summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2020-05-02 03:02:27 +0300
committerDavid S. Miller <davem@davemloft.net>2020-05-02 03:02:27 +0300
commit115506fea499f1cd9a80290b31eca4352e0559e9 (patch)
tree25e4ff3b5a49115d964fab690cf72fa18a5f96bf
parent5b95dea31636ce93660930d16172fe75589b2e70 (diff)
parent57dc6f3b4133f45e73d87895180ca1f3eaf01722 (diff)
downloadlinux-115506fea499f1cd9a80290b31eca4352e0559e9.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says: ==================== pull-request: bpf-next 2020-05-01 (v2) The following pull-request contains BPF updates for your *net-next* tree. We've added 61 non-merge commits during the last 6 day(s) which contain a total of 153 files changed, 6739 insertions(+), 3367 deletions(-). The main changes are: 1) pulled work.sysctl from vfs tree with sysctl bpf changes. 2) bpf_link observability, from Andrii. 3) BTF-defined map in map, from Andrii. 4) asan fixes for selftests, from Andrii. 5) Allow bpf_map_lookup_elem for SOCKMAP and SOCKHASH, from Jakub. 6) production cloudflare classifier as a selftes, from Lorenz. 7) bpf_ktime_get_*_ns() helper improvements, from Maciej. 8) unprivileged bpftool feature probe, from Quentin. 9) BPF_ENABLE_STATS command, from Song. 10) enable bpf_[gs]etsockopt() helpers for sock_ops progs, from Stanislav. 11) enable a bunch of common helpers for cg-device, sysctl, sockopt progs, from Stanislav. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--arch/arm64/kernel/armv8_deprecated.c2
-rw-r--r--arch/arm64/kernel/fpsimd.c3
-rw-r--r--arch/mips/lasat/sysctl.c13
-rw-r--r--arch/riscv/net/bpf_jit_comp32.c103
-rw-r--r--arch/s390/appldata/appldata_base.c11
-rw-r--r--arch/s390/kernel/debug.c2
-rw-r--r--arch/s390/kernel/topology.c2
-rw-r--r--arch/s390/mm/cmm.c12
-rw-r--r--arch/x86/kernel/itmt.c3
-rw-r--r--drivers/cdrom/cdrom.c2
-rw-r--r--drivers/char/random.c2
-rw-r--r--drivers/macintosh/mac_hid.c3
-rw-r--r--drivers/media/rc/bpf-lirc.c2
-rw-r--r--drivers/parport/procfs.c39
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/file_table.c4
-rw-r--r--fs/fscache/main.c3
-rw-r--r--fs/inode.c2
-rw-r--r--fs/proc/proc_sysctl.c47
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/xfs/xfs_sysctl.c4
-rw-r--r--include/linux/bpf-cgroup.h23
-rw-r--r--include/linux/bpf.h37
-rw-r--r--include/linux/bpf_types.h6
-rw-r--r--include/linux/compaction.h2
-rw-r--r--include/linux/coredump.h4
-rw-r--r--include/linux/file.h2
-rw-r--r--include/linux/filter.h2
-rw-r--r--include/linux/fs.h6
-rw-r--r--include/linux/ftrace.h3
-rw-r--r--include/linux/hugetlb.h15
-rw-r--r--include/linux/kprobes.h2
-rw-r--r--include/linux/latencytop.h4
-rw-r--r--include/linux/mm.h14
-rw-r--r--include/linux/mmzone.h27
-rw-r--r--include/linux/nmi.h15
-rw-r--r--include/linux/perf_event.h13
-rw-r--r--include/linux/pid.h3
-rw-r--r--include/linux/printk.h2
-rw-r--r--include/linux/sched/sysctl.h44
-rw-r--r--include/linux/security.h2
-rw-r--r--include/linux/sysctl.h61
-rw-r--r--include/linux/timer.h3
-rw-r--r--include/linux/vmstat.h8
-rw-r--r--include/linux/writeback.h28
-rw-r--r--include/uapi/linux/bpf.h69
-rw-r--r--ipc/ipc_sysctl.c10
-rw-r--r--ipc/mq_sysctl.c4
-rw-r--r--kernel/bpf/btf.c2
-rw-r--r--kernel/bpf/cgroup.c146
-rw-r--r--kernel/bpf/core.c6
-rw-r--r--kernel/bpf/helpers.c89
-rw-r--r--kernel/bpf/syscall.c410
-rw-r--r--kernel/bpf/verifier.c80
-rw-r--r--kernel/cgroup/cgroup.c27
-rw-r--r--kernel/events/callchain.c2
-rw-r--r--kernel/events/core.c6
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/latencytop.c4
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/sched/core.c9
-rw-r--r--kernel/sched/fair.c3
-rw-r--r--kernel/sched/rt.c10
-rw-r--r--kernel/sched/topology.c2
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/sysctl.c3057
-rw-r--r--kernel/time/timer.c3
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/umh.c2
-rw-r--r--kernel/utsname_sysctl.c2
-rw-r--r--kernel/watchdog.c12
-rw-r--r--mm/compaction.c2
-rw-r--r--mm/hugetlb.c9
-rw-r--r--mm/page-writeback.c16
-rw-r--r--mm/page_alloc.c42
-rw-r--r--mm/util.c10
-rw-r--r--mm/vmstat.c4
-rw-r--r--net/bridge/br_netfilter_hooks.c2
-rw-r--r--net/core/filter.c200
-rw-r--r--net/core/neighbour.c28
-rw-r--r--net/core/sock_map.c18
-rw-r--r--net/core/sysctl_net_core.c27
-rw-r--r--net/decnet/dn_dev.c7
-rw-r--r--net/decnet/sysctl_net_decnet.c27
-rw-r--r--net/ipv4/devinet.c9
-rw-r--r--net/ipv4/route.c3
-rw-r--r--net/ipv4/sysctl_net_ipv4.c38
-rw-r--r--net/ipv6/addrconf.c33
-rw-r--r--net/ipv6/ndisc.c3
-rw-r--r--net/ipv6/route.c5
-rw-r--r--net/ipv6/sysctl_net_ipv6.c3
-rw-r--r--net/mpls/af_mpls.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c6
-rw-r--r--net/netfilter/nf_conntrack_standalone.c2
-rw-r--r--net/netfilter/nf_log.c2
-rw-r--r--net/phonet/sysctl.c3
-rw-r--r--net/rds/tcp.c6
-rw-r--r--net/sctp/sysctl.c32
-rw-r--r--net/sunrpc/sysctl.c29
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c7
-rw-r--r--net/xdp/xsk.c4
-rw-r--r--security/apparmor/lsm.c2
-rw-r--r--security/min_addr.c2
-rw-r--r--security/yama/yama_lsm.c2
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-feature.rst12
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-link.rst118
-rw-r--r--tools/bpf/bpftool/Makefile13
-rw-r--r--tools/bpf/bpftool/bash-completion/bpftool41
-rw-r--r--tools/bpf/bpftool/btf.c1
-rw-r--r--tools/bpf/bpftool/cgroup.c48
-rw-r--r--tools/bpf/bpftool/common.c2
-rw-r--r--tools/bpf/bpftool/feature.c143
-rw-r--r--tools/bpf/bpftool/gen.c1
-rw-r--r--tools/bpf/bpftool/jit_disasm.c1
-rw-r--r--tools/bpf/bpftool/link.c333
-rw-r--r--tools/bpf/bpftool/main.c6
-rw-r--r--tools/bpf/bpftool/main.h37
-rw-r--r--tools/include/uapi/linux/bpf.h69
-rw-r--r--tools/lib/bpf/bpf.c29
-rw-r--r--tools/lib/bpf/bpf.h5
-rw-r--r--tools/lib/bpf/bpf_helpers.h7
-rw-r--r--tools/lib/bpf/btf_dump.c2
-rw-r--r--tools/lib/bpf/hashmap.c7
-rw-r--r--tools/lib/bpf/libbpf.c705
-rw-r--r--tools/lib/bpf/libbpf.map7
-rw-r--r--tools/testing/selftests/bpf/.gitignore4
-rw-r--r--tools/testing/selftests/bpf/Makefile16
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c110
-rw-r--r--tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c49
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cls_redirect.c456
-rw-r--r--tools/testing/selftests/bpf/prog_tests/core_reloc.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/enable_stats.c45
-rw-r--r--tools/testing/selftests/bpf/prog_tests/hashmap.c (renamed from tools/testing/selftests/bpf/test_hashmap.c)280
-rw-r--r--tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c5
-rw-r--r--tools/testing/selftests/bpf/prog_tests/perf_buffer.c5
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sk_assign.c21
-rw-r--r--tools/testing/selftests/bpf/progs/connect4_prog.c46
-rw-r--r--tools/testing/selftests/bpf/progs/test_btf_map_in_map.c76
-rw-r--r--tools/testing/selftests/bpf/progs/test_cls_redirect.c1058
-rw-r--r--tools/testing/selftests/bpf/progs/test_cls_redirect.h54
-rw-r--r--tools/testing/selftests/bpf/progs/test_enable_stats.c18
-rw-r--r--tools/testing/selftests/bpf/progs/test_obj_id.c14
-rw-r--r--tools/testing/selftests/bpf/progs/test_sk_assign.c82
-rw-r--r--tools/testing/selftests/bpf/progs/test_sysctl_prog.c2
-rw-r--r--tools/testing/selftests/bpf/test_progs.c21
-rw-r--r--tools/testing/selftests/bpf/test_progs.h7
-rw-r--r--tools/testing/selftests/bpf/test_verifier.c19
-rw-r--r--tools/testing/selftests/bpf/verifier/event_output.c24
-rw-r--r--tools/testing/selftests/bpf/verifier/prevent_map_lookup.c30
-rw-r--r--tools/testing/selftests/bpf/verifier/sock.c115
153 files changed, 6332 insertions, 2960 deletions
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index c19aa81ddc8c..7364de008bab 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -203,7 +203,7 @@ static void __init register_insn_emulation(struct insn_emulation_ops *ops)
}
static int emulation_proc_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = 0;
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 94289d126993..35cb5e66c504 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -341,8 +341,7 @@ static unsigned int find_supported_vector_length(unsigned int vl)
#ifdef CONFIG_SYSCTL
static int sve_proc_do_default_vl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int vl = sve_default_vl;
diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c
index e666fe26c50d..2119541a5b8b 100644
--- a/arch/mips/lasat/sysctl.c
+++ b/arch/mips/lasat/sysctl.c
@@ -95,16 +95,15 @@ int proc_lasat_ip(struct ctl_table *table, int write,
len = 0;
p = buffer;
while (len < *lenp) {
- if (get_user(c, p++))
- return -EFAULT;
+ c = *p;
+ p++;
if (c == 0 || c == '\n')
break;
len++;
}
if (len >= sizeof(ipbuf)-1)
len = sizeof(ipbuf) - 1;
- if (copy_from_user(ipbuf, buffer, len))
- return -EFAULT;
+ memcpy(ipbuf, buffer, len);
ipbuf[len] = 0;
*ppos += *lenp;
/* Now see if we can convert it to a valid IP */
@@ -122,11 +121,9 @@ int proc_lasat_ip(struct ctl_table *table, int write,
if (len > *lenp)
len = *lenp;
if (len)
- if (copy_to_user(buffer, ipbuf, len))
- return -EFAULT;
+ memcpy(buffer, ipbuf, len);
if (len < *lenp) {
- if (put_user('\n', ((char *) buffer) + len))
- return -EFAULT;
+ *((char *)buffer + len) = '\n';
len++;
}
*lenp = len;
diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c
index 302934177760..b198eaa74456 100644
--- a/arch/riscv/net/bpf_jit_comp32.c
+++ b/arch/riscv/net/bpf_jit_comp32.c
@@ -13,8 +13,35 @@
#include <linux/filter.h>
#include "bpf_jit.h"
+/*
+ * Stack layout during BPF program execution:
+ *
+ * high
+ * RV32 fp => +----------+
+ * | saved ra |
+ * | saved fp | RV32 callee-saved registers
+ * | ... |
+ * +----------+ <= (fp - 4 * NR_SAVED_REGISTERS)
+ * | hi(R6) |
+ * | lo(R6) |
+ * | hi(R7) | JIT scratch space for BPF registers
+ * | lo(R7) |
+ * | ... |
+ * BPF_REG_FP => +----------+ <= (fp - 4 * NR_SAVED_REGISTERS
+ * | | - 4 * BPF_JIT_SCRATCH_REGS)
+ * | |
+ * | ... | BPF program stack
+ * | |
+ * RV32 sp => +----------+
+ * | |
+ * | ... | Function call stack
+ * | |
+ * +----------+
+ * low
+ */
+
enum {
- /* Stack layout - these are offsets from (top of stack - 4). */
+ /* Stack layout - these are offsets from top of JIT scratch space. */
BPF_R6_HI,
BPF_R6_LO,
BPF_R7_HI,
@@ -29,7 +56,11 @@ enum {
BPF_JIT_SCRATCH_REGS,
};
-#define STACK_OFFSET(k) (-4 - ((k) * 4))
+/* Number of callee-saved registers stored to stack: ra, fp, s1--s7. */
+#define NR_SAVED_REGISTERS 9
+
+/* Offset from fp for BPF registers stored on stack. */
+#define STACK_OFFSET(k) (-4 - (4 * NR_SAVED_REGISTERS) - (4 * (k)))
#define TMP_REG_1 (MAX_BPF_JIT_REG + 0)
#define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
@@ -111,11 +142,9 @@ static void emit_imm64(const s8 *rd, s32 imm_hi, s32 imm_lo,
static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
{
- int stack_adjust = ctx->stack_size, store_offset = stack_adjust - 4;
+ int stack_adjust = ctx->stack_size;
const s8 *r0 = bpf2rv32[BPF_REG_0];
- store_offset -= 4 * BPF_JIT_SCRATCH_REGS;
-
/* Set return value if not tail call. */
if (!is_tail_call) {
emit(rv_addi(RV_REG_A0, lo(r0), 0), ctx);
@@ -123,15 +152,15 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
}
/* Restore callee-saved registers. */
- emit(rv_lw(RV_REG_RA, store_offset - 0, RV_REG_SP), ctx);
- emit(rv_lw(RV_REG_FP, store_offset - 4, RV_REG_SP), ctx);
- emit(rv_lw(RV_REG_S1, store_offset - 8, RV_REG_SP), ctx);
- emit(rv_lw(RV_REG_S2, store_offset - 12, RV_REG_SP), ctx);
- emit(rv_lw(RV_REG_S3, store_offset - 16, RV_REG_SP), ctx);
- emit(rv_lw(RV_REG_S4, store_offset - 20, RV_REG_SP), ctx);
- emit(rv_lw(RV_REG_S5, store_offset - 24, RV_REG_SP), ctx);
- emit(rv_lw(RV_REG_S6, store_offset - 28, RV_REG_SP), ctx);
- emit(rv_lw(RV_REG_S7, store_offset - 32, RV_REG_SP), ctx);
+ emit(rv_lw(RV_REG_RA, stack_adjust - 4, RV_REG_SP), ctx);
+ emit(rv_lw(RV_REG_FP, stack_adjust - 8, RV_REG_SP), ctx);
+ emit(rv_lw(RV_REG_S1, stack_adjust - 12, RV_REG_SP), ctx);
+ emit(rv_lw(RV_REG_S2, stack_adjust - 16, RV_REG_SP), ctx);
+ emit(rv_lw(RV_REG_S3, stack_adjust - 20, RV_REG_SP), ctx);
+ emit(rv_lw(RV_REG_S4, stack_adjust - 24, RV_REG_SP), ctx);
+ emit(rv_lw(RV_REG_S5, stack_adjust - 28, RV_REG_SP), ctx);
+ emit(rv_lw(RV_REG_S6, stack_adjust - 32, RV_REG_SP), ctx);
+ emit(rv_lw(RV_REG_S7, stack_adjust - 36, RV_REG_SP), ctx);
emit(rv_addi(RV_REG_SP, RV_REG_SP, stack_adjust), ctx);
@@ -770,12 +799,13 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
emit_bcc(BPF_JGE, lo(idx_reg), RV_REG_T1, off, ctx);
/*
- * if ((temp_tcc = tcc - 1) < 0)
+ * temp_tcc = tcc - 1;
+ * if (tcc < 0)
* goto out;
*/
emit(rv_addi(RV_REG_T1, RV_REG_TCC, -1), ctx);
off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2;
- emit_bcc(BPF_JSLT, RV_REG_T1, RV_REG_ZERO, off, ctx);
+ emit_bcc(BPF_JSLT, RV_REG_TCC, RV_REG_ZERO, off, ctx);
/*
* prog = array->ptrs[index];
@@ -1259,17 +1289,20 @@ notsupported:
void bpf_jit_build_prologue(struct rv_jit_context *ctx)
{
- /* Make space to save 9 registers: ra, fp, s1--s7. */
- int stack_adjust = 9 * sizeof(u32), store_offset, bpf_stack_adjust;
const s8 *fp = bpf2rv32[BPF_REG_FP];
const s8 *r1 = bpf2rv32[BPF_REG_1];
-
- bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16);
+ int stack_adjust = 0;
+ int bpf_stack_adjust =
+ round_up(ctx->prog->aux->stack_depth, STACK_ALIGN);
+
+ /* Make space for callee-saved registers. */
+ stack_adjust += NR_SAVED_REGISTERS * sizeof(u32);
+ /* Make space for BPF registers on stack. */
+ stack_adjust += BPF_JIT_SCRATCH_REGS * sizeof(u32);
+ /* Make space for BPF stack. */
stack_adjust += bpf_stack_adjust;
-
- store_offset = stack_adjust - 4;
-
- stack_adjust += 4 * BPF_JIT_SCRATCH_REGS;
+ /* Round up for stack alignment. */
+ stack_adjust = round_up(stack_adjust, STACK_ALIGN);
/*
* The first instruction sets the tail-call-counter (TCC) register.
@@ -1280,24 +1313,24 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx)
emit(rv_addi(RV_REG_SP, RV_REG_SP, -stack_adjust), ctx);
/* Save callee-save registers. */
- emit(rv_sw(RV_REG_SP, store_offset - 0, RV_REG_RA), ctx);
- emit(rv_sw(RV_REG_SP, store_offset - 4, RV_REG_FP), ctx);
- emit(rv_sw(RV_REG_SP, store_offset - 8, RV_REG_S1), ctx);
- emit(rv_sw(RV_REG_SP, store_offset - 12, RV_REG_S2), ctx);
- emit(rv_sw(RV_REG_SP, store_offset - 16, RV_REG_S3), ctx);
- emit(rv_sw(RV_REG_SP, store_offset - 20, RV_REG_S4), ctx);
- emit(rv_sw(RV_REG_SP, store_offset - 24, RV_REG_S5), ctx);
- emit(rv_sw(RV_REG_SP, store_offset - 28, RV_REG_S6), ctx);
- emit(rv_sw(RV_REG_SP, store_offset - 32, RV_REG_S7), ctx);
+ emit(rv_sw(RV_REG_SP, stack_adjust - 4, RV_REG_RA), ctx);
+ emit(rv_sw(RV_REG_SP, stack_adjust - 8, RV_REG_FP), ctx);
+ emit(rv_sw(RV_REG_SP, stack_adjust - 12, RV_REG_S1), ctx);
+ emit(rv_sw(RV_REG_SP, stack_adjust - 16, RV_REG_S2), ctx);
+ emit(rv_sw(RV_REG_SP, stack_adjust - 20, RV_REG_S3), ctx);
+ emit(rv_sw(RV_REG_SP, stack_adjust - 24, RV_REG_S4), ctx);
+ emit(rv_sw(RV_REG_SP, stack_adjust - 28, RV_REG_S5), ctx);
+ emit(rv_sw(RV_REG_SP, stack_adjust - 32, RV_REG_S6), ctx);
+ emit(rv_sw(RV_REG_SP, stack_adjust - 36, RV_REG_S7), ctx);
/* Set fp: used as the base address for stacked BPF registers. */
emit(rv_addi(RV_REG_FP, RV_REG_SP, stack_adjust), ctx);
- /* Set up BPF stack pointer. */
+ /* Set up BPF frame pointer. */
emit(rv_addi(lo(fp), RV_REG_SP, bpf_stack_adjust), ctx);
emit(rv_addi(hi(fp), RV_REG_ZERO, 0), ctx);
- /* Set up context pointer. */
+ /* Set up BPF context pointer. */
emit(rv_addi(lo(r1), RV_REG_A0, 0), ctx);
emit(rv_addi(hi(r1), RV_REG_ZERO, 0), ctx);
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index aa738cad1338..d74a4c7d5df6 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -51,10 +51,9 @@ static struct platform_device *appldata_pdev;
*/
static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata";
static int appldata_timer_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
static int appldata_interval_handler(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
static struct ctl_table_header *appldata_sysctl_header;
static struct ctl_table appldata_table[] = {
@@ -217,7 +216,7 @@ static void __appldata_vtimer_setup(int cmd)
*/
static int
appldata_timer_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int timer_active = appldata_timer_active;
int rc;
@@ -250,7 +249,7 @@ appldata_timer_handler(struct ctl_table *ctl, int write,
*/
static int
appldata_interval_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int interval = appldata_interval;
int rc;
@@ -280,7 +279,7 @@ appldata_interval_handler(struct ctl_table *ctl, int write,
*/
static int
appldata_generic_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct appldata_ops *ops = NULL, *tmp_ops;
struct list_head *lh;
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 6d321f5f101d..636446003a06 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -867,7 +867,7 @@ static int debug_active = 1;
* if debug_active is already off
*/
static int s390dbf_procactive(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!write || debug_stoppable || !debug_active)
return proc_dointvec(table, write, buffer, lenp, ppos);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 5f70cefc13e4..332b542548cd 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -594,7 +594,7 @@ static int __init topology_setup(char *str)
early_param("topology", topology_setup);
static int topology_ctl_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int enabled = topology_is_enabled();
int new_mode;
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index ae989b740376..36bce727897b 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -245,7 +245,7 @@ static int cmm_skip_blanks(char *cp, char **endp)
}
static int cmm_pages_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
long nr = cmm_get_pages();
struct ctl_table ctl_entry = {
@@ -264,7 +264,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write,
}
static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
long nr = cmm_get_timed_pages();
@@ -284,7 +284,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
}
static int cmm_timeout_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char buf[64], *p;
long nr, seconds;
@@ -297,8 +297,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
if (write) {
len = min(*lenp, sizeof(buf));
- if (copy_from_user(buf, buffer, len))
- return -EFAULT;
+ memcpy(buf, buffer, len);
buf[len - 1] = '\0';
cmm_skip_blanks(buf, &p);
nr = simple_strtoul(p, &p, 0);
@@ -311,8 +310,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
cmm_timeout_pages, cmm_timeout_seconds);
if (len > *lenp)
len = *lenp;
- if (copy_to_user(buffer, buf, len))
- return -EFAULT;
+ memcpy(buffer, buf, len);
*lenp = len;
*ppos += len;
}
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 1cb3ca9bba49..1afbdd1dd777 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -39,8 +39,7 @@ static bool __read_mostly sched_itmt_capable;
unsigned int __read_mostly sysctl_sched_itmt_enabled;
static int sched_itmt_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int old_sysctl;
int ret;
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index faca0f346fff..e3bbe108eb54 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3631,7 +3631,7 @@ static void cdrom_update_settings(void)
}
static int cdrom_sysctl_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 0d10e31fd342..1e0db78b83ba 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -2057,7 +2057,7 @@ static char sysctl_bootid[16];
* sysctl system call, as 16 bytes of binary data.
*/
static int proc_do_uuid(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table fake_table;
unsigned char buf[64], tmp_uuid[16], *uuid;
diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c
index 7af0c536d568..28b8581b44dd 100644
--- a/drivers/macintosh/mac_hid.c
+++ b/drivers/macintosh/mac_hid.c
@@ -183,8 +183,7 @@ static void mac_hid_stop_emulation(void)
}
static int mac_hid_toggle_emumouse(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
int old_val = *valp;
diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index 0f3417d161b8..069c42f22a8c 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -103,6 +103,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_peek_elem_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_ktime_get_boot_ns:
+ return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_prandom_u32:
diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c
index 48804049d697..ee7b5daabfd4 100644
--- a/drivers/parport/procfs.c
+++ b/drivers/parport/procfs.c
@@ -34,7 +34,7 @@
#define PARPORT_MAX_SPINTIME_VALUE 1000
static int do_active_device(struct ctl_table *table, int write,
- void __user *result, size_t *lenp, loff_t *ppos)
+ void *result, size_t *lenp, loff_t *ppos)
{
struct parport *port = (struct parport *)table->extra1;
char buffer[256];
@@ -65,13 +65,13 @@ static int do_active_device(struct ctl_table *table, int write,
*lenp = len;
*ppos += len;
-
- return copy_to_user(result, buffer, len) ? -EFAULT : 0;
+ memcpy(result, buffer, len);
+ return 0;
}
#ifdef CONFIG_PARPORT_1284
static int do_autoprobe(struct ctl_table *table, int write,
- void __user *result, size_t *lenp, loff_t *ppos)
+ void *result, size_t *lenp, loff_t *ppos)
{
struct parport_device_info *info = table->extra2;
const char *str;
@@ -108,13 +108,13 @@ static int do_autoprobe(struct ctl_table *table, int write,
*ppos += len;
- return copy_to_user (result, buffer, len) ? -EFAULT : 0;
+ memcpy(result, buffer, len);
+ return 0;
}
#endif /* IEEE1284.3 support. */
static int do_hardware_base_addr(struct ctl_table *table, int write,
- void __user *result,
- size_t *lenp, loff_t *ppos)
+ void *result, size_t *lenp, loff_t *ppos)
{
struct parport *port = (struct parport *)table->extra1;
char buffer[20];
@@ -136,13 +136,12 @@ static int do_hardware_base_addr(struct ctl_table *table, int write,
*lenp = len;
*ppos += len;
-
- return copy_to_user(result, buffer, len) ? -EFAULT : 0;
+ memcpy(result, buffer, len);
+ return 0;
}
static int do_hardware_irq(struct ctl_table *table, int write,
- void __user *result,
- size_t *lenp, loff_t *ppos)
+ void *result, size_t *lenp, loff_t *ppos)
{
struct parport *port = (struct parport *)table->extra1;
char buffer[20];
@@ -164,13 +163,12 @@ static int do_hardware_irq(struct ctl_table *table, int write,
*lenp = len;
*ppos += len;
-
- return copy_to_user(result, buffer, len) ? -EFAULT : 0;
+ memcpy(result, buffer, len);
+ return 0;
}
static int do_hardware_dma(struct ctl_table *table, int write,
- void __user *result,
- size_t *lenp, loff_t *ppos)
+ void *result, size_t *lenp, loff_t *ppos)
{
struct parport *port = (struct parport *)table->extra1;
char buffer[20];
@@ -192,13 +190,12 @@ static int do_hardware_dma(struct ctl_table *table, int write,
*lenp = len;
*ppos += len;
-
- return copy_to_user(result, buffer, len) ? -EFAULT : 0;
+ memcpy(result, buffer, len);
+ return 0;
}
static int do_hardware_modes(struct ctl_table *table, int write,
- void __user *result,
- size_t *lenp, loff_t *ppos)
+ void *result, size_t *lenp, loff_t *ppos)
{
struct parport *port = (struct parport *)table->extra1;
char buffer[40];
@@ -231,8 +228,8 @@ static int do_hardware_modes(struct ctl_table *table, int write,
*lenp = len;
*ppos += len;
-
- return copy_to_user(result, buffer, len) ? -EFAULT : 0;
+ memcpy(result, buffer, len);
+ return 0;
}
#define PARPORT_PORT_DIR(CHILD) { .procname = NULL, .mode = 0555, .child = CHILD }
diff --git a/fs/dcache.c b/fs/dcache.c
index b280e07e162b..8dd4d8d7bd0b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -165,7 +165,7 @@ static long get_nr_dentry_negative(void)
return sum < 0 ? 0 : sum;
}
-int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
+int proc_nr_dentry(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
dentry_stat.nr_dentry = get_nr_dentry();
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index dc1a1d5d825b..f00fcc4a4f72 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -47,7 +47,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
}
int drop_caches_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int ret;
diff --git a/fs/file_table.c b/fs/file_table.c
index 30d55c9a1744..3b612535391f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -80,14 +80,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
*/
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
int proc_nr_files(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
files_stat.nr_files = get_nr_files();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#else
int proc_nr_files(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 59c2494efda3..c1e6cc9091aa 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -51,8 +51,7 @@ static unsigned fscache_op_max_active = 2;
static struct ctl_table_header *fscache_sysctl_header;
static int fscache_max_active_sysctl(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct workqueue_struct **wqp = table->extra1;
unsigned int *datap = table->data;
diff --git a/fs/inode.c b/fs/inode.c
index 93d9252a00ab..cc6e701b7e5d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -108,7 +108,7 @@ long get_nr_dirty_inodes(void)
*/
#ifdef CONFIG_SYSCTL
int proc_nr_inodes(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
inodes_stat.nr_inodes = get_nr_inodes();
inodes_stat.nr_unused = get_nr_inodes_unused();
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b6f5d459b087..df2143e05c57 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -539,13 +539,13 @@ out:
return err;
}
-static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
+static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
size_t count, loff_t *ppos, int write)
{
struct inode *inode = file_inode(filp);
struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
- void *new_buf = NULL;
+ void *kbuf;
ssize_t error;
if (IS_ERR(head))
@@ -564,27 +564,38 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
if (!table->proc_handler)
goto out;
- error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count,
- ppos, &new_buf);
+ if (write) {
+ kbuf = memdup_user_nul(ubuf, count);
+ if (IS_ERR(kbuf)) {
+ error = PTR_ERR(kbuf);
+ goto out;
+ }
+ } else {
+ error = -ENOMEM;
+ kbuf = kzalloc(count, GFP_KERNEL);
+ if (!kbuf)
+ goto out;
+ }
+
+ error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
+ ppos);
if (error)
- goto out;
+ goto out_free_buf;
/* careful: calling conventions are nasty here */
- if (new_buf) {
- mm_segment_t old_fs;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- error = table->proc_handler(table, write, (void __user *)new_buf,
- &count, ppos);
- set_fs(old_fs);
- kfree(new_buf);
- } else {
- error = table->proc_handler(table, write, buf, &count, ppos);
+ error = table->proc_handler(table, write, kbuf, &count, ppos);
+ if (error)
+ goto out_free_buf;
+
+ if (!write) {
+ error = -EFAULT;
+ if (copy_to_user(ubuf, kbuf, count))
+ goto out_free_buf;
}
- if (!error)
- error = count;
+ error = count;
+out_free_buf:
+ kfree(kbuf);
out:
sysctl_head_finish(head);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index b6a4f692d345..7b4bac91146b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2841,7 +2841,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = {
EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
static int do_proc_dqstats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int type = (unsigned long *)table->data - dqstats.stat;
s64 value = percpu_counter_sum(&dqstats.counter[type]);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 31b3bdbd2eba..021ef96d0542 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -13,7 +13,7 @@ STATIC int
xfs_stats_clear_proc_handler(
struct ctl_table *ctl,
int write,
- void __user *buffer,
+ void *buffer,
size_t *lenp,
loff_t *ppos)
{
@@ -33,7 +33,7 @@ STATIC int
xfs_panic_mask_proc_handler(
struct ctl_table *ctl,
int write,
- void __user *buffer,
+ void *buffer,
size_t *lenp,
loff_t *ppos)
{
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c11b413d5b1a..272626cc3fc9 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -57,8 +57,6 @@ struct bpf_cgroup_link {
enum bpf_attach_type type;
};
-extern const struct bpf_link_ops bpf_cgroup_link_lops;
-
struct bpf_prog_list {
struct list_head node;
struct bpf_prog *prog;
@@ -100,8 +98,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
struct bpf_cgroup_link *link,
enum bpf_attach_type type);
-int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
- struct bpf_prog *new_prog);
int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr);
@@ -112,8 +108,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp,
u32 flags);
int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type);
-int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog,
- struct bpf_prog *new_prog);
int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr);
@@ -138,8 +132,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
struct ctl_table *table, int write,
- void __user *buf, size_t *pcount,
- loff_t *ppos, void **new_buf,
+ void **buf, size_t *pcount, loff_t *ppos,
enum bpf_attach_type type);
int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
@@ -302,12 +295,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
})
-#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos, nbuf) \
+#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \
({ \
int __ret = 0; \
if (cgroup_bpf_enabled) \
__ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \
- buf, count, pos, nbuf, \
+ buf, count, pos, \
BPF_CGROUP_SYSCTL); \
__ret; \
})
@@ -354,7 +347,6 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
#else
struct bpf_prog;
-struct bpf_link;
struct cgroup_bpf {};
static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
@@ -378,13 +370,6 @@ static inline int cgroup_bpf_link_attach(const union bpf_attr *attr,
return -EINVAL;
}
-static inline int cgroup_bpf_replace(struct bpf_link *link,
- struct bpf_prog *old_prog,
- struct bpf_prog *new_prog)
-{
- return -EINVAL;
-}
-
static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
@@ -429,7 +414,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; })
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
optlen, max_optlen, retval) ({ retval; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index fd2b2322412d..1262ec460ab3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -987,6 +987,7 @@ _out: \
#ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active);
+extern struct mutex bpf_stats_enabled_mutex;
/*
* Block execution of BPF programs attached to instrumentation (perf,
@@ -1026,9 +1027,11 @@ extern const struct file_operations bpf_prog_fops;
extern const struct bpf_verifier_ops _name ## _verifier_ops;
#define BPF_MAP_TYPE(_id, _ops) \
extern const struct bpf_map_ops _ops;
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
extern const struct bpf_prog_ops bpf_offload_prog_ops;
extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
@@ -1085,21 +1088,35 @@ int bpf_prog_new_fd(struct bpf_prog *prog);
struct bpf_link {
atomic64_t refcnt;
+ u32 id;
+ enum bpf_link_type type;
const struct bpf_link_ops *ops;
struct bpf_prog *prog;
struct work_struct work;
};
+struct bpf_link_primer {
+ struct bpf_link *link;
+ struct file *file;
+ int fd;
+ u32 id;
+};
+
struct bpf_link_ops {
void (*release)(struct bpf_link *link);
void (*dealloc)(struct bpf_link *link);
-
+ int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog);
+ void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
+ int (*fill_link_info)(const struct bpf_link *link,
+ struct bpf_link_info *info);
};
-void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
- struct bpf_prog *prog);
-void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
- int link_fd);
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog);
+int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer);
+int bpf_link_settle(struct bpf_link_primer *primer);
+void bpf_link_cleanup(struct bpf_link_primer *primer);
void bpf_link_inc(struct bpf_link *link);
void bpf_link_put(struct bpf_link *link);
int bpf_link_new_fd(struct bpf_link *link);
@@ -1215,6 +1232,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
struct bpf_prog *bpf_prog_by_id(u32 id);
+const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
@@ -1365,6 +1383,12 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id)
{
return ERR_PTR(-ENOTSUPP);
}
+
+static inline const struct bpf_func_proto *
+bpf_base_func_proto(enum bpf_func_id func_id)
+{
+ return NULL;
+}
#endif /* CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
@@ -1502,6 +1526,7 @@ extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;
extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
+extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto;
extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
@@ -1523,6 +1548,7 @@ extern const struct bpf_func_proto bpf_strtoul_proto;
extern const struct bpf_func_proto bpf_tcp_sock_proto;
extern const struct bpf_func_proto bpf_jiffies64_proto;
extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto;
+extern const struct bpf_func_proto bpf_event_output_data_proto;
const struct bpf_func_proto *bpf_tracing_func_proto(
enum bpf_func_id func_id, const struct bpf_prog *prog);
@@ -1530,6 +1556,7 @@ const struct bpf_func_proto *bpf_tracing_func_proto(
/* Shared helpers among cBPF and eBPF. */
void bpf_user_rnd_init_once(void);
u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
#if defined(CONFIG_NET)
bool bpf_sock_common_is_valid_access(int off, int size,
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index ba0c2d56f8a3..8345cdf553b8 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -118,3 +118,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
#if defined(CONFIG_BPF_JIT)
BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
#endif
+
+BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
+BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
+#ifdef CONFIG_CGROUP_BPF
+BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup)
+#endif
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 4b898cdbdf05..a0eabfbeb0e1 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -86,7 +86,7 @@ static inline unsigned long compact_gap(unsigned int order)
#ifdef CONFIG_COMPACTION
extern int sysctl_compact_memory;
extern int sysctl_compaction_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos);
+ void *buffer, size_t *length, loff_t *ppos);
extern int sysctl_extfrag_threshold;
extern int sysctl_compact_unevictable_allowed;
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index abf4b4e65dbb..7a899e83835d 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -22,4 +22,8 @@ extern void do_coredump(const kernel_siginfo_t *siginfo);
static inline void do_coredump(const kernel_siginfo_t *siginfo) {}
#endif
+extern int core_uses_pid;
+extern char core_pattern[];
+extern unsigned int core_pipe_limit;
+
#endif /* _LINUX_COREDUMP_H */
diff --git a/include/linux/file.h b/include/linux/file.h
index 142d102f285e..122f80084a3e 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -94,4 +94,6 @@ extern void fd_install(unsigned int fd, struct file *file);
extern void flush_delayed_fput(void);
extern void __fput_sync(struct file *);
+extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
+
#endif /* __LINUX_FILE_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9b5aa5c483cc..af37318bb1c5 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -863,8 +863,6 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
bpf_aux_classic_check_t trans, bool save_orig);
void bpf_prog_destroy(struct bpf_prog *fp);
-const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id);
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_attach_bpf(u32 ufd, struct sock *sk);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4f6f59b4f22a..9b028d260649 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3536,11 +3536,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
struct ctl_table;
int proc_nr_files(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
int proc_nr_dentry(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
int proc_nr_inodes(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
int __init get_filesystem_list(char *buf);
#define __FMODE_EXEC ((__force int) FMODE_EXEC)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index db95244a62d4..ddfc377de0d2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1005,8 +1005,7 @@ extern void disable_trace_on_warning(void);
extern int __disable_trace_on_warning;
int tracepoint_printk_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
#else /* CONFIG_TRACING */
static inline void disable_trace_on_warning(void) { }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 43a1cef8f0f1..92c21c5ccc58 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -105,14 +105,13 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
void hugepage_put_subpool(struct hugepage_subpool *spool);
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
-int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
-int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
-int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
-
-#ifdef CONFIG_NUMA
-int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-#endif
+int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
+int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
+ loff_t *);
+int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *,
+ loff_t *);
+int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *,
+ loff_t *);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 04bdaf01112c..594265bfd390 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -312,7 +312,7 @@ DEFINE_INSN_CACHE_OPS(optinsn);
#ifdef CONFIG_SYSCTL
extern int sysctl_kprobes_optimization;
extern int proc_kprobes_optimization_handler(struct ctl_table *table,
- int write, void __user *buffer,
+ int write, void *buffer,
size_t *length, loff_t *ppos);
#endif
extern void wait_for_kprobe_optimizer(void);
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index 9022f0c2e2e4..abe3d95f795b 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -38,8 +38,8 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
void clear_tsk_latency_tracing(struct task_struct *p);
-extern int sysctl_latencytop(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
#else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5a323422d783..a7b1ef8ed970 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -201,10 +201,10 @@ extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;
-extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *,
- size_t *, loff_t *);
-extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
- size_t *, loff_t *);
+int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
+ loff_t *);
+int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
+ loff_t *);
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
@@ -2957,8 +2957,8 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
#ifdef CONFIG_SYSCTL
extern int sysctl_drop_caches;
-int drop_caches_sysctl_handler(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
+int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *,
+ loff_t *);
#endif
void drop_slab(void);
@@ -3140,5 +3140,7 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
pgoff_t first_index, pgoff_t nr);
#endif
+extern int sysctl_nr_trim_pages;
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1b9de7d220fb..93cf20f41e26 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -909,24 +909,23 @@ static inline int is_highmem(struct zone *zone)
/* These two functions are used to setup the per zone pages min values */
struct ctl_table;
-int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-int watermark_boost_factor_sysctl_handler(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
+
+int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *,
+ loff_t *);
+int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
+ size_t *, loff_t *);
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
-int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
+ size_t *, loff_t *);
int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
+ void *, size_t *, loff_t *);
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
+ void *, size_t *, loff_t *);
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-
-extern int numa_zonelist_order_handler(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
+ void *, size_t *, loff_t *);
+int numa_zonelist_order_handler(struct ctl_table *, int,
+ void *, size_t *, loff_t *);
+extern int percpu_pagelist_fraction;
extern char numa_zonelist_order[];
#define NUMA_ZONELIST_ORDER_LEN 16
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 9003e29cde46..750c7f395ca9 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -202,16 +202,11 @@ static inline void watchdog_update_hrtimer_threshold(u64 period) { }
#endif
struct ctl_table;
-extern int proc_watchdog(struct ctl_table *, int ,
- void __user *, size_t *, loff_t *);
-extern int proc_nmi_watchdog(struct ctl_table *, int ,
- void __user *, size_t *, loff_t *);
-extern int proc_soft_watchdog(struct ctl_table *, int ,
- void __user *, size_t *, loff_t *);
-extern int proc_watchdog_thresh(struct ctl_table *, int ,
- void __user *, size_t *, loff_t *);
-extern int proc_watchdog_cpumask(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
+int proc_watchdog(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_nmi_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *);
+int proc_soft_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *);
+int proc_watchdog_thresh(struct ctl_table *, int , void *, size_t *, loff_t *);
+int proc_watchdog_cpumask(struct ctl_table *, int, void *, size_t *, loff_t *);
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
#include <asm/nmi.h>
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9c3e7619c929..347ea379622a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1280,15 +1280,12 @@ extern int sysctl_perf_cpu_time_max_percent;
extern void perf_sample_event_took(u64 sample_len_ns);
-extern int perf_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-
+int perf_proc_update_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
int perf_event_max_stack_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
/* Access to perf_event_open(2) syscall. */
#define PERF_SECURITY_OPEN 0
diff --git a/include/linux/pid.h b/include/linux/pid.h
index cc896f0fc4e3..93543cbc0e6b 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -108,6 +108,9 @@ extern void transfer_pid(struct task_struct *old, struct task_struct *new,
struct pid_namespace;
extern struct pid_namespace init_pid_ns;
+extern int pid_max;
+extern int pid_max_min, pid_max_max;
+
/*
* look up a PID in the hash table. Must be called with the tasklist_lock
* or rcu_read_lock() held.
diff --git a/include/linux/printk.h b/include/linux/printk.h
index e061635e0409..fcde0772ec98 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -189,7 +189,7 @@ extern int printk_delay_msec;
extern int dmesg_restrict;
extern int
-devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void __user *buf,
+devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf,
size_t *lenp, loff_t *ppos);
extern void wake_up_klogd(void);
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index d4f6215ee03f..7b4d3a49b6c5 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -12,9 +12,8 @@ extern unsigned int sysctl_hung_task_panic;
extern unsigned long sysctl_hung_task_timeout_secs;
extern unsigned long sysctl_hung_task_check_interval_secs;
extern int sysctl_hung_task_warnings;
-extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos);
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
#else
/* Avoid need for ifdefs elsewhere in the code */
enum { sysctl_hung_task_timeout_secs = 0 };
@@ -43,8 +42,7 @@ extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
int sched_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length,
- loff_t *ppos);
+ void *buffer, size_t *length, loff_t *ppos);
#endif
/*
@@ -72,33 +70,21 @@ extern unsigned int sysctl_sched_autogroup_enabled;
extern int sysctl_sched_rr_timeslice;
extern int sched_rr_timeslice;
-extern int sched_rr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-
-extern int sched_rt_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-
-#ifdef CONFIG_UCLAMP_TASK
-extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-#endif
-
-extern int sysctl_numa_balancing(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-
-extern int sysctl_schedstats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
+int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
extern unsigned int sysctl_sched_energy_aware;
-extern int sched_energy_aware_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
+int sched_energy_aware_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
#endif
#endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index a8d9310472df..6aa229b252ce 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -211,7 +211,7 @@ struct request_sock;
#ifdef CONFIG_MMU
extern int mmap_min_addr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
#endif
/* security_inode_init_security callback function to write xattrs */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 02fa84493f23..f2401e45a3c2 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -44,35 +44,26 @@ struct ctl_dir;
extern const int sysctl_vals[];
-typedef int proc_handler (struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-
-extern int proc_dostring(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-extern int proc_dointvec(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-extern int proc_douintvec(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-extern int proc_dointvec_minmax(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-extern int proc_douintvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-extern int proc_dointvec_jiffies(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-extern int proc_dointvec_ms_jiffies(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-extern int proc_doulongvec_minmax(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
- void __user *, size_t *, loff_t *);
-extern int proc_do_large_bitmap(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-extern int proc_do_static_key(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
+typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+
+int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *,
+ loff_t *);
+int proc_dointvec_ms_jiffies(struct ctl_table *, int, void *, size_t *,
+ loff_t *);
+int proc_doulongvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *,
+ size_t *, loff_t *);
+int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_do_static_key(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
/*
* Register a set of sysctl names by calling register_sysctl_table
@@ -207,7 +198,15 @@ void unregister_sysctl_table(struct ctl_table_header * table);
extern int sysctl_init(void);
+extern int pwrsw_enabled;
+extern int unaligned_enabled;
+extern int unaligned_dump_stack;
+extern int no_unaligned_warning;
+
extern struct ctl_table sysctl_mount_point[];
+extern struct ctl_table random_table[];
+extern struct ctl_table firmware_config_table[];
+extern struct ctl_table epoll_table[];
#else /* CONFIG_SYSCTL */
static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
@@ -238,7 +237,7 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
#endif /* CONFIG_SYSCTL */
-int sysctl_max_threads(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+int sysctl_max_threads(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
#endif /* _LINUX_SYSCTL_H */
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 0dc19a8c39c9..07910ae5ddd9 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -201,8 +201,7 @@ struct ctl_table;
extern unsigned int sysctl_timer_migration;
int timer_migration_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
#endif
unsigned long __round_jiffies(unsigned long j, int cpu);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 292485f3d24d..cb507151710f 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -16,8 +16,8 @@ extern int sysctl_stat_interval;
#define DISABLE_NUMA_STAT 0
extern int sysctl_vm_numa_stat;
DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
-extern int sysctl_vm_numa_stat_handler(struct ctl_table *table,
- int write, void __user *buffer, size_t *length, loff_t *ppos);
+int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos);
#endif
struct reclaim_stat {
@@ -274,8 +274,8 @@ void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);
struct ctl_table;
-int vmstat_refresh(struct ctl_table *, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
+ loff_t *ppos);
void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a19d845dd7eb..f8a7e1a850fb 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -362,24 +362,18 @@ extern int vm_highmem_is_dirtyable;
extern int block_dump;
extern int laptop_mode;
-extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-extern int dirty_background_bytes_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-extern int dirty_ratio_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-extern int dirty_bytes_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
+int dirty_background_ratio_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+int dirty_background_bytes_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+int dirty_ratio_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+int dirty_bytes_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
int dirtytime_interval_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-
-struct ctl_table;
-int dirty_writeback_centisecs_handler(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
+ void *buffer, size_t *lenp, loff_t *ppos);
+int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7bbf1b65be10..b3643e27e264 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -113,6 +113,9 @@ enum bpf_cmd {
BPF_MAP_DELETE_BATCH,
BPF_LINK_CREATE,
BPF_LINK_UPDATE,
+ BPF_LINK_GET_FD_BY_ID,
+ BPF_LINK_GET_NEXT_ID,
+ BPF_ENABLE_STATS,
};
enum bpf_map_type {
@@ -220,6 +223,15 @@ enum bpf_attach_type {
#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
+enum bpf_link_type {
+ BPF_LINK_TYPE_UNSPEC = 0,
+ BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
+ BPF_LINK_TYPE_TRACING = 2,
+ BPF_LINK_TYPE_CGROUP = 3,
+
+ MAX_BPF_LINK_TYPE,
+};
+
/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
*
* NONE(default): No further bpf programs allowed in the subtree.
@@ -379,6 +391,12 @@ enum {
*/
#define BPF_F_QUERY_EFFECTIVE (1U << 0)
+/* type for BPF_ENABLE_STATS */
+enum bpf_stats_type {
+ /* enabled run_time_ns and run_cnt */
+ BPF_STATS_RUN_TIME = 0,
+};
+
enum bpf_stack_build_id_status {
/* user space need an empty entry to identify end of a trace */
BPF_STACK_BUILD_ID_EMPTY = 0,
@@ -523,6 +541,7 @@ union bpf_attr {
__u32 prog_id;
__u32 map_id;
__u32 btf_id;
+ __u32 link_id;
};
__u32 next_id;
__u32 open_flags;
@@ -589,6 +608,10 @@ union bpf_attr {
__u32 old_prog_fd;
} link_update;
+ struct { /* struct used by BPF_ENABLE_STATS command */
+ __u32 type;
+ } enable_stats;
+
} __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF
@@ -652,6 +675,8 @@ union bpf_attr {
* u64 bpf_ktime_get_ns(void)
* Description
* Return the time elapsed since system boot, in nanoseconds.
+ * Does not include time the system was suspended.
+ * See: clock_gettime(CLOCK_MONOTONIC)
* Return
* Current *ktime*.
*
@@ -1562,7 +1587,7 @@ union bpf_attr {
* Return
* 0
*
- * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen)
+ * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
* Description
* Emulate a call to **setsockopt()** on the socket associated to
* *bpf_socket*, which must be a full socket. The *level* at
@@ -1570,6 +1595,11 @@ union bpf_attr {
* must be specified, see **setsockopt(2)** for more information.
* The option value of length *optlen* is pointed by *optval*.
*
+ * *bpf_socket* should be one of the following:
+ * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * and **BPF_CGROUP_INET6_CONNECT**.
+ *
* This helper actually implements a subset of **setsockopt()**.
* It supports the following *level*\ s:
*
@@ -1764,7 +1794,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
- * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen)
+ * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
* Description
* Emulate a call to **getsockopt()** on the socket associated to
* *bpf_socket*, which must be a full socket. The *level* at
@@ -1773,6 +1803,11 @@ union bpf_attr {
* The retrieved value is stored in the structure pointed by
* *opval* and of length *optlen*.
*
+ * *bpf_socket* should be one of the following:
+ * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * and **BPF_CGROUP_INET6_CONNECT**.
+ *
* This helper actually implements a subset of **getsockopt()**.
* It supports the following *level*\ s:
*
@@ -3025,6 +3060,14 @@ union bpf_attr {
* * **-EOPNOTSUPP** Unsupported operation, for example a
* call from outside of TC ingress.
* * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport).
+ *
+ * u64 bpf_ktime_get_boot_ns(void)
+ * Description
+ * Return the time elapsed since system boot, in nanoseconds.
+ * Does include the time the system was suspended.
+ * See: clock_gettime(CLOCK_BOOTTIME)
+ * Return
+ * Current *ktime*.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -3151,7 +3194,8 @@ union bpf_attr {
FN(xdp_output), \
FN(get_netns_cookie), \
FN(get_current_ancestor_cgroup_id), \
- FN(sk_assign),
+ FN(sk_assign), \
+ FN(ktime_get_boot_ns),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -3598,6 +3642,25 @@ struct bpf_btf_info {
__u32 id;
} __attribute__((aligned(8)));
+struct bpf_link_info {
+ __u32 type;
+ __u32 id;
+ __u32 prog_id;
+ union {
+ struct {
+ __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */
+ __u32 tp_name_len; /* in/out: tp_name buffer len */
+ } raw_tracepoint;
+ struct {
+ __u32 attach_type;
+ } tracing;
+ struct {
+ __u64 cgroup_id;
+ __u32 attach_type;
+ } cgroup;
+ };
+} __attribute__((aligned(8)));
+
/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
* by user and intended to be used by socket (e.g. to bind to, depends on
* attach attach type).
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index affd66537e87..d1b8644bfb88 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -24,7 +24,7 @@ static void *get_ipc(struct ctl_table *table)
#ifdef CONFIG_PROC_SYSCTL
static int proc_ipc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table ipc_table;
@@ -35,7 +35,7 @@ static int proc_ipc_dointvec(struct ctl_table *table, int write,
}
static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table ipc_table;
@@ -46,7 +46,7 @@ static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write,
}
static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ipc_namespace *ns = current->nsproxy->ipc_ns;
int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -59,7 +59,7 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
}
static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table ipc_table;
memcpy(&ipc_table, table, sizeof(ipc_table));
@@ -70,7 +70,7 @@ static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write,
}
static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table ipc_table;
int dummy = 0;
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 7c00f28923a8..72a92a08c848 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -19,7 +19,7 @@ static void *get_mq(struct ctl_table *table)
}
static int proc_mq_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table mq_table;
memcpy(&mq_table, table, sizeof(mq_table));
@@ -29,7 +29,7 @@ static int proc_mq_dointvec(struct ctl_table *table, int write,
}
static int proc_mq_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table mq_table;
memcpy(&mq_table, table, sizeof(mq_table));
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d65c6912bdaf..a2cfba89a8e1 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3482,6 +3482,7 @@ extern char __weak __stop_BTF[];
extern struct btf *btf_vmlinux;
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
static union {
struct bpf_ctx_convert {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
@@ -3508,6 +3509,7 @@ static u8 bpf_ctx_convert_map[] = {
0, /* avoid empty array */
};
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
static const struct btf_member *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index cb305e71e7de..5c0e964105ac 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -557,8 +557,9 @@ found:
*
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
- struct bpf_prog *new_prog)
+static int __cgroup_bpf_replace(struct cgroup *cgrp,
+ struct bpf_cgroup_link *link,
+ struct bpf_prog *new_prog)
{
struct list_head *progs = &cgrp->bpf.progs[link->type];
struct bpf_prog *old_prog;
@@ -583,6 +584,30 @@ int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
return 0;
}
+static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_cgroup_link *cg_link;
+ int ret;
+
+ cg_link = container_of(link, struct bpf_cgroup_link, link);
+
+ mutex_lock(&cgroup_mutex);
+ /* link might have been auto-released by dying cgroup, so fail */
+ if (!cg_link->cgroup) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ if (old_prog && link->prog != old_prog) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
@@ -808,17 +833,56 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link)
kfree(cg_link);
}
-const struct bpf_link_ops bpf_cgroup_link_lops = {
+static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ u64 cg_id = 0;
+
+ mutex_lock(&cgroup_mutex);
+ if (cg_link->cgroup)
+ cg_id = cgroup_id(cg_link->cgroup);
+ mutex_unlock(&cgroup_mutex);
+
+ seq_printf(seq,
+ "cgroup_id:\t%llu\n"
+ "attach_type:\t%d\n",
+ cg_id,
+ cg_link->type);
+}
+
+static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ u64 cg_id = 0;
+
+ mutex_lock(&cgroup_mutex);
+ if (cg_link->cgroup)
+ cg_id = cgroup_id(cg_link->cgroup);
+ mutex_unlock(&cgroup_mutex);
+
+ info->cgroup.cgroup_id = cg_id;
+ info->cgroup.attach_type = cg_link->type;
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_cgroup_link_lops = {
.release = bpf_cgroup_link_release,
.dealloc = bpf_cgroup_link_dealloc,
+ .update_prog = cgroup_bpf_replace,
+ .show_fdinfo = bpf_cgroup_link_show_fdinfo,
+ .fill_link_info = bpf_cgroup_link_fill_link_info,
};
int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
+ struct bpf_link_primer link_primer;
struct bpf_cgroup_link *link;
- struct file *link_file;
struct cgroup *cgrp;
- int err, link_fd;
+ int err;
if (attr->link_create.flags)
return -EINVAL;
@@ -832,26 +896,25 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
err = -ENOMEM;
goto out_put_cgroup;
}
- bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
+ prog);
link->cgroup = cgrp;
link->type = attr->link_create.attach_type;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_cgroup;
}
err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
BPF_F_ALLOW_MULTI);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_cgroup;
}
- fd_install(link_fd, link_file);
- return link_fd;
+ return bpf_link_settle(&link_primer);
out_put_cgroup:
cgroup_put(cgrp);
@@ -1054,36 +1117,21 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
return !allow;
}
-EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
static const struct bpf_func_proto *
cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_map_push_elem:
- return &bpf_map_push_elem_proto;
- case BPF_FUNC_map_pop_elem:
- return &bpf_map_pop_elem_proto;
- case BPF_FUNC_map_peek_elem:
- return &bpf_map_peek_elem_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_trace_printk:
- if (capable(CAP_SYS_ADMIN))
- return bpf_get_trace_printk_proto();
- /* fall through */
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return NULL;
+ return bpf_base_func_proto(func_id);
}
}
@@ -1137,16 +1185,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* @head: sysctl table header
* @table: sysctl table
* @write: sysctl is being read (= 0) or written (= 1)
- * @buf: pointer to buffer passed by user space
+ * @buf: pointer to buffer (in and out)
* @pcount: value-result argument: value is size of buffer pointed to by @buf,
* result is size of @new_buf if program set new value, initial value
* otherwise
* @ppos: value-result argument: value is position at which read from or write
* to sysctl is happening, result is new position if program overrode it,
* initial value otherwise
- * @new_buf: pointer to pointer to new buffer that will be allocated if program
- * overrides new value provided by user space on sysctl write
- * NOTE: it's caller responsibility to free *new_buf if it was set
* @type: type of program to be executed
*
* Program is run when sysctl is being accessed, either read or written, and
@@ -1157,8 +1202,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
*/
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
struct ctl_table *table, int write,
- void __user *buf, size_t *pcount,
- loff_t *ppos, void **new_buf,
+ void **buf, size_t *pcount, loff_t *ppos,
enum bpf_attach_type type)
{
struct bpf_sysctl_kern ctx = {
@@ -1173,36 +1217,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
.new_updated = 0,
};
struct cgroup *cgrp;
+ loff_t pos = 0;
int ret;
ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
- if (ctx.cur_val) {
- mm_segment_t old_fs;
- loff_t pos = 0;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
- &ctx.cur_len, &pos)) {
- /* Let BPF program decide how to proceed. */
- ctx.cur_len = 0;
- }
- set_fs(old_fs);
- } else {
+ if (!ctx.cur_val ||
+ table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
/* Let BPF program decide how to proceed. */
ctx.cur_len = 0;
}
- if (write && buf && *pcount) {
+ if (write && *buf && *pcount) {
/* BPF program should be able to override new value with a
* buffer bigger than provided by user.
*/
ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
- if (!ctx.new_val ||
- copy_from_user(ctx.new_val, buf, ctx.new_len))
+ if (ctx.new_val) {
+ memcpy(ctx.new_val, *buf, ctx.new_len);
+ } else {
/* Let BPF program decide how to proceed. */
ctx.new_len = 0;
+ }
}
rcu_read_lock();
@@ -1213,7 +1249,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
kfree(ctx.cur_val);
if (ret == 1 && ctx.new_updated) {
- *new_buf = ctx.new_val;
+ kfree(*buf);
+ *buf = ctx.new_val;
*pcount = ctx.new_len;
} else {
kfree(ctx.new_val);
@@ -1221,7 +1258,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
return ret == 1 ? 0 : -EPERM;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
#ifdef CONFIG_NET
static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
@@ -1326,7 +1362,6 @@ out:
sockopt_free_buf(&ctx);
return ret;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
int optname, char __user *optval,
@@ -1413,7 +1448,6 @@ out:
sockopt_free_buf(&ctx);
return ret;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
#endif
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 916f5132a984..6aa11de67315 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2136,6 +2136,11 @@ BPF_CALL_0(bpf_user_rnd_u32)
return res;
}
+BPF_CALL_0(bpf_get_raw_cpu_id)
+{
+ return raw_smp_processor_id();
+}
+
/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
@@ -2151,6 +2156,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bafc53ddd350..5c0290e0696e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -151,7 +151,19 @@ BPF_CALL_0(bpf_ktime_get_ns)
const struct bpf_func_proto bpf_ktime_get_ns_proto = {
.func = bpf_ktime_get_ns,
- .gpl_only = true,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_ktime_get_boot_ns)
+{
+ /* NMI safe access to clock boottime */
+ return ktime_get_boot_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
+ .func = bpf_ktime_get_boot_ns,
+ .gpl_only = false,
.ret_type = RET_INTEGER,
};
@@ -562,3 +574,78 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
.arg3_type = ARG_PTR_TO_UNINIT_MEM,
.arg4_type = ARG_CONST_SIZE,
};
+
+static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
+ .func = bpf_get_raw_cpu_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map,
+ u64, flags, void *, data, u64, size)
+{
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+
+ return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
+}
+
+const struct bpf_func_proto bpf_event_output_data_proto = {
+ .func = bpf_event_output_data,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+const struct bpf_func_proto *
+bpf_base_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_map_push_elem:
+ return &bpf_map_push_elem_proto;
+ case BPF_FUNC_map_pop_elem:
+ return &bpf_map_pop_elem_proto;
+ case BPF_FUNC_map_peek_elem:
+ return &bpf_map_peek_elem_proto;
+ case BPF_FUNC_get_prandom_u32:
+ return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_raw_smp_processor_id_proto;
+ case BPF_FUNC_get_numa_node_id:
+ return &bpf_get_numa_node_id_proto;
+ case BPF_FUNC_tail_call:
+ return &bpf_tail_call_proto;
+ case BPF_FUNC_ktime_get_ns:
+ return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_ktime_get_boot_ns:
+ return &bpf_ktime_get_boot_ns_proto;
+ default:
+ break;
+ }
+
+ if (!capable(CAP_SYS_ADMIN))
+ return NULL;
+
+ switch (func_id) {
+ case BPF_FUNC_spin_lock:
+ return &bpf_spin_lock_proto;
+ case BPF_FUNC_spin_unlock:
+ return &bpf_spin_unlock_proto;
+ case BPF_FUNC_trace_printk:
+ return bpf_get_trace_printk_proto();
+ case BPF_FUNC_jiffies64:
+ return &bpf_jiffies64_proto;
+ default:
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7626b8024471..bb1ab7da6103 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -42,6 +42,8 @@ static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
static DEFINE_SPINLOCK(map_idr_lock);
+static DEFINE_IDR(link_idr);
+static DEFINE_SPINLOCK(link_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly;
@@ -49,9 +51,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops) \
[_id] = &_ops,
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
};
/*
@@ -1546,9 +1550,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
};
static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
@@ -2181,25 +2187,39 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
}
-void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
- struct bpf_prog *prog)
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog)
{
atomic64_set(&link->refcnt, 1);
+ link->type = type;
+ link->id = 0;
link->ops = ops;
link->prog = prog;
}
+static void bpf_link_free_id(int id)
+{
+ if (!id)
+ return;
+
+ spin_lock_bh(&link_idr_lock);
+ idr_remove(&link_idr, id);
+ spin_unlock_bh(&link_idr_lock);
+}
+
/* Clean up bpf_link and corresponding anon_inode file and FD. After
* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
- * anon_inode's release() call. This helper manages marking bpf_link as
- * defunct, releases anon_inode file and puts reserved FD.
+ * anon_inode's release() call. This helper marksbpf_link as
+ * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
+ * is not decremented, it's the responsibility of a calling code that failed
+ * to complete bpf_link initialization.
*/
-void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
- int link_fd)
+void bpf_link_cleanup(struct bpf_link_primer *primer)
{
- link->prog = NULL;
- fput(link_file);
- put_unused_fd(link_fd);
+ primer->link->prog = NULL;
+ bpf_link_free_id(primer->id);
+ fput(primer->file);
+ put_unused_fd(primer->fd);
}
void bpf_link_inc(struct bpf_link *link)
@@ -2210,6 +2230,7 @@ void bpf_link_inc(struct bpf_link *link)
/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
+ bpf_link_free_id(link->id);
if (link->prog) {
/* detach BPF program, clean up used resources */
link->ops->release(link);
@@ -2251,35 +2272,35 @@ static int bpf_link_release(struct inode *inode, struct file *filp)
}
#ifdef CONFIG_PROC_FS
-static const struct bpf_link_ops bpf_raw_tp_lops;
-static const struct bpf_link_ops bpf_tracing_link_lops;
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
+#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
+static const char *bpf_link_type_strs[] = {
+ [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
+#include <linux/bpf_types.h>
+};
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_link *link = filp->private_data;
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- const char *link_type;
-
- if (link->ops == &bpf_raw_tp_lops)
- link_type = "raw_tracepoint";
- else if (link->ops == &bpf_tracing_link_lops)
- link_type = "tracing";
-#ifdef CONFIG_CGROUP_BPF
- else if (link->ops == &bpf_cgroup_link_lops)
- link_type = "cgroup";
-#endif
- else
- link_type = "unknown";
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
+ "link_id:\t%u\n"
"prog_tag:\t%s\n"
"prog_id:\t%u\n",
- link_type,
+ bpf_link_type_strs[link->type],
+ link->id,
prog_tag,
prog->aux->id);
+ if (link->ops->show_fdinfo)
+ link->ops->show_fdinfo(link, m);
}
#endif
@@ -2292,36 +2313,77 @@ static const struct file_operations bpf_link_fops = {
.write = bpf_dummy_write,
};
-int bpf_link_new_fd(struct bpf_link *link)
+static int bpf_link_alloc_id(struct bpf_link *link)
{
- return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
-}
+ int id;
-/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but
- * instead of immediately installing fd in fdtable, just reserve it and
- * return. Caller then need to either install it with fd_install(fd, file) or
- * release with put_unused_fd(fd).
- * This is useful for cases when bpf_link attachment/detachment are
- * complicated and expensive operations and should be delayed until all the fd
- * reservation and anon_inode creation succeeds.
+ idr_preload(GFP_KERNEL);
+ spin_lock_bh(&link_idr_lock);
+ id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
+ spin_unlock_bh(&link_idr_lock);
+ idr_preload_end();
+
+ return id;
+}
+
+/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
+ * reserving unused FD and allocating ID from link_idr. This is to be paired
+ * with bpf_link_settle() to install FD and ID and expose bpf_link to
+ * user-space, if bpf_link is successfully attached. If not, bpf_link and
+ * pre-allocated resources are to be freed with bpf_cleanup() call. All the
+ * transient state is passed around in struct bpf_link_primer.
+ * This is preferred way to create and initialize bpf_link, especially when
+ * there are complicated and expensive operations inbetween creating bpf_link
+ * itself and attaching it to BPF hook. By using bpf_link_prime() and
+ * bpf_link_settle() kernel code using bpf_link doesn't have to perform
+ * expensive (and potentially failing) roll back operations in a rare case
+ * that file, FD, or ID can't be allocated.
*/
-struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd)
+int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
{
struct file *file;
- int fd;
+ int fd, id;
fd = get_unused_fd_flags(O_CLOEXEC);
if (fd < 0)
- return ERR_PTR(fd);
+ return fd;
+
+
+ id = bpf_link_alloc_id(link);
+ if (id < 0) {
+ put_unused_fd(fd);
+ return id;
+ }
file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
if (IS_ERR(file)) {
+ bpf_link_free_id(id);
put_unused_fd(fd);
- return file;
+ return PTR_ERR(file);
}
- *reserved_fd = fd;
- return file;
+ primer->link = link;
+ primer->file = file;
+ primer->fd = fd;
+ primer->id = id;
+ return 0;
+}
+
+int bpf_link_settle(struct bpf_link_primer *primer)
+{
+ /* make bpf_link fetchable by ID */
+ spin_lock_bh(&link_idr_lock);
+ primer->link->id = primer->id;
+ spin_unlock_bh(&link_idr_lock);
+ /* make bpf_link fetchable by FD */
+ fd_install(primer->fd, primer->file);
+ /* pass through installed FD */
+ return primer->fd;
+}
+
+int bpf_link_new_fd(struct bpf_link *link)
+{
+ return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
}
struct bpf_link *bpf_link_get_from_fd(u32 ufd)
@@ -2345,6 +2407,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
struct bpf_tracing_link {
struct bpf_link link;
+ enum bpf_attach_type attach_type;
};
static void bpf_tracing_link_release(struct bpf_link *link)
@@ -2360,16 +2423,40 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link)
kfree(tr_link);
}
+static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link);
+
+ seq_printf(seq,
+ "attach_type:\t%d\n",
+ tr_link->attach_type);
+}
+
+static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link);
+
+ info->tracing.attach_type = tr_link->attach_type;
+
+ return 0;
+}
+
static const struct bpf_link_ops bpf_tracing_link_lops = {
.release = bpf_tracing_link_release,
.dealloc = bpf_tracing_link_dealloc,
+ .show_fdinfo = bpf_tracing_link_show_fdinfo,
+ .fill_link_info = bpf_tracing_link_fill_link_info,
};
static int bpf_tracing_prog_attach(struct bpf_prog *prog)
{
+ struct bpf_link_primer link_primer;
struct bpf_tracing_link *link;
- struct file *link_file;
- int link_fd, err;
+ int err;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
@@ -2402,24 +2489,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
err = -ENOMEM;
goto out_put_prog;
}
- bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+ &bpf_tracing_link_lops, prog);
+ link->attach_type = prog->expected_attach_type;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_prog;
}
err = bpf_trampoline_link_prog(prog);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_prog;
}
- fd_install(link_fd, link_file);
- return link_fd;
-
+ return bpf_link_settle(&link_primer);
out_put_prog:
bpf_prog_put(prog);
return err;
@@ -2447,22 +2533,69 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
kfree(raw_tp);
}
-static const struct bpf_link_ops bpf_raw_tp_lops = {
+static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_raw_tp_link *raw_tp_link =
+ container_of(link, struct bpf_raw_tp_link, link);
+
+ seq_printf(seq,
+ "tp_name:\t%s\n",
+ raw_tp_link->btp->tp->name);
+}
+
+static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_raw_tp_link *raw_tp_link =
+ container_of(link, struct bpf_raw_tp_link, link);
+ char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
+ const char *tp_name = raw_tp_link->btp->tp->name;
+ u32 ulen = info->raw_tracepoint.tp_name_len;
+ size_t tp_len = strlen(tp_name);
+
+ if (ulen && !ubuf)
+ return -EINVAL;
+
+ info->raw_tracepoint.tp_name_len = tp_len + 1;
+
+ if (!ubuf)
+ return 0;
+
+ if (ulen >= tp_len + 1) {
+ if (copy_to_user(ubuf, tp_name, tp_len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, tp_name, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.release = bpf_raw_tp_link_release,
.dealloc = bpf_raw_tp_link_dealloc,
+ .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
+ .fill_link_info = bpf_raw_tp_link_fill_link_info,
};
#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
+ struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
struct bpf_raw_event_map *btp;
- struct file *link_file;
struct bpf_prog *prog;
const char *tp_name;
char buf[128];
- int link_fd, err;
+ int err;
if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
return -EINVAL;
@@ -2515,24 +2648,23 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
err = -ENOMEM;
goto out_put_btp;
}
- bpf_link_init(&link->link, &bpf_raw_tp_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
+ &bpf_raw_tp_link_lops, prog);
link->btp = btp;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_btp;
}
err = bpf_probe_register(link->btp, prog);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_btp;
}
- fd_install(link_fd, link_file);
- return link_fd;
+ return bpf_link_settle(&link_primer);
out_put_btp:
bpf_put_raw_tracepoint(btp);
@@ -3313,6 +3445,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf,
return btf_get_info_by_fd(btf, attr, uattr);
}
+static int bpf_link_get_info_by_fd(struct bpf_link *link,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_link_info info;
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ memset(&info, 0, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_len))
+ return -EFAULT;
+
+ info.type = link->type;
+ info.id = link->id;
+ info.prog_id = link->prog->aux->id;
+
+ if (link->ops->fill_link_info) {
+ err = link->ops->fill_link_info(link, &info);
+ if (err)
+ return err;
+ }
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+
#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -3337,6 +3505,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
uattr);
else if (f.file->f_op == &btf_fops)
err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
+ else if (f.file->f_op == &bpf_link_fops)
+ err = bpf_link_get_info_by_fd(f.file->private_data,
+ attr, uattr);
else
err = -EINVAL;
@@ -3464,7 +3635,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (file->f_op == &bpf_link_fops) {
struct bpf_link *link = file->private_data;
- if (link->ops == &bpf_raw_tp_lops) {
+ if (link->ops == &bpf_raw_tp_link_lops) {
struct bpf_raw_tp_link *raw_tp =
container_of(link, struct bpf_raw_tp_link, link);
struct bpf_raw_event_map *btp = raw_tp->btp;
@@ -3645,13 +3816,10 @@ static int link_update(union bpf_attr *attr)
goto out_put_progs;
}
-#ifdef CONFIG_CGROUP_BPF
- if (link->ops == &bpf_cgroup_link_lops) {
- ret = cgroup_bpf_replace(link, old_prog, new_prog);
- goto out_put_progs;
- }
-#endif
- ret = -EINVAL;
+ if (link->ops->update_prog)
+ ret = link->ops->update_prog(link, new_prog, old_prog);
+ else
+ ret = EINVAL;
out_put_progs:
if (old_prog)
@@ -3663,6 +3831,102 @@ out_put_link:
return ret;
}
+static int bpf_link_inc_not_zero(struct bpf_link *link)
+{
+ return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT;
+}
+
+#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
+
+static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_link *link;
+ u32 id = attr->link_id;
+ int fd, err;
+
+ if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&link_idr_lock);
+ link = idr_find(&link_idr, id);
+ /* before link is "settled", ID is 0, pretend it doesn't exist yet */
+ if (link) {
+ if (link->id)
+ err = bpf_link_inc_not_zero(link);
+ else
+ err = -EAGAIN;
+ } else {
+ err = -ENOENT;
+ }
+ spin_unlock_bh(&link_idr_lock);
+
+ if (err)
+ return err;
+
+ fd = bpf_link_new_fd(link);
+ if (fd < 0)
+ bpf_link_put(link);
+
+ return fd;
+}
+
+DEFINE_MUTEX(bpf_stats_enabled_mutex);
+
+static int bpf_stats_release(struct inode *inode, struct file *file)
+{
+ mutex_lock(&bpf_stats_enabled_mutex);
+ static_key_slow_dec(&bpf_stats_enabled_key.key);
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return 0;
+}
+
+static const struct file_operations bpf_stats_fops = {
+ .release = bpf_stats_release,
+};
+
+static int bpf_enable_runtime_stats(void)
+{
+ int fd;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+
+ /* Set a very high limit to avoid overflow */
+ if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return -EBUSY;
+ }
+
+ fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
+ if (fd >= 0)
+ static_key_slow_inc(&bpf_stats_enabled_key.key);
+
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return fd;
+}
+
+#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
+
+static int bpf_enable_stats(union bpf_attr *attr)
+{
+
+ if (CHECK_ATTR(BPF_ENABLE_STATS))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (attr->enable_stats.type) {
+ case BPF_STATS_RUN_TIME:
+ return bpf_enable_runtime_stats();
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr;
@@ -3780,6 +4044,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_LINK_UPDATE:
err = link_update(&attr);
break;
+ case BPF_LINK_GET_FD_BY_ID:
+ err = bpf_link_get_fd_by_id(&attr);
+ break;
+ case BPF_LINK_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &link_idr, &link_idr_lock);
+ break;
+ case BPF_ENABLE_STATS:
+ err = bpf_enable_stats(&attr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fa1d8245b925..70ad009577f8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -28,9 +28,11 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = & _name ## _verifier_ops,
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
};
/* bpf_check() is a static code analyzer that walks eBPF program
@@ -168,6 +170,8 @@ struct bpf_verifier_stack_elem {
int insn_idx;
int prev_insn_idx;
struct bpf_verifier_stack_elem *next;
+ /* length of verifier log at the time this state was pushed on stack */
+ u32 log_pos;
};
#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
@@ -283,6 +287,18 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
log->ubuf = NULL;
}
+static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+{
+ char zero = 0;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ log->len_used = new_pos;
+ if (put_user(zero, log->ubuf + new_pos))
+ log->ubuf = NULL;
+}
+
/* log_level controls verbosity level of eBPF verifier.
* bpf_verifier_log_write() is used to dump the verification trace to the log,
* so the user can figure out what's wrong with the program
@@ -413,11 +429,30 @@ static bool is_release_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_sk_release;
}
-static bool is_acquire_function(enum bpf_func_id func_id)
+static bool may_be_acquire_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_lookup_tcp ||
func_id == BPF_FUNC_sk_lookup_udp ||
- func_id == BPF_FUNC_skc_lookup_tcp;
+ func_id == BPF_FUNC_skc_lookup_tcp ||
+ func_id == BPF_FUNC_map_lookup_elem;
+}
+
+static bool is_acquire_function(enum bpf_func_id func_id,
+ const struct bpf_map *map)
+{
+ enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
+
+ if (func_id == BPF_FUNC_sk_lookup_tcp ||
+ func_id == BPF_FUNC_sk_lookup_udp ||
+ func_id == BPF_FUNC_skc_lookup_tcp)
+ return true;
+
+ if (func_id == BPF_FUNC_map_lookup_elem &&
+ (map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map_type == BPF_MAP_TYPE_SOCKHASH))
+ return true;
+
+ return false;
}
static bool is_ptr_cast_function(enum bpf_func_id func_id)
@@ -846,7 +881,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
}
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
- int *insn_idx)
+ int *insn_idx, bool pop_log)
{
struct bpf_verifier_state *cur = env->cur_state;
struct bpf_verifier_stack_elem *elem, *head = env->head;
@@ -860,6 +895,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
if (err)
return err;
}
+ if (pop_log)
+ bpf_vlog_reset(&env->log, head->log_pos);
if (insn_idx)
*insn_idx = head->insn_idx;
if (prev_insn_idx)
@@ -887,6 +924,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
+ elem->log_pos = env->log.len_used;
env->head = elem;
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
@@ -915,7 +953,7 @@ err:
free_verifier_state(env->cur_state, true);
env->cur_state = NULL;
/* pop all elements and return */
- while (!pop_stack(env, NULL, NULL));
+ while (!pop_stack(env, NULL, NULL, false));
return NULL;
}
@@ -3915,7 +3953,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_sock_map_update &&
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_map &&
- func_id != BPF_FUNC_sk_select_reuseport)
+ func_id != BPF_FUNC_sk_select_reuseport &&
+ func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
case BPF_MAP_TYPE_SOCKHASH:
@@ -3923,7 +3962,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_sock_hash_update &&
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_hash &&
- func_id != BPF_FUNC_sk_select_reuseport)
+ func_id != BPF_FUNC_sk_select_reuseport &&
+ func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
@@ -4093,7 +4133,7 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
/* A reference acquiring function cannot acquire
* another refcounted ptr.
*/
- if (is_acquire_function(func_id) && count)
+ if (may_be_acquire_function(func_id) && count)
return false;
/* We only support one arg being unreferenced at the moment,
@@ -4604,7 +4644,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
if (is_ptr_cast_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
- } else if (is_acquire_function(func_id)) {
+ } else if (is_acquire_function(func_id, meta.map_ptr)) {
int id = acquire_reference_state(env, insn_idx);
if (id < 0)
@@ -5609,7 +5649,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
{
struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
- bool src_known, dst_known;
+ bool src_known;
s64 smin_val, smax_val;
u64 umin_val, umax_val;
s32 s32_min_val, s32_max_val;
@@ -5631,7 +5671,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
if (alu32) {
src_known = tnum_subreg_is_const(src_reg.var_off);
- dst_known = tnum_subreg_is_const(dst_reg->var_off);
if ((src_known &&
(s32_min_val != s32_max_val || u32_min_val != u32_max_val)) ||
s32_min_val > s32_max_val || u32_min_val > u32_max_val) {
@@ -5643,7 +5682,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
}
} else {
src_known = tnum_is_const(src_reg.var_off);
- dst_known = tnum_is_const(dst_reg->var_off);
if ((src_known &&
(smin_val != smax_val || umin_val != umax_val)) ||
smin_val > smax_val || umin_val > umax_val) {
@@ -6515,12 +6553,16 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
if (is_null) {
reg->type = SCALAR_VALUE;
} else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
- if (reg->map_ptr->inner_map_meta) {
+ const struct bpf_map *map = reg->map_ptr;
+
+ if (map->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
- reg->map_ptr = reg->map_ptr->inner_map_meta;
- } else if (reg->map_ptr->map_type ==
- BPF_MAP_TYPE_XSKMAP) {
+ reg->map_ptr = map->inner_map_meta;
+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+ reg->type = PTR_TO_SOCKET;
} else {
reg->type = PTR_TO_MAP_VALUE;
}
@@ -8409,6 +8451,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
static int do_check(struct bpf_verifier_env *env)
{
+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state = env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
@@ -8685,7 +8728,7 @@ static int do_check(struct bpf_verifier_env *env)
process_bpf_exit:
update_branch_counts(env, env->cur_state);
err = pop_stack(env, &prev_insn_idx,
- &env->insn_idx);
+ &env->insn_idx, pop_log);
if (err < 0) {
if (err != -ENOENT)
return err;
@@ -10208,6 +10251,7 @@ static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state;
struct bpf_reg_state *regs;
int ret, i;
@@ -10270,7 +10314,9 @@ out:
free_verifier_state(env->cur_state, true);
env->cur_state = NULL;
}
- while (!pop_stack(env, NULL, NULL));
+ while (!pop_stack(env, NULL, NULL, false));
+ if (!ret && pop_log)
+ bpf_vlog_reset(&env->log, 0);
free_states(env);
if (ret)
/* clean aux data in case subprog was rejected */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 06b5ea9d899d..557a9b9d2244 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6508,33 +6508,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp,
return ret;
}
-int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog,
- struct bpf_prog *new_prog)
-{
- struct bpf_cgroup_link *cg_link;
- int ret;
-
- if (link->ops != &bpf_cgroup_link_lops)
- return -EINVAL;
-
- cg_link = container_of(link, struct bpf_cgroup_link, link);
-
- mutex_lock(&cgroup_mutex);
- /* link might have been auto-released by dying cgroup, so fail */
- if (!cg_link->cgroup) {
- ret = -EINVAL;
- goto out_unlock;
- }
- if (old_prog && link->prog != old_prog) {
- ret = -EPERM;
- goto out_unlock;
- }
- ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
-out_unlock:
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type)
{
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c2b41a263166..bdb1533ada81 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -236,7 +236,7 @@ exit_put:
* sysctl_perf_event_max_contexts_per_stack.
*/
int perf_event_max_stack_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *value = table->data;
int new_value = *value, ret;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 633b4ae72ed5..468139611e06 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -437,8 +437,7 @@ static void update_perf_cpu_limits(void)
static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
int perf_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int perf_cpu = sysctl_perf_cpu_time_max_percent;
@@ -462,8 +461,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2625c241ac00..ffbe03a45c16 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -892,7 +892,7 @@ static void unoptimize_all_kprobes(void)
static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length,
+ void *buffer, size_t *length,
loff_t *ppos)
{
int ret;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 8d1c15832e55..166d7bf49666 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void)
return 0;
}
-int sysctl_latencytop(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int err;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 01f8ba32cc0c..3ccaba5f15c0 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
#ifdef CONFIG_CHECKPOINT_RESTORE
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9a9b6156270b..471f649b5868 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -173,7 +173,7 @@ __setup("printk.devkmsg=", control_devkmsg);
char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char old_str[DEVKMSG_STR_MAX_SIZE];
unsigned int old;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9a2fbf98fd6f..3e89a042a48f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1110,8 +1110,7 @@ static void uclamp_update_root_tg(void) { }
#endif
int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
int old_min, old_max;
@@ -2718,7 +2717,7 @@ void set_numabalancing_state(bool enabled)
#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
@@ -2792,8 +2791,8 @@ static void __init init_schedstats(void)
}
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 02f323b85b6d..b6077fd5b32f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
*/
int sched_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
unsigned int factor = get_update_sysctl_factor();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index df11d88c9895..45da29de3ecc 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2714,9 +2714,8 @@ static void sched_rt_do_global(void)
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
}
-int sched_rt_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int old_period, old_runtime;
static DEFINE_MUTEX(mutex);
@@ -2754,9 +2753,8 @@ undo:
return ret;
}
-int sched_rr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
static DEFINE_MUTEX(mutex);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8344757bba6e..fa64b2ee9fe6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -209,7 +209,7 @@ bool sched_energy_update;
#ifdef CONFIG_PROC_SYSCTL
int sched_energy_aware_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, state;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 55a6184f5990..d653d8426de9 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
}
static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a176d8727a3..7adfe5dbce9d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -68,6 +68,9 @@
#include <linux/bpf.h>
#include <linux/mount.h>
#include <linux/userfaultfd_k.h>
+#include <linux/coredump.h>
+#include <linux/latencytop.h>
+#include <linux/pid.h>
#include "../lib/kstrtox.h"
@@ -103,22 +106,6 @@
#if defined(CONFIG_SYSCTL)
-/* External variables not in a header file. */
-extern int suid_dumpable;
-#ifdef CONFIG_COREDUMP
-extern int core_uses_pid;
-extern char core_pattern[];
-extern unsigned int core_pipe_limit;
-#endif
-extern int pid_max;
-extern int pid_max_min, pid_max_max;
-extern int percpu_pagelist_fraction;
-extern int latencytop_enabled;
-extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
-#ifndef CONFIG_MMU
-extern int sysctl_nr_trim_pages;
-#endif
-
/* Constants used for minimum and maximum */
#ifdef CONFIG_LOCKUP_DETECTOR
static int sixty = 60;
@@ -160,24 +147,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
#ifdef CONFIG_INOTIFY_USER
#include <linux/inotify.h>
#endif
-#ifdef CONFIG_SPARC
-#endif
-
-#ifdef CONFIG_PARISC
-extern int pwrsw_enabled;
-#endif
-
-#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
-extern int unaligned_enabled;
-#endif
-
-#ifdef CONFIG_IA64
-extern int unaligned_dump_stack;
-#endif
-
-#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
-extern int no_unaligned_warning;
-#endif
#ifdef CONFIG_PROC_SYSCTL
@@ -207,102 +176,1472 @@ enum sysctl_writes_mode {
};
static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
+#endif /* CONFIG_PROC_SYSCTL */
+
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+int sysctl_legacy_va_layout;
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+static int min_sched_granularity_ns = 100000; /* 100 usecs */
+static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
+static int min_wakeup_granularity_ns; /* 0 usecs */
+static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+#ifdef CONFIG_SMP
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
-static int proc_do_cad_pid(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-static int proc_taint(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
#ifdef CONFIG_COMPACTION
-static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos);
+static int min_extfrag_threshold;
+static int max_extfrag_threshold = 1000;
#endif
+
+#endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_stats_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static int saved_val;
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+ val = saved_val;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret && val != saved_val) {
+ if (val)
+ static_key_slow_inc(key);
+ else
+ static_key_slow_dec(key);
+ saved_val = val;
+ }
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return ret;
+}
#endif
+/*
+ * /proc/sys support
+ */
+
+#ifdef CONFIG_PROC_SYSCTL
+
+static int _proc_do_string(char *data, int maxlen, int write,
+ char *buffer, size_t *lenp, loff_t *ppos)
+{
+ size_t len;
+ char c, *p;
+
+ if (!data || !maxlen || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
+ /* Only continue writes not past the end of buffer. */
+ len = strlen(data);
+ if (len > maxlen - 1)
+ len = maxlen - 1;
+
+ if (*ppos > len)
+ return 0;
+ len = *ppos;
+ } else {
+ /* Start writing from beginning of buffer. */
+ len = 0;
+ }
+
+ *ppos += *lenp;
+ p = buffer;
+ while ((p - buffer) < *lenp && len < maxlen - 1) {
+ c = *(p++);
+ if (c == 0 || c == '\n')
+ break;
+ data[len++] = c;
+ }
+ data[len] = 0;
+ } else {
+ len = strlen(data);
+ if (len > maxlen)
+ len = maxlen;
+
+ if (*ppos > len) {
+ *lenp = 0;
+ return 0;
+ }
+
+ data += *ppos;
+ len -= *ppos;
+
+ if (len > *lenp)
+ len = *lenp;
+ if (len)
+ memcpy(buffer, data, len);
+ if (len < *lenp) {
+ buffer[len] = '\n';
+ len++;
+ }
+ *lenp = len;
+ *ppos += len;
+ }
+ return 0;
+}
+
+static void warn_sysctl_write(struct ctl_table *table)
+{
+ pr_warn_once("%s wrote to %s when file position was not 0!\n"
+ "This will not be supported in the future. To silence this\n"
+ "warning, set kernel.sysctl_writes_strict = -1\n",
+ current->comm, table->procname);
+}
+
+/**
+ * proc_first_pos_non_zero_ignore - check if first position is allowed
+ * @ppos: file position
+ * @table: the sysctl table
+ *
+ * Returns true if the first position is non-zero and the sysctl_writes_strict
+ * mode indicates this is not allowed for numeric input types. String proc
+ * handlers can ignore the return value.
+ */
+static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
+ struct ctl_table *table)
+{
+ if (!*ppos)
+ return false;
+
+ switch (sysctl_writes_strict) {
+ case SYSCTL_WRITES_STRICT:
+ return true;
+ case SYSCTL_WRITES_WARN:
+ warn_sysctl_write(table);
+ return false;
+ default:
+ return false;
+ }
+}
+
+/**
+ * proc_dostring - read a string sysctl
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes a string from/to the user buffer. If the kernel
+ * buffer provided is not large enough to hold the string, the
+ * string is truncated. The copied string is %NULL-terminated.
+ * If the string is being read by the user process, it is copied
+ * and a newline '\n' is added. It is truncated if the buffer is
+ * not large enough.
+ *
+ * Returns 0 on success.
+ */
+int proc_dostring(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write)
+ proc_first_pos_non_zero_ignore(ppos, table);
+
+ return _proc_do_string(table->data, table->maxlen, write, buffer, lenp,
+ ppos);
+}
+
+static size_t proc_skip_spaces(char **buf)
+{
+ size_t ret;
+ char *tmp = skip_spaces(*buf);
+ ret = tmp - *buf;
+ *buf = tmp;
+ return ret;
+}
+
+static void proc_skip_char(char **buf, size_t *size, const char v)
+{
+ while (*size) {
+ if (**buf != v)
+ break;
+ (*size)--;
+ (*buf)++;
+ }
+}
+
+/**
+ * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
+ * fail on overflow
+ *
+ * @cp: kernel buffer containing the string to parse
+ * @endp: pointer to store the trailing characters
+ * @base: the base to use
+ * @res: where the parsed integer will be stored
+ *
+ * In case of success 0 is returned and @res will contain the parsed integer,
+ * @endp will hold any trailing characters.
+ * This function will fail the parse on overflow. If there wasn't an overflow
+ * the function will defer the decision what characters count as invalid to the
+ * caller.
+ */
+static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
+ unsigned long *res)
+{
+ unsigned long long result;
+ unsigned int rv;
+
+ cp = _parse_integer_fixup_radix(cp, &base);
+ rv = _parse_integer(cp, base, &result);
+ if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
+ return -ERANGE;
+
+ cp += rv;
+
+ if (endp)
+ *endp = (char *)cp;
+
+ *res = (unsigned long)result;
+ return 0;
+}
+
+#define TMPBUFLEN 22
+/**
+ * proc_get_long - reads an ASCII formatted integer from a user buffer
+ *
+ * @buf: a kernel buffer
+ * @size: size of the kernel buffer
+ * @val: this is where the number will be stored
+ * @neg: set to %TRUE if number is negative
+ * @perm_tr: a vector which contains the allowed trailers
+ * @perm_tr_len: size of the perm_tr vector
+ * @tr: pointer to store the trailer character
+ *
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes read. If @tr is non-NULL and a trailing
+ * character exists (size is non-zero after returning from this
+ * function), @tr is updated with the trailing character.
+ */
+static int proc_get_long(char **buf, size_t *size,
+ unsigned long *val, bool *neg,
+ const char *perm_tr, unsigned perm_tr_len, char *tr)
+{
+ int len;
+ char *p, tmp[TMPBUFLEN];
+
+ if (!*size)
+ return -EINVAL;
+
+ len = *size;
+ if (len > TMPBUFLEN - 1)
+ len = TMPBUFLEN - 1;
+
+ memcpy(tmp, *buf, len);
+
+ tmp[len] = 0;
+ p = tmp;
+ if (*p == '-' && *size > 1) {
+ *neg = true;
+ p++;
+ } else
+ *neg = false;
+ if (!isdigit(*p))
+ return -EINVAL;
+
+ if (strtoul_lenient(p, &p, 0, val))
+ return -EINVAL;
+
+ len = p - tmp;
+
+ /* We don't know if the next char is whitespace thus we may accept
+ * invalid integers (e.g. 1234...a) or two integers instead of one
+ * (e.g. 123...1). So lets not allow such large numbers. */
+ if (len == TMPBUFLEN - 1)
+ return -EINVAL;
+
+ if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
+ return -EINVAL;
+
+ if (tr && (len < *size))
+ *tr = *p;
+
+ *buf += len;
+ *size -= len;
+
+ return 0;
+}
+
+/**
+ * proc_put_long - converts an integer to a decimal ASCII formatted string
+ *
+ * @buf: the user buffer
+ * @size: the size of the user buffer
+ * @val: the integer to be converted
+ * @neg: sign of the number, %TRUE for negative
+ *
+ * In case of success @buf and @size are updated with the amount of bytes
+ * written.
+ */
+static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg)
+{
+ int len;
+ char tmp[TMPBUFLEN], *p = tmp;
+
+ sprintf(p, "%s%lu", neg ? "-" : "", val);
+ len = strlen(tmp);
+ if (len > *size)
+ len = *size;
+ memcpy(*buf, tmp, len);
+ *size -= len;
+ *buf += len;
+}
+#undef TMPBUFLEN
+
+static void proc_put_char(void **buf, size_t *size, char c)
+{
+ if (*size) {
+ char **buffer = (char **)buf;
+ **buffer = c;
+
+ (*size)--;
+ (*buffer)++;
+ *buf = *buffer;
+ }
+}
+
+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*negp) {
+ if (*lvalp > (unsigned long) INT_MAX + 1)
+ return -EINVAL;
+ *valp = -*lvalp;
+ } else {
+ if (*lvalp > (unsigned long) INT_MAX)
+ return -EINVAL;
+ *valp = *lvalp;
+ }
+ } else {
+ int val = *valp;
+ if (val < 0) {
+ *negp = true;
+ *lvalp = -(unsigned long)val;
+ } else {
+ *negp = false;
+ *lvalp = (unsigned long)val;
+ }
+ }
+ return 0;
+}
+
+static int do_proc_douintvec_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
+ *valp = *lvalp;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long)val;
+ }
+ return 0;
+}
+
+static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
+
+static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+ int *i, vleft, first = 1, err = 0;
+ size_t left;
+ char *p;
+
+ if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (int *) tbl_data;
+ vleft = table->maxlen / sizeof(*i);
+ left = *lenp;
+
+ if (!conv)
+ conv = do_proc_dointvec_conv;
+
+ if (write) {
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto out;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+ p = buffer;
+ }
+
+ for (; left && vleft--; i++, first=0) {
+ unsigned long lval;
+ bool neg;
+
+ if (write) {
+ left -= proc_skip_spaces(&p);
+
+ if (!left)
+ break;
+ err = proc_get_long(&p, &left, &lval, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err)
+ break;
+ if (conv(&neg, &lval, i, 1, data)) {
+ err = -EINVAL;
+ break;
+ }
+ } else {
+ if (conv(&neg, &lval, i, 0, data)) {
+ err = -EINVAL;
+ break;
+ }
+ if (!first)
+ proc_put_char(&buffer, &left, '\t');
+ proc_put_long(&buffer, &left, lval, neg);
+ }
+ }
+
+ if (!write && !first && left && !err)
+ proc_put_char(&buffer, &left, '\n');
+ if (write && !err && left)
+ left -= proc_skip_spaces(&p);
+ if (write && first)
+ return err ? : -EINVAL;
+ *lenp -= left;
+out:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_dointvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_dointvec(table->data, table, write,
+ buffer, lenp, ppos, conv, data);
+}
+
+static int do_proc_douintvec_w(unsigned int *tbl_data,
+ struct ctl_table *table,
+ void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned long lval;
+ int err = 0;
+ size_t left;
+ bool neg;
+ char *p = buffer;
+
+ left = *lenp;
+
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto bail_early;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+
+ left -= proc_skip_spaces(&p);
+ if (!left) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ err = proc_get_long(&p, &left, &lval, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err || neg) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (conv(&lval, tbl_data, 1, data)) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (!err && left)
+ left -= proc_skip_spaces(&p);
+
+out_free:
+ if (err)
+ return -EINVAL;
+
+ return 0;
+
+ /* This is in keeping with old __do_proc_dointvec() */
+bail_early:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned long lval;
+ int err = 0;
+ size_t left;
+
+ left = *lenp;
+
+ if (conv(&lval, tbl_data, 0, data)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ proc_put_long(&buffer, &left, lval, false);
+ if (!left)
+ goto out;
+
+ proc_put_char(&buffer, &left, '\n');
+
+out:
+ *lenp -= left;
+ *ppos += *lenp;
+
+ return err;
+}
+
+static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned int *i, vleft;
+
+ if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (unsigned int *) tbl_data;
+ vleft = table->maxlen / sizeof(*i);
+
+ /*
+ * Arrays are not supported, keep this simple. *Do not* add
+ * support for them.
+ */
+ if (vleft != 1) {
+ *lenp = 0;
+ return -EINVAL;
+ }
+
+ if (!conv)
+ conv = do_proc_douintvec_conv;
+
+ if (write)
+ return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
+ conv, data);
+ return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
+}
+
+static int do_proc_douintvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_douintvec(table->data, table, write,
+ buffer, lenp, ppos, conv, data);
+}
+
+/**
+ * proc_dointvec - read a vector of integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+#ifdef CONFIG_COMPACTION
+static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
+ int write, void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, old;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ old = *(int *)table->data;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+ if (old != *(int *)table->data)
+ pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
+ table->procname, current->comm,
+ task_pid_nr(current));
+ return ret;
+}
+#endif
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_conv, NULL);
+}
+
+/*
+ * Taint values can only be increased
+ * This means we can safely use a temporary.
+ */
+static int proc_taint(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long tmptaint = get_taint();
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &tmptaint;
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ if (write) {
+ /*
+ * Poor man's atomic or. Not worth adding a primitive
+ * to everyone's atomic.h for this
+ */
+ int i;
+ for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
+ if ((tmptaint >> i) & 1)
+ add_taint(i, LOCKDEP_STILL_OK);
+ }
+ }
+
+ return err;
+}
+
#ifdef CONFIG_PRINTK
static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
#endif
+/**
+ * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_dointvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_dointvec_minmax() handler.
+ */
+struct do_proc_dointvec_minmax_conv_param {
+ int *min;
+ int *max;
+};
+
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ int tmp, ret;
+ struct do_proc_dointvec_minmax_conv_param *param = data;
+ /*
+ * If writing, first do so via a temporary local int so we can
+ * bounds-check it before touching *valp.
+ */
+ int *ip = write ? &tmp : valp;
+
+ ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
+ if (ret)
+ return ret;
+
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
+ return -EINVAL;
+ *valp = tmp;
+ }
+
+ return 0;
+}
+
+/**
+ * proc_dointvec_minmax - read a vector of integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success or -EINVAL on write when the range check fails.
+ */
+int proc_dointvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_dointvec_minmax_conv_param param = {
+ .min = (int *) table->extra1,
+ .max = (int *) table->extra2,
+ };
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_minmax_conv, &param);
+}
+
+/**
+ * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_douintvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_douintvec_minmax() handler.
+ */
+struct do_proc_douintvec_minmax_conv_param {
+ unsigned int *min;
+ unsigned int *max;
+};
+
+static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ int ret;
+ unsigned int tmp;
+ struct do_proc_douintvec_minmax_conv_param *param = data;
+ /* write via temporary local uint for bounds-checking */
+ unsigned int *up = write ? &tmp : valp;
+
+ ret = do_proc_douintvec_conv(lvalp, up, write, data);
+ if (ret)
+ return ret;
+
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
+ return -ERANGE;
+
+ *valp = tmp;
+ }
+
+ return 0;
+}
+
+/**
+ * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string. Negative
+ * strings are not allowed.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max). There is a final sanity
+ * check for UINT_MAX to avoid having to support wrap around uses from
+ * userspace.
+ *
+ * Returns 0 on success or -ERANGE on write when the range check fails.
+ */
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_douintvec_minmax_conv_param param = {
+ .min = (unsigned int *) table->extra1,
+ .max = (unsigned int *) table->extra2,
+ };
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_minmax_conv, &param);
+}
+
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ if (write) {
+ unsigned int val;
+
+ val = round_pipe_size(*lvalp);
+ if (val == 0)
+ return -EINVAL;
+
+ *valp = val;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long) val;
+ }
+
+ return 0;
+}
+
+static int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_dopipe_max_size_conv, NULL);
+}
+
+static void validate_coredump_safety(void)
+{
+#ifdef CONFIG_COREDUMP
+ if (suid_dumpable == SUID_DUMP_ROOT &&
+ core_pattern[0] != '/' && core_pattern[0] != '|') {
+ printk(KERN_WARNING
+"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
+"Pipe handler or fully qualified core dump path required.\n"
+"Set kernel.core_pattern before fs.suid_dumpable.\n"
+ );
+ }
+#endif
+}
+
static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
+
#ifdef CONFIG_COREDUMP
static int proc_dostring_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dostring(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
#endif
-static int proc_dopipe_max_size(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
#ifdef CONFIG_MAGIC_SYSRQ
static int sysrq_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-#endif
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int tmp, ret;
-static struct ctl_table kern_table[];
-static struct ctl_table vm_table[];
-static struct ctl_table fs_table[];
-static struct ctl_table debug_table[];
-static struct ctl_table dev_table[];
-extern struct ctl_table random_table[];
-#ifdef CONFIG_EPOLL
-extern struct ctl_table epoll_table[];
-#endif
+ tmp = sysrq_mask();
-#ifdef CONFIG_FW_LOADER_USER_HELPER
-extern struct ctl_table firmware_config_table[];
-#endif
+ ret = __do_proc_dointvec(&tmp, table, write, buffer,
+ lenp, ppos, NULL, NULL);
+ if (ret || !write)
+ return ret;
-#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
- defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
-int sysctl_legacy_va_layout;
+ if (write)
+ sysrq_toggle_support(tmp);
+
+ return 0;
+}
#endif
-/* The default sysctl tables: */
+static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
+ int write, void *buffer, size_t *lenp, loff_t *ppos,
+ unsigned long convmul, unsigned long convdiv)
+{
+ unsigned long *i, *min, *max;
+ int vleft, first = 1, err = 0;
+ size_t left;
+ char *p;
-static struct ctl_table sysctl_base_table[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = kern_table,
- },
- {
- .procname = "vm",
- .mode = 0555,
- .child = vm_table,
- },
- {
- .procname = "fs",
- .mode = 0555,
- .child = fs_table,
- },
- {
- .procname = "debug",
- .mode = 0555,
- .child = debug_table,
- },
- {
- .procname = "dev",
- .mode = 0555,
- .child = dev_table,
- },
- { }
-};
+ if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
-#ifdef CONFIG_SCHED_DEBUG
-static int min_sched_granularity_ns = 100000; /* 100 usecs */
-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
-static int min_wakeup_granularity_ns; /* 0 usecs */
-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
-#ifdef CONFIG_SMP
-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_SCHED_DEBUG */
+ i = (unsigned long *) data;
+ min = (unsigned long *) table->extra1;
+ max = (unsigned long *) table->extra2;
+ vleft = table->maxlen / sizeof(unsigned long);
+ left = *lenp;
-#ifdef CONFIG_COMPACTION
-static int min_extfrag_threshold;
-static int max_extfrag_threshold = 1000;
-#endif
+ if (write) {
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto out;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+ p = buffer;
+ }
+
+ for (; left && vleft--; i++, first = 0) {
+ unsigned long val;
+
+ if (write) {
+ bool neg;
+
+ left -= proc_skip_spaces(&p);
+ if (!left)
+ break;
+
+ err = proc_get_long(&p, &left, &val, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err)
+ break;
+ if (neg)
+ continue;
+ val = convmul * val / convdiv;
+ if ((min && val < *min) || (max && val > *max)) {
+ err = -EINVAL;
+ break;
+ }
+ *i = val;
+ } else {
+ val = convdiv * (*i) / convmul;
+ if (!first)
+ proc_put_char(&buffer, &left, '\t');
+ proc_put_long(&buffer, &left, val, false);
+ }
+ }
+
+ if (!write && !first && left && !err)
+ proc_put_char(&buffer, &left, '\n');
+ if (write && !err)
+ left -= proc_skip_spaces(&p);
+ if (write && first)
+ return err ? : -EINVAL;
+ *lenp -= left;
+out:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul,
+ unsigned long convdiv)
+{
+ return __do_proc_doulongvec_minmax(table->data, table, write,
+ buffer, lenp, ppos, convmul, convdiv);
+}
+
+/**
+ * proc_doulongvec_minmax - read a vector of long integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
+}
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_doulongvec_minmax(table, write, buffer,
+ lenp, ppos, HZ, 1000l);
+}
+
+
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*lvalp > INT_MAX / HZ)
+ return 1;
+ *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = -(unsigned long)val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = lval / HZ;
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
+ return 1;
+ *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = -(unsigned long)val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = jiffies_to_clock_t(lval);
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+
+ if (jif > INT_MAX)
+ return 1;
+ *valp = (int)jif;
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = -(unsigned long)val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = jiffies_to_msecs(lval);
+ }
+ return 0;
+}
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ do_proc_dointvec_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/USER_HZ seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ do_proc_dointvec_userhz_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_ms_jiffies_conv, NULL);
+}
+
+static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct pid *new_pid;
+ pid_t tmp;
+ int r;
+
+ tmp = pid_vnr(cad_pid);
+
+ r = __do_proc_dointvec(&tmp, table, write, buffer,
+ lenp, ppos, NULL, NULL);
+ if (r || !write)
+ return r;
+
+ new_pid = find_get_pid(tmp);
+ if (!new_pid)
+ return -ESRCH;
+
+ put_pid(xchg(&cad_pid, new_pid));
+ return 0;
+}
+
+/**
+ * proc_do_large_bitmap - read/write from/to a large bitmap
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * The bitmap is stored at table->data and the bitmap length (in bits)
+ * in table->maxlen.
+ *
+ * We use a range comma separated format (e.g. 1,3-4,10-10) so that
+ * large bitmaps may be represented in a compact manner. Writing into
+ * the file will clear the bitmap then update it with the given input.
+ *
+ * Returns 0 on success.
+ */
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err = 0;
+ bool first = 1;
+ size_t left = *lenp;
+ unsigned long bitmap_len = table->maxlen;
+ unsigned long *bitmap = *(unsigned long **) table->data;
+ unsigned long *tmp_bitmap = NULL;
+ char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
+
+ if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ char *p = buffer;
+ size_t skipped = 0;
+
+ if (left > PAGE_SIZE - 1) {
+ left = PAGE_SIZE - 1;
+ /* How much of the buffer we'll skip this pass */
+ skipped = *lenp - left;
+ }
+
+ tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
+ if (!tmp_bitmap)
+ return -ENOMEM;
+ proc_skip_char(&p, &left, '\n');
+ while (!err && left) {
+ unsigned long val_a, val_b;
+ bool neg;
+ size_t saved_left;
+
+ /* In case we stop parsing mid-number, we can reset */
+ saved_left = left;
+ err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
+ sizeof(tr_a), &c);
+ /*
+ * If we consumed the entirety of a truncated buffer or
+ * only one char is left (may be a "-"), then stop here,
+ * reset, & come back for more.
+ */
+ if ((left <= 1) && skipped) {
+ left = saved_left;
+ break;
+ }
+
+ if (err)
+ break;
+ if (val_a >= bitmap_len || neg) {
+ err = -EINVAL;
+ break;
+ }
+
+ val_b = val_a;
+ if (left) {
+ p++;
+ left--;
+ }
+
+ if (c == '-') {
+ err = proc_get_long(&p, &left, &val_b,
+ &neg, tr_b, sizeof(tr_b),
+ &c);
+ /*
+ * If we consumed all of a truncated buffer or
+ * then stop here, reset, & come back for more.
+ */
+ if (!left && skipped) {
+ left = saved_left;
+ break;
+ }
+
+ if (err)
+ break;
+ if (val_b >= bitmap_len || neg ||
+ val_a > val_b) {
+ err = -EINVAL;
+ break;
+ }
+ if (left) {
+ p++;
+ left--;
+ }
+ }
+
+ bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
+ first = 0;
+ proc_skip_char(&p, &left, '\n');
+ }
+ left += skipped;
+ } else {
+ unsigned long bit_a, bit_b = 0;
+
+ while (left) {
+ bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
+ if (bit_a >= bitmap_len)
+ break;
+ bit_b = find_next_zero_bit(bitmap, bitmap_len,
+ bit_a + 1) - 1;
+
+ if (!first)
+ proc_put_char(&buffer, &left, ',');
+ proc_put_long(&buffer, &left, bit_a, false);
+ if (bit_a != bit_b) {
+ proc_put_char(&buffer, &left, '-');
+ proc_put_long(&buffer, &left, bit_b, false);
+ }
+
+ first = 0; bit_b++;
+ }
+ proc_put_char(&buffer, &left, '\n');
+ }
+
+ if (!err) {
+ if (write) {
+ if (*ppos)
+ bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
+ else
+ bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
+ }
+ *lenp -= left;
+ *ppos += *lenp;
+ }
+
+ bitmap_free(tmp_bitmap);
+ return err;
+}
+
+#else /* CONFIG_PROC_SYSCTL */
+
+int proc_dostring(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_douintvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+#endif /* CONFIG_PROC_SYSCTL */
+
+#if defined(CONFIG_SYSCTL)
+int proc_do_static_key(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static DEFINE_MUTEX(static_key_mutex);
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&static_key_mutex);
+ val = static_key_enabled(key);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (val)
+ static_key_enable(key);
+ else
+ static_key_disable(key);
+ }
+ mutex_unlock(&static_key_mutex);
+ return ret;
+}
static struct ctl_table kern_table[] = {
{
@@ -613,7 +1952,7 @@ static struct ctl_table kern_table[] = {
.procname = "soft-power",
.data = &pwrsw_enabled,
.maxlen = sizeof (int),
- .mode = 0644,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
#endif
@@ -1072,7 +2411,7 @@ static struct ctl_table kern_table[] = {
.procname = "ignore-unaligned-usertrap",
.data = &no_unaligned_warning,
.maxlen = sizeof (int),
- .mode = 0644,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
#endif
@@ -1244,7 +2583,7 @@ static struct ctl_table kern_table[] = {
.data = &bpf_stats_enabled_key.key,
.maxlen = sizeof(bpf_stats_enabled_key),
.mode = 0644,
- .proc_handler = proc_do_static_key,
+ .proc_handler = bpf_stats_handler,
},
#endif
#if defined(CONFIG_TREE_RCU)
@@ -1320,7 +2659,7 @@ static struct ctl_table vm_table[] = {
.proc_handler = overcommit_kbytes_handler,
},
{
- .procname = "page-cluster",
+ .procname = "page-cluster",
.data = &page_cluster,
.maxlen = sizeof(int),
.mode = 0644,
@@ -1491,7 +2830,7 @@ static struct ctl_table vm_table[] = {
.data = &watermark_boost_factor,
.maxlen = sizeof(watermark_boost_factor),
.mode = 0644,
- .proc_handler = watermark_boost_factor_sysctl_handler,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
@@ -1959,6 +3298,35 @@ static struct ctl_table dev_table[] = {
{ }
};
+static struct ctl_table sysctl_base_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = kern_table,
+ },
+ {
+ .procname = "vm",
+ .mode = 0555,
+ .child = vm_table,
+ },
+ {
+ .procname = "fs",
+ .mode = 0555,
+ .child = fs_table,
+ },
+ {
+ .procname = "debug",
+ .mode = 0555,
+ .child = debug_table,
+ },
+ {
+ .procname = "dev",
+ .mode = 0555,
+ .child = dev_table,
+ },
+ { }
+};
+
int __init sysctl_init(void)
{
struct ctl_table_header *hdr;
@@ -1967,1474 +3335,7 @@ int __init sysctl_init(void)
kmemleak_not_leak(hdr);
return 0;
}
-
#endif /* CONFIG_SYSCTL */
-
-/*
- * /proc/sys support
- */
-
-#ifdef CONFIG_PROC_SYSCTL
-
-static int _proc_do_string(char *data, int maxlen, int write,
- char __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- size_t len;
- char __user *p;
- char c;
-
- if (!data || !maxlen || !*lenp) {
- *lenp = 0;
- return 0;
- }
-
- if (write) {
- if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
- /* Only continue writes not past the end of buffer. */
- len = strlen(data);
- if (len > maxlen - 1)
- len = maxlen - 1;
-
- if (*ppos > len)
- return 0;
- len = *ppos;
- } else {
- /* Start writing from beginning of buffer. */
- len = 0;
- }
-
- *ppos += *lenp;
- p = buffer;
- while ((p - buffer) < *lenp && len < maxlen - 1) {
- if (get_user(c, p++))
- return -EFAULT;
- if (c == 0 || c == '\n')
- break;
- data[len++] = c;
- }
- data[len] = 0;
- } else {
- len = strlen(data);
- if (len > maxlen)
- len = maxlen;
-
- if (*ppos > len) {
- *lenp = 0;
- return 0;
- }
-
- data += *ppos;
- len -= *ppos;
-
- if (len > *lenp)
- len = *lenp;
- if (len)
- if (copy_to_user(buffer, data, len))
- return -EFAULT;
- if (len < *lenp) {
- if (put_user('\n', buffer + len))
- return -EFAULT;
- len++;
- }
- *lenp = len;
- *ppos += len;
- }
- return 0;
-}
-
-static void warn_sysctl_write(struct ctl_table *table)
-{
- pr_warn_once("%s wrote to %s when file position was not 0!\n"
- "This will not be supported in the future. To silence this\n"
- "warning, set kernel.sysctl_writes_strict = -1\n",
- current->comm, table->procname);
-}
-
-/**
- * proc_first_pos_non_zero_ignore - check if first position is allowed
- * @ppos: file position
- * @table: the sysctl table
- *
- * Returns true if the first position is non-zero and the sysctl_writes_strict
- * mode indicates this is not allowed for numeric input types. String proc
- * handlers can ignore the return value.
- */
-static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
- struct ctl_table *table)
-{
- if (!*ppos)
- return false;
-
- switch (sysctl_writes_strict) {
- case SYSCTL_WRITES_STRICT:
- return true;
- case SYSCTL_WRITES_WARN:
- warn_sysctl_write(table);
- return false;
- default:
- return false;
- }
-}
-
-/**
- * proc_dostring - read a string sysctl
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes a string from/to the user buffer. If the kernel
- * buffer provided is not large enough to hold the string, the
- * string is truncated. The copied string is %NULL-terminated.
- * If the string is being read by the user process, it is copied
- * and a newline '\n' is added. It is truncated if the buffer is
- * not large enough.
- *
- * Returns 0 on success.
- */
-int proc_dostring(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- if (write)
- proc_first_pos_non_zero_ignore(ppos, table);
-
- return _proc_do_string((char *)(table->data), table->maxlen, write,
- (char __user *)buffer, lenp, ppos);
-}
-
-static size_t proc_skip_spaces(char **buf)
-{
- size_t ret;
- char *tmp = skip_spaces(*buf);
- ret = tmp - *buf;
- *buf = tmp;
- return ret;
-}
-
-static void proc_skip_char(char **buf, size_t *size, const char v)
-{
- while (*size) {
- if (**buf != v)
- break;
- (*size)--;
- (*buf)++;
- }
-}
-
-/**
- * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
- * fail on overflow
- *
- * @cp: kernel buffer containing the string to parse
- * @endp: pointer to store the trailing characters
- * @base: the base to use
- * @res: where the parsed integer will be stored
- *
- * In case of success 0 is returned and @res will contain the parsed integer,
- * @endp will hold any trailing characters.
- * This function will fail the parse on overflow. If there wasn't an overflow
- * the function will defer the decision what characters count as invalid to the
- * caller.
- */
-static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
- unsigned long *res)
-{
- unsigned long long result;
- unsigned int rv;
-
- cp = _parse_integer_fixup_radix(cp, &base);
- rv = _parse_integer(cp, base, &result);
- if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
- return -ERANGE;
-
- cp += rv;
-
- if (endp)
- *endp = (char *)cp;
-
- *res = (unsigned long)result;
- return 0;
-}
-
-#define TMPBUFLEN 22
-/**
- * proc_get_long - reads an ASCII formatted integer from a user buffer
- *
- * @buf: a kernel buffer
- * @size: size of the kernel buffer
- * @val: this is where the number will be stored
- * @neg: set to %TRUE if number is negative
- * @perm_tr: a vector which contains the allowed trailers
- * @perm_tr_len: size of the perm_tr vector
- * @tr: pointer to store the trailer character
- *
- * In case of success %0 is returned and @buf and @size are updated with
- * the amount of bytes read. If @tr is non-NULL and a trailing
- * character exists (size is non-zero after returning from this
- * function), @tr is updated with the trailing character.
- */
-static int proc_get_long(char **buf, size_t *size,
- unsigned long *val, bool *neg,
- const char *perm_tr, unsigned perm_tr_len, char *tr)
-{
- int len;
- char *p, tmp[TMPBUFLEN];
-
- if (!*size)
- return -EINVAL;
-
- len = *size;
- if (len > TMPBUFLEN - 1)
- len = TMPBUFLEN - 1;
-
- memcpy(tmp, *buf, len);
-
- tmp[len] = 0;
- p = tmp;
- if (*p == '-' && *size > 1) {
- *neg = true;
- p++;
- } else
- *neg = false;
- if (!isdigit(*p))
- return -EINVAL;
-
- if (strtoul_lenient(p, &p, 0, val))
- return -EINVAL;
-
- len = p - tmp;
-
- /* We don't know if the next char is whitespace thus we may accept
- * invalid integers (e.g. 1234...a) or two integers instead of one
- * (e.g. 123...1). So lets not allow such large numbers. */
- if (len == TMPBUFLEN - 1)
- return -EINVAL;
-
- if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
- return -EINVAL;
-
- if (tr && (len < *size))
- *tr = *p;
-
- *buf += len;
- *size -= len;
-
- return 0;
-}
-
-/**
- * proc_put_long - converts an integer to a decimal ASCII formatted string
- *
- * @buf: the user buffer
- * @size: the size of the user buffer
- * @val: the integer to be converted
- * @neg: sign of the number, %TRUE for negative
- *
- * In case of success %0 is returned and @buf and @size are updated with
- * the amount of bytes written.
- */
-static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
- bool neg)
-{
- int len;
- char tmp[TMPBUFLEN], *p = tmp;
-
- sprintf(p, "%s%lu", neg ? "-" : "", val);
- len = strlen(tmp);
- if (len > *size)
- len = *size;
- if (copy_to_user(*buf, tmp, len))
- return -EFAULT;
- *size -= len;
- *buf += len;
- return 0;
-}
-#undef TMPBUFLEN
-
-static int proc_put_char(void __user **buf, size_t *size, char c)
-{
- if (*size) {
- char __user **buffer = (char __user **)buf;
- if (put_user(c, *buffer))
- return -EFAULT;
- (*size)--, (*buffer)++;
- *buf = *buffer;
- }
- return 0;
-}
-
-static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (*negp) {
- if (*lvalp > (unsigned long) INT_MAX + 1)
- return -EINVAL;
- *valp = -*lvalp;
- } else {
- if (*lvalp > (unsigned long) INT_MAX)
- return -EINVAL;
- *valp = *lvalp;
- }
- } else {
- int val = *valp;
- if (val < 0) {
- *negp = true;
- *lvalp = -(unsigned long)val;
- } else {
- *negp = false;
- *lvalp = (unsigned long)val;
- }
- }
- return 0;
-}
-
-static int do_proc_douintvec_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- if (write) {
- if (*lvalp > UINT_MAX)
- return -EINVAL;
- *valp = *lvalp;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long)val;
- }
- return 0;
-}
-
-static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
-
-static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
- int write, void *data),
- void *data)
-{
- int *i, vleft, first = 1, err = 0;
- size_t left;
- char *kbuf = NULL, *p;
-
- if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- i = (int *) tbl_data;
- vleft = table->maxlen / sizeof(*i);
- left = *lenp;
-
- if (!conv)
- conv = do_proc_dointvec_conv;
-
- if (write) {
- if (proc_first_pos_non_zero_ignore(ppos, table))
- goto out;
-
- if (left > PAGE_SIZE - 1)
- left = PAGE_SIZE - 1;
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- }
-
- for (; left && vleft--; i++, first=0) {
- unsigned long lval;
- bool neg;
-
- if (write) {
- left -= proc_skip_spaces(&p);
-
- if (!left)
- break;
- err = proc_get_long(&p, &left, &lval, &neg,
- proc_wspace_sep,
- sizeof(proc_wspace_sep), NULL);
- if (err)
- break;
- if (conv(&neg, &lval, i, 1, data)) {
- err = -EINVAL;
- break;
- }
- } else {
- if (conv(&neg, &lval, i, 0, data)) {
- err = -EINVAL;
- break;
- }
- if (!first)
- err = proc_put_char(&buffer, &left, '\t');
- if (err)
- break;
- err = proc_put_long(&buffer, &left, lval, neg);
- if (err)
- break;
- }
- }
-
- if (!write && !first && left && !err)
- err = proc_put_char(&buffer, &left, '\n');
- if (write && !err && left)
- left -= proc_skip_spaces(&p);
- if (write) {
- kfree(kbuf);
- if (first)
- return err ? : -EINVAL;
- }
- *lenp -= left;
-out:
- *ppos += *lenp;
- return err;
-}
-
-static int do_proc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
- int write, void *data),
- void *data)
-{
- return __do_proc_dointvec(table->data, table, write,
- buffer, lenp, ppos, conv, data);
-}
-
-static int do_proc_douintvec_w(unsigned int *tbl_data,
- struct ctl_table *table,
- void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- unsigned long lval;
- int err = 0;
- size_t left;
- bool neg;
- char *kbuf = NULL, *p;
-
- left = *lenp;
-
- if (proc_first_pos_non_zero_ignore(ppos, table))
- goto bail_early;
-
- if (left > PAGE_SIZE - 1)
- left = PAGE_SIZE - 1;
-
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return -EINVAL;
-
- left -= proc_skip_spaces(&p);
- if (!left) {
- err = -EINVAL;
- goto out_free;
- }
-
- err = proc_get_long(&p, &left, &lval, &neg,
- proc_wspace_sep,
- sizeof(proc_wspace_sep), NULL);
- if (err || neg) {
- err = -EINVAL;
- goto out_free;
- }
-
- if (conv(&lval, tbl_data, 1, data)) {
- err = -EINVAL;
- goto out_free;
- }
-
- if (!err && left)
- left -= proc_skip_spaces(&p);
-
-out_free:
- kfree(kbuf);
- if (err)
- return -EINVAL;
-
- return 0;
-
- /* This is in keeping with old __do_proc_dointvec() */
-bail_early:
- *ppos += *lenp;
- return err;
-}
-
-static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- unsigned long lval;
- int err = 0;
- size_t left;
-
- left = *lenp;
-
- if (conv(&lval, tbl_data, 0, data)) {
- err = -EINVAL;
- goto out;
- }
-
- err = proc_put_long(&buffer, &left, lval, false);
- if (err || !left)
- goto out;
-
- err = proc_put_char(&buffer, &left, '\n');
-
-out:
- *lenp -= left;
- *ppos += *lenp;
-
- return err;
-}
-
-static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- unsigned int *i, vleft;
-
- if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- i = (unsigned int *) tbl_data;
- vleft = table->maxlen / sizeof(*i);
-
- /*
- * Arrays are not supported, keep this simple. *Do not* add
- * support for them.
- */
- if (vleft != 1) {
- *lenp = 0;
- return -EINVAL;
- }
-
- if (!conv)
- conv = do_proc_douintvec_conv;
-
- if (write)
- return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
- conv, data);
- return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
-}
-
-static int do_proc_douintvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- return __do_proc_douintvec(table->data, table, write,
- buffer, lenp, ppos, conv, data);
-}
-
-/**
- * proc_dointvec - read a vector of integers
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- *
- * Returns 0 on success.
- */
-int proc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
-}
-
-#ifdef CONFIG_COMPACTION
-static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- int ret, old;
-
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
- old = *(int *)table->data;
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (ret)
- return ret;
- if (old != *(int *)table->data)
- pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
- table->procname, current->comm,
- task_pid_nr(current));
- return ret;
-}
-#endif
-
-/**
- * proc_douintvec - read a vector of unsigned integers
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
- * values from/to the user buffer, treated as an ASCII string.
- *
- * Returns 0 on success.
- */
-int proc_douintvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_douintvec_conv, NULL);
-}
-
-/*
- * Taint values can only be increased
- * This means we can safely use a temporary.
- */
-static int proc_taint(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct ctl_table t;
- unsigned long tmptaint = get_taint();
- int err;
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- t = *table;
- t.data = &tmptaint;
- err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
- if (err < 0)
- return err;
-
- if (write) {
- /*
- * Poor man's atomic or. Not worth adding a primitive
- * to everyone's atomic.h for this
- */
- int i;
- for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
- if ((tmptaint >> i) & 1)
- add_taint(i, LOCKDEP_STILL_OK);
- }
- }
-
- return err;
-}
-
-#ifdef CONFIG_PRINTK
-static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-}
-#endif
-
-/**
- * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
- * @min: pointer to minimum allowable value
- * @max: pointer to maximum allowable value
- *
- * The do_proc_dointvec_minmax_conv_param structure provides the
- * minimum and maximum values for doing range checking for those sysctl
- * parameters that use the proc_dointvec_minmax() handler.
- */
-struct do_proc_dointvec_minmax_conv_param {
- int *min;
- int *max;
-};
-
-static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- int tmp, ret;
- struct do_proc_dointvec_minmax_conv_param *param = data;
- /*
- * If writing, first do so via a temporary local int so we can
- * bounds-check it before touching *valp.
- */
- int *ip = write ? &tmp : valp;
-
- ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
- if (ret)
- return ret;
-
- if (write) {
- if ((param->min && *param->min > tmp) ||
- (param->max && *param->max < tmp))
- return -EINVAL;
- *valp = tmp;
- }
-
- return 0;
-}
-
-/**
- * proc_dointvec_minmax - read a vector of integers with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success or -EINVAL on write when the range check fails.
- */
-int proc_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct do_proc_dointvec_minmax_conv_param param = {
- .min = (int *) table->extra1,
- .max = (int *) table->extra2,
- };
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dointvec_minmax_conv, &param);
-}
-
-/**
- * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
- * @min: pointer to minimum allowable value
- * @max: pointer to maximum allowable value
- *
- * The do_proc_douintvec_minmax_conv_param structure provides the
- * minimum and maximum values for doing range checking for those sysctl
- * parameters that use the proc_douintvec_minmax() handler.
- */
-struct do_proc_douintvec_minmax_conv_param {
- unsigned int *min;
- unsigned int *max;
-};
-
-static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- int ret;
- unsigned int tmp;
- struct do_proc_douintvec_minmax_conv_param *param = data;
- /* write via temporary local uint for bounds-checking */
- unsigned int *up = write ? &tmp : valp;
-
- ret = do_proc_douintvec_conv(lvalp, up, write, data);
- if (ret)
- return ret;
-
- if (write) {
- if ((param->min && *param->min > tmp) ||
- (param->max && *param->max < tmp))
- return -ERANGE;
-
- *valp = tmp;
- }
-
- return 0;
-}
-
-/**
- * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
- * values from/to the user buffer, treated as an ASCII string. Negative
- * strings are not allowed.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max). There is a final sanity
- * check for UINT_MAX to avoid having to support wrap around uses from
- * userspace.
- *
- * Returns 0 on success or -ERANGE on write when the range check fails.
- */
-int proc_douintvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct do_proc_douintvec_minmax_conv_param param = {
- .min = (unsigned int *) table->extra1,
- .max = (unsigned int *) table->extra2,
- };
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_douintvec_minmax_conv, &param);
-}
-
-static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- if (write) {
- unsigned int val;
-
- val = round_pipe_size(*lvalp);
- if (val == 0)
- return -EINVAL;
-
- *valp = val;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long) val;
- }
-
- return 0;
-}
-
-static int proc_dopipe_max_size(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_dopipe_max_size_conv, NULL);
-}
-
-static void validate_coredump_safety(void)
-{
-#ifdef CONFIG_COREDUMP
- if (suid_dumpable == SUID_DUMP_ROOT &&
- core_pattern[0] != '/' && core_pattern[0] != '|') {
- printk(KERN_WARNING
-"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
-"Pipe handler or fully qualified core dump path required.\n"
-"Set kernel.core_pattern before fs.suid_dumpable.\n"
- );
- }
-#endif
-}
-
-static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!error)
- validate_coredump_safety();
- return error;
-}
-
-#ifdef CONFIG_COREDUMP
-static int proc_dostring_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int error = proc_dostring(table, write, buffer, lenp, ppos);
- if (!error)
- validate_coredump_safety();
- return error;
-}
-#endif
-
-#ifdef CONFIG_MAGIC_SYSRQ
-static int sysrq_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int tmp, ret;
-
- tmp = sysrq_mask();
-
- ret = __do_proc_dointvec(&tmp, table, write, buffer,
- lenp, ppos, NULL, NULL);
- if (ret || !write)
- return ret;
-
- if (write)
- sysrq_toggle_support(tmp);
-
- return 0;
-}
-#endif
-
-static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos,
- unsigned long convmul,
- unsigned long convdiv)
-{
- unsigned long *i, *min, *max;
- int vleft, first = 1, err = 0;
- size_t left;
- char *kbuf = NULL, *p;
-
- if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- i = (unsigned long *) data;
- min = (unsigned long *) table->extra1;
- max = (unsigned long *) table->extra2;
- vleft = table->maxlen / sizeof(unsigned long);
- left = *lenp;
-
- if (write) {
- if (proc_first_pos_non_zero_ignore(ppos, table))
- goto out;
-
- if (left > PAGE_SIZE - 1)
- left = PAGE_SIZE - 1;
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- }
-
- for (; left && vleft--; i++, first = 0) {
- unsigned long val;
-
- if (write) {
- bool neg;
-
- left -= proc_skip_spaces(&p);
- if (!left)
- break;
-
- err = proc_get_long(&p, &left, &val, &neg,
- proc_wspace_sep,
- sizeof(proc_wspace_sep), NULL);
- if (err)
- break;
- if (neg)
- continue;
- val = convmul * val / convdiv;
- if ((min && val < *min) || (max && val > *max)) {
- err = -EINVAL;
- break;
- }
- *i = val;
- } else {
- val = convdiv * (*i) / convmul;
- if (!first) {
- err = proc_put_char(&buffer, &left, '\t');
- if (err)
- break;
- }
- err = proc_put_long(&buffer, &left, val, false);
- if (err)
- break;
- }
- }
-
- if (!write && !first && left && !err)
- err = proc_put_char(&buffer, &left, '\n');
- if (write && !err)
- left -= proc_skip_spaces(&p);
- if (write) {
- kfree(kbuf);
- if (first)
- return err ? : -EINVAL;
- }
- *lenp -= left;
-out:
- *ppos += *lenp;
- return err;
-}
-
-static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos,
- unsigned long convmul,
- unsigned long convdiv)
-{
- return __do_proc_doulongvec_minmax(table->data, table, write,
- buffer, lenp, ppos, convmul, convdiv);
-}
-
-/**
- * proc_doulongvec_minmax - read a vector of long integers with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
- * values from/to the user buffer, treated as an ASCII string.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success.
- */
-int proc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
-}
-
-/**
- * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
- * values from/to the user buffer, treated as an ASCII string. The values
- * are treated as milliseconds, and converted to jiffies when they are stored.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success.
- */
-int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return do_proc_doulongvec_minmax(table, write, buffer,
- lenp, ppos, HZ, 1000l);
-}
-
-
-static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (*lvalp > INT_MAX / HZ)
- return 1;
- *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = lval / HZ;
- }
- return 0;
-}
-
-static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
- return 1;
- *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = jiffies_to_clock_t(lval);
- }
- return 0;
-}
-
-static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
-
- if (jif > INT_MAX)
- return 1;
- *valp = (int)jif;
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = jiffies_to_msecs(lval);
- }
- return 0;
-}
-
-/**
- * proc_dointvec_jiffies - read a vector of integers as seconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in seconds, and are converted into
- * jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- do_proc_dointvec_jiffies_conv,NULL);
-}
-
-/**
- * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: pointer to the file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/USER_HZ seconds, and
- * are converted into jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- do_proc_dointvec_userhz_jiffies_conv,NULL);
-}
-
-/**
- * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- * @ppos: the current position in the file
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/1000 seconds, and
- * are converted into jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dointvec_ms_jiffies_conv, NULL);
-}
-
-static int proc_do_cad_pid(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct pid *new_pid;
- pid_t tmp;
- int r;
-
- tmp = pid_vnr(cad_pid);
-
- r = __do_proc_dointvec(&tmp, table, write, buffer,
- lenp, ppos, NULL, NULL);
- if (r || !write)
- return r;
-
- new_pid = find_get_pid(tmp);
- if (!new_pid)
- return -ESRCH;
-
- put_pid(xchg(&cad_pid, new_pid));
- return 0;
-}
-
-/**
- * proc_do_large_bitmap - read/write from/to a large bitmap
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * The bitmap is stored at table->data and the bitmap length (in bits)
- * in table->maxlen.
- *
- * We use a range comma separated format (e.g. 1,3-4,10-10) so that
- * large bitmaps may be represented in a compact manner. Writing into
- * the file will clear the bitmap then update it with the given input.
- *
- * Returns 0 on success.
- */
-int proc_do_large_bitmap(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int err = 0;
- bool first = 1;
- size_t left = *lenp;
- unsigned long bitmap_len = table->maxlen;
- unsigned long *bitmap = *(unsigned long **) table->data;
- unsigned long *tmp_bitmap = NULL;
- char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
-
- if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- if (write) {
- char *kbuf, *p;
- size_t skipped = 0;
-
- if (left > PAGE_SIZE - 1) {
- left = PAGE_SIZE - 1;
- /* How much of the buffer we'll skip this pass */
- skipped = *lenp - left;
- }
-
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
- if (!tmp_bitmap) {
- kfree(kbuf);
- return -ENOMEM;
- }
- proc_skip_char(&p, &left, '\n');
- while (!err && left) {
- unsigned long val_a, val_b;
- bool neg;
- size_t saved_left;
-
- /* In case we stop parsing mid-number, we can reset */
- saved_left = left;
- err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
- sizeof(tr_a), &c);
- /*
- * If we consumed the entirety of a truncated buffer or
- * only one char is left (may be a "-"), then stop here,
- * reset, & come back for more.
- */
- if ((left <= 1) && skipped) {
- left = saved_left;
- break;
- }
-
- if (err)
- break;
- if (val_a >= bitmap_len || neg) {
- err = -EINVAL;
- break;
- }
-
- val_b = val_a;
- if (left) {
- p++;
- left--;
- }
-
- if (c == '-') {
- err = proc_get_long(&p, &left, &val_b,
- &neg, tr_b, sizeof(tr_b),
- &c);
- /*
- * If we consumed all of a truncated buffer or
- * then stop here, reset, & come back for more.
- */
- if (!left && skipped) {
- left = saved_left;
- break;
- }
-
- if (err)
- break;
- if (val_b >= bitmap_len || neg ||
- val_a > val_b) {
- err = -EINVAL;
- break;
- }
- if (left) {
- p++;
- left--;
- }
- }
-
- bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
- first = 0;
- proc_skip_char(&p, &left, '\n');
- }
- kfree(kbuf);
- left += skipped;
- } else {
- unsigned long bit_a, bit_b = 0;
-
- while (left) {
- bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
- if (bit_a >= bitmap_len)
- break;
- bit_b = find_next_zero_bit(bitmap, bitmap_len,
- bit_a + 1) - 1;
-
- if (!first) {
- err = proc_put_char(&buffer, &left, ',');
- if (err)
- break;
- }
- err = proc_put_long(&buffer, &left, bit_a, false);
- if (err)
- break;
- if (bit_a != bit_b) {
- err = proc_put_char(&buffer, &left, '-');
- if (err)
- break;
- err = proc_put_long(&buffer, &left, bit_b, false);
- if (err)
- break;
- }
-
- first = 0; bit_b++;
- }
- if (!err)
- err = proc_put_char(&buffer, &left, '\n');
- }
-
- if (!err) {
- if (write) {
- if (*ppos)
- bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
- else
- bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
- }
- *lenp -= left;
- *ppos += *lenp;
- }
-
- bitmap_free(tmp_bitmap);
- return err;
-}
-
-#else /* CONFIG_PROC_SYSCTL */
-
-int proc_dostring(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_douintvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_douintvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_do_large_bitmap(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-#endif /* CONFIG_PROC_SYSCTL */
-
-#if defined(CONFIG_SYSCTL)
-int proc_do_static_key(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- struct static_key *key = (struct static_key *)table->data;
- static DEFINE_MUTEX(static_key_mutex);
- int val, ret;
- struct ctl_table tmp = {
- .data = &val,
- .maxlen = sizeof(val),
- .mode = table->mode,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- };
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mutex_lock(&static_key_mutex);
- val = static_key_enabled(key);
- ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
- if (write && !ret) {
- if (val)
- static_key_enable(key);
- else
- static_key_disable(key);
- }
- mutex_unlock(&static_key_mutex);
- return ret;
-}
-#endif
/*
* No sense putting this after each symbol definition, twice,
* exception granted :-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a5221abb4594..398e6eadb861 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -249,8 +249,7 @@ void timers_update_nohz(void)
}
int timer_migration_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1796747a77..e875c95d3ced 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -797,6 +797,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_peek_elem_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_ktime_get_boot_ns:
+ return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8d2b98812625..167a74a15b1a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2661,7 +2661,7 @@ static void output_printk(struct trace_event_buffer *fbuffer)
}
int tracepoint_printk_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int save_tracepoint_printk;
diff --git a/kernel/umh.c b/kernel/umh.c
index 7f255b5a8845..9788ed481a6a 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -630,7 +630,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
EXPORT_SYMBOL(call_usermodehelper);
static int proc_cap_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3732c888a949..4ca61d49885b 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table)
* to observe. Should this be in kernel/sys.c ????
*/
static int proc_do_uts_string(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table uts_table;
int r;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b6b1f54a7837..53ff2c81b084 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -661,7 +661,7 @@ static void proc_watchdog_update(void)
* proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
*/
static int proc_watchdog_common(int which, struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err, old, *param = table->data;
@@ -688,7 +688,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
* /proc/sys/kernel/watchdog
*/
int proc_watchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
table, write, buffer, lenp, ppos);
@@ -698,7 +698,7 @@ int proc_watchdog(struct ctl_table *table, int write,
* /proc/sys/kernel/nmi_watchdog
*/
int proc_nmi_watchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!nmi_watchdog_available && write)
return -ENOTSUPP;
@@ -710,7 +710,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write,
* /proc/sys/kernel/soft_watchdog
*/
int proc_soft_watchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
table, write, buffer, lenp, ppos);
@@ -720,7 +720,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
* /proc/sys/kernel/watchdog_thresh
*/
int proc_watchdog_thresh(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err, old;
@@ -743,7 +743,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
* been brought online, if desired.
*/
int proc_watchdog_cpumask(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err;
diff --git a/mm/compaction.c b/mm/compaction.c
index 46f0fcc93081..d8cfb7b99a83 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2463,7 +2463,7 @@ int sysctl_compact_memory;
* /proc/sys/vm/compact_memory
*/
int sysctl_compaction_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
if (write)
compact_nodes();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bcabbe02192b..f9a97320e1de 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3352,7 +3352,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
#ifdef CONFIG_SYSCTL
static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp = h->max_huge_pages;
@@ -3375,7 +3375,7 @@ out:
}
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(false, table, write,
@@ -3384,7 +3384,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
#ifdef CONFIG_NUMA
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(true, table, write,
buffer, length, ppos);
@@ -3392,8 +3392,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
#endif /* CONFIG_NUMA */
int hugetlb_overcommit_handler(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7326b54ab728..d3ee4c4dafac 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -512,8 +512,7 @@ bool node_dirty_ok(struct pglist_data *pgdat)
}
int dirty_background_ratio_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -524,8 +523,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write,
}
int dirty_background_bytes_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -535,9 +533,8 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write,
return ret;
}
-int dirty_ratio_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int old_ratio = vm_dirty_ratio;
int ret;
@@ -551,8 +548,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
}
int dirty_bytes_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned long old_bytes = vm_dirty_bytes;
int ret;
@@ -1972,7 +1968,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
unsigned int old_interval = dirty_writeback_interval;
int ret;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 69827d4fa052..0c43e9ae5004 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5546,21 +5546,11 @@ char numa_zonelist_order[] = "Node";
* sysctl handler for numa_zonelist_order
*/
int numa_zonelist_order_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length,
- loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
- char *str;
- int ret;
-
- if (!write)
- return proc_dostring(table, write, buffer, length, ppos);
- str = memdup_user_nul(buffer, 16);
- if (IS_ERR(str))
- return PTR_ERR(str);
-
- ret = __parse_numa_zonelist_order(str);
- kfree(str);
- return ret;
+ if (write)
+ return __parse_numa_zonelist_order(buffer);
+ return proc_dostring(table, write, buffer, length, ppos);
}
@@ -7963,7 +7953,7 @@ core_initcall(init_per_zone_wmark_min)
* changes.
*/
int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -7978,20 +7968,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
-int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- int rc;
-
- rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (rc)
- return rc;
-
- return 0;
-}
-
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -8021,7 +7999,7 @@ static void setup_min_unmapped_ratio(void)
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -8048,7 +8026,7 @@ static void setup_min_slab_ratio(void)
}
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -8072,7 +8050,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
* if in function of the boot time zone sizes.
*/
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec_minmax(table, write, buffer, length, ppos);
setup_per_zone_lowmem_reserve();
@@ -8094,7 +8072,7 @@ static void __zone_pcp_update(struct zone *zone)
* pagelist can have before it gets flushed back to buddy allocator.
*/
int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int old_percpu_pagelist_fraction;
diff --git a/mm/util.c b/mm/util.c
index 988d11e6c17c..8defc8ec141f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -717,9 +717,8 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
-int overcommit_ratio_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
@@ -729,9 +728,8 @@ int overcommit_ratio_handler(struct ctl_table *table, int write,
return ret;
}
-int overcommit_kbytes_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 96d21a792b57..c03a8c914922 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -76,7 +76,7 @@ static void invalid_numa_statistics(void)
static DEFINE_MUTEX(vm_numa_stat_lock);
int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int ret, oldval;
@@ -1751,7 +1751,7 @@ static void refresh_vm_stats(struct work_struct *work)
}
int vmstat_refresh(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
long val;
int err;
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 59980ecfc962..04c3f9a82650 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -1027,7 +1027,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
#ifdef CONFIG_SYSCTL
static
int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/net/core/filter.c b/net/core/filter.c
index 7d6ceaa54d21..dfaf5df13722 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -256,17 +256,6 @@ BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
offset);
}
-BPF_CALL_0(bpf_get_raw_cpu_id)
-{
- return raw_smp_processor_id();
-}
-
-static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
- .func = bpf_get_raw_cpu_id,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
-};
-
static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
struct bpf_insn *insn_buf)
{
@@ -4205,36 +4194,19 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags,
- void *, data, u64, size)
-{
- if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
- return -EINVAL;
-
- return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
-}
+#define SOCKOPT_CC_REINIT (1 << 0)
-static const struct bpf_func_proto bpf_event_output_data_proto = {
- .func = bpf_event_output_data,
- .gpl_only = true,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
- .arg5_type = ARG_CONST_SIZE_OR_ZERO,
-};
-
-BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
- int, level, int, optname, char *, optval, int, optlen)
+static int _bpf_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen, u32 flags)
{
- struct sock *sk = bpf_sock->sk;
int ret = 0;
int val;
if (!sk_fullsock(sk))
return -EINVAL;
+ sock_owned_by_me(sk);
+
if (level == SOL_SOCKET) {
if (optlen != sizeof(int))
return -EINVAL;
@@ -4329,7 +4301,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
sk->sk_prot->setsockopt == tcp_setsockopt) {
if (optname == TCP_CONGESTION) {
char name[TCP_CA_NAME_MAX];
- bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN;
+ bool reinit = flags & SOCKOPT_CC_REINIT;
strncpy(name, optval, min_t(long, optlen,
TCP_CA_NAME_MAX-1));
@@ -4376,24 +4348,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
return ret;
}
-static const struct bpf_func_proto bpf_setsockopt_proto = {
- .func = bpf_setsockopt,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
- .arg5_type = ARG_CONST_SIZE,
-};
-
-BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
- int, level, int, optname, char *, optval, int, optlen)
+static int _bpf_getsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
{
- struct sock *sk = bpf_sock->sk;
-
if (!sk_fullsock(sk))
goto err_clear;
+
+ sock_owned_by_me(sk);
+
#ifdef CONFIG_INET
if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
struct inet_connection_sock *icsk;
@@ -4459,8 +4421,71 @@ err_clear:
return -EINVAL;
}
-static const struct bpf_func_proto bpf_getsockopt_proto = {
- .func = bpf_getsockopt,
+BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ u32 flags = 0;
+ return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen,
+ flags);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
+ .func = bpf_sock_addr_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
+ .func = bpf_sock_addr_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ u32 flags = 0;
+ if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
+ flags |= SOCKOPT_CC_REINIT;
+ return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen,
+ flags);
+}
+
+static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
+ .func = bpf_sock_ops_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
+ .func = bpf_sock_ops_getsockopt,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -5983,52 +6008,7 @@ bool bpf_helper_changes_pkt_data(void *func)
return false;
}
-const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
-{
- switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_map_push_elem:
- return &bpf_map_push_elem_proto;
- case BPF_FUNC_map_pop_elem:
- return &bpf_map_pop_elem_proto;
- case BPF_FUNC_map_peek_elem:
- return &bpf_map_peek_elem_proto;
- case BPF_FUNC_get_prandom_u32:
- return &bpf_get_prandom_u32_proto;
- case BPF_FUNC_get_smp_processor_id:
- return &bpf_get_raw_smp_processor_id_proto;
- case BPF_FUNC_get_numa_node_id:
- return &bpf_get_numa_node_id_proto;
- case BPF_FUNC_tail_call:
- return &bpf_tail_call_proto;
- case BPF_FUNC_ktime_get_ns:
- return &bpf_ktime_get_ns_proto;
- default:
- break;
- }
-
- if (!capable(CAP_SYS_ADMIN))
- return NULL;
-
- switch (func_id) {
- case BPF_FUNC_spin_lock:
- return &bpf_spin_lock_proto;
- case BPF_FUNC_spin_unlock:
- return &bpf_spin_unlock_proto;
- case BPF_FUNC_trace_printk:
- return bpf_get_trace_printk_proto();
- case BPF_FUNC_jiffies64:
- return &bpf_jiffies64_proto;
- default:
- return NULL;
- }
-}
+const struct bpf_func_proto bpf_event_output_data_proto __weak;
static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
@@ -6119,6 +6099,22 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ return &bpf_sock_addr_setsockopt_proto;
+ default:
+ return NULL;
+ }
+ case BPF_FUNC_getsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ return &bpf_sock_addr_getsockopt_proto;
+ default:
+ return NULL;
+ }
default:
return bpf_base_func_proto(func_id);
}
@@ -6213,6 +6209,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_adjust_room_proto;
case BPF_FUNC_skb_change_tail:
return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
@@ -6335,9 +6333,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_setsockopt:
- return &bpf_setsockopt_proto;
+ return &bpf_sock_ops_setsockopt_proto;
case BPF_FUNC_getsockopt:
- return &bpf_getsockopt_proto;
+ return &bpf_sock_ops_getsockopt_proto;
case BPF_FUNC_sock_ops_cb_flags_set:
return &bpf_sock_ops_cb_flags_set_proto;
case BPF_FUNC_sock_map_update:
@@ -8786,6 +8784,10 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
if (!reuse) {
+ /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
+ if (sk_is_refcounted(selected_sk))
+ sock_put(selected_sk);
+
/* reuseport_array has only sk with non NULL sk_reuseport_cb.
* The only (!reuse) case here is - the sk has already been
* unhashed (e.g. by close()), so treat it as -ENOENT.
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 39d37d0ef575..3f2263e79e4b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3379,7 +3379,7 @@ EXPORT_SYMBOL(neigh_app_ns);
static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
static int proc_unres_qlen(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int size, ret;
struct ctl_table tmp = *ctl;
@@ -3443,8 +3443,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
}
static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
struct ctl_table tmp = *ctl;
int ret;
@@ -3457,8 +3457,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
return ret;
}
-int neigh_proc_dointvec(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
@@ -3467,8 +3467,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write,
}
EXPORT_SYMBOL(neigh_proc_dointvec);
-int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
- void __user *buffer,
+int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3479,8 +3478,8 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3489,8 +3488,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
}
int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3500,8 +3498,8 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos);
@@ -3510,8 +3508,8 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
}
static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
struct neigh_parms *p = ctl->extra2;
int ret;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index b08dfae10f88..00a26cf2cfe9 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -343,7 +343,14 @@ static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
static void *sock_map_lookup(struct bpf_map *map, void *key)
{
- return __sock_map_lookup_elem(map, *(u32 *)key);
+ struct sock *sk;
+
+ sk = __sock_map_lookup_elem(map, *(u32 *)key);
+ if (!sk || !sk_fullsock(sk))
+ return NULL;
+ if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+ return sk;
}
static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
@@ -1051,7 +1058,14 @@ static void *sock_hash_lookup_sys(struct bpf_map *map, void *key)
static void *sock_hash_lookup(struct bpf_map *map, void *key)
{
- return __sock_hash_lookup_elem(map, key);
+ struct sock *sk;
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (!sk || !sk_fullsock(sk))
+ return NULL;
+ if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+ return sk;
}
static void sock_hash_release_progs(struct bpf_map *map)
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 9f9e00ba3ad7..0ddb13a6282b 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -45,7 +45,7 @@ EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
#ifdef CONFIG_RPS
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int orig_size, size;
int ret, i;
@@ -115,8 +115,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
static DEFINE_MUTEX(flow_limit_update_mutex);
static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct sd_flow_limit *cur;
struct softnet_data *sd;
@@ -180,10 +179,7 @@ write_unlock:
}
if (len < *lenp)
kbuf[len++] = '\n';
- if (copy_to_user(buffer, kbuf, len)) {
- ret = -EFAULT;
- goto done;
- }
+ memcpy(buffer, kbuf, len);
*lenp = len;
*ppos += len;
}
@@ -194,8 +190,7 @@ done:
}
static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int old, *ptr;
int ret;
@@ -217,7 +212,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_NET_SCHED
static int set_default_qdisc(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char id[IFNAMSIZ];
struct ctl_table tbl = {
@@ -236,7 +231,7 @@ static int set_default_qdisc(struct ctl_table *table, int write,
#endif
static int proc_do_dev_weight(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -251,7 +246,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write,
}
static int proc_do_rss_key(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table fake_table;
char buf[NETDEV_RSS_KEY_LEN * 3];
@@ -264,7 +259,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write,
#ifdef CONFIG_BPF_JIT
static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret, jit_enable = *(int *)table->data;
@@ -291,8 +286,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
# ifdef CONFIG_HAVE_EBPF_JIT
static int
proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -303,8 +297,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
static int
proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index cca7ae712995..65abcf1b3210 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -160,8 +160,8 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU
static int min_priority[1];
static int max_priority[] = { 127 }; /* From DECnet spec */
-static int dn_forwarding_proc(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
+static int dn_forwarding_proc(struct ctl_table *, int, void *, size_t *,
+ loff_t *);
static struct dn_dev_sysctl_table {
struct ctl_table_header *sysctl_header;
struct ctl_table dn_dev_vars[5];
@@ -245,8 +245,7 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
}
static int dn_forwarding_proc(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
#ifdef CONFIG_DECNET_ROUTER
struct net_device *dev = table->extra1;
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 55bf64a22b59..deae519bdeec 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -134,8 +134,7 @@ static int parse_addr(__le16 *addr, char *str)
}
static int dn_node_address_handler(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char addr[DN_ASCBUF_LEN];
size_t len;
@@ -148,10 +147,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write,
if (write) {
len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1);
-
- if (copy_from_user(addr, buffer, len))
- return -EFAULT;
-
+ memcpy(addr, buffer, len);
addr[len] = 0;
strip_it(addr);
@@ -173,11 +169,9 @@ static int dn_node_address_handler(struct ctl_table *table, int write,
len = strlen(addr);
addr[len++] = '\n';
- if (len > *lenp) len = *lenp;
-
- if (copy_to_user(buffer, addr, len))
- return -EFAULT;
-
+ if (len > *lenp)
+ len = *lenp;
+ memcpy(buffer, addr, len);
*lenp = len;
*ppos += len;
@@ -185,8 +179,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write,
}
static int dn_def_dev_handler(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
size_t len;
struct net_device *dev;
@@ -201,9 +194,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write,
if (*lenp > 16)
return -E2BIG;
- if (copy_from_user(devname, buffer, *lenp))
- return -EFAULT;
-
+ memcpy(devname, buffer, *lenp);
devname[*lenp] = 0;
strip_it(devname);
@@ -238,9 +229,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write,
if (len > *lenp) len = *lenp;
- if (copy_to_user(buffer, devname, len))
- return -EFAULT;
-
+ memcpy(buffer, devname, len);
*lenp = len;
*ppos += len;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c0dd561aa190..fc94f82f82c7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2366,8 +2366,7 @@ static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
}
static int devinet_conf_proc(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int old_value = *(int *)ctl->data;
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
@@ -2419,8 +2418,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
}
static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
@@ -2463,8 +2461,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
}
static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 788c69d9bfe0..041f4dcac440 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3336,8 +3336,7 @@ static int ip_rt_gc_elasticity __read_mostly = 8;
static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = (struct net *)__ctl->extra1;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3a628423d27b..5653e3b011bf 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -71,8 +71,7 @@ static void set_local_port_range(struct net *net, int range[2])
/* Validate changes from /proc interface. */
static int ipv4_local_port_range(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net =
container_of(table->data, struct net, ipv4.ip_local_ports.range);
@@ -107,7 +106,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
/* Validate changes from /proc interface. */
static int ipv4_privileged_ports(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
ipv4.sysctl_ip_prot_sock);
@@ -168,8 +167,7 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig
/* Validate changes from /proc interface. */
static int ipv4_ping_group_range(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct user_namespace *user_ns = current_user_ns();
int ret;
@@ -204,8 +202,7 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
}
static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net;
int ret;
@@ -221,7 +218,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
}
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(ctl->data, struct net,
ipv4.tcp_congestion_control);
@@ -241,9 +238,8 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
}
static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
- int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
int ret;
@@ -258,9 +254,8 @@ static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
}
static int proc_allowed_congestion_control(struct ctl_table *ctl,
- int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
int ret;
@@ -296,8 +291,7 @@ static int sscanf_key(char *buf, __le32 *key)
}
static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
ipv4.sysctl_tcp_fastopen);
@@ -399,7 +393,7 @@ static void proc_configure_early_demux(int enabled, int protocol)
}
static int proc_tcp_early_demux(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = 0;
@@ -415,7 +409,7 @@ static int proc_tcp_early_demux(struct ctl_table *table, int write,
}
static int proc_udp_early_demux(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = 0;
@@ -431,8 +425,7 @@ static int proc_udp_early_demux(struct ctl_table *table, int write,
}
static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
- int write,
- void __user *buffer,
+ int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
@@ -447,8 +440,7 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
}
static int proc_tcp_available_ulp(struct ctl_table *ctl,
- int write,
- void __user *buffer, size_t *lenp,
+ int write, void *buffer, size_t *lenp,
loff_t *ppos)
{
struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, };
@@ -466,7 +458,7 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl,
#ifdef CONFIG_IP_ROUTE_MULTIPATH
static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2c4f20ec1e2a..26e666fe9a0e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6095,9 +6095,8 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
#ifdef CONFIG_SYSCTL
-static
-int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
@@ -6121,9 +6120,8 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
return ret;
}
-static
-int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct inet6_dev *idev = ctl->extra1;
int min_mtu = IPV6_MIN_MTU;
@@ -6193,9 +6191,8 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf)
return 0;
}
-static
-int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
@@ -6219,9 +6216,8 @@ int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
return ret;
}
-static
-int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int ret;
@@ -6262,7 +6258,7 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
}
static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = 0;
@@ -6324,7 +6320,7 @@ out:
}
static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int err;
@@ -6391,8 +6387,7 @@ out:
static
int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
- int write,
- void __user *buffer,
+ int write, void *buffer,
size_t *lenp,
loff_t *ppos)
{
@@ -6492,10 +6487,8 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val)
return 0;
}
-static
-int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 2d09c4da03ee..27f29b957ee7 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1835,7 +1835,8 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
}
}
-int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos)
+int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct net_device *dev = ctl->extra1;
struct inet6_dev *idev;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 803212aae4ca..3912aac7854d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6092,9 +6092,8 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
#ifdef CONFIG_SYSCTL
-static
-int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net;
int delay;
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 63b657aa8d29..fac2135aa47b 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -26,8 +26,7 @@ static int auto_flowlabels_min;
static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net;
int ret;
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 4701edffb1f7..a42e4ed5ab0e 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1362,8 +1362,7 @@ done:
(&((struct mpls_dev *)0)->field)
static int mpls_conf_proc(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int oval = *(int *)ctl->data;
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
@@ -2594,7 +2593,7 @@ nolabels:
}
static int mpls_platform_labels(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = table->data;
int platform_labels = net->mpls.platform_labels;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 8d14a1acbc37..412656c34f20 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1736,7 +1736,7 @@ static int three = 3;
static int
proc_do_defense_mode(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct netns_ipvs *ipvs = table->extra2;
int *valp = table->data;
@@ -1763,7 +1763,7 @@ proc_do_defense_mode(struct ctl_table *table, int write,
static int
proc_do_sync_threshold(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
int val[2];
@@ -1788,7 +1788,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write,
static int
proc_do_sync_ports(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
int val = *valp;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 5a3e6c43ee68..6a26299cb064 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -519,7 +519,7 @@ static unsigned int nf_conntrack_htable_size_user __read_mostly;
static int
nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index bb25d4c794c7..6cb9f9474b05 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -414,7 +414,7 @@ static struct ctl_table nf_log_sysctl_ftable[] = {
};
static int nf_log_proc_dostring(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
const struct nf_logger *logger;
char buf[NFLOGGER_NAME_LEN];
diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c
index 251e750fd9aa..0d0bf41381c2 100644
--- a/net/phonet/sysctl.c
+++ b/net/phonet/sysctl.c
@@ -49,8 +49,7 @@ void phonet_get_local_port_range(int *min, int *max)
}
static int proc_local_port_range(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int range[2] = {local_port_range[0], local_port_range[1]};
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 66121bc6f34e..46782fac4c16 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -62,8 +62,7 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0);
static struct kmem_cache *rds_tcp_conn_slab;
static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *fpos);
+ void *buffer, size_t *lenp, loff_t *fpos);
static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
@@ -676,8 +675,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
}
static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *fpos)
+ void *buffer, size_t *lenp, loff_t *fpos)
{
struct net *net = current->nsproxy->net_ns;
int err;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 4740aa70e652..c16c80963e55 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -43,20 +43,15 @@ static unsigned long max_autoclose_max =
? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ;
static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
+static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
static int proc_sctp_do_auth(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
static struct ctl_table sctp_table[] = {
{
@@ -343,8 +338,7 @@ static struct ctl_table sctp_net_table[] = {
};
static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = current->nsproxy->net_ns;
struct ctl_table tbl;
@@ -389,8 +383,7 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write,
}
static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = current->nsproxy->net_ns;
unsigned int min = *(unsigned int *) ctl->extra1;
@@ -418,8 +411,7 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write,
}
static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = current->nsproxy->net_ns;
unsigned int min = *(unsigned int *) ctl->extra1;
@@ -447,8 +439,7 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write,
}
static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (write)
pr_warn_once("Changing rto_alpha or rto_beta may lead to "
@@ -458,8 +449,7 @@ static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write,
}
static int proc_sctp_do_auth(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = current->nsproxy->net_ns;
struct ctl_table tbl;
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index d75f17b56f0e..999eee1ed61c 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -60,7 +60,7 @@ rpc_unregister_sysctl(void)
}
static int proc_do_xprt(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char tmpbuf[256];
size_t len;
@@ -70,15 +70,15 @@ static int proc_do_xprt(struct ctl_table *table, int write,
return 0;
}
len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
- return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
+ return memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
}
static int
-proc_dodebug(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp,
+ loff_t *ppos)
{
- char tmpbuf[20], c, *s = NULL;
- char __user *p;
+ char tmpbuf[20], *s = NULL;
+ char *p;
unsigned int value;
size_t left, len;
@@ -90,18 +90,17 @@ proc_dodebug(struct ctl_table *table, int write,
left = *lenp;
if (write) {
- if (!access_ok(buffer, left))
- return -EFAULT;
p = buffer;
- while (left && __get_user(c, p) >= 0 && isspace(c))
- left--, p++;
+ while (left && isspace(*p)) {
+ left--;
+ p++;
+ }
if (!left)
goto done;
if (left > sizeof(tmpbuf) - 1)
return -EINVAL;
- if (copy_from_user(tmpbuf, p, left))
- return -EFAULT;
+ memcpy(tmpbuf, p, left);
tmpbuf[left] = '\0';
value = simple_strtol(tmpbuf, &s, 0);
@@ -121,11 +120,9 @@ proc_dodebug(struct ctl_table *table, int write,
len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data);
if (len > left)
len = left;
- if (copy_to_user(buffer, tmpbuf, len))
- return -EFAULT;
+ memcpy(buffer, tmpbuf, len);
if ((left -= len) > 0) {
- if (put_user('\n', (char __user *)buffer + len))
- return -EFAULT;
+ *((char *)buffer + len) = '\n';
left--;
}
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 97bca509a391..526da5d4710b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -80,8 +80,7 @@ atomic_t rdma_stat_sq_prod;
* current value.
*/
static int read_reset_stat(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
atomic_t *stat = (atomic_t *)table->data;
@@ -103,8 +102,8 @@ static int read_reset_stat(struct ctl_table *table, int write,
len -= *ppos;
if (len > *lenp)
len = *lenp;
- if (len && copy_to_user(buffer, str_buf, len))
- return -EFAULT;
+ if (len)
+ memcpy(buffer, str_buf, len);
*lenp = len;
*ppos += len;
}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index c350108aa38d..f6e6609f70a3 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -322,7 +322,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
if (!xskq_cons_peek_desc(xs->tx, desc, umem))
continue;
- /* This is the backpreassure mechanism for the Tx path.
+ /* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
@@ -406,7 +406,7 @@ static int xsk_generic_xmit(struct sock *sk)
addr = desc.addr;
buffer = xdp_umem_get_data(xs->umem, addr);
err = skb_store_bits(skb, 0, buffer, len);
- /* This is the backpreassure mechanism for the Tx path.
+ /* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index b621ad74f54a..27e371b44dad 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -1696,7 +1696,7 @@ static int __init alloc_buffers(void)
#ifdef CONFIG_SYSCTL
static int apparmor_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!policy_admin_capable(NULL))
return -EPERM;
diff --git a/security/min_addr.c b/security/min_addr.c
index 94d2b0cf0e7b..88c9a6a21f47 100644
--- a/security/min_addr.c
+++ b/security/min_addr.c
@@ -30,7 +30,7 @@ static void update_mmap_min_addr(void)
* calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly
*/
int mmap_min_addr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 94dc346370b1..536c99646f6a 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -430,7 +430,7 @@ static struct security_hook_list yama_hooks[] __lsm_ro_after_init = {
#ifdef CONFIG_SYSCTL
static int yama_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table table_copy;
diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
index b04156cfd7a3..1fa755f55e0c 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
@@ -19,7 +19,7 @@ SYNOPSIS
FEATURE COMMANDS
================
-| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**macros** [**prefix** *PREFIX*]]
+| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]]
| **bpftool** **feature help**
|
| *COMPONENT* := { **kernel** | **dev** *NAME* }
@@ -49,6 +49,16 @@ DESCRIPTION
Keyword **kernel** can be omitted. If no probe target is
specified, probing the kernel is the default behaviour.
+ When the **unprivileged** keyword is used, bpftool will dump
+ only the features available to a user who does not have the
+ **CAP_SYS_ADMIN** capability set. The features available in
+ that case usually represent a small subset of the parameters
+ supported by the system. Unprivileged users MUST use the
+ **unprivileged** keyword: This is to avoid misdetection if
+ bpftool is inadvertently run as non-root, for example. This
+ keyword is unavailable if bpftool was compiled without
+ libcap.
+
**bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]]
Probe network device for supported eBPF features and dump
results to the console.
diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst
new file mode 100644
index 000000000000..ee6500d6e6e4
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst
@@ -0,0 +1,118 @@
+================
+bpftool-link
+================
+-------------------------------------------------------------------------------
+tool for inspection and simple manipulation of eBPF links
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+ **bpftool** [*OPTIONS*] **link** *COMMAND*
+
+ *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
+
+ *COMMANDS* := { **show** | **list** | **pin** | **help** }
+
+LINK COMMANDS
+=============
+
+| **bpftool** **link { show | list }** [*LINK*]
+| **bpftool** **link pin** *LINK* *FILE*
+| **bpftool** **link help**
+|
+| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* }
+
+
+DESCRIPTION
+===========
+ **bpftool link { show | list }** [*LINK*]
+ Show information about active links. If *LINK* is
+ specified show information only about given link,
+ otherwise list all links currently active on the system.
+
+ Output will start with link ID followed by link type and
+ zero or more named attributes, some of which depend on type
+ of link.
+
+ **bpftool link pin** *LINK* *FILE*
+ Pin link *LINK* as *FILE*.
+
+ Note: *FILE* must be located in *bpffs* mount. It must not
+ contain a dot character ('.'), which is reserved for future
+ extensions of *bpffs*.
+
+ **bpftool link help**
+ Print short help message.
+
+OPTIONS
+=======
+ -h, --help
+ Print short generic help message (similar to **bpftool help**).
+
+ -V, --version
+ Print version number (similar to **bpftool version**).
+
+ -j, --json
+ Generate JSON output. For commands that cannot produce JSON, this
+ option has no effect.
+
+ -p, --pretty
+ Generate human-readable JSON output. Implies **-j**.
+
+ -f, --bpffs
+ When showing BPF links, show file names of pinned
+ links.
+
+ -n, --nomount
+ Do not automatically attempt to mount any virtual file system
+ (such as tracefs or BPF virtual file system) when necessary.
+
+ -d, --debug
+ Print all logs available, even debug-level information. This
+ includes logs from libbpf.
+
+EXAMPLES
+========
+**# bpftool link show**
+
+::
+
+ 10: cgroup prog 25
+ cgroup_id 614 attach_type egress
+
+**# bpftool --json --pretty link show**
+
+::
+
+ [{
+ "type": "cgroup",
+ "prog_id": 25,
+ "cgroup_id": 614,
+ "attach_type": "egress"
+ }
+ ]
+
+|
+| **# bpftool link pin id 10 /sys/fs/bpf/link**
+| **# ls -l /sys/fs/bpf/**
+
+::
+
+ -rw------- 1 root root 0 Apr 23 21:39 link
+
+
+SEE ALSO
+========
+ **bpf**\ (2),
+ **bpf-helpers**\ (7),
+ **bpftool**\ (8),
+ **bpftool-prog\ (8),
+ **bpftool-map**\ (8),
+ **bpftool-cgroup**\ (8),
+ **bpftool-feature**\ (8),
+ **bpftool-net**\ (8),
+ **bpftool-perf**\ (8),
+ **bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index f584d1fdfc64..2759f9cc3289 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -55,16 +55,15 @@ ifneq ($(EXTRA_LDFLAGS),)
LDFLAGS += $(EXTRA_LDFLAGS)
endif
-LIBS = $(LIBBPF) -lelf -lz
-
INSTALL ?= install
RM ?= rm -f
CLANG ?= clang
FEATURE_USER = .bpftool
-FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib \
+FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib libcap \
+ clang-bpf-global-var
+FEATURE_DISPLAY = libbfd disassembler-four-args zlib libcap \
clang-bpf-global-var
-FEATURE_DISPLAY = libbfd disassembler-four-args zlib clang-bpf-global-var
check_feat := 1
NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall
@@ -90,6 +89,12 @@ ifeq ($(feature-reallocarray), 0)
CFLAGS += -DCOMPAT_NEED_REALLOCARRAY
endif
+LIBS = $(LIBBPF) -lelf -lz
+ifeq ($(feature-libcap), 1)
+CFLAGS += -DUSE_LIBCAP
+LIBS += -lcap
+endif
+
include $(wildcard $(OUTPUT)*.d)
all: $(OUTPUT)bpftool
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 45ee99b159e2..fc989ead7313 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -98,6 +98,12 @@ _bpftool_get_btf_ids()
command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
}
+_bpftool_get_link_ids()
+{
+ COMPREPLY+=( $( compgen -W "$( bpftool -jp link 2>&1 | \
+ command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
+}
+
_bpftool_get_obj_map_names()
{
local obj
@@ -1073,7 +1079,7 @@ _bpftool()
COMPREPLY+=( $( compgen -W 'macros' -- "$cur" ) )
fi
_bpftool_one_of_list 'kernel dev'
- _bpftool_once_attr 'full'
+ _bpftool_once_attr 'full unprivileged'
return 0
;;
*)
@@ -1082,6 +1088,39 @@ _bpftool()
;;
esac
;;
+ link)
+ case $command in
+ show|list|pin)
+ case $prev in
+ id)
+ _bpftool_get_link_ids
+ return 0
+ ;;
+ esac
+ ;;
+ esac
+
+ local LINK_TYPE='id pinned'
+ case $command in
+ show|list)
+ [[ $prev != "$command" ]] && return 0
+ COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) )
+ return 0
+ ;;
+ pin)
+ if [[ $prev == "$command" ]]; then
+ COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) )
+ else
+ _filedir
+ fi
+ return 0
+ ;;
+ *)
+ [[ $prev == $object ]] && \
+ COMPREPLY=( $( compgen -W 'help pin show list' -- "$cur" ) )
+ ;;
+ esac
+ ;;
esac
} &&
complete -F _bpftool bpftool
diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index bcaf55b59498..41a1346934a1 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -15,7 +15,6 @@
#include <linux/hashtable.h>
#include <sys/types.h>
#include <sys/stat.h>
-#include <unistd.h>
#include "json_writer.h"
#include "main.h"
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index 62c6a1d7cd18..1693c802bb20 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -31,42 +31,20 @@
static unsigned int query_flags;
-static const char * const attach_type_strings[] = {
- [BPF_CGROUP_INET_INGRESS] = "ingress",
- [BPF_CGROUP_INET_EGRESS] = "egress",
- [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create",
- [BPF_CGROUP_SOCK_OPS] = "sock_ops",
- [BPF_CGROUP_DEVICE] = "device",
- [BPF_CGROUP_INET4_BIND] = "bind4",
- [BPF_CGROUP_INET6_BIND] = "bind6",
- [BPF_CGROUP_INET4_CONNECT] = "connect4",
- [BPF_CGROUP_INET6_CONNECT] = "connect6",
- [BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
- [BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
- [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
- [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
- [BPF_CGROUP_SYSCTL] = "sysctl",
- [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4",
- [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6",
- [BPF_CGROUP_GETSOCKOPT] = "getsockopt",
- [BPF_CGROUP_SETSOCKOPT] = "setsockopt",
- [__MAX_BPF_ATTACH_TYPE] = NULL,
-};
-
static enum bpf_attach_type parse_attach_type(const char *str)
{
enum bpf_attach_type type;
for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) {
- if (attach_type_strings[type] &&
- is_prefix(str, attach_type_strings[type]))
+ if (attach_type_name[type] &&
+ is_prefix(str, attach_type_name[type]))
return type;
}
return __MAX_BPF_ATTACH_TYPE;
}
-static int show_bpf_prog(int id, const char *attach_type_str,
+static int show_bpf_prog(int id, enum bpf_attach_type attach_type,
const char *attach_flags_str,
int level)
{
@@ -86,18 +64,22 @@ static int show_bpf_prog(int id, const char *attach_type_str,
if (json_output) {
jsonw_start_object(json_wtr);
jsonw_uint_field(json_wtr, "id", info.id);
- jsonw_string_field(json_wtr, "attach_type",
- attach_type_str);
+ if (attach_type < ARRAY_SIZE(attach_type_name))
+ jsonw_string_field(json_wtr, "attach_type",
+ attach_type_name[attach_type]);
+ else
+ jsonw_uint_field(json_wtr, "attach_type", attach_type);
jsonw_string_field(json_wtr, "attach_flags",
attach_flags_str);
jsonw_string_field(json_wtr, "name", info.name);
jsonw_end_object(json_wtr);
} else {
- printf("%s%-8u %-15s %-15s %-15s\n", level ? " " : "",
- info.id,
- attach_type_str,
- attach_flags_str,
- info.name);
+ printf("%s%-8u ", level ? " " : "", info.id);
+ if (attach_type < ARRAY_SIZE(attach_type_name))
+ printf("%-15s", attach_type_name[attach_type]);
+ else
+ printf("type %-10u", attach_type);
+ printf(" %-15s %-15s\n", attach_flags_str, info.name);
}
close(prog_fd);
@@ -171,7 +153,7 @@ static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
}
for (iter = 0; iter < prog_cnt; iter++)
- show_bpf_prog(prog_ids[iter], attach_type_strings[type],
+ show_bpf_prog(prog_ids[iter], type,
attach_flags_str, level);
return 0;
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index f2223dbdfb0a..c47bdc65de8e 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -262,6 +262,8 @@ int get_fd_type(int fd)
return BPF_OBJ_MAP;
else if (strstr(buf, "bpf-prog"))
return BPF_OBJ_PROG;
+ else if (strstr(buf, "bpf-link"))
+ return BPF_OBJ_LINK;
return BPF_OBJ_UNKNOWN;
}
diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 88718ee6a438..f54347f55ee0 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -6,6 +6,9 @@
#include <string.h>
#include <unistd.h>
#include <net/if.h>
+#ifdef USE_LIBCAP
+#include <sys/capability.h>
+#endif
#include <sys/utsname.h>
#include <sys/vfs.h>
@@ -35,6 +38,11 @@ static const char * const helper_name[] = {
#undef BPF_HELPER_MAKE_ENTRY
+static bool full_mode;
+#ifdef USE_LIBCAP
+static bool run_as_unprivileged;
+#endif
+
/* Miscellaneous utility functions */
static bool check_procfs(void)
@@ -471,6 +479,13 @@ probe_prog_type(enum bpf_prog_type prog_type, bool *supported_types,
}
res = bpf_probe_prog_type(prog_type, ifindex);
+#ifdef USE_LIBCAP
+ /* Probe may succeed even if program load fails, for unprivileged users
+ * check that we did not fail because of insufficient permissions
+ */
+ if (run_as_unprivileged && errno == EPERM)
+ res = false;
+#endif
supported_types[prog_type] |= res;
@@ -499,6 +514,10 @@ probe_map_type(enum bpf_map_type map_type, const char *define_prefix,
res = bpf_probe_map_type(map_type, ifindex);
+ /* Probe result depends on the success of map creation, no additional
+ * check required for unprivileged users
+ */
+
maxlen = sizeof(plain_desc) - strlen(plain_comment) - 1;
if (strlen(map_type_name[map_type]) > maxlen) {
p_info("map type name too long");
@@ -518,12 +537,19 @@ probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type,
const char *define_prefix, unsigned int id,
const char *ptype_name, __u32 ifindex)
{
- bool res;
+ bool res = false;
- if (!supported_type)
- res = false;
- else
+ if (supported_type) {
res = bpf_probe_helper(id, prog_type, ifindex);
+#ifdef USE_LIBCAP
+ /* Probe may succeed even if program load fails, for
+ * unprivileged users check that we did not fail because of
+ * insufficient permissions
+ */
+ if (run_as_unprivileged && errno == EPERM)
+ res = false;
+#endif
+ }
if (json_output) {
if (res)
@@ -540,8 +566,7 @@ probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type,
static void
probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type,
- const char *define_prefix, bool full_mode,
- __u32 ifindex)
+ const char *define_prefix, __u32 ifindex)
{
const char *ptype_name = prog_type_name[prog_type];
char feat_name[128];
@@ -678,8 +703,7 @@ static void section_map_types(const char *define_prefix, __u32 ifindex)
}
static void
-section_helpers(bool *supported_types, const char *define_prefix,
- bool full_mode, __u32 ifindex)
+section_helpers(bool *supported_types, const char *define_prefix, __u32 ifindex)
{
unsigned int i;
@@ -704,8 +728,8 @@ section_helpers(bool *supported_types, const char *define_prefix,
define_prefix, define_prefix, define_prefix,
define_prefix);
for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++)
- probe_helpers_for_progtype(i, supported_types[i],
- define_prefix, full_mode, ifindex);
+ probe_helpers_for_progtype(i, supported_types[i], define_prefix,
+ ifindex);
print_end_section();
}
@@ -720,23 +744,86 @@ static void section_misc(const char *define_prefix, __u32 ifindex)
print_end_section();
}
-static int do_probe(int argc, char **argv)
+static int handle_perms(void)
{
- enum probe_component target = COMPONENT_UNSPEC;
- const char *define_prefix = NULL;
- bool supported_types[128] = {};
- bool full_mode = false;
- __u32 ifindex = 0;
- char *ifname;
+#ifdef USE_LIBCAP
+ cap_value_t cap_list[1] = { CAP_SYS_ADMIN };
+ bool has_sys_admin_cap = false;
+ cap_flag_value_t val;
+ int res = -1;
+ cap_t caps;
+
+ caps = cap_get_proc();
+ if (!caps) {
+ p_err("failed to get capabilities for process: %s",
+ strerror(errno));
+ return -1;
+ }
+
+ if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &val)) {
+ p_err("bug: failed to retrieve CAP_SYS_ADMIN status");
+ goto exit_free;
+ }
+ if (val == CAP_SET)
+ has_sys_admin_cap = true;
+
+ if (!run_as_unprivileged && !has_sys_admin_cap) {
+ p_err("full feature probing requires CAP_SYS_ADMIN, run as root or use 'unprivileged'");
+ goto exit_free;
+ }
+
+ if ((run_as_unprivileged && !has_sys_admin_cap) ||
+ (!run_as_unprivileged && has_sys_admin_cap)) {
+ /* We are all good, exit now */
+ res = 0;
+ goto exit_free;
+ }
+ /* if (run_as_unprivileged && has_sys_admin_cap), drop CAP_SYS_ADMIN */
+
+ if (cap_set_flag(caps, CAP_EFFECTIVE, ARRAY_SIZE(cap_list), cap_list,
+ CAP_CLEAR)) {
+ p_err("bug: failed to clear CAP_SYS_ADMIN from capabilities");
+ goto exit_free;
+ }
+
+ if (cap_set_proc(caps)) {
+ p_err("failed to drop CAP_SYS_ADMIN: %s", strerror(errno));
+ goto exit_free;
+ }
+
+ res = 0;
+
+exit_free:
+ if (cap_free(caps) && !res) {
+ p_err("failed to clear storage object for capabilities: %s",
+ strerror(errno));
+ res = -1;
+ }
+
+ return res;
+#else
/* Detection assumes user has sufficient privileges (CAP_SYS_ADMIN).
- * Let's approximate, and restrict usage to root user only.
+ * We do not use libpcap so let's approximate, and restrict usage to
+ * root user only.
*/
if (geteuid()) {
- p_err("please run this command as root user");
+ p_err("full feature probing requires root privileges");
return -1;
}
+ return 0;
+#endif /* USE_LIBCAP */
+}
+
+static int do_probe(int argc, char **argv)
+{
+ enum probe_component target = COMPONENT_UNSPEC;
+ const char *define_prefix = NULL;
+ bool supported_types[128] = {};
+ __u32 ifindex = 0;
+ char *ifname;
+
set_max_rlimit();
while (argc) {
@@ -785,6 +872,14 @@ static int do_probe(int argc, char **argv)
if (!REQ_ARGS(1))
return -1;
define_prefix = GET_ARG();
+ } else if (is_prefix(*argv, "unprivileged")) {
+#ifdef USE_LIBCAP
+ run_as_unprivileged = true;
+ NEXT_ARG();
+#else
+ p_err("unprivileged run not supported, recompile bpftool with libcap");
+ return -1;
+#endif
} else {
p_err("expected no more arguments, 'kernel', 'dev', 'macros' or 'prefix', got: '%s'?",
*argv);
@@ -792,6 +887,12 @@ static int do_probe(int argc, char **argv)
}
}
+ /* Full feature detection requires CAP_SYS_ADMIN privilege.
+ * Let's approximate, and warn if user is not root.
+ */
+ if (handle_perms())
+ return -1;
+
if (json_output) {
define_prefix = NULL;
jsonw_start_object(json_wtr);
@@ -803,7 +904,7 @@ static int do_probe(int argc, char **argv)
goto exit_close_json;
section_program_types(supported_types, define_prefix, ifindex);
section_map_types(define_prefix, ifindex);
- section_helpers(supported_types, define_prefix, full_mode, ifindex);
+ section_helpers(supported_types, define_prefix, ifindex);
section_misc(define_prefix, ifindex);
exit_close_json:
@@ -822,7 +923,7 @@ static int do_help(int argc, char **argv)
}
fprintf(stderr,
- "Usage: %s %s probe [COMPONENT] [full] [macros [prefix PREFIX]]\n"
+ "Usage: %s %s probe [COMPONENT] [full] [unprivileged] [macros [prefix PREFIX]]\n"
" %s %s help\n"
"\n"
" COMPONENT := { kernel | dev NAME }\n"
diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c
index f8113b3646f5..0e5f0236cc76 100644
--- a/tools/bpf/bpftool/gen.c
+++ b/tools/bpf/bpftool/gen.c
@@ -17,7 +17,6 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
-#include <unistd.h>
#include <bpf/btf.h>
#include "bpf/libbpf_internal.h"
diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c
index f7f5885aa3ba..e7e7eee9f172 100644
--- a/tools/bpf/bpftool/jit_disasm.c
+++ b/tools/bpf/bpftool/jit_disasm.c
@@ -15,7 +15,6 @@
#include <stdio.h>
#include <stdarg.h>
#include <stdint.h>
-#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
new file mode 100644
index 000000000000..adc7dc431ed8
--- /dev/null
+++ b/tools/bpf/bpftool/link.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2020 Facebook */
+
+#include <errno.h>
+#include <net/if.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <bpf/bpf.h>
+
+#include "json_writer.h"
+#include "main.h"
+
+static const char * const link_type_name[] = {
+ [BPF_LINK_TYPE_UNSPEC] = "unspec",
+ [BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint",
+ [BPF_LINK_TYPE_TRACING] = "tracing",
+ [BPF_LINK_TYPE_CGROUP] = "cgroup",
+};
+
+static int link_parse_fd(int *argc, char ***argv)
+{
+ if (is_prefix(**argv, "id")) {
+ unsigned int id;
+ char *endptr;
+
+ NEXT_ARGP();
+
+ id = strtoul(**argv, &endptr, 0);
+ if (*endptr) {
+ p_err("can't parse %s as ID", **argv);
+ return -1;
+ }
+ NEXT_ARGP();
+
+ return bpf_link_get_fd_by_id(id);
+ } else if (is_prefix(**argv, "pinned")) {
+ char *path;
+
+ NEXT_ARGP();
+
+ path = **argv;
+ NEXT_ARGP();
+
+ return open_obj_pinned_any(path, BPF_OBJ_LINK);
+ }
+
+ p_err("expected 'id' or 'pinned', got: '%s'?", **argv);
+ return -1;
+}
+
+static void
+show_link_header_json(struct bpf_link_info *info, json_writer_t *wtr)
+{
+ jsonw_uint_field(wtr, "id", info->id);
+ if (info->type < ARRAY_SIZE(link_type_name))
+ jsonw_string_field(wtr, "type", link_type_name[info->type]);
+ else
+ jsonw_uint_field(wtr, "type", info->type);
+
+ jsonw_uint_field(json_wtr, "prog_id", info->prog_id);
+}
+
+static int get_prog_info(int prog_id, struct bpf_prog_info *info)
+{
+ __u32 len = sizeof(*info);
+ int err, prog_fd;
+
+ prog_fd = bpf_prog_get_fd_by_id(prog_id);
+ if (prog_fd < 0)
+ return prog_fd;
+
+ memset(info, 0, sizeof(*info));
+ err = bpf_obj_get_info_by_fd(prog_fd, info, &len);
+ if (err)
+ p_err("can't get prog info: %s", strerror(errno));
+ close(prog_fd);
+ return err;
+}
+
+static int show_link_close_json(int fd, struct bpf_link_info *info)
+{
+ struct bpf_prog_info prog_info;
+ int err;
+
+ jsonw_start_object(json_wtr);
+
+ show_link_header_json(info, json_wtr);
+
+ switch (info->type) {
+ case BPF_LINK_TYPE_RAW_TRACEPOINT:
+ jsonw_string_field(json_wtr, "tp_name",
+ (const char *)info->raw_tracepoint.tp_name);
+ break;
+ case BPF_LINK_TYPE_TRACING:
+ err = get_prog_info(info->prog_id, &prog_info);
+ if (err)
+ return err;
+
+ if (prog_info.type < ARRAY_SIZE(prog_type_name))
+ jsonw_string_field(json_wtr, "prog_type",
+ prog_type_name[prog_info.type]);
+ else
+ jsonw_uint_field(json_wtr, "prog_type",
+ prog_info.type);
+
+ if (info->tracing.attach_type < ARRAY_SIZE(attach_type_name))
+ jsonw_string_field(json_wtr, "attach_type",
+ attach_type_name[info->tracing.attach_type]);
+ else
+ jsonw_uint_field(json_wtr, "attach_type",
+ info->tracing.attach_type);
+ break;
+ case BPF_LINK_TYPE_CGROUP:
+ jsonw_lluint_field(json_wtr, "cgroup_id",
+ info->cgroup.cgroup_id);
+ if (info->cgroup.attach_type < ARRAY_SIZE(attach_type_name))
+ jsonw_string_field(json_wtr, "attach_type",
+ attach_type_name[info->cgroup.attach_type]);
+ else
+ jsonw_uint_field(json_wtr, "attach_type",
+ info->cgroup.attach_type);
+ break;
+ default:
+ break;
+ }
+
+ if (!hash_empty(link_table.table)) {
+ struct pinned_obj *obj;
+
+ jsonw_name(json_wtr, "pinned");
+ jsonw_start_array(json_wtr);
+ hash_for_each_possible(link_table.table, obj, hash, info->id) {
+ if (obj->id == info->id)
+ jsonw_string(json_wtr, obj->path);
+ }
+ jsonw_end_array(json_wtr);
+ }
+ jsonw_end_object(json_wtr);
+
+ return 0;
+}
+
+static void show_link_header_plain(struct bpf_link_info *info)
+{
+ printf("%u: ", info->id);
+ if (info->type < ARRAY_SIZE(link_type_name))
+ printf("%s ", link_type_name[info->type]);
+ else
+ printf("type %u ", info->type);
+
+ printf("prog %u ", info->prog_id);
+}
+
+static int show_link_close_plain(int fd, struct bpf_link_info *info)
+{
+ struct bpf_prog_info prog_info;
+ int err;
+
+ show_link_header_plain(info);
+
+ switch (info->type) {
+ case BPF_LINK_TYPE_RAW_TRACEPOINT:
+ printf("\n\ttp '%s' ",
+ (const char *)info->raw_tracepoint.tp_name);
+ break;
+ case BPF_LINK_TYPE_TRACING:
+ err = get_prog_info(info->prog_id, &prog_info);
+ if (err)
+ return err;
+
+ if (prog_info.type < ARRAY_SIZE(prog_type_name))
+ printf("\n\tprog_type %s ",
+ prog_type_name[prog_info.type]);
+ else
+ printf("\n\tprog_type %u ", prog_info.type);
+
+ if (info->tracing.attach_type < ARRAY_SIZE(attach_type_name))
+ printf("attach_type %s ",
+ attach_type_name[info->tracing.attach_type]);
+ else
+ printf("attach_type %u ", info->tracing.attach_type);
+ break;
+ case BPF_LINK_TYPE_CGROUP:
+ printf("\n\tcgroup_id %zu ", (size_t)info->cgroup.cgroup_id);
+ if (info->cgroup.attach_type < ARRAY_SIZE(attach_type_name))
+ printf("attach_type %s ",
+ attach_type_name[info->cgroup.attach_type]);
+ else
+ printf("attach_type %u ", info->cgroup.attach_type);
+ break;
+ default:
+ break;
+ }
+
+ if (!hash_empty(link_table.table)) {
+ struct pinned_obj *obj;
+
+ hash_for_each_possible(link_table.table, obj, hash, info->id) {
+ if (obj->id == info->id)
+ printf("\n\tpinned %s", obj->path);
+ }
+ }
+
+ printf("\n");
+
+ return 0;
+}
+
+static int do_show_link(int fd)
+{
+ struct bpf_link_info info;
+ __u32 len = sizeof(info);
+ char raw_tp_name[256];
+ int err;
+
+ memset(&info, 0, sizeof(info));
+again:
+ err = bpf_obj_get_info_by_fd(fd, &info, &len);
+ if (err) {
+ p_err("can't get link info: %s",
+ strerror(errno));
+ close(fd);
+ return err;
+ }
+ if (info.type == BPF_LINK_TYPE_RAW_TRACEPOINT &&
+ !info.raw_tracepoint.tp_name) {
+ info.raw_tracepoint.tp_name = (unsigned long)&raw_tp_name;
+ info.raw_tracepoint.tp_name_len = sizeof(raw_tp_name);
+ goto again;
+ }
+
+ if (json_output)
+ show_link_close_json(fd, &info);
+ else
+ show_link_close_plain(fd, &info);
+
+ close(fd);
+ return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+ __u32 id = 0;
+ int err, fd;
+
+ if (show_pinned)
+ build_pinned_obj_table(&link_table, BPF_OBJ_LINK);
+
+ if (argc == 2) {
+ fd = link_parse_fd(&argc, &argv);
+ if (fd < 0)
+ return fd;
+ return do_show_link(fd);
+ }
+
+ if (argc)
+ return BAD_ARG();
+
+ if (json_output)
+ jsonw_start_array(json_wtr);
+ while (true) {
+ err = bpf_link_get_next_id(id, &id);
+ if (err) {
+ if (errno == ENOENT)
+ break;
+ p_err("can't get next link: %s%s", strerror(errno),
+ errno == EINVAL ? " -- kernel too old?" : "");
+ break;
+ }
+
+ fd = bpf_link_get_fd_by_id(id);
+ if (fd < 0) {
+ if (errno == ENOENT)
+ continue;
+ p_err("can't get link by id (%u): %s",
+ id, strerror(errno));
+ break;
+ }
+
+ err = do_show_link(fd);
+ if (err)
+ break;
+ }
+ if (json_output)
+ jsonw_end_array(json_wtr);
+
+ return errno == ENOENT ? 0 : -1;
+}
+
+static int do_pin(int argc, char **argv)
+{
+ int err;
+
+ err = do_pin_any(argc, argv, link_parse_fd);
+ if (!err && json_output)
+ jsonw_null(json_wtr);
+ return err;
+}
+
+static int do_help(int argc, char **argv)
+{
+ if (json_output) {
+ jsonw_null(json_wtr);
+ return 0;
+ }
+
+ fprintf(stderr,
+ "Usage: %1$s %2$s { show | list } [LINK]\n"
+ " %1$s %2$s pin LINK FILE\n"
+ " %1$s %2$s help\n"
+ "\n"
+ " " HELP_SPEC_LINK "\n"
+ " " HELP_SPEC_PROGRAM "\n"
+ " " HELP_SPEC_OPTIONS "\n"
+ "",
+ bin_name, argv[-2]);
+
+ return 0;
+}
+
+static const struct cmd cmds[] = {
+ { "show", do_show },
+ { "list", do_show },
+ { "help", do_help },
+ { "pin", do_pin },
+ { 0 }
+};
+
+int do_link(int argc, char **argv)
+{
+ return cmd_select(cmds, argc, argv, do_help);
+}
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 466c269eabdd..1413a154806e 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -30,6 +30,7 @@ bool verifier_logs;
bool relaxed_maps;
struct pinned_obj_table prog_table;
struct pinned_obj_table map_table;
+struct pinned_obj_table link_table;
static void __noreturn clean_and_exit(int i)
{
@@ -58,7 +59,7 @@ static int do_help(int argc, char **argv)
" %s batch file FILE\n"
" %s version\n"
"\n"
- " OBJECT := { prog | map | cgroup | perf | net | feature | btf | gen | struct_ops }\n"
+ " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops }\n"
" " HELP_SPEC_OPTIONS "\n"
"",
bin_name, bin_name, bin_name);
@@ -215,6 +216,7 @@ static const struct cmd cmds[] = {
{ "batch", do_batch },
{ "prog", do_prog },
{ "map", do_map },
+ { "link", do_link },
{ "cgroup", do_cgroup },
{ "perf", do_perf },
{ "net", do_net },
@@ -364,6 +366,7 @@ int main(int argc, char **argv)
hash_init(prog_table.table);
hash_init(map_table.table);
+ hash_init(link_table.table);
opterr = 0;
while ((opt = getopt_long(argc, argv, "Vhpjfmnd",
@@ -422,6 +425,7 @@ int main(int argc, char **argv)
if (show_pinned) {
delete_pinned_obj_table(&prog_table);
delete_pinned_obj_table(&map_table);
+ delete_pinned_obj_table(&link_table);
}
return ret;
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 86f14ce26fd7..9b1fb81a8331 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -50,6 +50,8 @@
"\t {-m|--mapcompat} | {-n|--nomount} }"
#define HELP_SPEC_MAP \
"MAP := { id MAP_ID | pinned FILE | name MAP_NAME }"
+#define HELP_SPEC_LINK \
+ "LINK := { id LINK_ID | pinned FILE }"
static const char * const prog_type_name[] = {
[BPF_PROG_TYPE_UNSPEC] = "unspec",
@@ -83,6 +85,38 @@ static const char * const prog_type_name[] = {
[BPF_PROG_TYPE_EXT] = "ext",
};
+static const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = {
+ [BPF_CGROUP_INET_INGRESS] = "ingress",
+ [BPF_CGROUP_INET_EGRESS] = "egress",
+ [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create",
+ [BPF_CGROUP_SOCK_OPS] = "sock_ops",
+ [BPF_CGROUP_DEVICE] = "device",
+ [BPF_CGROUP_INET4_BIND] = "bind4",
+ [BPF_CGROUP_INET6_BIND] = "bind6",
+ [BPF_CGROUP_INET4_CONNECT] = "connect4",
+ [BPF_CGROUP_INET6_CONNECT] = "connect6",
+ [BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
+ [BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
+ [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
+ [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
+ [BPF_CGROUP_SYSCTL] = "sysctl",
+ [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4",
+ [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6",
+ [BPF_CGROUP_GETSOCKOPT] = "getsockopt",
+ [BPF_CGROUP_SETSOCKOPT] = "setsockopt",
+
+ [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser",
+ [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict",
+ [BPF_SK_MSG_VERDICT] = "sk_msg_verdict",
+ [BPF_LIRC_MODE2] = "lirc_mode2",
+ [BPF_FLOW_DISSECTOR] = "flow_dissector",
+ [BPF_TRACE_RAW_TP] = "raw_tp",
+ [BPF_TRACE_FENTRY] = "fentry",
+ [BPF_TRACE_FEXIT] = "fexit",
+ [BPF_MODIFY_RETURN] = "mod_ret",
+ [BPF_LSM_MAC] = "lsm_mac",
+};
+
extern const char * const map_type_name[];
extern const size_t map_type_name_size;
@@ -90,6 +124,7 @@ enum bpf_obj_type {
BPF_OBJ_UNKNOWN,
BPF_OBJ_PROG,
BPF_OBJ_MAP,
+ BPF_OBJ_LINK,
};
extern const char *bin_name;
@@ -102,6 +137,7 @@ extern bool verifier_logs;
extern bool relaxed_maps;
extern struct pinned_obj_table prog_table;
extern struct pinned_obj_table map_table;
+extern struct pinned_obj_table link_table;
void __printf(1, 2) p_err(const char *fmt, ...);
void __printf(1, 2) p_info(const char *fmt, ...);
@@ -153,6 +189,7 @@ int do_pin_fd(int fd, const char *name);
int do_prog(int argc, char **arg);
int do_map(int argc, char **arg);
+int do_link(int argc, char **arg);
int do_event_pipe(int argc, char **argv);
int do_cgroup(int argc, char **arg);
int do_perf(int argc, char **arg);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7bbf1b65be10..b3643e27e264 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -113,6 +113,9 @@ enum bpf_cmd {
BPF_MAP_DELETE_BATCH,
BPF_LINK_CREATE,
BPF_LINK_UPDATE,
+ BPF_LINK_GET_FD_BY_ID,
+ BPF_LINK_GET_NEXT_ID,
+ BPF_ENABLE_STATS,
};
enum bpf_map_type {
@@ -220,6 +223,15 @@ enum bpf_attach_type {
#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
+enum bpf_link_type {
+ BPF_LINK_TYPE_UNSPEC = 0,
+ BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
+ BPF_LINK_TYPE_TRACING = 2,
+ BPF_LINK_TYPE_CGROUP = 3,
+
+ MAX_BPF_LINK_TYPE,
+};
+
/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
*
* NONE(default): No further bpf programs allowed in the subtree.
@@ -379,6 +391,12 @@ enum {
*/
#define BPF_F_QUERY_EFFECTIVE (1U << 0)
+/* type for BPF_ENABLE_STATS */
+enum bpf_stats_type {
+ /* enabled run_time_ns and run_cnt */
+ BPF_STATS_RUN_TIME = 0,
+};
+
enum bpf_stack_build_id_status {
/* user space need an empty entry to identify end of a trace */
BPF_STACK_BUILD_ID_EMPTY = 0,
@@ -523,6 +541,7 @@ union bpf_attr {
__u32 prog_id;
__u32 map_id;
__u32 btf_id;
+ __u32 link_id;
};
__u32 next_id;
__u32 open_flags;
@@ -589,6 +608,10 @@ union bpf_attr {
__u32 old_prog_fd;
} link_update;
+ struct { /* struct used by BPF_ENABLE_STATS command */
+ __u32 type;
+ } enable_stats;
+
} __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF
@@ -652,6 +675,8 @@ union bpf_attr {
* u64 bpf_ktime_get_ns(void)
* Description
* Return the time elapsed since system boot, in nanoseconds.
+ * Does not include time the system was suspended.
+ * See: clock_gettime(CLOCK_MONOTONIC)
* Return
* Current *ktime*.
*
@@ -1562,7 +1587,7 @@ union bpf_attr {
* Return
* 0
*
- * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen)
+ * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
* Description
* Emulate a call to **setsockopt()** on the socket associated to
* *bpf_socket*, which must be a full socket. The *level* at
@@ -1570,6 +1595,11 @@ union bpf_attr {
* must be specified, see **setsockopt(2)** for more information.
* The option value of length *optlen* is pointed by *optval*.
*
+ * *bpf_socket* should be one of the following:
+ * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * and **BPF_CGROUP_INET6_CONNECT**.
+ *
* This helper actually implements a subset of **setsockopt()**.
* It supports the following *level*\ s:
*
@@ -1764,7 +1794,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
- * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen)
+ * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
* Description
* Emulate a call to **getsockopt()** on the socket associated to
* *bpf_socket*, which must be a full socket. The *level* at
@@ -1773,6 +1803,11 @@ union bpf_attr {
* The retrieved value is stored in the structure pointed by
* *opval* and of length *optlen*.
*
+ * *bpf_socket* should be one of the following:
+ * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * and **BPF_CGROUP_INET6_CONNECT**.
+ *
* This helper actually implements a subset of **getsockopt()**.
* It supports the following *level*\ s:
*
@@ -3025,6 +3060,14 @@ union bpf_attr {
* * **-EOPNOTSUPP** Unsupported operation, for example a
* call from outside of TC ingress.
* * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport).
+ *
+ * u64 bpf_ktime_get_boot_ns(void)
+ * Description
+ * Return the time elapsed since system boot, in nanoseconds.
+ * Does include the time the system was suspended.
+ * See: clock_gettime(CLOCK_BOOTTIME)
+ * Return
+ * Current *ktime*.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -3151,7 +3194,8 @@ union bpf_attr {
FN(xdp_output), \
FN(get_netns_cookie), \
FN(get_current_ancestor_cgroup_id), \
- FN(sk_assign),
+ FN(sk_assign), \
+ FN(ktime_get_boot_ns),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -3598,6 +3642,25 @@ struct bpf_btf_info {
__u32 id;
} __attribute__((aligned(8)));
+struct bpf_link_info {
+ __u32 type;
+ __u32 id;
+ __u32 prog_id;
+ union {
+ struct {
+ __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */
+ __u32 tp_name_len; /* in/out: tp_name buffer len */
+ } raw_tracepoint;
+ struct {
+ __u32 attach_type;
+ } tracing;
+ struct {
+ __u64 cgroup_id;
+ __u32 attach_type;
+ } cgroup;
+ };
+} __attribute__((aligned(8)));
+
/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
* by user and intended to be used by socket (e.g. to bind to, depends on
* attach attach type).
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 5cc1b0785d18..43322f0d6c7f 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -721,6 +721,11 @@ int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id)
return bpf_obj_get_next_id(start_id, next_id, BPF_BTF_GET_NEXT_ID);
}
+int bpf_link_get_next_id(__u32 start_id, __u32 *next_id)
+{
+ return bpf_obj_get_next_id(start_id, next_id, BPF_LINK_GET_NEXT_ID);
+}
+
int bpf_prog_get_fd_by_id(__u32 id)
{
union bpf_attr attr;
@@ -751,13 +756,23 @@ int bpf_btf_get_fd_by_id(__u32 id)
return sys_bpf(BPF_BTF_GET_FD_BY_ID, &attr, sizeof(attr));
}
-int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len)
+int bpf_link_get_fd_by_id(__u32 id)
+{
+ union bpf_attr attr;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.link_id = id;
+
+ return sys_bpf(BPF_LINK_GET_FD_BY_ID, &attr, sizeof(attr));
+}
+
+int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len)
{
union bpf_attr attr;
int err;
memset(&attr, 0, sizeof(attr));
- attr.info.bpf_fd = prog_fd;
+ attr.info.bpf_fd = bpf_fd;
attr.info.info_len = *info_len;
attr.info.info = ptr_to_u64(info);
@@ -826,3 +841,13 @@ int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
return err;
}
+
+int bpf_enable_stats(enum bpf_stats_type type)
+{
+ union bpf_attr attr;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.enable_stats.type = type;
+
+ return sys_bpf(BPF_ENABLE_STATS, &attr, sizeof(attr));
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 46d47afdd887..1901b2777854 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -216,10 +216,12 @@ LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data,
LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id);
LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id);
LIBBPF_API int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id);
+LIBBPF_API int bpf_link_get_next_id(__u32 start_id, __u32 *next_id);
LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id);
LIBBPF_API int bpf_map_get_fd_by_id(__u32 id);
LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id);
-LIBBPF_API int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len);
+LIBBPF_API int bpf_link_get_fd_by_id(__u32 id);
+LIBBPF_API int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len);
LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type,
__u32 query_flags, __u32 *attach_flags,
__u32 *prog_ids, __u32 *prog_cnt);
@@ -229,6 +231,7 @@ LIBBPF_API int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf,
LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf,
__u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
__u64 *probe_offset, __u64 *probe_addr);
+LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type);
#ifdef __cplusplus
} /* extern "C" */
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index f69cc208778a..da00b87aa199 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -2,10 +2,17 @@
#ifndef __BPF_HELPERS__
#define __BPF_HELPERS__
+/*
+ * Note that bpf programs need to include either
+ * vmlinux.h (auto-generated from BTF) or linux/types.h
+ * in advance since bpf_helper_defs.h uses such types
+ * as __u64.
+ */
#include "bpf_helper_defs.h"
#define __uint(name, val) int (*name)[val]
#define __type(name, val) typeof(val) *name
+#define __array(name, val) typeof(val) *name[]
/* Helper macro to print out debug messages */
#define bpf_printk(fmt, ...) \
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 0c28ee82834b..de07e559a11d 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -658,7 +658,7 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
if (!btf_dump_is_blacklisted(d, id)) {
btf_dump_emit_typedef_def(d, id, t, 0);
btf_dump_printf(d, ";\n\n");
- };
+ }
tstate->fwd_emitted = 1;
break;
default:
diff --git a/tools/lib/bpf/hashmap.c b/tools/lib/bpf/hashmap.c
index 54c30c802070..cffb96202e0d 100644
--- a/tools/lib/bpf/hashmap.c
+++ b/tools/lib/bpf/hashmap.c
@@ -59,7 +59,14 @@ struct hashmap *hashmap__new(hashmap_hash_fn hash_fn,
void hashmap__clear(struct hashmap *map)
{
+ struct hashmap_entry *cur, *tmp;
+ int bkt;
+
+ hashmap__for_each_entry_safe(map, cur, tmp, bkt) {
+ free(cur);
+ }
free(map->buckets);
+ map->buckets = NULL;
map->cap = map->cap_bits = map->sz = 0;
}
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 8f480e29a6b0..977add1b73e2 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -310,6 +310,7 @@ struct bpf_map {
int map_ifindex;
int inner_map_fd;
struct bpf_map_def def;
+ __u32 btf_var_idx;
__u32 btf_key_type_id;
__u32 btf_value_type_id;
__u32 btf_vmlinux_value_type_id;
@@ -318,6 +319,9 @@ struct bpf_map {
enum libbpf_map_type libbpf_type;
void *mmaped;
struct bpf_struct_ops *st_ops;
+ struct bpf_map *inner_map;
+ void **init_slots;
+ int init_slots_sz;
char *pin_path;
bool pinned;
bool reused;
@@ -389,6 +393,7 @@ struct bpf_object {
int nr_reloc_sects;
int maps_shndx;
int btf_maps_shndx;
+ __u32 btf_maps_sec_btf_id;
int text_shndx;
int symbols_shndx;
int data_shndx;
@@ -1914,109 +1919,54 @@ static int build_map_pin_path(struct bpf_map *map, const char *path)
return 0;
}
-static int bpf_object__init_user_btf_map(struct bpf_object *obj,
- const struct btf_type *sec,
- int var_idx, int sec_idx,
- const Elf_Data *data, bool strict,
- const char *pin_root_path)
+
+static int parse_btf_map_def(struct bpf_object *obj,
+ struct bpf_map *map,
+ const struct btf_type *def,
+ bool strict, bool is_inner,
+ const char *pin_root_path)
{
- const struct btf_type *var, *def, *t;
- const struct btf_var_secinfo *vi;
- const struct btf_var *var_extra;
+ const struct btf_type *t;
const struct btf_member *m;
- const char *map_name;
- struct bpf_map *map;
int vlen, i;
- vi = btf_var_secinfos(sec) + var_idx;
- var = btf__type_by_id(obj->btf, vi->type);
- var_extra = btf_var(var);
- map_name = btf__name_by_offset(obj->btf, var->name_off);
- vlen = btf_vlen(var);
-
- if (map_name == NULL || map_name[0] == '\0') {
- pr_warn("map #%d: empty name.\n", var_idx);
- return -EINVAL;
- }
- if ((__u64)vi->offset + vi->size > data->d_size) {
- pr_warn("map '%s' BTF data is corrupted.\n", map_name);
- return -EINVAL;
- }
- if (!btf_is_var(var)) {
- pr_warn("map '%s': unexpected var kind %u.\n",
- map_name, btf_kind(var));
- return -EINVAL;
- }
- if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED &&
- var_extra->linkage != BTF_VAR_STATIC) {
- pr_warn("map '%s': unsupported var linkage %u.\n",
- map_name, var_extra->linkage);
- return -EOPNOTSUPP;
- }
-
- def = skip_mods_and_typedefs(obj->btf, var->type, NULL);
- if (!btf_is_struct(def)) {
- pr_warn("map '%s': unexpected def kind %u.\n",
- map_name, btf_kind(var));
- return -EINVAL;
- }
- if (def->size > vi->size) {
- pr_warn("map '%s': invalid def size.\n", map_name);
- return -EINVAL;
- }
-
- map = bpf_object__add_map(obj);
- if (IS_ERR(map))
- return PTR_ERR(map);
- map->name = strdup(map_name);
- if (!map->name) {
- pr_warn("map '%s': failed to alloc map name.\n", map_name);
- return -ENOMEM;
- }
- map->libbpf_type = LIBBPF_MAP_UNSPEC;
- map->def.type = BPF_MAP_TYPE_UNSPEC;
- map->sec_idx = sec_idx;
- map->sec_offset = vi->offset;
- pr_debug("map '%s': at sec_idx %d, offset %zu.\n",
- map_name, map->sec_idx, map->sec_offset);
-
vlen = btf_vlen(def);
m = btf_members(def);
for (i = 0; i < vlen; i++, m++) {
const char *name = btf__name_by_offset(obj->btf, m->name_off);
if (!name) {
- pr_warn("map '%s': invalid field #%d.\n", map_name, i);
+ pr_warn("map '%s': invalid field #%d.\n", map->name, i);
return -EINVAL;
}
if (strcmp(name, "type") == 0) {
- if (!get_map_field_int(map_name, obj->btf, m,
+ if (!get_map_field_int(map->name, obj->btf, m,
&map->def.type))
return -EINVAL;
pr_debug("map '%s': found type = %u.\n",
- map_name, map->def.type);
+ map->name, map->def.type);
} else if (strcmp(name, "max_entries") == 0) {
- if (!get_map_field_int(map_name, obj->btf, m,
+ if (!get_map_field_int(map->name, obj->btf, m,
&map->def.max_entries))
return -EINVAL;
pr_debug("map '%s': found max_entries = %u.\n",
- map_name, map->def.max_entries);
+ map->name, map->def.max_entries);
} else if (strcmp(name, "map_flags") == 0) {
- if (!get_map_field_int(map_name, obj->btf, m,
+ if (!get_map_field_int(map->name, obj->btf, m,
&map->def.map_flags))
return -EINVAL;
pr_debug("map '%s': found map_flags = %u.\n",
- map_name, map->def.map_flags);
+ map->name, map->def.map_flags);
} else if (strcmp(name, "key_size") == 0) {
__u32 sz;
- if (!get_map_field_int(map_name, obj->btf, m, &sz))
+ if (!get_map_field_int(map->name, obj->btf, m, &sz))
return -EINVAL;
pr_debug("map '%s': found key_size = %u.\n",
- map_name, sz);
+ map->name, sz);
if (map->def.key_size && map->def.key_size != sz) {
pr_warn("map '%s': conflicting key size %u != %u.\n",
- map_name, map->def.key_size, sz);
+ map->name, map->def.key_size, sz);
return -EINVAL;
}
map->def.key_size = sz;
@@ -2026,25 +1976,25 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
t = btf__type_by_id(obj->btf, m->type);
if (!t) {
pr_warn("map '%s': key type [%d] not found.\n",
- map_name, m->type);
+ map->name, m->type);
return -EINVAL;
}
if (!btf_is_ptr(t)) {
pr_warn("map '%s': key spec is not PTR: %u.\n",
- map_name, btf_kind(t));
+ map->name, btf_kind(t));
return -EINVAL;
}
sz = btf__resolve_size(obj->btf, t->type);
if (sz < 0) {
pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n",
- map_name, t->type, (ssize_t)sz);
+ map->name, t->type, (ssize_t)sz);
return sz;
}
pr_debug("map '%s': found key [%u], sz = %zd.\n",
- map_name, t->type, (ssize_t)sz);
+ map->name, t->type, (ssize_t)sz);
if (map->def.key_size && map->def.key_size != sz) {
pr_warn("map '%s': conflicting key size %u != %zd.\n",
- map_name, map->def.key_size, (ssize_t)sz);
+ map->name, map->def.key_size, (ssize_t)sz);
return -EINVAL;
}
map->def.key_size = sz;
@@ -2052,13 +2002,13 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
} else if (strcmp(name, "value_size") == 0) {
__u32 sz;
- if (!get_map_field_int(map_name, obj->btf, m, &sz))
+ if (!get_map_field_int(map->name, obj->btf, m, &sz))
return -EINVAL;
pr_debug("map '%s': found value_size = %u.\n",
- map_name, sz);
+ map->name, sz);
if (map->def.value_size && map->def.value_size != sz) {
pr_warn("map '%s': conflicting value size %u != %u.\n",
- map_name, map->def.value_size, sz);
+ map->name, map->def.value_size, sz);
return -EINVAL;
}
map->def.value_size = sz;
@@ -2068,71 +2018,207 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
t = btf__type_by_id(obj->btf, m->type);
if (!t) {
pr_warn("map '%s': value type [%d] not found.\n",
- map_name, m->type);
+ map->name, m->type);
return -EINVAL;
}
if (!btf_is_ptr(t)) {
pr_warn("map '%s': value spec is not PTR: %u.\n",
- map_name, btf_kind(t));
+ map->name, btf_kind(t));
return -EINVAL;
}
sz = btf__resolve_size(obj->btf, t->type);
if (sz < 0) {
pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n",
- map_name, t->type, (ssize_t)sz);
+ map->name, t->type, (ssize_t)sz);
return sz;
}
pr_debug("map '%s': found value [%u], sz = %zd.\n",
- map_name, t->type, (ssize_t)sz);
+ map->name, t->type, (ssize_t)sz);
if (map->def.value_size && map->def.value_size != sz) {
pr_warn("map '%s': conflicting value size %u != %zd.\n",
- map_name, map->def.value_size, (ssize_t)sz);
+ map->name, map->def.value_size, (ssize_t)sz);
return -EINVAL;
}
map->def.value_size = sz;
map->btf_value_type_id = t->type;
+ }
+ else if (strcmp(name, "values") == 0) {
+ int err;
+
+ if (is_inner) {
+ pr_warn("map '%s': multi-level inner maps not supported.\n",
+ map->name);
+ return -ENOTSUP;
+ }
+ if (i != vlen - 1) {
+ pr_warn("map '%s': '%s' member should be last.\n",
+ map->name, name);
+ return -EINVAL;
+ }
+ if (!bpf_map_type__is_map_in_map(map->def.type)) {
+ pr_warn("map '%s': should be map-in-map.\n",
+ map->name);
+ return -ENOTSUP;
+ }
+ if (map->def.value_size && map->def.value_size != 4) {
+ pr_warn("map '%s': conflicting value size %u != 4.\n",
+ map->name, map->def.value_size);
+ return -EINVAL;
+ }
+ map->def.value_size = 4;
+ t = btf__type_by_id(obj->btf, m->type);
+ if (!t) {
+ pr_warn("map '%s': map-in-map inner type [%d] not found.\n",
+ map->name, m->type);
+ return -EINVAL;
+ }
+ if (!btf_is_array(t) || btf_array(t)->nelems) {
+ pr_warn("map '%s': map-in-map inner spec is not a zero-sized array.\n",
+ map->name);
+ return -EINVAL;
+ }
+ t = skip_mods_and_typedefs(obj->btf, btf_array(t)->type,
+ NULL);
+ if (!btf_is_ptr(t)) {
+ pr_warn("map '%s': map-in-map inner def is of unexpected kind %u.\n",
+ map->name, btf_kind(t));
+ return -EINVAL;
+ }
+ t = skip_mods_and_typedefs(obj->btf, t->type, NULL);
+ if (!btf_is_struct(t)) {
+ pr_warn("map '%s': map-in-map inner def is of unexpected kind %u.\n",
+ map->name, btf_kind(t));
+ return -EINVAL;
+ }
+
+ map->inner_map = calloc(1, sizeof(*map->inner_map));
+ if (!map->inner_map)
+ return -ENOMEM;
+ map->inner_map->sec_idx = obj->efile.btf_maps_shndx;
+ map->inner_map->name = malloc(strlen(map->name) +
+ sizeof(".inner") + 1);
+ if (!map->inner_map->name)
+ return -ENOMEM;
+ sprintf(map->inner_map->name, "%s.inner", map->name);
+
+ err = parse_btf_map_def(obj, map->inner_map, t, strict,
+ true /* is_inner */, NULL);
+ if (err)
+ return err;
} else if (strcmp(name, "pinning") == 0) {
__u32 val;
int err;
- if (!get_map_field_int(map_name, obj->btf, m, &val))
+ if (is_inner) {
+ pr_debug("map '%s': inner def can't be pinned.\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (!get_map_field_int(map->name, obj->btf, m, &val))
return -EINVAL;
pr_debug("map '%s': found pinning = %u.\n",
- map_name, val);
+ map->name, val);
if (val != LIBBPF_PIN_NONE &&
val != LIBBPF_PIN_BY_NAME) {
pr_warn("map '%s': invalid pinning value %u.\n",
- map_name, val);
+ map->name, val);
return -EINVAL;
}
if (val == LIBBPF_PIN_BY_NAME) {
err = build_map_pin_path(map, pin_root_path);
if (err) {
pr_warn("map '%s': couldn't build pin path.\n",
- map_name);
+ map->name);
return err;
}
}
} else {
if (strict) {
pr_warn("map '%s': unknown field '%s'.\n",
- map_name, name);
+ map->name, name);
return -ENOTSUP;
}
pr_debug("map '%s': ignoring unknown field '%s'.\n",
- map_name, name);
+ map->name, name);
}
}
if (map->def.type == BPF_MAP_TYPE_UNSPEC) {
- pr_warn("map '%s': map type isn't specified.\n", map_name);
+ pr_warn("map '%s': map type isn't specified.\n", map->name);
return -EINVAL;
}
return 0;
}
+static int bpf_object__init_user_btf_map(struct bpf_object *obj,
+ const struct btf_type *sec,
+ int var_idx, int sec_idx,
+ const Elf_Data *data, bool strict,
+ const char *pin_root_path)
+{
+ const struct btf_type *var, *def;
+ const struct btf_var_secinfo *vi;
+ const struct btf_var *var_extra;
+ const char *map_name;
+ struct bpf_map *map;
+
+ vi = btf_var_secinfos(sec) + var_idx;
+ var = btf__type_by_id(obj->btf, vi->type);
+ var_extra = btf_var(var);
+ map_name = btf__name_by_offset(obj->btf, var->name_off);
+
+ if (map_name == NULL || map_name[0] == '\0') {
+ pr_warn("map #%d: empty name.\n", var_idx);
+ return -EINVAL;
+ }
+ if ((__u64)vi->offset + vi->size > data->d_size) {
+ pr_warn("map '%s' BTF data is corrupted.\n", map_name);
+ return -EINVAL;
+ }
+ if (!btf_is_var(var)) {
+ pr_warn("map '%s': unexpected var kind %u.\n",
+ map_name, btf_kind(var));
+ return -EINVAL;
+ }
+ if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED &&
+ var_extra->linkage != BTF_VAR_STATIC) {
+ pr_warn("map '%s': unsupported var linkage %u.\n",
+ map_name, var_extra->linkage);
+ return -EOPNOTSUPP;
+ }
+
+ def = skip_mods_and_typedefs(obj->btf, var->type, NULL);
+ if (!btf_is_struct(def)) {
+ pr_warn("map '%s': unexpected def kind %u.\n",
+ map_name, btf_kind(var));
+ return -EINVAL;
+ }
+ if (def->size > vi->size) {
+ pr_warn("map '%s': invalid def size.\n", map_name);
+ return -EINVAL;
+ }
+
+ map = bpf_object__add_map(obj);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ map->name = strdup(map_name);
+ if (!map->name) {
+ pr_warn("map '%s': failed to alloc map name.\n", map_name);
+ return -ENOMEM;
+ }
+ map->libbpf_type = LIBBPF_MAP_UNSPEC;
+ map->def.type = BPF_MAP_TYPE_UNSPEC;
+ map->sec_idx = sec_idx;
+ map->sec_offset = vi->offset;
+ map->btf_var_idx = var_idx;
+ pr_debug("map '%s': at sec_idx %d, offset %zu.\n",
+ map_name, map->sec_idx, map->sec_offset);
+
+ return parse_btf_map_def(obj, map, def, strict, false, pin_root_path);
+}
+
static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict,
const char *pin_root_path)
{
@@ -2163,6 +2249,7 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict,
name = btf__name_by_offset(obj->btf, t->name_off);
if (strcmp(name, MAPS_ELF_SEC) == 0) {
sec = t;
+ obj->efile.btf_maps_sec_btf_id = i;
break;
}
}
@@ -2549,7 +2636,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
/* Only do relo for section with exec instructions */
if (!section_have_execinstr(obj, sec) &&
- strcmp(name, ".rel" STRUCT_OPS_SEC)) {
+ strcmp(name, ".rel" STRUCT_OPS_SEC) &&
+ strcmp(name, ".rel" MAPS_ELF_SEC)) {
pr_debug("skip relo %s(%d) for section(%d)\n",
name, idx, sec);
continue;
@@ -3482,124 +3570,181 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map)
return 0;
}
+static void bpf_map__destroy(struct bpf_map *map);
+
+static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map)
+{
+ struct bpf_create_map_attr create_attr;
+ struct bpf_map_def *def = &map->def;
+
+ memset(&create_attr, 0, sizeof(create_attr));
+
+ if (obj->caps.name)
+ create_attr.name = map->name;
+ create_attr.map_ifindex = map->map_ifindex;
+ create_attr.map_type = def->type;
+ create_attr.map_flags = def->map_flags;
+ create_attr.key_size = def->key_size;
+ create_attr.value_size = def->value_size;
+
+ if (def->type == BPF_MAP_TYPE_PERF_EVENT_ARRAY && !def->max_entries) {
+ int nr_cpus;
+
+ nr_cpus = libbpf_num_possible_cpus();
+ if (nr_cpus < 0) {
+ pr_warn("map '%s': failed to determine number of system CPUs: %d\n",
+ map->name, nr_cpus);
+ return nr_cpus;
+ }
+ pr_debug("map '%s': setting size to %d\n", map->name, nr_cpus);
+ create_attr.max_entries = nr_cpus;
+ } else {
+ create_attr.max_entries = def->max_entries;
+ }
+
+ if (bpf_map__is_struct_ops(map))
+ create_attr.btf_vmlinux_value_type_id =
+ map->btf_vmlinux_value_type_id;
+
+ create_attr.btf_fd = 0;
+ create_attr.btf_key_type_id = 0;
+ create_attr.btf_value_type_id = 0;
+ if (obj->btf && !bpf_map_find_btf_info(obj, map)) {
+ create_attr.btf_fd = btf__fd(obj->btf);
+ create_attr.btf_key_type_id = map->btf_key_type_id;
+ create_attr.btf_value_type_id = map->btf_value_type_id;
+ }
+
+ if (bpf_map_type__is_map_in_map(def->type)) {
+ if (map->inner_map) {
+ int err;
+
+ err = bpf_object__create_map(obj, map->inner_map);
+ if (err) {
+ pr_warn("map '%s': failed to create inner map: %d\n",
+ map->name, err);
+ return err;
+ }
+ map->inner_map_fd = bpf_map__fd(map->inner_map);
+ }
+ if (map->inner_map_fd >= 0)
+ create_attr.inner_map_fd = map->inner_map_fd;
+ }
+
+ map->fd = bpf_create_map_xattr(&create_attr);
+ if (map->fd < 0 && (create_attr.btf_key_type_id ||
+ create_attr.btf_value_type_id)) {
+ char *cp, errmsg[STRERR_BUFSIZE];
+ int err = -errno;
+
+ cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg));
+ pr_warn("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n",
+ map->name, cp, err);
+ create_attr.btf_fd = 0;
+ create_attr.btf_key_type_id = 0;
+ create_attr.btf_value_type_id = 0;
+ map->btf_key_type_id = 0;
+ map->btf_value_type_id = 0;
+ map->fd = bpf_create_map_xattr(&create_attr);
+ }
+
+ if (map->fd < 0)
+ return -errno;
+
+ if (bpf_map_type__is_map_in_map(def->type) && map->inner_map) {
+ bpf_map__destroy(map->inner_map);
+ zfree(&map->inner_map);
+ }
+
+ return 0;
+}
+
static int
bpf_object__create_maps(struct bpf_object *obj)
{
- struct bpf_create_map_attr create_attr = {};
- int nr_cpus = 0;
- unsigned int i;
+ struct bpf_map *map;
+ char *cp, errmsg[STRERR_BUFSIZE];
+ unsigned int i, j;
int err;
for (i = 0; i < obj->nr_maps; i++) {
- struct bpf_map *map = &obj->maps[i];
- struct bpf_map_def *def = &map->def;
- char *cp, errmsg[STRERR_BUFSIZE];
- int *pfd = &map->fd;
+ map = &obj->maps[i];
if (map->pin_path) {
err = bpf_object__reuse_map(map);
if (err) {
- pr_warn("error reusing pinned map %s\n",
+ pr_warn("map '%s': error reusing pinned map\n",
map->name);
- return err;
+ goto err_out;
}
}
if (map->fd >= 0) {
- pr_debug("skip map create (preset) %s: fd=%d\n",
+ pr_debug("map '%s': skipping creation (preset fd=%d)\n",
map->name, map->fd);
continue;
}
- if (obj->caps.name)
- create_attr.name = map->name;
- create_attr.map_ifindex = map->map_ifindex;
- create_attr.map_type = def->type;
- create_attr.map_flags = def->map_flags;
- create_attr.key_size = def->key_size;
- create_attr.value_size = def->value_size;
- if (def->type == BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
- !def->max_entries) {
- if (!nr_cpus)
- nr_cpus = libbpf_num_possible_cpus();
- if (nr_cpus < 0) {
- pr_warn("failed to determine number of system CPUs: %d\n",
- nr_cpus);
- err = nr_cpus;
- goto err_out;
- }
- pr_debug("map '%s': setting size to %d\n",
- map->name, nr_cpus);
- create_attr.max_entries = nr_cpus;
- } else {
- create_attr.max_entries = def->max_entries;
- }
- create_attr.btf_fd = 0;
- create_attr.btf_key_type_id = 0;
- create_attr.btf_value_type_id = 0;
- if (bpf_map_type__is_map_in_map(def->type) &&
- map->inner_map_fd >= 0)
- create_attr.inner_map_fd = map->inner_map_fd;
- if (bpf_map__is_struct_ops(map))
- create_attr.btf_vmlinux_value_type_id =
- map->btf_vmlinux_value_type_id;
-
- if (obj->btf && !bpf_map_find_btf_info(obj, map)) {
- create_attr.btf_fd = btf__fd(obj->btf);
- create_attr.btf_key_type_id = map->btf_key_type_id;
- create_attr.btf_value_type_id = map->btf_value_type_id;
- }
-
- *pfd = bpf_create_map_xattr(&create_attr);
- if (*pfd < 0 && (create_attr.btf_key_type_id ||
- create_attr.btf_value_type_id)) {
- err = -errno;
- cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg));
- pr_warn("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n",
- map->name, cp, err);
- create_attr.btf_fd = 0;
- create_attr.btf_key_type_id = 0;
- create_attr.btf_value_type_id = 0;
- map->btf_key_type_id = 0;
- map->btf_value_type_id = 0;
- *pfd = bpf_create_map_xattr(&create_attr);
- }
+ err = bpf_object__create_map(obj, map);
+ if (err)
+ goto err_out;
- if (*pfd < 0) {
- size_t j;
-
- err = -errno;
-err_out:
- cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg));
- pr_warn("failed to create map (name: '%s'): %s(%d)\n",
- map->name, cp, err);
- pr_perm_msg(err);
- for (j = 0; j < i; j++)
- zclose(obj->maps[j].fd);
- return err;
- }
+ pr_debug("map '%s': created successfully, fd=%d\n", map->name,
+ map->fd);
if (bpf_map__is_internal(map)) {
err = bpf_object__populate_internal_map(obj, map);
if (err < 0) {
- zclose(*pfd);
+ zclose(map->fd);
goto err_out;
}
}
+ if (map->init_slots_sz) {
+ for (j = 0; j < map->init_slots_sz; j++) {
+ const struct bpf_map *targ_map;
+ int fd;
+
+ if (!map->init_slots[j])
+ continue;
+
+ targ_map = map->init_slots[j];
+ fd = bpf_map__fd(targ_map);
+ err = bpf_map_update_elem(map->fd, &j, &fd, 0);
+ if (err) {
+ err = -errno;
+ pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n",
+ map->name, j, targ_map->name,
+ fd, err);
+ goto err_out;
+ }
+ pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n",
+ map->name, j, targ_map->name, fd);
+ }
+ zfree(&map->init_slots);
+ map->init_slots_sz = 0;
+ }
+
if (map->pin_path && !map->pinned) {
err = bpf_map__pin(map, NULL);
if (err) {
- pr_warn("failed to auto-pin map name '%s' at '%s'\n",
- map->name, map->pin_path);
- return err;
+ pr_warn("map '%s': failed to auto-pin at '%s': %d\n",
+ map->name, map->pin_path, err);
+ zclose(map->fd);
+ goto err_out;
}
}
-
- pr_debug("created map %s: fd=%d\n", map->name, *pfd);
}
return 0;
+
+err_out:
+ cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg));
+ pr_warn("map '%s': failed to create: %s(%d)\n", map->name, cp, err);
+ pr_perm_msg(err);
+ for (j = 0; j < i; j++)
+ zclose(obj->maps[j].fd);
+ return err;
}
static int
@@ -4851,9 +4996,118 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path)
return 0;
}
-static int bpf_object__collect_struct_ops_map_reloc(struct bpf_object *obj,
- GElf_Shdr *shdr,
- Elf_Data *data);
+static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
+ GElf_Shdr *shdr, Elf_Data *data);
+
+static int bpf_object__collect_map_relos(struct bpf_object *obj,
+ GElf_Shdr *shdr, Elf_Data *data)
+{
+ int i, j, nrels, new_sz, ptr_sz = sizeof(void *);
+ const struct btf_var_secinfo *vi = NULL;
+ const struct btf_type *sec, *var, *def;
+ const struct btf_member *member;
+ struct bpf_map *map, *targ_map;
+ const char *name, *mname;
+ Elf_Data *symbols;
+ unsigned int moff;
+ GElf_Sym sym;
+ GElf_Rel rel;
+ void *tmp;
+
+ if (!obj->efile.btf_maps_sec_btf_id || !obj->btf)
+ return -EINVAL;
+ sec = btf__type_by_id(obj->btf, obj->efile.btf_maps_sec_btf_id);
+ if (!sec)
+ return -EINVAL;
+
+ symbols = obj->efile.symbols;
+ nrels = shdr->sh_size / shdr->sh_entsize;
+ for (i = 0; i < nrels; i++) {
+ if (!gelf_getrel(data, i, &rel)) {
+ pr_warn(".maps relo #%d: failed to get ELF relo\n", i);
+ return -LIBBPF_ERRNO__FORMAT;
+ }
+ if (!gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym)) {
+ pr_warn(".maps relo #%d: symbol %zx not found\n",
+ i, (size_t)GELF_R_SYM(rel.r_info));
+ return -LIBBPF_ERRNO__FORMAT;
+ }
+ name = elf_strptr(obj->efile.elf, obj->efile.strtabidx,
+ sym.st_name) ? : "<?>";
+ if (sym.st_shndx != obj->efile.btf_maps_shndx) {
+ pr_warn(".maps relo #%d: '%s' isn't a BTF-defined map\n",
+ i, name);
+ return -LIBBPF_ERRNO__RELOC;
+ }
+
+ pr_debug(".maps relo #%d: for %zd value %zd rel.r_offset %zu name %d ('%s')\n",
+ i, (ssize_t)(rel.r_info >> 32), (size_t)sym.st_value,
+ (size_t)rel.r_offset, sym.st_name, name);
+
+ for (j = 0; j < obj->nr_maps; j++) {
+ map = &obj->maps[j];
+ if (map->sec_idx != obj->efile.btf_maps_shndx)
+ continue;
+
+ vi = btf_var_secinfos(sec) + map->btf_var_idx;
+ if (vi->offset <= rel.r_offset &&
+ rel.r_offset + sizeof(void *) <= vi->offset + vi->size)
+ break;
+ }
+ if (j == obj->nr_maps) {
+ pr_warn(".maps relo #%d: cannot find map '%s' at rel.r_offset %zu\n",
+ i, name, (size_t)rel.r_offset);
+ return -EINVAL;
+ }
+
+ if (!bpf_map_type__is_map_in_map(map->def.type))
+ return -EINVAL;
+ if (map->def.type == BPF_MAP_TYPE_HASH_OF_MAPS &&
+ map->def.key_size != sizeof(int)) {
+ pr_warn(".maps relo #%d: hash-of-maps '%s' should have key size %zu.\n",
+ i, map->name, sizeof(int));
+ return -EINVAL;
+ }
+
+ targ_map = bpf_object__find_map_by_name(obj, name);
+ if (!targ_map)
+ return -ESRCH;
+
+ var = btf__type_by_id(obj->btf, vi->type);
+ def = skip_mods_and_typedefs(obj->btf, var->type, NULL);
+ if (btf_vlen(def) == 0)
+ return -EINVAL;
+ member = btf_members(def) + btf_vlen(def) - 1;
+ mname = btf__name_by_offset(obj->btf, member->name_off);
+ if (strcmp(mname, "values"))
+ return -EINVAL;
+
+ moff = btf_member_bit_offset(def, btf_vlen(def) - 1) / 8;
+ if (rel.r_offset - vi->offset < moff)
+ return -EINVAL;
+
+ moff = rel.r_offset - vi->offset - moff;
+ if (moff % ptr_sz)
+ return -EINVAL;
+ moff /= ptr_sz;
+ if (moff >= map->init_slots_sz) {
+ new_sz = moff + 1;
+ tmp = realloc(map->init_slots, new_sz * ptr_sz);
+ if (!tmp)
+ return -ENOMEM;
+ map->init_slots = tmp;
+ memset(map->init_slots + map->init_slots_sz, 0,
+ (new_sz - map->init_slots_sz) * ptr_sz);
+ map->init_slots_sz = new_sz;
+ }
+ map->init_slots[moff] = targ_map;
+
+ pr_debug(".maps relo #%d: map '%s' slot [%d] points to map '%s'\n",
+ i, map->name, moff, name);
+ }
+
+ return 0;
+}
static int bpf_object__collect_reloc(struct bpf_object *obj)
{
@@ -4876,21 +5130,17 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
}
if (idx == obj->efile.st_ops_shndx) {
- err = bpf_object__collect_struct_ops_map_reloc(obj,
- shdr,
- data);
- if (err)
- return err;
- continue;
- }
-
- prog = bpf_object__find_prog_by_idx(obj, idx);
- if (!prog) {
- pr_warn("relocation failed: no section(%d)\n", idx);
- return -LIBBPF_ERRNO__RELOC;
+ err = bpf_object__collect_st_ops_relos(obj, shdr, data);
+ } else if (idx == obj->efile.btf_maps_shndx) {
+ err = bpf_object__collect_map_relos(obj, shdr, data);
+ } else {
+ prog = bpf_object__find_prog_by_idx(obj, idx);
+ if (!prog) {
+ pr_warn("relocation failed: no prog in section(%d)\n", idx);
+ return -LIBBPF_ERRNO__RELOC;
+ }
+ err = bpf_program__collect_reloc(prog, shdr, data, obj);
}
-
- err = bpf_program__collect_reloc(prog, shdr, data, obj);
if (err)
return err;
}
@@ -5955,6 +6205,40 @@ int bpf_object__pin(struct bpf_object *obj, const char *path)
return 0;
}
+static void bpf_map__destroy(struct bpf_map *map)
+{
+ if (map->clear_priv)
+ map->clear_priv(map, map->priv);
+ map->priv = NULL;
+ map->clear_priv = NULL;
+
+ if (map->inner_map) {
+ bpf_map__destroy(map->inner_map);
+ zfree(&map->inner_map);
+ }
+
+ zfree(&map->init_slots);
+ map->init_slots_sz = 0;
+
+ if (map->mmaped) {
+ munmap(map->mmaped, bpf_map_mmap_sz(map));
+ map->mmaped = NULL;
+ }
+
+ if (map->st_ops) {
+ zfree(&map->st_ops->data);
+ zfree(&map->st_ops->progs);
+ zfree(&map->st_ops->kern_func_off);
+ zfree(&map->st_ops);
+ }
+
+ zfree(&map->name);
+ zfree(&map->pin_path);
+
+ if (map->fd >= 0)
+ zclose(map->fd);
+}
+
void bpf_object__close(struct bpf_object *obj)
{
size_t i;
@@ -5970,29 +6254,8 @@ void bpf_object__close(struct bpf_object *obj)
btf__free(obj->btf);
btf_ext__free(obj->btf_ext);
- for (i = 0; i < obj->nr_maps; i++) {
- struct bpf_map *map = &obj->maps[i];
-
- if (map->clear_priv)
- map->clear_priv(map, map->priv);
- map->priv = NULL;
- map->clear_priv = NULL;
-
- if (map->mmaped) {
- munmap(map->mmaped, bpf_map_mmap_sz(map));
- map->mmaped = NULL;
- }
-
- if (map->st_ops) {
- zfree(&map->st_ops->data);
- zfree(&map->st_ops->progs);
- zfree(&map->st_ops->kern_func_off);
- zfree(&map->st_ops);
- }
-
- zfree(&map->name);
- zfree(&map->pin_path);
- }
+ for (i = 0; i < obj->nr_maps; i++)
+ bpf_map__destroy(&obj->maps[i]);
zfree(&obj->kconfig);
zfree(&obj->externs);
@@ -6516,9 +6779,8 @@ static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj,
}
/* Collect the reloc from ELF and populate the st_ops->progs[] */
-static int bpf_object__collect_struct_ops_map_reloc(struct bpf_object *obj,
- GElf_Shdr *shdr,
- Elf_Data *data)
+static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
+ GElf_Shdr *shdr, Elf_Data *data)
{
const struct btf_member *member;
struct bpf_struct_ops *st_ops;
@@ -6672,6 +6934,7 @@ int libbpf_find_vmlinux_btf_id(const char *name,
enum bpf_attach_type attach_type)
{
struct btf *btf;
+ int err;
btf = libbpf_find_kernel_btf();
if (IS_ERR(btf)) {
@@ -6679,7 +6942,9 @@ int libbpf_find_vmlinux_btf_id(const char *name,
return -EINVAL;
}
- return __find_vmlinux_btf_id(btf, name, attach_type);
+ err = __find_vmlinux_btf_id(btf, name, attach_type);
+ btf__free(btf);
+ return err;
}
static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd)
@@ -7006,7 +7271,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
err = bpf_object__load(obj);
if (err) {
bpf_object__close(obj);
- return -EINVAL;
+ return err;
}
*pobj = obj;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index bb8831605b25..e03bd4db827e 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -254,3 +254,10 @@ LIBBPF_0.0.8 {
bpf_program__set_lsm;
bpf_set_link_xdp_fd_opts;
} LIBBPF_0.0.7;
+
+LIBBPF_0.0.9 {
+ global:
+ bpf_enable_stats;
+ bpf_link_get_fd_by_id;
+ bpf_link_get_next_id;
+} LIBBPF_0.0.8;
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index c30079c86998..3ff031972975 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -30,8 +30,6 @@ test_tcpnotify_user
test_libbpf
test_tcp_check_syncookie_user
test_sysctl
-test_hashmap
-test_btf_dump
test_current_pid_tgid_new_ns
xdping
test_cpp
@@ -39,4 +37,4 @@ test_cpp
/no_alu32
/bpf_gcc
/tools
-
+/runqslower
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7729892e0b04..3d942be23d09 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -20,9 +20,10 @@ CLANG ?= clang
LLC ?= llc
LLVM_OBJCOPY ?= llvm-objcopy
BPF_GCC ?= $(shell command -v bpf-gcc;)
-CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) -I$(CURDIR) \
- -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) -I$(TOOLSINCDIR) \
- -I$(APIDIR) \
+SAN_CFLAGS ?=
+CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) $(SAN_CFLAGS) \
+ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \
+ -I$(TOOLSINCDIR) -I$(APIDIR) \
-Dbpf_prog_load=bpf_prog_test_load \
-Dbpf_load_program=bpf_test_load_program
LDLIBS += -lcap -lelf -lz -lrt -lpthread
@@ -32,7 +33,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \
test_cgroup_storage \
- test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
+ test_netcnt test_tcpnotify_user test_sock_fields test_sysctl \
test_progs-no_alu32 \
test_current_pid_tgid_new_ns
@@ -141,7 +142,8 @@ VMLINUX_BTF := $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
$(OUTPUT)/runqslower: $(BPFOBJ)
$(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \
OUTPUT=$(SCRATCH_DIR)/ VMLINUX_BTF=$(VMLINUX_BTF) \
- BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR)
+ BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) && \
+ cp $(SCRATCH_DIR)/runqslower $@
$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ)
@@ -241,7 +243,7 @@ define GCC_BPF_BUILD_RULE
$(BPF_GCC) $3 $4 -O2 -c $1 -o $2
endef
-SKEL_BLACKLIST := btf__% test_pinning_invalid.c
+SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c
# Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on
# $eval()) and pass control to DEFINE_TEST_RUNNER_RULES.
@@ -323,7 +325,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \
$(TRUNNER_BPF_SKELS) \
$$(BPFOBJ) | $(TRUNNER_OUTPUT)
$$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@)
- cd $$(@D) && $$(CC) $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)
+ cd $$(@D) && $$(CC) -I. $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)
$(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \
%.c \
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
index f10029821e16..7afa4160416f 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
@@ -1,26 +1,30 @@
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
+#define nr_iters 2
+
void test_bpf_obj_id(void)
{
const __u64 array_magic_value = 0xfaceb00c;
const __u32 array_key = 0;
- const int nr_iters = 2;
const char *file = "./test_obj_id.o";
const char *expected_prog_name = "test_obj_id";
const char *expected_map_name = "test_map_id";
const __u64 nsec_per_sec = 1000000000;
- struct bpf_object *objs[nr_iters];
+ struct bpf_object *objs[nr_iters] = {};
+ struct bpf_link *links[nr_iters] = {};
+ struct bpf_program *prog;
int prog_fds[nr_iters], map_fds[nr_iters];
/* +1 to test for the info_len returned by kernel */
struct bpf_prog_info prog_infos[nr_iters + 1];
struct bpf_map_info map_infos[nr_iters + 1];
+ struct bpf_link_info link_infos[nr_iters + 1];
/* Each prog only uses one map. +1 to test nr_map_ids
* returned by kernel.
*/
__u32 map_ids[nr_iters + 1];
- char jited_insns[128], xlated_insns[128], zeros[128];
+ char jited_insns[128], xlated_insns[128], zeros[128], tp_name[128];
__u32 i, next_id, info_len, nr_id_found, duration = 0;
struct timespec real_time_ts, boot_time_ts;
int err = 0;
@@ -36,14 +40,15 @@ void test_bpf_obj_id(void)
CHECK(err >= 0 || errno != ENOENT,
"get-fd-by-notexist-map-id", "err %d errno %d\n", err, errno);
- for (i = 0; i < nr_iters; i++)
- objs[i] = NULL;
+ err = bpf_link_get_fd_by_id(0);
+ CHECK(err >= 0 || errno != ENOENT,
+ "get-fd-by-notexist-link-id", "err %d errno %d\n", err, errno);
/* Check bpf_obj_get_info_by_fd() */
bzero(zeros, sizeof(zeros));
for (i = 0; i < nr_iters; i++) {
now = time(NULL);
- err = bpf_prog_load(file, BPF_PROG_TYPE_SOCKET_FILTER,
+ err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT,
&objs[i], &prog_fds[i]);
/* test_obj_id.o is a dumb prog. It should never fail
* to load.
@@ -60,6 +65,17 @@ void test_bpf_obj_id(void)
if (CHECK_FAIL(err))
goto done;
+ prog = bpf_object__find_program_by_title(objs[i],
+ "raw_tp/sys_enter");
+ if (CHECK_FAIL(!prog))
+ goto done;
+ links[i] = bpf_program__attach(prog);
+ err = libbpf_get_error(links[i]);
+ if (CHECK(err, "prog_attach", "prog #%d, err %d\n", i, err)) {
+ links[i] = NULL;
+ goto done;
+ }
+
/* Check getting map info */
info_len = sizeof(struct bpf_map_info) * 2;
bzero(&map_infos[i], info_len);
@@ -107,7 +123,7 @@ void test_bpf_obj_id(void)
load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec)
+ (prog_infos[i].load_time / nsec_per_sec);
if (CHECK(err ||
- prog_infos[i].type != BPF_PROG_TYPE_SOCKET_FILTER ||
+ prog_infos[i].type != BPF_PROG_TYPE_RAW_TRACEPOINT ||
info_len != sizeof(struct bpf_prog_info) ||
(env.jit_enabled && !prog_infos[i].jited_prog_len) ||
(env.jit_enabled &&
@@ -120,7 +136,11 @@ void test_bpf_obj_id(void)
*(int *)(long)prog_infos[i].map_ids != map_infos[i].id ||
strcmp((char *)prog_infos[i].name, expected_prog_name),
"get-prog-info(fd)",
- "err %d errno %d i %d type %d(%d) info_len %u(%zu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n",
+ "err %d errno %d i %d type %d(%d) info_len %u(%zu) "
+ "jit_enabled %d jited_prog_len %u xlated_prog_len %u "
+ "jited_prog %d xlated_prog %d load_time %lu(%lu) "
+ "uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) "
+ "name %s(%s)\n",
err, errno, i,
prog_infos[i].type, BPF_PROG_TYPE_SOCKET_FILTER,
info_len, sizeof(struct bpf_prog_info),
@@ -135,6 +155,33 @@ void test_bpf_obj_id(void)
*(int *)(long)prog_infos[i].map_ids, map_infos[i].id,
prog_infos[i].name, expected_prog_name))
goto done;
+
+ /* Check getting link info */
+ info_len = sizeof(struct bpf_link_info) * 2;
+ bzero(&link_infos[i], info_len);
+ link_infos[i].raw_tracepoint.tp_name = (__u64)&tp_name;
+ link_infos[i].raw_tracepoint.tp_name_len = sizeof(tp_name);
+ err = bpf_obj_get_info_by_fd(bpf_link__fd(links[i]),
+ &link_infos[i], &info_len);
+ if (CHECK(err ||
+ link_infos[i].type != BPF_LINK_TYPE_RAW_TRACEPOINT ||
+ link_infos[i].prog_id != prog_infos[i].id ||
+ link_infos[i].raw_tracepoint.tp_name != (__u64)&tp_name ||
+ strcmp((char *)link_infos[i].raw_tracepoint.tp_name,
+ "sys_enter") ||
+ info_len != sizeof(struct bpf_link_info),
+ "get-link-info(fd)",
+ "err %d errno %d info_len %u(%zu) type %d(%d) id %d "
+ "prog_id %d (%d) tp_name %s(%s)\n",
+ err, errno,
+ info_len, sizeof(struct bpf_link_info),
+ link_infos[i].type, BPF_LINK_TYPE_RAW_TRACEPOINT,
+ link_infos[i].id,
+ link_infos[i].prog_id, prog_infos[i].id,
+ (char *)link_infos[i].raw_tracepoint.tp_name,
+ "sys_enter"))
+ goto done;
+
}
/* Check bpf_prog_get_next_id() */
@@ -247,7 +294,52 @@ void test_bpf_obj_id(void)
"nr_id_found %u(%u)\n",
nr_id_found, nr_iters);
+ /* Check bpf_link_get_next_id() */
+ nr_id_found = 0;
+ next_id = 0;
+ while (!bpf_link_get_next_id(next_id, &next_id)) {
+ struct bpf_link_info link_info;
+ int link_fd, cmp_res;
+
+ info_len = sizeof(link_info);
+ memset(&link_info, 0, info_len);
+
+ link_fd = bpf_link_get_fd_by_id(next_id);
+ if (link_fd < 0 && errno == ENOENT)
+ /* The bpf_link is in the dead row */
+ continue;
+ if (CHECK(link_fd < 0, "get-link-fd(next_id)",
+ "link_fd %d next_id %u errno %d\n",
+ link_fd, next_id, errno))
+ break;
+
+ for (i = 0; i < nr_iters; i++)
+ if (link_infos[i].id == next_id)
+ break;
+
+ if (i == nr_iters)
+ continue;
+
+ nr_id_found++;
+
+ err = bpf_obj_get_info_by_fd(link_fd, &link_info, &info_len);
+ cmp_res = memcmp(&link_info, &link_infos[i],
+ offsetof(struct bpf_link_info, raw_tracepoint));
+ CHECK(err || info_len != sizeof(link_info) || cmp_res,
+ "check get-link-info(next_id->fd)",
+ "err %d errno %d info_len %u(%zu) memcmp %d\n",
+ err, errno, info_len, sizeof(struct bpf_link_info),
+ cmp_res);
+
+ close(link_fd);
+ }
+ CHECK(nr_id_found != nr_iters,
+ "check total link id found by get_next_id",
+ "nr_id_found %u(%u)\n", nr_id_found, nr_iters);
+
done:
- for (i = 0; i < nr_iters; i++)
+ for (i = 0; i < nr_iters; i++) {
+ bpf_link__destroy(links[i]);
bpf_object__close(objs[i]);
+ }
}
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
new file mode 100644
index 000000000000..f7ee8fa377ad
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <test_progs.h>
+
+#include "test_btf_map_in_map.skel.h"
+
+void test_btf_map_in_map(void)
+{
+ int duration = 0, err, key = 0, val;
+ struct test_btf_map_in_map* skel;
+
+ skel = test_btf_map_in_map__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n"))
+ return;
+
+ err = test_btf_map_in_map__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+ goto cleanup;
+
+ /* inner1 = input, inner2 = input + 1 */
+ val = bpf_map__fd(skel->maps.inner_map1);
+ bpf_map_update_elem(bpf_map__fd(skel->maps.outer_arr), &key, &val, 0);
+ val = bpf_map__fd(skel->maps.inner_map2);
+ bpf_map_update_elem(bpf_map__fd(skel->maps.outer_hash), &key, &val, 0);
+ skel->bss->input = 1;
+ usleep(1);
+
+ bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map1), &key, &val);
+ CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1);
+ bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map2), &key, &val);
+ CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2);
+
+ /* inner1 = input + 1, inner2 = input */
+ val = bpf_map__fd(skel->maps.inner_map2);
+ bpf_map_update_elem(bpf_map__fd(skel->maps.outer_arr), &key, &val, 0);
+ val = bpf_map__fd(skel->maps.inner_map1);
+ bpf_map_update_elem(bpf_map__fd(skel->maps.outer_hash), &key, &val, 0);
+ skel->bss->input = 3;
+ usleep(1);
+
+ bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map1), &key, &val);
+ CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4);
+ bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map2), &key, &val);
+ CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3);
+
+cleanup:
+ test_btf_map_in_map__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
new file mode 100644
index 000000000000..f259085cca6a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
@@ -0,0 +1,456 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// Copyright (c) 2020 Cloudflare
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <string.h>
+
+#include <linux/pkt_cls.h>
+
+#include <test_progs.h>
+
+#include "progs/test_cls_redirect.h"
+#include "test_cls_redirect.skel.h"
+
+#define ENCAP_IP INADDR_LOOPBACK
+#define ENCAP_PORT (1234)
+
+struct addr_port {
+ in_port_t port;
+ union {
+ struct in_addr in_addr;
+ struct in6_addr in6_addr;
+ };
+};
+
+struct tuple {
+ int family;
+ struct addr_port src;
+ struct addr_port dst;
+};
+
+static int start_server(const struct sockaddr *addr, socklen_t len, int type)
+{
+ int fd = socket(addr->sa_family, type, 0);
+ if (CHECK_FAIL(fd == -1))
+ return -1;
+ if (CHECK_FAIL(bind(fd, addr, len) == -1))
+ goto err;
+ if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1))
+ goto err;
+
+ return fd;
+
+err:
+ close(fd);
+ return -1;
+}
+
+static int connect_to_server(const struct sockaddr *addr, socklen_t len,
+ int type)
+{
+ int fd = socket(addr->sa_family, type, 0);
+ if (CHECK_FAIL(fd == -1))
+ return -1;
+ if (CHECK_FAIL(connect(fd, addr, len)))
+ goto err;
+
+ return fd;
+
+err:
+ close(fd);
+ return -1;
+}
+
+static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap)
+{
+ const struct sockaddr_in6 *in6;
+ const struct sockaddr_in *in;
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ in = (const struct sockaddr_in *)sa;
+ ap->in_addr = in->sin_addr;
+ ap->port = in->sin_port;
+ return true;
+
+ case AF_INET6:
+ in6 = (const struct sockaddr_in6 *)sa;
+ ap->in6_addr = in6->sin6_addr;
+ ap->port = in6->sin6_port;
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type,
+ int *server, int *conn, struct tuple *tuple)
+{
+ struct sockaddr_storage ss;
+ socklen_t slen = sizeof(ss);
+ struct sockaddr *sa = (struct sockaddr *)&ss;
+
+ *server = start_server(addr, len, type);
+ if (*server < 0)
+ return false;
+
+ if (CHECK_FAIL(getsockname(*server, sa, &slen)))
+ goto close_server;
+
+ *conn = connect_to_server(sa, slen, type);
+ if (*conn < 0)
+ goto close_server;
+
+ /* We want to simulate packets arriving at conn, so we have to
+ * swap src and dst.
+ */
+ slen = sizeof(ss);
+ if (CHECK_FAIL(getsockname(*conn, sa, &slen)))
+ goto close_conn;
+
+ if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst)))
+ goto close_conn;
+
+ slen = sizeof(ss);
+ if (CHECK_FAIL(getpeername(*conn, sa, &slen)))
+ goto close_conn;
+
+ if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src)))
+ goto close_conn;
+
+ tuple->family = ss.ss_family;
+ return true;
+
+close_conn:
+ close(*conn);
+ *conn = -1;
+close_server:
+ close(*server);
+ *server = -1;
+ return false;
+}
+
+static socklen_t prepare_addr(struct sockaddr_storage *addr, int family)
+{
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+
+ switch (family) {
+ case AF_INET:
+ addr4 = (struct sockaddr_in *)addr;
+ memset(addr4, 0, sizeof(*addr4));
+ addr4->sin_family = family;
+ addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ return sizeof(*addr4);
+ case AF_INET6:
+ addr6 = (struct sockaddr_in6 *)addr;
+ memset(addr6, 0, sizeof(*addr6));
+ addr6->sin6_family = family;
+ addr6->sin6_addr = in6addr_loopback;
+ return sizeof(*addr6);
+ default:
+ fprintf(stderr, "Invalid family %d", family);
+ return 0;
+ }
+}
+
+static bool was_decapsulated(struct bpf_prog_test_run_attr *tattr)
+{
+ return tattr->data_size_out < tattr->data_size_in;
+}
+
+enum type {
+ UDP,
+ TCP,
+ __NR_KIND,
+};
+
+enum hops {
+ NO_HOPS,
+ ONE_HOP,
+};
+
+enum flags {
+ NONE,
+ SYN,
+ ACK,
+};
+
+enum conn {
+ KNOWN_CONN,
+ UNKNOWN_CONN,
+};
+
+enum result {
+ ACCEPT,
+ FORWARD,
+};
+
+struct test_cfg {
+ enum type type;
+ enum result result;
+ enum conn conn;
+ enum hops hops;
+ enum flags flags;
+};
+
+static int test_str(void *buf, size_t len, const struct test_cfg *test,
+ int family)
+{
+ const char *family_str, *type, *conn, *hops, *result, *flags;
+
+ family_str = "IPv4";
+ if (family == AF_INET6)
+ family_str = "IPv6";
+
+ type = "TCP";
+ if (test->type == UDP)
+ type = "UDP";
+
+ conn = "known";
+ if (test->conn == UNKNOWN_CONN)
+ conn = "unknown";
+
+ hops = "no hops";
+ if (test->hops == ONE_HOP)
+ hops = "one hop";
+
+ result = "accept";
+ if (test->result == FORWARD)
+ result = "forward";
+
+ flags = "none";
+ if (test->flags == SYN)
+ flags = "SYN";
+ else if (test->flags == ACK)
+ flags = "ACK";
+
+ return snprintf(buf, len, "%s %s %s %s (%s, flags: %s)", family_str,
+ type, result, conn, hops, flags);
+}
+
+static struct test_cfg tests[] = {
+ { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, SYN },
+ { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, ACK },
+ { TCP, FORWARD, UNKNOWN_CONN, ONE_HOP, ACK },
+ { TCP, ACCEPT, KNOWN_CONN, ONE_HOP, ACK },
+ { UDP, ACCEPT, UNKNOWN_CONN, NO_HOPS, NONE },
+ { UDP, FORWARD, UNKNOWN_CONN, ONE_HOP, NONE },
+ { UDP, ACCEPT, KNOWN_CONN, ONE_HOP, NONE },
+};
+
+static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto)
+{
+ const uint8_t hlen =
+ (sizeof(struct guehdr) / sizeof(uint32_t)) + hop_count;
+ *encap = (encap_headers_t){
+ .eth = { .h_proto = htons(ETH_P_IP) },
+ .ip = {
+ .ihl = 5,
+ .version = 4,
+ .ttl = IPDEFTTL,
+ .protocol = IPPROTO_UDP,
+ .daddr = htonl(ENCAP_IP)
+ },
+ .udp = {
+ .dest = htons(ENCAP_PORT),
+ },
+ .gue = {
+ .hlen = hlen,
+ .proto_ctype = proto
+ },
+ .unigue = {
+ .hop_count = hop_count
+ },
+ };
+}
+
+static size_t build_input(const struct test_cfg *test, void *const buf,
+ const struct tuple *tuple)
+{
+ in_port_t sport = tuple->src.port;
+ encap_headers_t encap;
+ struct iphdr ip;
+ struct ipv6hdr ipv6;
+ struct tcphdr tcp;
+ struct udphdr udp;
+ struct in_addr next_hop;
+ uint8_t *p = buf;
+ int proto;
+
+ proto = IPPROTO_IPIP;
+ if (tuple->family == AF_INET6)
+ proto = IPPROTO_IPV6;
+
+ encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto);
+ p = mempcpy(p, &encap, sizeof(encap));
+
+ if (test->hops == ONE_HOP) {
+ next_hop = (struct in_addr){ .s_addr = htonl(0x7f000002) };
+ p = mempcpy(p, &next_hop, sizeof(next_hop));
+ }
+
+ proto = IPPROTO_TCP;
+ if (test->type == UDP)
+ proto = IPPROTO_UDP;
+
+ switch (tuple->family) {
+ case AF_INET:
+ ip = (struct iphdr){
+ .ihl = 5,
+ .version = 4,
+ .ttl = IPDEFTTL,
+ .protocol = proto,
+ .saddr = tuple->src.in_addr.s_addr,
+ .daddr = tuple->dst.in_addr.s_addr,
+ };
+ p = mempcpy(p, &ip, sizeof(ip));
+ break;
+ case AF_INET6:
+ ipv6 = (struct ipv6hdr){
+ .version = 6,
+ .hop_limit = IPDEFTTL,
+ .nexthdr = proto,
+ .saddr = tuple->src.in6_addr,
+ .daddr = tuple->dst.in6_addr,
+ };
+ p = mempcpy(p, &ipv6, sizeof(ipv6));
+ break;
+ default:
+ return 0;
+ }
+
+ if (test->conn == UNKNOWN_CONN)
+ sport--;
+
+ switch (test->type) {
+ case TCP:
+ tcp = (struct tcphdr){
+ .source = sport,
+ .dest = tuple->dst.port,
+ };
+ if (test->flags == SYN)
+ tcp.syn = true;
+ if (test->flags == ACK)
+ tcp.ack = true;
+ p = mempcpy(p, &tcp, sizeof(tcp));
+ break;
+ case UDP:
+ udp = (struct udphdr){
+ .source = sport,
+ .dest = tuple->dst.port,
+ };
+ p = mempcpy(p, &udp, sizeof(udp));
+ break;
+ default:
+ return 0;
+ }
+
+ return (void *)p - buf;
+}
+
+static void close_fds(int *fds, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ if (fds[i] > 0)
+ close(fds[i]);
+}
+
+void test_cls_redirect(void)
+{
+ struct test_cls_redirect *skel = NULL;
+ struct bpf_prog_test_run_attr tattr = {};
+ int families[] = { AF_INET, AF_INET6 };
+ struct sockaddr_storage ss;
+ struct sockaddr *addr;
+ socklen_t slen;
+ int i, j, err;
+
+ int servers[__NR_KIND][ARRAY_SIZE(families)] = {};
+ int conns[__NR_KIND][ARRAY_SIZE(families)] = {};
+ struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)];
+
+ skel = test_cls_redirect__open();
+ if (CHECK_FAIL(!skel))
+ return;
+
+ skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
+ skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);
+
+ if (CHECK_FAIL(test_cls_redirect__load(skel)))
+ goto cleanup;
+
+ addr = (struct sockaddr *)&ss;
+ for (i = 0; i < ARRAY_SIZE(families); i++) {
+ slen = prepare_addr(&ss, families[i]);
+ if (CHECK_FAIL(!slen))
+ goto cleanup;
+
+ if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM,
+ &servers[UDP][i], &conns[UDP][i],
+ &tuples[UDP][i])))
+ goto cleanup;
+
+ if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM,
+ &servers[TCP][i], &conns[TCP][i],
+ &tuples[TCP][i])))
+ goto cleanup;
+ }
+
+ tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect);
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ struct test_cfg *test = &tests[i];
+
+ for (j = 0; j < ARRAY_SIZE(families); j++) {
+ struct tuple *tuple = &tuples[test->type][j];
+ char input[256];
+ char tmp[256];
+
+ test_str(tmp, sizeof(tmp), test, tuple->family);
+ if (!test__start_subtest(tmp))
+ continue;
+
+ tattr.data_out = tmp;
+ tattr.data_size_out = sizeof(tmp);
+
+ tattr.data_in = input;
+ tattr.data_size_in = build_input(test, input, tuple);
+ if (CHECK_FAIL(!tattr.data_size_in))
+ continue;
+
+ err = bpf_prog_test_run_xattr(&tattr);
+ if (CHECK_FAIL(err))
+ continue;
+
+ if (tattr.retval != TC_ACT_REDIRECT) {
+ PRINT_FAIL("expected TC_ACT_REDIRECT, got %d\n",
+ tattr.retval);
+ continue;
+ }
+
+ switch (test->result) {
+ case ACCEPT:
+ if (CHECK_FAIL(!was_decapsulated(&tattr)))
+ continue;
+ break;
+ case FORWARD:
+ if (CHECK_FAIL(was_decapsulated(&tattr)))
+ continue;
+ break;
+ default:
+ PRINT_FAIL("unknown result %d\n", test->result);
+ continue;
+ }
+ }
+ }
+
+cleanup:
+ test_cls_redirect__destroy(skel);
+ close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0]));
+ close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0]));
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
index 31e177adbdf1..084ed26a7d78 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -392,7 +392,7 @@ static struct core_reloc_test_case test_cases[] = {
.input = STRUCT_TO_CHAR_PTR(core_reloc_existence___minimal) {
.a = 42,
},
- .input_len = sizeof(struct core_reloc_existence),
+ .input_len = sizeof(struct core_reloc_existence___minimal),
.output = STRUCT_TO_CHAR_PTR(core_reloc_existence_output) {
.a_exists = 1,
.b_exists = 0,
diff --git a/tools/testing/selftests/bpf/prog_tests/enable_stats.c b/tools/testing/selftests/bpf/prog_tests/enable_stats.c
new file mode 100644
index 000000000000..2cb2085917e7
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/enable_stats.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "test_enable_stats.skel.h"
+
+void test_enable_stats(void)
+{
+ struct test_enable_stats *skel;
+ int stats_fd, err, prog_fd;
+ struct bpf_prog_info info;
+ __u32 info_len = sizeof(info);
+ int duration = 0;
+
+ skel = test_enable_stats__open_and_load();
+ if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n"))
+ return;
+
+ stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME);
+ if (CHECK(stats_fd < 0, "get_stats_fd", "failed %d\n", errno)) {
+ test_enable_stats__destroy(skel);
+ return;
+ }
+
+ err = test_enable_stats__attach(skel);
+ if (CHECK(err, "attach_raw_tp", "err %d\n", err))
+ goto cleanup;
+
+ test_enable_stats__detach(skel);
+
+ prog_fd = bpf_program__fd(skel->progs.test_enable_stats);
+ memset(&info, 0, info_len);
+ err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+ if (CHECK(err, "get_prog_info",
+ "failed to get bpf_prog_info for fd %d\n", prog_fd))
+ goto cleanup;
+ if (CHECK(info.run_time_ns == 0, "check_stats_enabled",
+ "failed to enable run_time_ns stats\n"))
+ goto cleanup;
+
+ CHECK(info.run_cnt != skel->bss->count, "check_run_cnt_valid",
+ "invalid run_cnt stats\n");
+
+cleanup:
+ test_enable_stats__destroy(skel);
+ close(stats_fd);
+}
diff --git a/tools/testing/selftests/bpf/test_hashmap.c b/tools/testing/selftests/bpf/prog_tests/hashmap.c
index c490e012c23f..428d488830c6 100644
--- a/tools/testing/selftests/bpf/test_hashmap.c
+++ b/tools/testing/selftests/bpf/prog_tests/hashmap.c
@@ -5,26 +5,17 @@
*
* Copyright (c) 2019 Facebook
*/
-#include <stdio.h>
-#include <errno.h>
-#include <linux/err.h>
+#include "test_progs.h"
#include "bpf/hashmap.h"
-#define CHECK(condition, format...) ({ \
- int __ret = !!(condition); \
- if (__ret) { \
- fprintf(stderr, "%s:%d:FAIL ", __func__, __LINE__); \
- fprintf(stderr, format); \
- } \
- __ret; \
-})
+static int duration = 0;
-size_t hash_fn(const void *k, void *ctx)
+static size_t hash_fn(const void *k, void *ctx)
{
return (long)k;
}
-bool equal_fn(const void *a, const void *b, void *ctx)
+static bool equal_fn(const void *a, const void *b, void *ctx)
{
return (long)a == (long)b;
}
@@ -49,53 +40,55 @@ static inline size_t exp_cap(size_t sz)
#define ELEM_CNT 62
-int test_hashmap_generic(void)
+static void test_hashmap_generic(void)
{
struct hashmap_entry *entry, *tmp;
int err, bkt, found_cnt, i;
long long found_msk;
struct hashmap *map;
- fprintf(stderr, "%s: ", __func__);
-
map = hashmap__new(hash_fn, equal_fn, NULL);
- if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map)))
- return 1;
+ if (CHECK(IS_ERR(map), "hashmap__new",
+ "failed to create map: %ld\n", PTR_ERR(map)))
+ return;
for (i = 0; i < ELEM_CNT; i++) {
const void *oldk, *k = (const void *)(long)i;
void *oldv, *v = (void *)(long)(1024 + i);
err = hashmap__update(map, k, v, &oldk, &oldv);
- if (CHECK(err != -ENOENT, "unexpected result: %d\n", err))
- return 1;
+ if (CHECK(err != -ENOENT, "hashmap__update",
+ "unexpected result: %d\n", err))
+ goto cleanup;
if (i % 2) {
err = hashmap__add(map, k, v);
} else {
err = hashmap__set(map, k, v, &oldk, &oldv);
- if (CHECK(oldk != NULL || oldv != NULL,
+ if (CHECK(oldk != NULL || oldv != NULL, "check_kv",
"unexpected k/v: %p=%p\n", oldk, oldv))
- return 1;
+ goto cleanup;
}
- if (CHECK(err, "failed to add k/v %ld = %ld: %d\n",
+ if (CHECK(err, "elem_add", "failed to add k/v %ld = %ld: %d\n",
(long)k, (long)v, err))
- return 1;
+ goto cleanup;
- if (CHECK(!hashmap__find(map, k, &oldv),
+ if (CHECK(!hashmap__find(map, k, &oldv), "elem_find",
"failed to find key %ld\n", (long)k))
- return 1;
- if (CHECK(oldv != v, "found value is wrong: %ld\n", (long)oldv))
- return 1;
+ goto cleanup;
+ if (CHECK(oldv != v, "elem_val",
+ "found value is wrong: %ld\n", (long)oldv))
+ goto cleanup;
}
- if (CHECK(hashmap__size(map) != ELEM_CNT,
+ if (CHECK(hashmap__size(map) != ELEM_CNT, "hashmap__size",
"invalid map size: %zu\n", hashmap__size(map)))
- return 1;
+ goto cleanup;
if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)),
+ "hashmap_cap",
"unexpected map capacity: %zu\n", hashmap__capacity(map)))
- return 1;
+ goto cleanup;
found_msk = 0;
hashmap__for_each_entry(map, entry, bkt) {
@@ -103,42 +96,47 @@ int test_hashmap_generic(void)
long v = (long)entry->value;
found_msk |= 1ULL << k;
- if (CHECK(v - k != 1024, "invalid k/v pair: %ld = %ld\n", k, v))
- return 1;
+ if (CHECK(v - k != 1024, "check_kv",
+ "invalid k/v pair: %ld = %ld\n", k, v))
+ goto cleanup;
}
- if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1,
+ if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, "elem_cnt",
"not all keys iterated: %llx\n", found_msk))
- return 1;
+ goto cleanup;
for (i = 0; i < ELEM_CNT; i++) {
const void *oldk, *k = (const void *)(long)i;
void *oldv, *v = (void *)(long)(256 + i);
err = hashmap__add(map, k, v);
- if (CHECK(err != -EEXIST, "unexpected add result: %d\n", err))
- return 1;
+ if (CHECK(err != -EEXIST, "hashmap__add",
+ "unexpected add result: %d\n", err))
+ goto cleanup;
if (i % 2)
err = hashmap__update(map, k, v, &oldk, &oldv);
else
err = hashmap__set(map, k, v, &oldk, &oldv);
- if (CHECK(err, "failed to update k/v %ld = %ld: %d\n",
- (long)k, (long)v, err))
- return 1;
- if (CHECK(!hashmap__find(map, k, &oldv),
+ if (CHECK(err, "elem_upd",
+ "failed to update k/v %ld = %ld: %d\n",
+ (long)k, (long)v, err))
+ goto cleanup;
+ if (CHECK(!hashmap__find(map, k, &oldv), "elem_find",
"failed to find key %ld\n", (long)k))
- return 1;
- if (CHECK(oldv != v, "found value is wrong: %ld\n", (long)oldv))
- return 1;
+ goto cleanup;
+ if (CHECK(oldv != v, "elem_val",
+ "found value is wrong: %ld\n", (long)oldv))
+ goto cleanup;
}
- if (CHECK(hashmap__size(map) != ELEM_CNT,
+ if (CHECK(hashmap__size(map) != ELEM_CNT, "hashmap__size",
"invalid updated map size: %zu\n", hashmap__size(map)))
- return 1;
+ goto cleanup;
if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)),
+ "hashmap__capacity",
"unexpected map capacity: %zu\n", hashmap__capacity(map)))
- return 1;
+ goto cleanup;
found_msk = 0;
hashmap__for_each_entry_safe(map, entry, tmp, bkt) {
@@ -146,20 +144,21 @@ int test_hashmap_generic(void)
long v = (long)entry->value;
found_msk |= 1ULL << k;
- if (CHECK(v - k != 256,
+ if (CHECK(v - k != 256, "elem_check",
"invalid updated k/v pair: %ld = %ld\n", k, v))
- return 1;
+ goto cleanup;
}
- if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1,
+ if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, "elem_cnt",
"not all keys iterated after update: %llx\n", found_msk))
- return 1;
+ goto cleanup;
found_cnt = 0;
hashmap__for_each_key_entry(map, entry, (void *)0) {
found_cnt++;
}
- if (CHECK(!found_cnt, "didn't find any entries for key 0\n"))
- return 1;
+ if (CHECK(!found_cnt, "found_cnt",
+ "didn't find any entries for key 0\n"))
+ goto cleanup;
found_msk = 0;
found_cnt = 0;
@@ -173,30 +172,31 @@ int test_hashmap_generic(void)
found_cnt++;
found_msk |= 1ULL << (long)k;
- if (CHECK(!hashmap__delete(map, k, &oldk, &oldv),
+ if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), "elem_del",
"failed to delete k/v %ld = %ld\n",
(long)k, (long)v))
- return 1;
- if (CHECK(oldk != k || oldv != v,
+ goto cleanup;
+ if (CHECK(oldk != k || oldv != v, "check_old",
"invalid deleted k/v: expected %ld = %ld, got %ld = %ld\n",
(long)k, (long)v, (long)oldk, (long)oldv))
- return 1;
- if (CHECK(hashmap__delete(map, k, &oldk, &oldv),
+ goto cleanup;
+ if (CHECK(hashmap__delete(map, k, &oldk, &oldv), "elem_del",
"unexpectedly deleted k/v %ld = %ld\n",
(long)oldk, (long)oldv))
- return 1;
+ goto cleanup;
}
- if (CHECK(!found_cnt || !found_msk,
+ if (CHECK(!found_cnt || !found_msk, "found_entries",
"didn't delete any key entries\n"))
- return 1;
- if (CHECK(hashmap__size(map) != ELEM_CNT - found_cnt,
+ goto cleanup;
+ if (CHECK(hashmap__size(map) != ELEM_CNT - found_cnt, "elem_cnt",
"invalid updated map size (already deleted: %d): %zu\n",
found_cnt, hashmap__size(map)))
- return 1;
+ goto cleanup;
if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)),
+ "hashmap__capacity",
"unexpected map capacity: %zu\n", hashmap__capacity(map)))
- return 1;
+ goto cleanup;
hashmap__for_each_entry_safe(map, entry, tmp, bkt) {
const void *oldk, *k;
@@ -208,53 +208,56 @@ int test_hashmap_generic(void)
found_cnt++;
found_msk |= 1ULL << (long)k;
- if (CHECK(!hashmap__delete(map, k, &oldk, &oldv),
+ if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), "elem_del",
"failed to delete k/v %ld = %ld\n",
(long)k, (long)v))
- return 1;
- if (CHECK(oldk != k || oldv != v,
+ goto cleanup;
+ if (CHECK(oldk != k || oldv != v, "elem_check",
"invalid old k/v: expect %ld = %ld, got %ld = %ld\n",
(long)k, (long)v, (long)oldk, (long)oldv))
- return 1;
- if (CHECK(hashmap__delete(map, k, &oldk, &oldv),
+ goto cleanup;
+ if (CHECK(hashmap__delete(map, k, &oldk, &oldv), "elem_del",
"unexpectedly deleted k/v %ld = %ld\n",
(long)k, (long)v))
- return 1;
+ goto cleanup;
}
if (CHECK(found_cnt != ELEM_CNT || found_msk != (1ULL << ELEM_CNT) - 1,
+ "found_cnt",
"not all keys were deleted: found_cnt:%d, found_msk:%llx\n",
found_cnt, found_msk))
- return 1;
- if (CHECK(hashmap__size(map) != 0,
+ goto cleanup;
+ if (CHECK(hashmap__size(map) != 0, "hashmap__size",
"invalid updated map size (already deleted: %d): %zu\n",
found_cnt, hashmap__size(map)))
- return 1;
+ goto cleanup;
found_cnt = 0;
hashmap__for_each_entry(map, entry, bkt) {
- CHECK(false, "unexpected map entries left: %ld = %ld\n",
- (long)entry->key, (long)entry->value);
- return 1;
+ CHECK(false, "elem_exists",
+ "unexpected map entries left: %ld = %ld\n",
+ (long)entry->key, (long)entry->value);
+ goto cleanup;
}
- hashmap__free(map);
+ hashmap__clear(map);
hashmap__for_each_entry(map, entry, bkt) {
- CHECK(false, "unexpected map entries left: %ld = %ld\n",
- (long)entry->key, (long)entry->value);
- return 1;
+ CHECK(false, "elem_exists",
+ "unexpected map entries left: %ld = %ld\n",
+ (long)entry->key, (long)entry->value);
+ goto cleanup;
}
- fprintf(stderr, "OK\n");
- return 0;
+cleanup:
+ hashmap__free(map);
}
-size_t collision_hash_fn(const void *k, void *ctx)
+static size_t collision_hash_fn(const void *k, void *ctx)
{
return 0;
}
-int test_hashmap_multimap(void)
+static void test_hashmap_multimap(void)
{
void *k1 = (void *)0, *k2 = (void *)1;
struct hashmap_entry *entry;
@@ -262,121 +265,116 @@ int test_hashmap_multimap(void)
long found_msk;
int err, bkt;
- fprintf(stderr, "%s: ", __func__);
-
/* force collisions */
map = hashmap__new(collision_hash_fn, equal_fn, NULL);
- if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map)))
- return 1;
-
+ if (CHECK(IS_ERR(map), "hashmap__new",
+ "failed to create map: %ld\n", PTR_ERR(map)))
+ return;
/* set up multimap:
* [0] -> 1, 2, 4;
* [1] -> 8, 16, 32;
*/
err = hashmap__append(map, k1, (void *)1);
- if (CHECK(err, "failed to add k/v: %d\n", err))
- return 1;
+ if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err))
+ goto cleanup;
err = hashmap__append(map, k1, (void *)2);
- if (CHECK(err, "failed to add k/v: %d\n", err))
- return 1;
+ if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err))
+ goto cleanup;
err = hashmap__append(map, k1, (void *)4);
- if (CHECK(err, "failed to add k/v: %d\n", err))
- return 1;
+ if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err))
+ goto cleanup;
err = hashmap__append(map, k2, (void *)8);
- if (CHECK(err, "failed to add k/v: %d\n", err))
- return 1;
+ if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err))
+ goto cleanup;
err = hashmap__append(map, k2, (void *)16);
- if (CHECK(err, "failed to add k/v: %d\n", err))
- return 1;
+ if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err))
+ goto cleanup;
err = hashmap__append(map, k2, (void *)32);
- if (CHECK(err, "failed to add k/v: %d\n", err))
- return 1;
+ if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err))
+ goto cleanup;
- if (CHECK(hashmap__size(map) != 6,
+ if (CHECK(hashmap__size(map) != 6, "hashmap_size",
"invalid map size: %zu\n", hashmap__size(map)))
- return 1;
+ goto cleanup;
/* verify global iteration still works and sees all values */
found_msk = 0;
hashmap__for_each_entry(map, entry, bkt) {
found_msk |= (long)entry->value;
}
- if (CHECK(found_msk != (1 << 6) - 1,
+ if (CHECK(found_msk != (1 << 6) - 1, "found_msk",
"not all keys iterated: %lx\n", found_msk))
- return 1;
+ goto cleanup;
/* iterate values for key 1 */
found_msk = 0;
hashmap__for_each_key_entry(map, entry, k1) {
found_msk |= (long)entry->value;
}
- if (CHECK(found_msk != (1 | 2 | 4),
+ if (CHECK(found_msk != (1 | 2 | 4), "found_msk",
"invalid k1 values: %lx\n", found_msk))
- return 1;
+ goto cleanup;
/* iterate values for key 2 */
found_msk = 0;
hashmap__for_each_key_entry(map, entry, k2) {
found_msk |= (long)entry->value;
}
- if (CHECK(found_msk != (8 | 16 | 32),
+ if (CHECK(found_msk != (8 | 16 | 32), "found_msk",
"invalid k2 values: %lx\n", found_msk))
- return 1;
+ goto cleanup;
- fprintf(stderr, "OK\n");
- return 0;
+cleanup:
+ hashmap__free(map);
}
-int test_hashmap_empty()
+static void test_hashmap_empty()
{
struct hashmap_entry *entry;
int bkt;
struct hashmap *map;
void *k = (void *)0;
- fprintf(stderr, "%s: ", __func__);
-
/* force collisions */
map = hashmap__new(hash_fn, equal_fn, NULL);
- if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map)))
- return 1;
+ if (CHECK(IS_ERR(map), "hashmap__new",
+ "failed to create map: %ld\n", PTR_ERR(map)))
+ goto cleanup;
- if (CHECK(hashmap__size(map) != 0,
+ if (CHECK(hashmap__size(map) != 0, "hashmap__size",
"invalid map size: %zu\n", hashmap__size(map)))
- return 1;
- if (CHECK(hashmap__capacity(map) != 0,
+ goto cleanup;
+ if (CHECK(hashmap__capacity(map) != 0, "hashmap__capacity",
"invalid map capacity: %zu\n", hashmap__capacity(map)))
- return 1;
- if (CHECK(hashmap__find(map, k, NULL), "unexpected find\n"))
- return 1;
- if (CHECK(hashmap__delete(map, k, NULL, NULL), "unexpected delete\n"))
- return 1;
+ goto cleanup;
+ if (CHECK(hashmap__find(map, k, NULL), "elem_find",
+ "unexpected find\n"))
+ goto cleanup;
+ if (CHECK(hashmap__delete(map, k, NULL, NULL), "elem_del",
+ "unexpected delete\n"))
+ goto cleanup;
hashmap__for_each_entry(map, entry, bkt) {
- CHECK(false, "unexpected iterated entry\n");
- return 1;
+ CHECK(false, "elem_found", "unexpected iterated entry\n");
+ goto cleanup;
}
hashmap__for_each_key_entry(map, entry, k) {
- CHECK(false, "unexpected key entry\n");
- return 1;
+ CHECK(false, "key_found", "unexpected key entry\n");
+ goto cleanup;
}
- fprintf(stderr, "OK\n");
- return 0;
+cleanup:
+ hashmap__free(map);
}
-int main(int argc, char **argv)
+void test_hashmap()
{
- bool failed = false;
-
- if (test_hashmap_generic())
- failed = true;
- if (test_hashmap_multimap())
- failed = true;
- if (test_hashmap_empty())
- failed = true;
-
- return failed;
+ if (test__start_subtest("generic"))
+ test_hashmap_generic();
+ if (test__start_subtest("multimap"))
+ test_hashmap_multimap();
+ if (test__start_subtest("empty"))
+ test_hashmap_empty();
}
diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
index 542240e16564..e74dc501b27f 100644
--- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
+++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
@@ -80,9 +80,6 @@ void test_ns_current_pid_tgid(void)
"User pid/tgid %llu BPF pid/tgid %llu\n", id, bss.pid_tgid))
goto cleanup;
cleanup:
- if (!link) {
- bpf_link__destroy(link);
- link = NULL;
- }
+ bpf_link__destroy(link);
bpf_object__close(obj);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
index 1450ea2dd4cc..a122ce3b360e 100644
--- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
+++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
@@ -6,6 +6,11 @@
#include <test_progs.h>
#include "bpf/libbpf_internal.h"
+/* AddressSanitizer sometimes crashes due to data dereference below, due to
+ * this being mmap()'ed memory. Disable instrumentation with
+ * no_sanitize_address attribute
+ */
+__attribute__((no_sanitize_address))
static void on_sample(void *ctx, int cpu, void *data, __u32 size)
{
int cpu_data = *(int *)data, duration = 0;
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
index d572e1a2c297..47fa04adc147 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
@@ -20,6 +20,7 @@
#define CONNECT_PORT 4321
#define TEST_DADDR (0xC0A80203)
#define NS_SELF "/proc/self/ns/net"
+#define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map"
static const struct timeval timeo_sec = { .tv_sec = 3 };
static const size_t timeo_optlen = sizeof(timeo_sec);
@@ -265,6 +266,7 @@ void test_sk_assign(void)
TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true),
};
int server = -1;
+ int server_map;
int self_net;
self_net = open(NS_SELF, O_RDONLY);
@@ -278,9 +280,17 @@ void test_sk_assign(void)
goto cleanup;
}
+ server_map = bpf_obj_get(SERVER_MAP_PATH);
+ if (CHECK_FAIL(server_map < 0)) {
+ perror("Unable to open " SERVER_MAP_PATH);
+ goto cleanup;
+ }
+
for (int i = 0; i < ARRAY_SIZE(tests) && !READ_ONCE(stop); i++) {
struct test_sk_cfg *test = &tests[i];
const struct sockaddr *addr;
+ const int zero = 0;
+ int err;
if (!test__start_subtest(test->name))
continue;
@@ -288,7 +298,13 @@ void test_sk_assign(void)
addr = (const struct sockaddr *)test->addr;
server = start_server(addr, test->len, test->type);
if (server == -1)
- goto cleanup;
+ goto close;
+
+ err = bpf_map_update_elem(server_map, &zero, &server, BPF_ANY);
+ if (CHECK_FAIL(err)) {
+ perror("Unable to update server_map");
+ goto close;
+ }
/* connect to unbound ports */
prepare_addr(test->addr, test->family, CONNECT_PORT,
@@ -302,7 +318,10 @@ void test_sk_assign(void)
close:
close(server);
+ close(server_map);
cleanup:
+ if (CHECK_FAIL(unlink(SERVER_MAP_PATH)))
+ perror("Unable to unlink " SERVER_MAP_PATH);
if (CHECK_FAIL(setns(self_net, CLONE_NEWNET)))
perror("Failed to setns("NS_SELF")");
close(self_net);
diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c
index ad3c498a8150..c2c85c31cffd 100644
--- a/tools/testing/selftests/bpf/progs/connect4_prog.c
+++ b/tools/testing/selftests/bpf/progs/connect4_prog.c
@@ -8,6 +8,7 @@
#include <linux/in.h>
#include <linux/in6.h>
#include <sys/socket.h>
+#include <netinet/tcp.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
@@ -16,6 +17,10 @@
#define DST_REWRITE_IP4 0x7f000001U
#define DST_REWRITE_PORT4 4444
+#ifndef TCP_CA_NAME_MAX
+#define TCP_CA_NAME_MAX 16
+#endif
+
int _version SEC("version") = 1;
__attribute__ ((noinline))
@@ -33,6 +38,43 @@ int do_bind(struct bpf_sock_addr *ctx)
return 1;
}
+static __inline int verify_cc(struct bpf_sock_addr *ctx,
+ char expected[TCP_CA_NAME_MAX])
+{
+ char buf[TCP_CA_NAME_MAX];
+ int i;
+
+ if (bpf_getsockopt(ctx, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf)))
+ return 1;
+
+ for (i = 0; i < TCP_CA_NAME_MAX; i++) {
+ if (buf[i] != expected[i])
+ return 1;
+ if (buf[i] == 0)
+ break;
+ }
+
+ return 0;
+}
+
+static __inline int set_cc(struct bpf_sock_addr *ctx)
+{
+ char reno[TCP_CA_NAME_MAX] = "reno";
+ char cubic[TCP_CA_NAME_MAX] = "cubic";
+
+ if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &reno, sizeof(reno)))
+ return 1;
+ if (verify_cc(ctx, reno))
+ return 1;
+
+ if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic)))
+ return 1;
+ if (verify_cc(ctx, cubic))
+ return 1;
+
+ return 0;
+}
+
SEC("cgroup/connect4")
int connect_v4_prog(struct bpf_sock_addr *ctx)
{
@@ -66,6 +108,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx)
bpf_sk_release(sk);
+ /* Rewrite congestion control. */
+ if (ctx->type == SOCK_STREAM && set_cc(ctx))
+ return 0;
+
/* Rewrite destination. */
ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4);
ctx->user_port = bpf_htons(DST_REWRITE_PORT4);
diff --git a/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c
new file mode 100644
index 000000000000..e5093796be97
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct inner_map {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} inner_map1 SEC(".maps"),
+ inner_map2 SEC(".maps");
+
+struct outer_arr {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, 3);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ /* it's possible to use anonymous struct as inner map definition here */
+ __array(values, struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ /* changing max_entries to 2 will fail during load
+ * due to incompatibility with inner_map definition */
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+ });
+} outer_arr SEC(".maps") = {
+ /* (void *) cast is necessary because we didn't use `struct inner_map`
+ * in __inner(values, ...)
+ * Actually, a conscious effort is required to screw up initialization
+ * of inner map slots, which is a great thing!
+ */
+ .values = { (void *)&inner_map1, 0, (void *)&inner_map2 },
+};
+
+struct outer_hash {
+ __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+ __uint(max_entries, 5);
+ __uint(key_size, sizeof(int));
+ /* Here everything works flawlessly due to reuse of struct inner_map
+ * and compiler will complain at the attempt to use non-inner_map
+ * references below. This is great experience.
+ */
+ __array(values, struct inner_map);
+} outer_hash SEC(".maps") = {
+ .values = {
+ [0] = &inner_map2,
+ [4] = &inner_map1,
+ },
+};
+
+int input = 0;
+
+SEC("raw_tp/sys_enter")
+int handle__sys_enter(void *ctx)
+{
+ struct inner_map *inner_map;
+ int key = 0, val;
+
+ inner_map = bpf_map_lookup_elem(&outer_arr, &key);
+ if (!inner_map)
+ return 1;
+ val = input;
+ bpf_map_update_elem(inner_map, &key, &val, 0);
+
+ inner_map = bpf_map_lookup_elem(&outer_hash, &key);
+ if (!inner_map)
+ return 1;
+ val = input + 1;
+ bpf_map_update_elem(inner_map, &key, &val, 0);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c
new file mode 100644
index 000000000000..1668b993eb86
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c
@@ -0,0 +1,1058 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// Copyright (c) 2019, 2020 Cloudflare
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <linux/bpf.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#include "test_cls_redirect.h"
+
+#define offsetofend(TYPE, MEMBER) \
+ (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
+
+#define IP_OFFSET_MASK (0x1FFF)
+#define IP_MF (0x2000)
+
+char _license[] SEC("license") = "Dual BSD/GPL";
+
+/**
+ * Destination port and IP used for UDP encapsulation.
+ */
+static volatile const __be16 ENCAPSULATION_PORT;
+static volatile const __be32 ENCAPSULATION_IP;
+
+typedef struct {
+ uint64_t processed_packets_total;
+ uint64_t l3_protocol_packets_total_ipv4;
+ uint64_t l3_protocol_packets_total_ipv6;
+ uint64_t l4_protocol_packets_total_tcp;
+ uint64_t l4_protocol_packets_total_udp;
+ uint64_t accepted_packets_total_syn;
+ uint64_t accepted_packets_total_syn_cookies;
+ uint64_t accepted_packets_total_last_hop;
+ uint64_t accepted_packets_total_icmp_echo_request;
+ uint64_t accepted_packets_total_established;
+ uint64_t forwarded_packets_total_gue;
+ uint64_t forwarded_packets_total_gre;
+
+ uint64_t errors_total_unknown_l3_proto;
+ uint64_t errors_total_unknown_l4_proto;
+ uint64_t errors_total_malformed_ip;
+ uint64_t errors_total_fragmented_ip;
+ uint64_t errors_total_malformed_icmp;
+ uint64_t errors_total_unwanted_icmp;
+ uint64_t errors_total_malformed_icmp_pkt_too_big;
+ uint64_t errors_total_malformed_tcp;
+ uint64_t errors_total_malformed_udp;
+ uint64_t errors_total_icmp_echo_replies;
+ uint64_t errors_total_malformed_encapsulation;
+ uint64_t errors_total_encap_adjust_failed;
+ uint64_t errors_total_encap_buffer_too_small;
+ uint64_t errors_total_redirect_loop;
+} metrics_t;
+
+typedef enum {
+ INVALID = 0,
+ UNKNOWN,
+ ECHO_REQUEST,
+ SYN,
+ SYN_COOKIE,
+ ESTABLISHED,
+} verdict_t;
+
+typedef struct {
+ uint16_t src, dst;
+} flow_ports_t;
+
+_Static_assert(
+ sizeof(flow_ports_t) !=
+ offsetofend(struct bpf_sock_tuple, ipv4.dport) -
+ offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
+ "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
+_Static_assert(
+ sizeof(flow_ports_t) !=
+ offsetofend(struct bpf_sock_tuple, ipv6.dport) -
+ offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
+ "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
+
+typedef int ret_t;
+
+/* This is a bit of a hack. We need a return value which allows us to
+ * indicate that the regular flow of the program should continue,
+ * while allowing functions to use XDP_PASS and XDP_DROP, etc.
+ */
+static const ret_t CONTINUE_PROCESSING = -1;
+
+/* Convenience macro to call functions which return ret_t.
+ */
+#define MAYBE_RETURN(x) \
+ do { \
+ ret_t __ret = x; \
+ if (__ret != CONTINUE_PROCESSING) \
+ return __ret; \
+ } while (0)
+
+/* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes),
+ * or not aligned if the arch supports efficient unaligned access.
+ *
+ * Since the verifier ensures that eBPF packet accesses follow these rules,
+ * we can tell LLVM to emit code as if we always had a larger alignment.
+ * It will yell at us if we end up on a platform where this is not valid.
+ */
+typedef uint8_t *net_ptr __attribute__((align_value(8)));
+
+typedef struct buf {
+ struct __sk_buff *skb;
+ net_ptr head;
+ /* NB: tail musn't have alignment other than 1, otherwise
+ * LLVM will go and eliminate code, e.g. when checking packet lengths.
+ */
+ uint8_t *const tail;
+} buf_t;
+
+static size_t buf_off(const buf_t *buf)
+{
+ /* Clang seems to optimize constructs like
+ * a - b + c
+ * if c is known:
+ * r? = c
+ * r? -= b
+ * r? += a
+ *
+ * This is a problem if a and b are packet pointers,
+ * since the verifier allows subtracting two pointers to
+ * get a scalar, but not a scalar and a pointer.
+ *
+ * Use inline asm to break this optimization.
+ */
+ size_t off = (size_t)buf->head;
+ asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
+ return off;
+}
+
+static bool buf_copy(buf_t *buf, void *dst, size_t len)
+{
+ if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) {
+ return false;
+ }
+
+ buf->head += len;
+ return true;
+}
+
+static bool buf_skip(buf_t *buf, const size_t len)
+{
+ /* Check whether off + len is valid in the non-linear part. */
+ if (buf_off(buf) + len > buf->skb->len) {
+ return false;
+ }
+
+ buf->head += len;
+ return true;
+}
+
+/* Returns a pointer to the start of buf, or NULL if len is
+ * larger than the remaining data. Consumes len bytes on a successful
+ * call.
+ *
+ * If scratch is not NULL, the function will attempt to load non-linear
+ * data via bpf_skb_load_bytes. On success, scratch is returned.
+ */
+static void *buf_assign(buf_t *buf, const size_t len, void *scratch)
+{
+ if (buf->head + len > buf->tail) {
+ if (scratch == NULL) {
+ return NULL;
+ }
+
+ return buf_copy(buf, scratch, len) ? scratch : NULL;
+ }
+
+ void *ptr = buf->head;
+ buf->head += len;
+ return ptr;
+}
+
+static bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4)
+{
+ if (ipv4->ihl <= 5) {
+ return true;
+ }
+
+ return buf_skip(buf, (ipv4->ihl - 5) * 4);
+}
+
+static bool ipv4_is_fragment(const struct iphdr *ip)
+{
+ uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
+ return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
+}
+
+static struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch)
+{
+ struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch);
+ if (ipv4 == NULL) {
+ return NULL;
+ }
+
+ if (ipv4->ihl < 5) {
+ return NULL;
+ }
+
+ if (!pkt_skip_ipv4_options(pkt, ipv4)) {
+ return NULL;
+ }
+
+ return ipv4;
+}
+
+/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
+static bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports)
+{
+ if (!buf_copy(pkt, ports, sizeof(*ports))) {
+ return false;
+ }
+
+ /* Ports in the L4 headers are reversed, since we are parsing an ICMP
+ * payload which is going towards the eyeball.
+ */
+ uint16_t dst = ports->src;
+ ports->src = ports->dst;
+ ports->dst = dst;
+ return true;
+}
+
+static uint16_t pkt_checksum_fold(uint32_t csum)
+{
+ /* The highest reasonable value for an IPv4 header
+ * checksum requires two folds, so we just do that always.
+ */
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+ return (uint16_t)~csum;
+}
+
+static void pkt_ipv4_checksum(struct iphdr *iph)
+{
+ iph->check = 0;
+
+ /* An IP header without options is 20 bytes. Two of those
+ * are the checksum, which we always set to zero. Hence,
+ * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
+ * which fits in 32 bit.
+ */
+ _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
+ uint32_t acc = 0;
+ uint16_t *ipw = (uint16_t *)iph;
+
+#pragma clang loop unroll(full)
+ for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) {
+ acc += ipw[i];
+ }
+
+ iph->check = pkt_checksum_fold(acc);
+}
+
+static bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
+ const struct ipv6hdr *ipv6,
+ uint8_t *upper_proto,
+ bool *is_fragment)
+{
+ /* We understand five extension headers.
+ * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
+ * headers should occur once, except Destination Options, which may
+ * occur twice. Hence we give up after 6 headers.
+ */
+ struct {
+ uint8_t next;
+ uint8_t len;
+ } exthdr = {
+ .next = ipv6->nexthdr,
+ };
+ *is_fragment = false;
+
+#pragma clang loop unroll(full)
+ for (int i = 0; i < 6; i++) {
+ switch (exthdr.next) {
+ case IPPROTO_FRAGMENT:
+ *is_fragment = true;
+ /* NB: We don't check that hdrlen == 0 as per spec. */
+ /* fallthrough; */
+
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_MH:
+ if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) {
+ return false;
+ }
+
+ /* hdrlen is in 8-octet units, and excludes the first 8 octets. */
+ if (!buf_skip(pkt,
+ (exthdr.len + 1) * 8 - sizeof(exthdr))) {
+ return false;
+ }
+
+ /* Decode next header */
+ break;
+
+ default:
+ /* The next header is not one of the known extension
+ * headers, treat it as the upper layer header.
+ *
+ * This handles IPPROTO_NONE.
+ *
+ * Encapsulating Security Payload (50) and Authentication
+ * Header (51) also end up here (and will trigger an
+ * unknown proto error later). They have a custom header
+ * format and seem too esoteric to care about.
+ */
+ *upper_proto = exthdr.next;
+ return true;
+ }
+ }
+
+ /* We never found an upper layer header. */
+ return false;
+}
+
+/* This function has to be inlined, because the verifier otherwise rejects it
+ * due to returning a pointer to the stack. This is technically correct, since
+ * scratch is allocated on the stack. However, this usage should be safe since
+ * it's the callers stack after all.
+ */
+static inline __attribute__((__always_inline__)) struct ipv6hdr *
+pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto,
+ bool *is_fragment)
+{
+ struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch);
+ if (ipv6 == NULL) {
+ return NULL;
+ }
+
+ if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) {
+ return NULL;
+ }
+
+ return ipv6;
+}
+
+/* Global metrics, per CPU
+ */
+struct bpf_map_def metrics_map SEC("maps") = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(unsigned int),
+ .value_size = sizeof(metrics_t),
+ .max_entries = 1,
+};
+
+static metrics_t *get_global_metrics(void)
+{
+ uint64_t key = 0;
+ return bpf_map_lookup_elem(&metrics_map, &key);
+}
+
+static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
+{
+ const int payload_off =
+ sizeof(*encap) +
+ sizeof(struct in_addr) * encap->unigue.hop_count;
+ int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
+
+ // Changing the ethertype if the encapsulated packet is ipv6
+ if (encap->gue.proto_ctype == IPPROTO_IPV6) {
+ encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
+ }
+
+ if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
+ BPF_F_ADJ_ROOM_FIXED_GSO)) {
+ return TC_ACT_SHOT;
+ }
+
+ return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
+}
+
+static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap,
+ struct in_addr *next_hop, metrics_t *metrics)
+{
+ metrics->forwarded_packets_total_gre++;
+
+ const int payload_off =
+ sizeof(*encap) +
+ sizeof(struct in_addr) * encap->unigue.hop_count;
+ int32_t encap_overhead =
+ payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
+ int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
+ uint16_t proto = ETH_P_IP;
+
+ /* Loop protection: the inner packet's TTL is decremented as a safeguard
+ * against any forwarding loop. As the only interesting field is the TTL
+ * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
+ * as they handle the split packets if needed (no need for the data to be
+ * in the linear section).
+ */
+ if (encap->gue.proto_ctype == IPPROTO_IPV6) {
+ proto = ETH_P_IPV6;
+ uint8_t ttl;
+ int rc;
+
+ rc = bpf_skb_load_bytes(
+ skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
+ &ttl, 1);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (ttl == 0) {
+ metrics->errors_total_redirect_loop++;
+ return TC_ACT_SHOT;
+ }
+
+ ttl--;
+ rc = bpf_skb_store_bytes(
+ skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
+ &ttl, 1, 0);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+ } else {
+ uint8_t ttl;
+ int rc;
+
+ rc = bpf_skb_load_bytes(
+ skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
+ 1);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (ttl == 0) {
+ metrics->errors_total_redirect_loop++;
+ return TC_ACT_SHOT;
+ }
+
+ /* IPv4 also has a checksum to patch. While the TTL is only one byte,
+ * this function only works for 2 and 4 bytes arguments (the result is
+ * the same).
+ */
+ rc = bpf_l3_csum_replace(
+ skb, payload_off + offsetof(struct iphdr, check), ttl,
+ ttl - 1, 2);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ ttl--;
+ rc = bpf_skb_store_bytes(
+ skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
+ 0);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+ }
+
+ if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
+ BPF_F_ADJ_ROOM_FIXED_GSO)) {
+ metrics->errors_total_encap_adjust_failed++;
+ return TC_ACT_SHOT;
+ }
+
+ if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
+ metrics->errors_total_encap_buffer_too_small++;
+ return TC_ACT_SHOT;
+ }
+
+ buf_t pkt = {
+ .skb = skb,
+ .head = (uint8_t *)(long)skb->data,
+ .tail = (uint8_t *)(long)skb->data_end,
+ };
+
+ encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL);
+ if (encap_gre == NULL) {
+ metrics->errors_total_encap_buffer_too_small++;
+ return TC_ACT_SHOT;
+ }
+
+ encap_gre->ip.protocol = IPPROTO_GRE;
+ encap_gre->ip.daddr = next_hop->s_addr;
+ encap_gre->ip.saddr = ENCAPSULATION_IP;
+ encap_gre->ip.tot_len =
+ bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
+ encap_gre->gre.flags = 0;
+ encap_gre->gre.protocol = bpf_htons(proto);
+ pkt_ipv4_checksum((void *)&encap_gre->ip);
+
+ return bpf_redirect(skb->ifindex, 0);
+}
+
+static ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap,
+ struct in_addr *next_hop, metrics_t *metrics)
+{
+ /* swap L2 addresses */
+ /* This assumes that packets are received from a router.
+ * So just swapping the MAC addresses here will make the packet go back to
+ * the router, which will send it to the appropriate machine.
+ */
+ unsigned char temp[ETH_ALEN];
+ memcpy(temp, encap->eth.h_dest, sizeof(temp));
+ memcpy(encap->eth.h_dest, encap->eth.h_source,
+ sizeof(encap->eth.h_dest));
+ memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
+
+ if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
+ encap->unigue.last_hop_gre) {
+ return forward_with_gre(skb, encap, next_hop, metrics);
+ }
+
+ metrics->forwarded_packets_total_gue++;
+ uint32_t old_saddr = encap->ip.saddr;
+ encap->ip.saddr = encap->ip.daddr;
+ encap->ip.daddr = next_hop->s_addr;
+ if (encap->unigue.next_hop < encap->unigue.hop_count) {
+ encap->unigue.next_hop++;
+ }
+
+ /* Remove ip->saddr, add next_hop->s_addr */
+ const uint64_t off = offsetof(typeof(*encap), ip.check);
+ int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
+ if (ret < 0) {
+ return TC_ACT_SHOT;
+ }
+
+ return bpf_redirect(skb->ifindex, 0);
+}
+
+static ret_t skip_next_hops(buf_t *pkt, int n)
+{
+ switch (n) {
+ case 1:
+ if (!buf_skip(pkt, sizeof(struct in_addr)))
+ return TC_ACT_SHOT;
+ case 0:
+ return CONTINUE_PROCESSING;
+
+ default:
+ return TC_ACT_SHOT;
+ }
+}
+
+/* Get the next hop from the GLB header.
+ *
+ * Sets next_hop->s_addr to 0 if there are no more hops left.
+ * pkt is positioned just after the variable length GLB header
+ * iff the call is successful.
+ */
+static ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
+ struct in_addr *next_hop)
+{
+ if (encap->unigue.next_hop > encap->unigue.hop_count) {
+ return TC_ACT_SHOT;
+ }
+
+ /* Skip "used" next hops. */
+ MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop));
+
+ if (encap->unigue.next_hop == encap->unigue.hop_count) {
+ /* No more next hops, we are at the end of the GLB header. */
+ next_hop->s_addr = 0;
+ return CONTINUE_PROCESSING;
+ }
+
+ if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) {
+ return TC_ACT_SHOT;
+ }
+
+ /* Skip the remainig next hops (may be zero). */
+ return skip_next_hops(pkt, encap->unigue.hop_count -
+ encap->unigue.next_hop - 1);
+}
+
+/* Fill a bpf_sock_tuple to be used with the socket lookup functions.
+ * This is a kludge that let's us work around verifier limitations:
+ *
+ * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
+ *
+ * clang will substitue a costant for sizeof, which allows the verifier
+ * to track it's value. Based on this, it can figure out the constant
+ * return value, and calling code works while still being "generic" to
+ * IPv4 and IPv6.
+ */
+static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
+ uint64_t iphlen, uint16_t sport, uint16_t dport)
+{
+ switch (iphlen) {
+ case sizeof(struct iphdr): {
+ struct iphdr *ipv4 = (struct iphdr *)iph;
+ tuple->ipv4.daddr = ipv4->daddr;
+ tuple->ipv4.saddr = ipv4->saddr;
+ tuple->ipv4.sport = sport;
+ tuple->ipv4.dport = dport;
+ return sizeof(tuple->ipv4);
+ }
+
+ case sizeof(struct ipv6hdr): {
+ struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
+ memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
+ sizeof(tuple->ipv6.daddr));
+ memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
+ sizeof(tuple->ipv6.saddr));
+ tuple->ipv6.sport = sport;
+ tuple->ipv6.dport = dport;
+ return sizeof(tuple->ipv6);
+ }
+
+ default:
+ return 0;
+ }
+}
+
+static verdict_t classify_tcp(struct __sk_buff *skb,
+ struct bpf_sock_tuple *tuple, uint64_t tuplen,
+ void *iph, struct tcphdr *tcp)
+{
+ struct bpf_sock *sk =
+ bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
+ if (sk == NULL) {
+ return UNKNOWN;
+ }
+
+ if (sk->state != BPF_TCP_LISTEN) {
+ bpf_sk_release(sk);
+ return ESTABLISHED;
+ }
+
+ if (iph != NULL && tcp != NULL) {
+ /* Kludge: we've run out of arguments, but need the length of the ip header. */
+ uint64_t iphlen = sizeof(struct iphdr);
+ if (tuplen == sizeof(tuple->ipv6)) {
+ iphlen = sizeof(struct ipv6hdr);
+ }
+
+ if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
+ sizeof(*tcp)) == 0) {
+ bpf_sk_release(sk);
+ return SYN_COOKIE;
+ }
+ }
+
+ bpf_sk_release(sk);
+ return UNKNOWN;
+}
+
+static verdict_t classify_udp(struct __sk_buff *skb,
+ struct bpf_sock_tuple *tuple, uint64_t tuplen)
+{
+ struct bpf_sock *sk =
+ bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
+ if (sk == NULL) {
+ return UNKNOWN;
+ }
+
+ if (sk->state == BPF_TCP_ESTABLISHED) {
+ bpf_sk_release(sk);
+ return ESTABLISHED;
+ }
+
+ bpf_sk_release(sk);
+ return UNKNOWN;
+}
+
+static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
+ struct bpf_sock_tuple *tuple, uint64_t tuplen,
+ metrics_t *metrics)
+{
+ switch (proto) {
+ case IPPROTO_TCP:
+ return classify_tcp(skb, tuple, tuplen, NULL, NULL);
+
+ case IPPROTO_UDP:
+ return classify_udp(skb, tuple, tuplen);
+
+ default:
+ metrics->errors_total_malformed_icmp++;
+ return INVALID;
+ }
+}
+
+static verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics)
+{
+ struct icmphdr icmp;
+ if (!buf_copy(pkt, &icmp, sizeof(icmp))) {
+ metrics->errors_total_malformed_icmp++;
+ return INVALID;
+ }
+
+ /* We should never receive encapsulated echo replies. */
+ if (icmp.type == ICMP_ECHOREPLY) {
+ metrics->errors_total_icmp_echo_replies++;
+ return INVALID;
+ }
+
+ if (icmp.type == ICMP_ECHO) {
+ return ECHO_REQUEST;
+ }
+
+ if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
+ metrics->errors_total_unwanted_icmp++;
+ return INVALID;
+ }
+
+ struct iphdr _ip4;
+ const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
+ if (ipv4 == NULL) {
+ metrics->errors_total_malformed_icmp_pkt_too_big++;
+ return INVALID;
+ }
+
+ /* The source address in the outer IP header is from the entity that
+ * originated the ICMP message. Use the original IP header to restore
+ * the correct flow tuple.
+ */
+ struct bpf_sock_tuple tuple;
+ tuple.ipv4.saddr = ipv4->daddr;
+ tuple.ipv4.daddr = ipv4->saddr;
+
+ if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) {
+ metrics->errors_total_malformed_icmp_pkt_too_big++;
+ return INVALID;
+ }
+
+ return classify_icmp(pkt->skb, ipv4->protocol, &tuple,
+ sizeof(tuple.ipv4), metrics);
+}
+
+static verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics)
+{
+ struct icmp6hdr icmp6;
+ if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) {
+ metrics->errors_total_malformed_icmp++;
+ return INVALID;
+ }
+
+ /* We should never receive encapsulated echo replies. */
+ if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
+ metrics->errors_total_icmp_echo_replies++;
+ return INVALID;
+ }
+
+ if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
+ return ECHO_REQUEST;
+ }
+
+ if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
+ metrics->errors_total_unwanted_icmp++;
+ return INVALID;
+ }
+
+ bool is_fragment;
+ uint8_t l4_proto;
+ struct ipv6hdr _ipv6;
+ const struct ipv6hdr *ipv6 =
+ pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
+ if (ipv6 == NULL) {
+ metrics->errors_total_malformed_icmp_pkt_too_big++;
+ return INVALID;
+ }
+
+ if (is_fragment) {
+ metrics->errors_total_fragmented_ip++;
+ return INVALID;
+ }
+
+ /* Swap source and dest addresses. */
+ struct bpf_sock_tuple tuple;
+ memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr));
+ memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr));
+
+ if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) {
+ metrics->errors_total_malformed_icmp_pkt_too_big++;
+ return INVALID;
+ }
+
+ return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6),
+ metrics);
+}
+
+static verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen,
+ metrics_t *metrics)
+{
+ metrics->l4_protocol_packets_total_tcp++;
+
+ struct tcphdr _tcp;
+ struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp);
+ if (tcp == NULL) {
+ metrics->errors_total_malformed_tcp++;
+ return INVALID;
+ }
+
+ if (tcp->syn) {
+ return SYN;
+ }
+
+ struct bpf_sock_tuple tuple;
+ uint64_t tuplen =
+ fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest);
+ return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp);
+}
+
+static verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen,
+ metrics_t *metrics)
+{
+ metrics->l4_protocol_packets_total_udp++;
+
+ struct udphdr _udp;
+ struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp);
+ if (udph == NULL) {
+ metrics->errors_total_malformed_udp++;
+ return INVALID;
+ }
+
+ struct bpf_sock_tuple tuple;
+ uint64_t tuplen =
+ fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest);
+ return classify_udp(pkt->skb, &tuple, tuplen);
+}
+
+static verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics)
+{
+ metrics->l3_protocol_packets_total_ipv4++;
+
+ struct iphdr _ip4;
+ struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
+ if (ipv4 == NULL) {
+ metrics->errors_total_malformed_ip++;
+ return INVALID;
+ }
+
+ if (ipv4->version != 4) {
+ metrics->errors_total_malformed_ip++;
+ return INVALID;
+ }
+
+ if (ipv4_is_fragment(ipv4)) {
+ metrics->errors_total_fragmented_ip++;
+ return INVALID;
+ }
+
+ switch (ipv4->protocol) {
+ case IPPROTO_ICMP:
+ return process_icmpv4(pkt, metrics);
+
+ case IPPROTO_TCP:
+ return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics);
+
+ case IPPROTO_UDP:
+ return process_udp(pkt, ipv4, sizeof(*ipv4), metrics);
+
+ default:
+ metrics->errors_total_unknown_l4_proto++;
+ return INVALID;
+ }
+}
+
+static verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics)
+{
+ metrics->l3_protocol_packets_total_ipv6++;
+
+ uint8_t l4_proto;
+ bool is_fragment;
+ struct ipv6hdr _ipv6;
+ struct ipv6hdr *ipv6 =
+ pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
+ if (ipv6 == NULL) {
+ metrics->errors_total_malformed_ip++;
+ return INVALID;
+ }
+
+ if (ipv6->version != 6) {
+ metrics->errors_total_malformed_ip++;
+ return INVALID;
+ }
+
+ if (is_fragment) {
+ metrics->errors_total_fragmented_ip++;
+ return INVALID;
+ }
+
+ switch (l4_proto) {
+ case IPPROTO_ICMPV6:
+ return process_icmpv6(pkt, metrics);
+
+ case IPPROTO_TCP:
+ return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics);
+
+ case IPPROTO_UDP:
+ return process_udp(pkt, ipv6, sizeof(*ipv6), metrics);
+
+ default:
+ metrics->errors_total_unknown_l4_proto++;
+ return INVALID;
+ }
+}
+
+SEC("classifier/cls_redirect")
+int cls_redirect(struct __sk_buff *skb)
+{
+ metrics_t *metrics = get_global_metrics();
+ if (metrics == NULL) {
+ return TC_ACT_SHOT;
+ }
+
+ metrics->processed_packets_total++;
+
+ /* Pass bogus packets as long as we're not sure they're
+ * destined for us.
+ */
+ if (skb->protocol != bpf_htons(ETH_P_IP)) {
+ return TC_ACT_OK;
+ }
+
+ encap_headers_t *encap;
+
+ /* Make sure that all encapsulation headers are available in
+ * the linear portion of the skb. This makes it easy to manipulate them.
+ */
+ if (bpf_skb_pull_data(skb, sizeof(*encap))) {
+ return TC_ACT_OK;
+ }
+
+ buf_t pkt = {
+ .skb = skb,
+ .head = (uint8_t *)(long)skb->data,
+ .tail = (uint8_t *)(long)skb->data_end,
+ };
+
+ encap = buf_assign(&pkt, sizeof(*encap), NULL);
+ if (encap == NULL) {
+ return TC_ACT_OK;
+ }
+
+ if (encap->ip.ihl != 5) {
+ /* We never have any options. */
+ return TC_ACT_OK;
+ }
+
+ if (encap->ip.daddr != ENCAPSULATION_IP ||
+ encap->ip.protocol != IPPROTO_UDP) {
+ return TC_ACT_OK;
+ }
+
+ /* TODO Check UDP length? */
+ if (encap->udp.dest != ENCAPSULATION_PORT) {
+ return TC_ACT_OK;
+ }
+
+ /* We now know that the packet is destined to us, we can
+ * drop bogus ones.
+ */
+ if (ipv4_is_fragment((void *)&encap->ip)) {
+ metrics->errors_total_fragmented_ip++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->gue.variant != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->gue.control != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->gue.flags != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->gue.hlen !=
+ sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->unigue.version != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->unigue.reserved != 0) {
+ return TC_ACT_SHOT;
+ }
+
+ struct in_addr next_hop;
+ MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop));
+
+ if (next_hop.s_addr == 0) {
+ metrics->accepted_packets_total_last_hop++;
+ return accept_locally(skb, encap);
+ }
+
+ verdict_t verdict;
+ switch (encap->gue.proto_ctype) {
+ case IPPROTO_IPIP:
+ verdict = process_ipv4(&pkt, metrics);
+ break;
+
+ case IPPROTO_IPV6:
+ verdict = process_ipv6(&pkt, metrics);
+ break;
+
+ default:
+ metrics->errors_total_unknown_l3_proto++;
+ return TC_ACT_SHOT;
+ }
+
+ switch (verdict) {
+ case INVALID:
+ /* metrics have already been bumped */
+ return TC_ACT_SHOT;
+
+ case UNKNOWN:
+ return forward_to_next_hop(skb, encap, &next_hop, metrics);
+
+ case ECHO_REQUEST:
+ metrics->accepted_packets_total_icmp_echo_request++;
+ break;
+
+ case SYN:
+ if (encap->unigue.forward_syn) {
+ return forward_to_next_hop(skb, encap, &next_hop,
+ metrics);
+ }
+
+ metrics->accepted_packets_total_syn++;
+ break;
+
+ case SYN_COOKIE:
+ metrics->accepted_packets_total_syn_cookies++;
+ break;
+
+ case ESTABLISHED:
+ metrics->accepted_packets_total_established++;
+ break;
+ }
+
+ return accept_locally(skb, encap);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.h b/tools/testing/selftests/bpf/progs/test_cls_redirect.h
new file mode 100644
index 000000000000..76eab0aacba0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* Copyright 2019, 2020 Cloudflare */
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+
+struct gre_base_hdr {
+ uint16_t flags;
+ uint16_t protocol;
+} __attribute__((packed));
+
+struct guehdr {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ uint8_t hlen : 5, control : 1, variant : 2;
+#else
+ uint8_t variant : 2, control : 1, hlen : 5;
+#endif
+ uint8_t proto_ctype;
+ uint16_t flags;
+};
+
+struct unigue {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ uint8_t _r : 2, last_hop_gre : 1, forward_syn : 1, version : 4;
+#else
+ uint8_t version : 4, forward_syn : 1, last_hop_gre : 1, _r : 2;
+#endif
+ uint8_t reserved;
+ uint8_t next_hop;
+ uint8_t hop_count;
+ // Next hops go here
+} __attribute__((packed));
+
+typedef struct {
+ struct ethhdr eth;
+ struct iphdr ip;
+ struct gre_base_hdr gre;
+} __attribute__((packed)) encap_gre_t;
+
+typedef struct {
+ struct ethhdr eth;
+ struct iphdr ip;
+ struct udphdr udp;
+ struct guehdr gue;
+ struct unigue unigue;
+} __attribute__((packed)) encap_headers_t;
diff --git a/tools/testing/selftests/bpf/progs/test_enable_stats.c b/tools/testing/selftests/bpf/progs/test_enable_stats.c
new file mode 100644
index 000000000000..01a002ade529
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_enable_stats.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <linux/types.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u64 count = 0;
+
+SEC("raw_tracepoint/sys_enter")
+int test_enable_stats(void *ctx)
+{
+ count += 1;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_obj_id.c b/tools/testing/selftests/bpf/progs/test_obj_id.c
index 98b9de2fafd0..ded71b3ff6b4 100644
--- a/tools/testing/selftests/bpf/progs/test_obj_id.c
+++ b/tools/testing/selftests/bpf/progs/test_obj_id.c
@@ -3,16 +3,8 @@
*/
#include <stddef.h>
#include <linux/bpf.h>
-#include <linux/pkt_cls.h>
#include <bpf/bpf_helpers.h>
-/* It is a dumb bpf program such that it must have no
- * issue to be loaded since testing the verifier is
- * not the focus here.
- */
-
-int _version SEC("version") = 1;
-
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
@@ -20,13 +12,13 @@ struct {
__type(value, __u64);
} test_map_id SEC(".maps");
-SEC("test_obj_id_dummy")
-int test_obj_id(struct __sk_buff *skb)
+SEC("raw_tp/sys_enter")
+int test_obj_id(void *ctx)
{
__u32 key = 0;
__u64 *value;
value = bpf_map_lookup_elem(&test_map_id, &key);
- return TC_ACT_OK;
+ return 0;
}
diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c
index 8f530843b4da..1ecd987005d2 100644
--- a/tools/testing/selftests/bpf/progs/test_sk_assign.c
+++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
@@ -16,6 +16,26 @@
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
+/* Pin map under /sys/fs/bpf/tc/globals/<map name> */
+#define PIN_GLOBAL_NS 2
+
+/* Must match struct bpf_elf_map layout from iproute2 */
+struct {
+ __u32 type;
+ __u32 size_key;
+ __u32 size_value;
+ __u32 max_elem;
+ __u32 flags;
+ __u32 id;
+ __u32 pinning;
+} server_map SEC("maps") = {
+ .type = BPF_MAP_TYPE_SOCKMAP,
+ .size_key = sizeof(int),
+ .size_value = sizeof(__u64),
+ .max_elem = 1,
+ .pinning = PIN_GLOBAL_NS,
+};
+
int _version SEC("version") = 1;
char _license[] SEC("license") = "GPL";
@@ -72,7 +92,9 @@ handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4)
{
struct bpf_sock_tuple ln = {0};
struct bpf_sock *sk;
+ const int zero = 0;
size_t tuple_len;
+ __be16 dport;
int ret;
tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
@@ -83,32 +105,11 @@ handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4)
if (sk)
goto assign;
- if (ipv4) {
- if (tuple->ipv4.dport != bpf_htons(4321))
- return TC_ACT_OK;
-
- ln.ipv4.daddr = bpf_htonl(0x7f000001);
- ln.ipv4.dport = bpf_htons(1234);
-
- sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv4),
- BPF_F_CURRENT_NETNS, 0);
- } else {
- if (tuple->ipv6.dport != bpf_htons(4321))
- return TC_ACT_OK;
-
- /* Upper parts of daddr are already zero. */
- ln.ipv6.daddr[3] = bpf_htonl(0x1);
- ln.ipv6.dport = bpf_htons(1234);
-
- sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv6),
- BPF_F_CURRENT_NETNS, 0);
- }
+ dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport;
+ if (dport != bpf_htons(4321))
+ return TC_ACT_OK;
- /* workaround: We can't do a single socket lookup here, because then
- * the compiler will likely spill tuple_len to the stack. This makes it
- * lose all bounds information in the verifier, which then rejects the
- * call as unsafe.
- */
+ sk = bpf_map_lookup_elem(&server_map, &zero);
if (!sk)
return TC_ACT_SHOT;
@@ -123,7 +124,9 @@ handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4)
{
struct bpf_sock_tuple ln = {0};
struct bpf_sock *sk;
+ const int zero = 0;
size_t tuple_len;
+ __be16 dport;
int ret;
tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
@@ -137,32 +140,11 @@ handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4)
bpf_sk_release(sk);
}
- if (ipv4) {
- if (tuple->ipv4.dport != bpf_htons(4321))
- return TC_ACT_OK;
+ dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport;
+ if (dport != bpf_htons(4321))
+ return TC_ACT_OK;
- ln.ipv4.daddr = bpf_htonl(0x7f000001);
- ln.ipv4.dport = bpf_htons(1234);
-
- sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4),
- BPF_F_CURRENT_NETNS, 0);
- } else {
- if (tuple->ipv6.dport != bpf_htons(4321))
- return TC_ACT_OK;
-
- /* Upper parts of daddr are already zero. */
- ln.ipv6.daddr[3] = bpf_htonl(0x1);
- ln.ipv6.dport = bpf_htons(1234);
-
- sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6),
- BPF_F_CURRENT_NETNS, 0);
- }
-
- /* workaround: We can't do a single socket lookup here, because then
- * the compiler will likely spill tuple_len to the stack. This makes it
- * lose all bounds information in the verifier, which then rejects the
- * call as unsafe.
- */
+ sk = bpf_map_lookup_elem(&server_map, &zero);
if (!sk)
return TC_ACT_SHOT;
diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c
index 2d0b0b82a78a..50525235380e 100644
--- a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c
+++ b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c
@@ -45,7 +45,7 @@ int sysctl_tcp_mem(struct bpf_sysctl *ctx)
unsigned long tcp_mem[3] = {0, 0, 0};
char value[MAX_VALUE_STR_LEN];
unsigned char i, off = 0;
- int ret;
+ volatile int ret;
if (ctx->write)
return 0;
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index b521e0a512b6..93970ec1c9e9 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -351,6 +351,7 @@ int extract_build_id(char *build_id, size_t size)
len = size;
memcpy(build_id, line, len);
build_id[len] = '\0';
+ free(line);
return 0;
err:
fclose(fp);
@@ -420,6 +421,18 @@ static int libbpf_print_fn(enum libbpf_print_level level,
return 0;
}
+static void free_str_set(const struct str_set *set)
+{
+ int i;
+
+ if (!set)
+ return;
+
+ for (i = 0; i < set->cnt; i++)
+ free((void *)set->strs[i]);
+ free(set->strs);
+}
+
static int parse_str_list(const char *s, struct str_set *set)
{
char *input, *state = NULL, *next, **tmp, **strs = NULL;
@@ -756,11 +769,11 @@ int main(int argc, char **argv)
fprintf(stdout, "Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n",
env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt);
- free(env.test_selector.blacklist.strs);
- free(env.test_selector.whitelist.strs);
+ free_str_set(&env.test_selector.blacklist);
+ free_str_set(&env.test_selector.whitelist);
free(env.test_selector.num_set);
- free(env.subtest_selector.blacklist.strs);
- free(env.subtest_selector.whitelist.strs);
+ free_str_set(&env.subtest_selector.blacklist);
+ free_str_set(&env.subtest_selector.whitelist);
free(env.subtest_selector.num_set);
return env.fail_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index f4aff6b8284b..10188cc8e9e0 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -105,6 +105,13 @@ struct ipv6_packet {
} __packed;
extern struct ipv6_packet pkt_v6;
+#define PRINT_FAIL(format...) \
+ ({ \
+ test__fail(); \
+ fprintf(stdout, "%s:FAIL:%d ", __func__, __LINE__); \
+ fprintf(stdout, ##format); \
+ })
+
#define _CHECK(condition, tag, duration, format...) ({ \
int __ret = !!(condition); \
int __save_errno = errno; \
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 87eaa49609a0..21a1ce219c1c 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -50,7 +50,7 @@
#define MAX_INSNS BPF_MAXINSNS
#define MAX_TEST_INSNS 1000000
#define MAX_FIXUPS 8
-#define MAX_NR_MAPS 19
+#define MAX_NR_MAPS 20
#define MAX_TEST_RUNS 8
#define POINTER_VALUE 0xcafe4all
#define TEST_DATA_LEN 64
@@ -86,6 +86,7 @@ struct bpf_test {
int fixup_map_array_small[MAX_FIXUPS];
int fixup_sk_storage_map[MAX_FIXUPS];
int fixup_map_event_output[MAX_FIXUPS];
+ int fixup_map_reuseport_array[MAX_FIXUPS];
const char *errstr;
const char *errstr_unpriv;
uint32_t insn_processed;
@@ -637,6 +638,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
int *fixup_map_array_small = test->fixup_map_array_small;
int *fixup_sk_storage_map = test->fixup_sk_storage_map;
int *fixup_map_event_output = test->fixup_map_event_output;
+ int *fixup_map_reuseport_array = test->fixup_map_reuseport_array;
if (test->fill_helper) {
test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn));
@@ -806,6 +808,14 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
fixup_map_event_output++;
} while (*fixup_map_event_output);
}
+ if (*fixup_map_reuseport_array) {
+ map_fds[19] = __create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+ sizeof(u32), sizeof(u64), 1, 0);
+ do {
+ prog[*fixup_map_reuseport_array].imm = map_fds[19];
+ fixup_map_reuseport_array++;
+ } while (*fixup_map_reuseport_array);
+ }
}
static int set_admin(bool admin)
@@ -943,7 +953,12 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
attr.insns = prog;
attr.insns_cnt = prog_len;
attr.license = "GPL";
- attr.log_level = verbose || expected_ret == VERBOSE_ACCEPT ? 1 : 4;
+ if (verbose)
+ attr.log_level = 1;
+ else if (expected_ret == VERBOSE_ACCEPT)
+ attr.log_level = 2;
+ else
+ attr.log_level = 4;
attr.prog_flags = pflags;
fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog));
diff --git a/tools/testing/selftests/bpf/verifier/event_output.c b/tools/testing/selftests/bpf/verifier/event_output.c
index 130553e19eca..99f8f582c02b 100644
--- a/tools/testing/selftests/bpf/verifier/event_output.c
+++ b/tools/testing/selftests/bpf/verifier/event_output.c
@@ -92,3 +92,27 @@
.result = ACCEPT,
.retval = 1,
},
+{
+ "perfevent for cgroup dev",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "perfevent for cgroup sysctl",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "perfevent for cgroup sockopt",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
diff --git a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c
index da7a4b37cb98..fc4e301260f6 100644
--- a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c
+++ b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c
@@ -1,34 +1,4 @@
{
- "prevent map lookup in sockmap",
- .insns = {
- BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
- BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
- BPF_LD_MAP_FD(BPF_REG_1, 0),
- BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
- BPF_EXIT_INSN(),
- },
- .fixup_map_sockmap = { 3 },
- .result = REJECT,
- .errstr = "cannot pass map_type 15 into func bpf_map_lookup_elem",
- .prog_type = BPF_PROG_TYPE_SOCK_OPS,
-},
-{
- "prevent map lookup in sockhash",
- .insns = {
- BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
- BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
- BPF_LD_MAP_FD(BPF_REG_1, 0),
- BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
- BPF_EXIT_INSN(),
- },
- .fixup_map_sockhash = { 3 },
- .result = REJECT,
- .errstr = "cannot pass map_type 18 into func bpf_map_lookup_elem",
- .prog_type = BPF_PROG_TYPE_SOCK_OPS,
-},
-{
"prevent map lookup in stack trace",
.insns = {
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c
index 9ed192e14f5f..0bc51ad9e0fb 100644
--- a/tools/testing/selftests/bpf/verifier/sock.c
+++ b/tools/testing/selftests/bpf/verifier/sock.c
@@ -516,3 +516,118 @@
.prog_type = BPF_PROG_TYPE_XDP,
.result = ACCEPT,
},
+{
+ "bpf_map_lookup_elem(sockmap, &key)",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockmap = { 3 },
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .result = REJECT,
+ .errstr = "Unreleased reference id=2 alloc_insn=5",
+},
+{
+ "bpf_map_lookup_elem(sockhash, &key)",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockhash = { 3 },
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .result = REJECT,
+ .errstr = "Unreleased reference id=2 alloc_insn=5",
+},
+{
+ "bpf_map_lookup_elem(sockmap, &key); sk->type [fullsock field]; bpf_sk_release(sk)",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockmap = { 3 },
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .result = ACCEPT,
+},
+{
+ "bpf_map_lookup_elem(sockhash, &key); sk->type [fullsock field]; bpf_sk_release(sk)",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockhash = { 3 },
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .result = ACCEPT,
+},
+{
+ "bpf_sk_select_reuseport(ctx, reuseport_array, &key, flags)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_reuseport_array = { 4 },
+ .prog_type = BPF_PROG_TYPE_SK_REUSEPORT,
+ .result = ACCEPT,
+},
+{
+ "bpf_sk_select_reuseport(ctx, sockmap, &key, flags)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockmap = { 4 },
+ .prog_type = BPF_PROG_TYPE_SK_REUSEPORT,
+ .result = ACCEPT,
+},
+{
+ "bpf_sk_select_reuseport(ctx, sockhash, &key, flags)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockmap = { 4 },
+ .prog_type = BPF_PROG_TYPE_SK_REUSEPORT,
+ .result = ACCEPT,
+},