diff options
37 files changed, 1969 insertions, 378 deletions
diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 87814859cfc2..a4508ec1816b 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -1134,7 +1134,7 @@ The verifier's knowledge about the variable offset consists of: mask and value; no bit should ever be 1 in both. For example, if a byte is read into a register from memory, the register's top 56 bits are known zero, while the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we -then OR this with 0x40, we get (0x40; 0xcf), then if we add 1 we get (0x0; +then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0; 0x1ff), because of potential carries. Besides arithmetic, the register state can also be updated by conditional branches. For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 41e2feb0cf4f..b5030e1a41d8 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -363,15 +363,7 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) { const u8 *tmp = bpf2a32[TMP_REG_1]; - s32 jmp_offset; - /* checks if divisor is zero or not. If it is, then - * exit directly. - */ - emit(ARM_CMP_I(rn, 0), ctx); - _emit(ARM_COND_EQ, ARM_MOV_I(ARM_R0, 0), ctx); - jmp_offset = epilogue_offset(ctx); - _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); #if __LINUX_ARM_ARCH__ == 7 if (elf_hwcap & HWCAP_IDIVA) { if (op == BPF_DIV) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 0775d5ab8ee9..1d4f1da7c58f 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -390,18 +390,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) case BPF_ALU64 | BPF_DIV | BPF_X: case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU64 | BPF_MOD | BPF_X: - { - const u8 r0 = bpf2a64[BPF_REG_0]; - - /* if (src == 0) return 0 */ - jmp_offset = 3; /* skip ahead to else path */ - check_imm19(jmp_offset); - emit(A64_CBNZ(is64, src, jmp_offset), ctx); - emit(A64_MOVZ(1, r0, 0, 0), ctx); - jmp_offset = epilogue_offset(ctx); - check_imm26(jmp_offset); - emit(A64_B(jmp_offset), ctx); - /* else */ switch (BPF_OP(code)) { case BPF_DIV: emit(A64_UDIV(is64, dst, dst, src), ctx); @@ -413,7 +401,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; } break; - } case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU64 | BPF_LSH | BPF_X: emit(A64_LSLV(is64, dst, dst, src), ctx); diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 4e347030ed2c..3e2798bfea4f 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -741,16 +741,11 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, break; case BPF_ALU | BPF_DIV | BPF_K: /* ALU_IMM */ case BPF_ALU | BPF_MOD | BPF_K: /* ALU_IMM */ + if (insn->imm == 0) + return -EINVAL; dst = ebpf_to_mips_reg(ctx, insn, dst_reg); if (dst < 0) return dst; - if (insn->imm == 0) { /* Div by zero */ - b_off = b_imm(exit_idx, ctx); - if (is_bad_offset(b_off)) - return -E2BIG; - emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off); - emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO); - } td = get_reg_val_type(ctx, this_idx, insn->dst_reg); if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) /* sign extend */ @@ -770,19 +765,13 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, break; case BPF_ALU64 | BPF_DIV | BPF_K: /* ALU_IMM */ case BPF_ALU64 | BPF_MOD | BPF_K: /* ALU_IMM */ + if (insn->imm == 0) + return -EINVAL; dst = ebpf_to_mips_reg(ctx, insn, dst_reg); if (dst < 0) return dst; - if (insn->imm == 0) { /* Div by zero */ - b_off = b_imm(exit_idx, ctx); - if (is_bad_offset(b_off)) - return -E2BIG; - emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off); - emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO); - } if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT) emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); - if (insn->imm == 1) { /* div by 1 is a nop, mod by 1 is zero */ if (bpf_op == BPF_MOD) @@ -860,11 +849,6 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, break; case BPF_DIV: case BPF_MOD: - b_off = b_imm(exit_idx, ctx); - if (is_bad_offset(b_off)) - return -E2BIG; - emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off); - emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src); emit_instr(ctx, ddivu, dst, src); if (bpf_op == BPF_DIV) emit_instr(ctx, mflo, dst); @@ -943,11 +927,6 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, break; case BPF_DIV: case BPF_MOD: - b_off = b_imm(exit_idx, ctx); - if (is_bad_offset(b_off)) - return -E2BIG; - emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off); - emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src); emit_instr(ctx, divu, dst, src); if (bpf_op == BPF_DIV) emit_instr(ctx, mflo, dst); diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 217a78e84865..0a34b0cec7b7 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -381,10 +381,6 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, goto bpf_alu32_trunc; case BPF_ALU | BPF_DIV | BPF_X: /* (u32) dst /= (u32) src */ case BPF_ALU | BPF_MOD | BPF_X: /* (u32) dst %= (u32) src */ - PPC_CMPWI(src_reg, 0); - PPC_BCC_SHORT(COND_NE, (ctx->idx * 4) + 12); - PPC_LI(b2p[BPF_REG_0], 0); - PPC_JMP(exit_addr); if (BPF_OP(code) == BPF_MOD) { PPC_DIVWU(b2p[TMP_REG_1], dst_reg, src_reg); PPC_MULW(b2p[TMP_REG_1], src_reg, @@ -395,10 +391,6 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, goto bpf_alu32_trunc; case BPF_ALU64 | BPF_DIV | BPF_X: /* dst /= src */ case BPF_ALU64 | BPF_MOD | BPF_X: /* dst %= src */ - PPC_CMPDI(src_reg, 0); - PPC_BCC_SHORT(COND_NE, (ctx->idx * 4) + 12); - PPC_LI(b2p[BPF_REG_0], 0); - PPC_JMP(exit_addr); if (BPF_OP(code) == BPF_MOD) { PPC_DIVD(b2p[TMP_REG_1], dst_reg, src_reg); PPC_MULD(b2p[TMP_REG_1], src_reg, diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index e50188773ff3..78a19c93b380 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -610,11 +610,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i { int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; - jit->seen |= SEEN_RET0; - /* ltr %src,%src (if src == 0 goto fail) */ - EMIT2(0x1200, src_reg, src_reg); - /* jz <ret0> */ - EMIT4_PCREL(0xa7840000, jit->ret0_ip - jit->prg); /* lhi %w0,0 */ EMIT4_IMM(0xa7080000, REG_W0, 0); /* lr %w1,%dst */ @@ -630,11 +625,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i { int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; - jit->seen |= SEEN_RET0; - /* ltgr %src,%src (if src == 0 goto fail) */ - EMIT4(0xb9020000, src_reg, src_reg); - /* jz <ret0> */ - EMIT4_PCREL(0xa7840000, jit->ret0_ip - jit->prg); /* lghi %w0,0 */ EMIT4_IMM(0xa7090000, REG_W0, 0); /* lgr %w1,%dst */ diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 50a24d7bd4c5..48a25869349b 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -967,31 +967,17 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) emit_alu(MULX, src, dst, ctx); break; case BPF_ALU | BPF_DIV | BPF_X: - emit_cmp(src, G0, ctx); - emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx); - emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx); - emit_write_y(G0, ctx); emit_alu(DIV, src, dst, ctx); break; - case BPF_ALU64 | BPF_DIV | BPF_X: - emit_cmp(src, G0, ctx); - emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx); - emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx); - emit_alu(UDIVX, src, dst, ctx); break; - case BPF_ALU | BPF_MOD | BPF_X: { const u8 tmp = bpf2sparc[TMP_REG_1]; ctx->tmp_1_used = true; - emit_cmp(src, G0, ctx); - emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx); - emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx); - emit_write_y(G0, ctx); emit_alu3(DIV, dst, src, tmp, ctx); emit_alu3(MULX, tmp, src, tmp, ctx); @@ -1003,10 +989,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) ctx->tmp_1_used = true; - emit_cmp(src, G0, ctx); - emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx); - emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx); - emit_alu3(UDIVX, dst, src, tmp, ctx); emit_alu3(MULX, tmp, src, tmp, ctx); emit_alu3(SUB, dst, tmp, dst, ctx); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5acee5139e28..4923d92f918d 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -568,26 +568,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, */ EMIT2(0x31, 0xd2); - if (BPF_SRC(insn->code) == BPF_X) { - /* if (src_reg == 0) return 0 */ - - /* cmp r11, 0 */ - EMIT4(0x49, 0x83, 0xFB, 0x00); - - /* jne .+9 (skip over pop, pop, xor and jmp) */ - EMIT2(X86_JNE, 1 + 1 + 2 + 5); - EMIT1(0x5A); /* pop rdx */ - EMIT1(0x58); /* pop rax */ - EMIT2(0x31, 0xc0); /* xor eax, eax */ - - /* jmp cleanup_addr - * addrs[i] - 11, because there are 11 bytes - * after this insn: div, mov, pop, pop, mov - */ - jmp_offset = ctx->cleanup_addr - (addrs[i] - 11); - EMIT1_off32(0xE9, jmp_offset); - } - if (BPF_CLASS(insn->code) == BPF_ALU64) /* div r11 */ EMIT3(0x49, 0xF7, 0xF3); diff --git a/include/linux/filter.h b/include/linux/filter.h index 425056c7f96c..276932d75975 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -688,6 +688,8 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb) struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); +bool bpf_opcode_in_insntable(u8 code); + struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); @@ -1003,10 +1005,20 @@ struct bpf_sock_ops_kern { struct sock *sk; u32 op; union { + u32 args[4]; u32 reply; u32 replylong[4]; }; u32 is_fullsock; + u64 temp; /* temp and everything after is not + * initialized to 0 before calling + * the BPF program. New fields that + * should be initialized to 0 should + * be inserted before temp. + * temp is scratch storage used by + * sock_ops_convert_ctx_access + * as temporary storage of a register. + */ }; #endif /* __LINUX_FILTER_H__ */ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4f93f0953c41..8f4c54986f97 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -335,6 +335,17 @@ struct tcp_sock { int linger2; + +/* Sock_ops bpf program related variables */ +#ifdef CONFIG_BPF + u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs + * values defined in uapi/linux/tcp.h + */ +#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) +#else +#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 +#endif + /* Receiver side RTT estimation */ struct { u32 rtt_us; diff --git a/include/net/tcp.h b/include/net/tcp.h index 5a1d26a18599..093e967a2960 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2006,12 +2006,12 @@ void tcp_cleanup_ulp(struct sock *sk); * program loaded). */ #ifdef CONFIG_BPF -static inline int tcp_call_bpf(struct sock *sk, int op) +static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { struct bpf_sock_ops_kern sock_ops; int ret; - memset(&sock_ops, 0, sizeof(sock_ops)); + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; sock_owned_by_me(sk); @@ -2019,6 +2019,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op) sock_ops.sk = sk; sock_ops.op = op; + if (nargs > 0) + memcpy(sock_ops.args, args, nargs * sizeof(*args)); ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); if (ret == 0) @@ -2027,18 +2029,46 @@ static inline int tcp_call_bpf(struct sock *sk, int op) ret = -1; return ret; } + +static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) +{ + u32 args[2] = {arg1, arg2}; + + return tcp_call_bpf(sk, op, 2, args); +} + +static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, + u32 arg3) +{ + u32 args[3] = {arg1, arg2, arg3}; + + return tcp_call_bpf(sk, op, 3, args); +} + #else -static inline int tcp_call_bpf(struct sock *sk, int op) +static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { return -EPERM; } + +static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) +{ + return -EPERM; +} + +static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, + u32 arg3) +{ + return -EPERM; +} + #endif static inline u32 tcp_timeout_init(struct sock *sk) { int timeout; - timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT); + timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL); if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; @@ -2049,7 +2079,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk) { int rwnd; - rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT); + rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL); if (rwnd < 0) rwnd = 0; @@ -2058,7 +2088,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk) static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { - return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1); + return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } #if IS_ENABLED(CONFIG_SMC) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 406c19d6016b..db6bdc375126 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -642,6 +642,14 @@ union bpf_attr { * @optlen: length of optval in bytes * Return: 0 or negative error * + * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) + * Set callback flags for sock_ops + * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct + * @flags: flags value + * Return: 0 for no error + * -EINVAL if there is no full tcp socket + * bits in flags that are not supported by current kernel + * * int bpf_skb_adjust_room(skb, len_diff, mode, flags) * Grow or shrink room in sk_buff. * @skb: pointer to skb @@ -748,7 +756,8 @@ union bpf_attr { FN(perf_event_read_value), \ FN(perf_prog_read_value), \ FN(getsockopt), \ - FN(override_return), + FN(override_return), \ + FN(sock_ops_cb_flags_set), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -952,8 +961,9 @@ struct bpf_map_info { struct bpf_sock_ops { __u32 op; union { - __u32 reply; - __u32 replylong[4]; + __u32 args[4]; /* Optionally passed to bpf program */ + __u32 reply; /* Returned by bpf program */ + __u32 replylong[4]; /* Optionally returned by bpf prog */ }; __u32 family; __u32 remote_ip4; /* Stored in network byte order */ @@ -968,8 +978,39 @@ struct bpf_sock_ops { */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; }; +/* Definitions for bpf_sock_ops_cb_flags */ +#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) +#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) +#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently + * supported cb flags + */ + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ @@ -1003,6 +1044,43 @@ enum { * a congestion threshold. RTTs above * this indicate congestion */ + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. + * Arg1: value of icsk_retransmits + * Arg2: value of icsk_rto + * Arg3: whether RTO has expired + */ + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. + * Arg1: sequence number of 1st byte + * Arg2: # segments + * Arg3: return value of + * tcp_transmit_skb (0 => success) + */ + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. + * Arg1: old_state + * Arg2: new_state + */ +}; + +/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect + * changes between the TCP and BPF versions. Ideally this should never happen. + * If it does, we need to add code to convert them before calling + * the BPF sock_ops function. + */ +enum { + BPF_TCP_ESTABLISHED = 1, + BPF_TCP_SYN_SENT, + BPF_TCP_SYN_RECV, + BPF_TCP_FIN_WAIT1, + BPF_TCP_FIN_WAIT2, + BPF_TCP_TIME_WAIT, + BPF_TCP_CLOSE, + BPF_TCP_CLOSE_WAIT, + BPF_TCP_LAST_ACK, + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, + + BPF_TCP_MAX_STATES /* Leave at the end! */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3aa0658add76..5f35f93dcab2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -782,6 +782,137 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +/* All UAPI available opcodes. */ +#define BPF_INSN_MAP(INSN_2, INSN_3) \ + /* 32 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU, ADD, X), \ + INSN_3(ALU, SUB, X), \ + INSN_3(ALU, AND, X), \ + INSN_3(ALU, OR, X), \ + INSN_3(ALU, LSH, X), \ + INSN_3(ALU, RSH, X), \ + INSN_3(ALU, XOR, X), \ + INSN_3(ALU, MUL, X), \ + INSN_3(ALU, MOV, X), \ + INSN_3(ALU, DIV, X), \ + INSN_3(ALU, MOD, X), \ + INSN_2(ALU, NEG), \ + INSN_3(ALU, END, TO_BE), \ + INSN_3(ALU, END, TO_LE), \ + /* Immediate based. */ \ + INSN_3(ALU, ADD, K), \ + INSN_3(ALU, SUB, K), \ + INSN_3(ALU, AND, K), \ + INSN_3(ALU, OR, K), \ + INSN_3(ALU, LSH, K), \ + INSN_3(ALU, RSH, K), \ + INSN_3(ALU, XOR, K), \ + INSN_3(ALU, MUL, K), \ + INSN_3(ALU, MOV, K), \ + INSN_3(ALU, DIV, K), \ + INSN_3(ALU, MOD, K), \ + /* 64 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU64, ADD, X), \ + INSN_3(ALU64, SUB, X), \ + INSN_3(ALU64, AND, X), \ + INSN_3(ALU64, OR, X), \ + INSN_3(ALU64, LSH, X), \ + INSN_3(ALU64, RSH, X), \ + INSN_3(ALU64, XOR, X), \ + INSN_3(ALU64, MUL, X), \ + INSN_3(ALU64, MOV, X), \ + INSN_3(ALU64, ARSH, X), \ + INSN_3(ALU64, DIV, X), \ + INSN_3(ALU64, MOD, X), \ + INSN_2(ALU64, NEG), \ + /* Immediate based. */ \ + INSN_3(ALU64, ADD, K), \ + INSN_3(ALU64, SUB, K), \ + INSN_3(ALU64, AND, K), \ + INSN_3(ALU64, OR, K), \ + INSN_3(ALU64, LSH, K), \ + INSN_3(ALU64, RSH, K), \ + INSN_3(ALU64, XOR, K), \ + INSN_3(ALU64, MUL, K), \ + INSN_3(ALU64, MOV, K), \ + INSN_3(ALU64, ARSH, K), \ + INSN_3(ALU64, DIV, K), \ + INSN_3(ALU64, MOD, K), \ + /* Call instruction. */ \ + INSN_2(JMP, CALL), \ + /* Exit instruction. */ \ + INSN_2(JMP, EXIT), \ + /* Jump instructions. */ \ + /* Register based. */ \ + INSN_3(JMP, JEQ, X), \ + INSN_3(JMP, JNE, X), \ + INSN_3(JMP, JGT, X), \ + INSN_3(JMP, JLT, X), \ + INSN_3(JMP, JGE, X), \ + INSN_3(JMP, JLE, X), \ + INSN_3(JMP, JSGT, X), \ + INSN_3(JMP, JSLT, X), \ + INSN_3(JMP, JSGE, X), \ + INSN_3(JMP, JSLE, X), \ + INSN_3(JMP, JSET, X), \ + /* Immediate based. */ \ + INSN_3(JMP, JEQ, K), \ + INSN_3(JMP, JNE, K), \ + INSN_3(JMP, JGT, K), \ + INSN_3(JMP, JLT, K), \ + INSN_3(JMP, JGE, K), \ + INSN_3(JMP, JLE, K), \ + INSN_3(JMP, JSGT, K), \ + INSN_3(JMP, JSLT, K), \ + INSN_3(JMP, JSGE, K), \ + INSN_3(JMP, JSLE, K), \ + INSN_3(JMP, JSET, K), \ + INSN_2(JMP, JA), \ + /* Store instructions. */ \ + /* Register based. */ \ + INSN_3(STX, MEM, B), \ + INSN_3(STX, MEM, H), \ + INSN_3(STX, MEM, W), \ + INSN_3(STX, MEM, DW), \ + INSN_3(STX, XADD, W), \ + INSN_3(STX, XADD, DW), \ + /* Immediate based. */ \ + INSN_3(ST, MEM, B), \ + INSN_3(ST, MEM, H), \ + INSN_3(ST, MEM, W), \ + INSN_3(ST, MEM, DW), \ + /* Load instructions. */ \ + /* Register based. */ \ + INSN_3(LDX, MEM, B), \ + INSN_3(LDX, MEM, H), \ + INSN_3(LDX, MEM, W), \ + INSN_3(LDX, MEM, DW), \ + /* Immediate based. */ \ + INSN_3(LD, IMM, DW), \ + /* Misc (old cBPF carry-over). */ \ + INSN_3(LD, ABS, B), \ + INSN_3(LD, ABS, H), \ + INSN_3(LD, ABS, W), \ + INSN_3(LD, IND, B), \ + INSN_3(LD, IND, H), \ + INSN_3(LD, IND, W) + +bool bpf_opcode_in_insntable(u8 code) +{ +#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true +#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true + static const bool public_insntable[256] = { + [0 ... 255] = false, + /* Now overwrite non-defaults ... */ + BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + }; +#undef BPF_INSN_3_TBL +#undef BPF_INSN_2_TBL + return public_insntable[code]; +} + #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context @@ -793,115 +924,18 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { u64 tmp; +#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y +#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ - /* 32 bit ALU operations */ - [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, - [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, - [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, - [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, - [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, - [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, - [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, - [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, - [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, - [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, - [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, - [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, - [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, - [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, - [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, - [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, - [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, - [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, - [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, - [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, - [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, - [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, - [BPF_ALU | BPF_NEG] = &&ALU_NEG, - [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, - [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, - /* 64 bit ALU operations */ - [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, - [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, - [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, - [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, - [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, - [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, - [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, - [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, - [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, - [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, - [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, - [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, - [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, - [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, - [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, - [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, - [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, - [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, - [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, - [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, - [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, - [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, - [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, - [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, - [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, - /* Call instruction */ - [BPF_JMP | BPF_CALL] = &&JMP_CALL, + BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), + /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, - /* Jumps */ - [BPF_JMP | BPF_JA] = &&JMP_JA, - [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, - [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, - [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, - [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, - [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, - [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, - [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, - [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, - [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, - [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, - [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, - [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, - [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, - [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, - [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, - [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, - [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, - [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, - [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, - [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, - [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, - [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, - /* Program return */ - [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, - /* Store instructions */ - [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, - [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, - [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, - [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, - [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, - [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, - [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, - [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, - [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, - [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, - /* Load instructions */ - [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, - [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, - [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, - [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, - [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, - [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, - [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, - [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, - [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, - [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, - [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; +#undef BPF_INSN_3_LBL +#undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; void *ptr; int off; @@ -965,14 +999,10 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - if (unlikely(SRC == 0)) - return 0; div64_u64_rem(DST, SRC, &tmp); DST = tmp; CONT; ALU_MOD_X: - if (unlikely((u32)SRC == 0)) - return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); CONT; @@ -985,13 +1015,9 @@ select_insn: DST = do_div(tmp, (u32) IMM); CONT; ALU64_DIV_X: - if (unlikely(SRC == 0)) - return 0; DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely((u32)SRC == 0)) - return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); DST = (u32) tmp; @@ -1302,8 +1328,14 @@ load_byte: goto load_byte; default_label: - /* If we ever reach this, we have a bug somewhere. */ - WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + /* If we ever reach this, we have a bug somewhere. Die hard here + * instead of just returning 0; we could be somewhere in a subprog, + * so execution could continue otherwise which we do /not/ want. + * + * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). + */ + pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); + BUG_ON(1); return 0; } STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index d7ea96218516..7b469d10d0e9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -593,11 +593,10 @@ unlock: static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { + struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; - struct lpm_trie_node *node, *next_node = NULL, *parent; struct lpm_trie_node **node_stack = NULL; - struct lpm_trie_node __rcu **root; int err = 0, stack_ptr = -1; unsigned int next_bit; size_t matchlen; @@ -614,22 +613,21 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) */ /* Empty trie */ - if (!rcu_dereference(trie->root)) + search_root = rcu_dereference(trie->root); + if (!search_root) return -ENOENT; /* For invalid key, find the leftmost node in the trie */ - if (!key || key->prefixlen > trie->max_prefixlen) { - root = &trie->root; + if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - } node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), - GFP_USER | __GFP_NOWARN); + GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) return -ENOMEM; /* Try to find the exact node for the given key */ - for (node = rcu_dereference(trie->root); node;) { + for (node = search_root; node;) { node_stack[++stack_ptr] = node; matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -640,10 +638,8 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) node = rcu_dereference(node->child[next_bit]); } if (!node || node->prefixlen != key->prefixlen || - (node->flags & LPM_TREE_NODE_FLAG_IM)) { - root = &trie->root; + (node->flags & LPM_TREE_NODE_FLAG_IM)) goto find_leftmost; - } /* The node with the exactly-matching key has been found, * find the first node in postorder after the matched node. @@ -651,10 +647,10 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) node = node_stack[stack_ptr]; while (stack_ptr > 0) { parent = node_stack[stack_ptr - 1]; - if (rcu_dereference(parent->child[0]) == node && - rcu_dereference(parent->child[1])) { - root = &parent->child[1]; - goto find_leftmost; + if (rcu_dereference(parent->child[0]) == node) { + search_root = rcu_dereference(parent->child[1]); + if (search_root) + goto find_leftmost; } if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { next_node = parent; @@ -673,7 +669,7 @@ find_leftmost: /* Find the leftmost non-intermediate node, all intermediate nodes * have exact two children, so this function will never return NULL. */ - for (node = rcu_dereference(*root); node;) { + for (node = search_root; node;) { if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) next_node = node; node = rcu_dereference(node->child[0]); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5bdb0cc84ad2..e24aa3241387 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -709,10 +709,7 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || - map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || - map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { + } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dfb138b46488..5fb69a85d967 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4981,6 +4981,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) next_insn: insn++; i++; + continue; + } + + /* Basic sanity check before we invest more work here. */ + if (!bpf_opcode_in_insntable(insn->code)) { + verbose(env, "unknown opcode %02x\n", insn->code); + return -EINVAL; } } @@ -5064,14 +5071,21 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } -/* The verifier does more data flow analysis than llvm and will not explore - * branches that are dead at run time. Malicious programs can have dead code - * too. Therefore replace all dead at-run-time code with nops. +/* The verifier does more data flow analysis than llvm and will not + * explore branches that are dead at run time. Malicious programs can + * have dead code too. Therefore replace all dead at-run-time code + * with 'ja -1'. + * + * Just nops are not optimal, e.g. if they would sit at the end of the + * program and through another bug we would manage to jump there, then + * we'd execute beyond program memory otherwise. Returning exception + * code also wouldn't work since we can have subprogs where the dead + * code could be located. */ static void sanitize_dead_code(struct bpf_verifier_env *env) { struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); + struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1); struct bpf_insn *insn = env->prog->insnsi; const int insn_cnt = env->prog->len; int i; @@ -5079,7 +5093,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { if (aux_data[i].seen) continue; - memcpy(insn + i, &nop, sizeof(nop)); + memcpy(insn + i, &trap, sizeof(trap)); } } @@ -5386,15 +5400,37 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_MOD | BPF_X) || insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - /* due to JIT bugs clear upper 32-bits of src register - * before div/mod operation - */ - insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); - insn_buf[1] = *insn; - cnt = 2; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + struct bpf_insn mask_and_div[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx div 0 -> 0 */ + BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), + BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + *insn, + }; + struct bpf_insn mask_and_mod[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx mod 0 -> Rx */ + BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), + *insn, + }; + struct bpf_insn *patchlet; + + if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + patchlet = mask_and_div + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); + } else { + patchlet = mask_and_mod + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); + } + + new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) return -ENOMEM; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index e3938e395cba..4cd9ea9b3449 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -2003,10 +2003,14 @@ static struct bpf_test tests[] = { { { 4, 0 }, { 5, 10 } } }, { - "INT: DIV by zero", + /* This one doesn't go through verifier, but is just raw insn + * as opposed to cBPF tests from here. Thus div by 0 tests are + * done in test_verifier in BPF kselftests. + */ + "INT: DIV by -1", .u.insns_int = { BPF_ALU64_REG(BPF_MOV, R6, R1), - BPF_ALU64_IMM(BPF_MOV, R7, 0), + BPF_ALU64_IMM(BPF_MOV, R7, -1), BPF_LD_ABS(BPF_B, 3), BPF_ALU32_REG(BPF_DIV, R0, R7), BPF_EXIT_INSN(), diff --git a/net/core/filter.c b/net/core/filter.c index 18da42a81d0c..08ab4c65a998 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -401,8 +401,8 @@ do_pass: /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to @@ -459,8 +459,15 @@ do_pass: break; if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || - fp->code == (BPF_ALU | BPF_MOD | BPF_X)) + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + /* Error with exception code on div/mod by 0. + * For cBPF programs, this was always return 0. + */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn++ = BPF_EXIT_INSN(); + } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; @@ -3232,6 +3239,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } #ifdef CONFIG_INET +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + if (val < -1 || val > 0xff) { + ret = -EINVAL; + } else { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (val == -1) + val = 0; + np->tclass = val; + } + break; + default: + ret = -EINVAL; + } +#endif } else if (level == SOL_TCP && sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { @@ -3241,7 +3271,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, reinit); + ret = tcp_set_congestion_control(sk, name, false, + reinit); } else { struct tcp_sock *tp = tcp_sk(sk); @@ -3307,6 +3338,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + goto err_clear; + + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + *((int *)optval) = (int)np->tclass; + break; + default: + goto err_clear; + } +#endif } else { goto err_clear; } @@ -3328,6 +3375,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, + int, argval) +{ + struct sock *sk = bpf_sock->sk; + int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + + if (!sk_fullsock(sk)) + return -EINVAL; + +#ifdef CONFIG_INET + if (val) + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + + return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); +#else + return -EINVAL; +#endif +} + +static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { + .func = bpf_sock_ops_cb_flags_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3510,6 +3584,8 @@ static const struct bpf_func_proto * return &bpf_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_getsockopt_proto; + case BPF_FUNC_sock_ops_cb_flags_set: + return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; default: @@ -3826,34 +3902,44 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static bool __is_valid_sock_ops_access(int off, int size) +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; - - return true; -} -static bool sock_ops_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ if (type == BPF_WRITE) { switch (off) { - case offsetof(struct bpf_sock_ops, op) ... - offsetof(struct bpf_sock_ops, replylong[3]): + case offsetof(struct bpf_sock_ops, reply): + case offsetof(struct bpf_sock_ops, sk_txhash): + if (size != size_default) + return false; break; default: return false; } + } else { + switch (off) { + case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, + bytes_acked): + if (size != sizeof(__u64)) + return false; + break; + default: + if (size != size_default) + return false; + break; + } } - return __is_valid_sock_ops_access(off, size); + return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, @@ -4470,10 +4556,37 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, is_fullsock)); break; -/* Helper macro for adding read access to tcp_sock fields. */ -#define SOCK_OPS_GET_TCP32(FIELD_NAME) \ + case offsetof(struct bpf_sock_ops, state): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_state)); + break; + + case offsetof(struct bpf_sock_ops, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct tcp_sock, rtt_min) + + FIELD_SIZEOF(struct minmax_sample, t)); + break; + +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ @@ -4485,17 +4598,159 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \ - offsetof(struct tcp_sock, FIELD_NAME)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + } while (0) + +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) case offsetof(struct bpf_sock_ops, snd_cwnd): - SOCK_OPS_GET_TCP32(snd_cwnd); + SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, srtt_us): - SOCK_OPS_GET_TCP32(srtt_us); + SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); break; + + case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): + SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_FIELD(total_retrans, total_retrans, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sk_txhash): + SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, + struct sock, type); + break; + + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_FIELD(bytes_received, bytes_received, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); + break; + } return insn - insn_buf; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d7cf861bf699..f013ddc191e0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -463,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); - tcp_call_bpf(sk, bpf_op); + tcp_call_bpf(sk, bpf_op, 0, NULL); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } @@ -2042,6 +2042,30 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; + /* We defined a new enum for TCP states that are exported in BPF + * so as not force the internal TCP states to be frozen. The + * following checks will detect if an internal state value ever + * differs from the BPF value. If this ever happens, then we will + * need to remap the internal value to the BPF value before calling + * tcp_call_bpf_2arg. + */ + BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); + BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); + BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); + BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); + BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); + BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); + BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); + BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); + BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); + BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); + + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); + switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 0b5a05bd82e3..ddbce73edae8 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk) * within a datacenter, where we have reasonable estimates of * RTTs */ - base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL); if (base_rtt > 0) { ca->nv_base_rtt = base_rtt; ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 95461f02ac9a..e9f985e42405 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2905,6 +2905,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, + TCP_SKB_CB(skb)->seq, segs, err); + if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); @@ -3469,7 +3473,7 @@ int tcp_connect(struct sock *sk) struct sk_buff *buff; int err; - tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6db3124cdbda..257abdde23b0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -213,11 +213,18 @@ static int tcp_write_timeout(struct sock *sk) icsk->icsk_user_timeout); } tcp_fastopen_active_detect_blackhole(sk, expired); + + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, + icsk->icsk_retransmits, + icsk->icsk_rto, (int)expired); + if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; } + return 0; } diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 7f61a3d57fa7..64335bb94f9f 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -201,13 +201,16 @@ CLANG_ARCH_ARGS = -target $(ARCH) endif # Trick to allow make to be run from this directory -all: +all: $(LIBBPF) $(MAKE) -C ../../ $(CURDIR)/ clean: $(MAKE) -C ../../ M=$(CURDIR) clean @rm -f *~ +$(LIBBPF): FORCE + $(MAKE) -C $(dir $@) $(notdir $@) + $(obj)/syscall_nrs.s: $(src)/syscall_nrs.c $(call if_changed_dep,cc_s_c) diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index 7cc9d228216f..7c25c0c112bc 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -23,8 +23,11 @@ #include <stdbool.h> #include <signal.h> #include <fcntl.h> +#include <sys/wait.h> +#include <time.h> #include <sys/time.h> +#include <sys/resource.h> #include <sys/types.h> #include <linux/netlink.h> @@ -35,6 +38,8 @@ #include <assert.h> #include <libgen.h> +#include <getopt.h> + #include "../bpf/bpf_load.h" #include "../bpf/bpf_util.h" #include "../bpf/libbpf.h" @@ -46,15 +51,42 @@ void running_handler(int a); #define S1_PORT 10000 #define S2_PORT 10001 -static int sockmap_test_sockets(int rate, int dot) +/* global sockets */ +int s1, s2, c1, c2, p1, p2; + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"cgroup", required_argument, NULL, 'c' }, + {"rate", required_argument, NULL, 'r' }, + {"verbose", no_argument, NULL, 'v' }, + {"iov_count", required_argument, NULL, 'i' }, + {"length", required_argument, NULL, 'l' }, + {"test", required_argument, NULL, 't' }, + {0, 0, NULL, 0 } +}; + +static void usage(char *argv[]) +{ + int i; + + printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]); + printf(" options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)\n", + *long_options[i].flag); + else + printf(" -%c\n", long_options[i].val); + } + printf("\n"); +} + +static int sockmap_init_sockets(void) { - int i, sc, err, max_fd, one = 1; - int s1, s2, c1, c2, p1, p2; + int i, err, one = 1; struct sockaddr_in addr; - struct timeval timeout; - char buf[1024] = {0}; int *fds[4] = {&s1, &s2, &c1, &c2}; - fd_set w; s1 = s2 = p1 = p2 = c1 = c2 = 0; @@ -63,8 +95,7 @@ static int sockmap_test_sockets(int rate, int dot) *fds[i] = socket(AF_INET, SOCK_STREAM, 0); if (*fds[i] < 0) { perror("socket s1 failed()"); - err = *fds[i]; - goto out; + return errno; } } @@ -74,16 +105,16 @@ static int sockmap_test_sockets(int rate, int dot) (char *)&one, sizeof(one)); if (err) { perror("setsockopt failed()"); - goto out; + return errno; } } /* Non-blocking sockets */ - for (i = 0; i < 4; i++) { + for (i = 0; i < 2; i++) { err = ioctl(*fds[i], FIONBIO, (char *)&one); if (err < 0) { perror("ioctl s1 failed()"); - goto out; + return errno; } } @@ -96,14 +127,14 @@ static int sockmap_test_sockets(int rate, int dot) err = bind(s1, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0) { perror("bind s1 failed()\n"); - goto out; + return errno; } addr.sin_port = htons(S2_PORT); err = bind(s2, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0) { perror("bind s2 failed()\n"); - goto out; + return errno; } /* Listen server sockets */ @@ -111,14 +142,14 @@ static int sockmap_test_sockets(int rate, int dot) err = listen(s1, 32); if (err < 0) { perror("listen s1 failed()\n"); - goto out; + return errno; } addr.sin_port = htons(S2_PORT); err = listen(s2, 32); if (err < 0) { perror("listen s1 failed()\n"); - goto out; + return errno; } /* Initiate Connect */ @@ -126,46 +157,232 @@ static int sockmap_test_sockets(int rate, int dot) err = connect(c1, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0 && errno != EINPROGRESS) { perror("connect c1 failed()\n"); - goto out; + return errno; } addr.sin_port = htons(S2_PORT); err = connect(c2, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0 && errno != EINPROGRESS) { perror("connect c2 failed()\n"); - goto out; + return errno; + } else if (err < 0) { + err = 0; } /* Accept Connecrtions */ p1 = accept(s1, NULL, NULL); if (p1 < 0) { perror("accept s1 failed()\n"); - goto out; + return errno; } p2 = accept(s2, NULL, NULL); if (p2 < 0) { perror("accept s1 failed()\n"); - goto out; + return errno; } - max_fd = p2; - timeout.tv_sec = 10; - timeout.tv_usec = 0; - printf("connected sockets: c1 <-> p1, c2 <-> p2\n"); printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n", c1, s1, c2, s2); + return 0; +} + +struct msg_stats { + size_t bytes_sent; + size_t bytes_recvd; + struct timespec start; + struct timespec end; +}; + +static int msg_loop(int fd, int iov_count, int iov_length, int cnt, + struct msg_stats *s, bool tx) +{ + struct msghdr msg = {0}; + int err, i, flags = MSG_NOSIGNAL; + struct iovec *iov; + + iov = calloc(iov_count, sizeof(struct iovec)); + if (!iov) + return errno; + + for (i = 0; i < iov_count; i++) { + char *d = calloc(iov_length, sizeof(char)); + + if (!d) { + fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count); + goto out_errno; + } + iov[i].iov_base = d; + iov[i].iov_len = iov_length; + } + + msg.msg_iov = iov; + msg.msg_iovlen = iov_count; + + if (tx) { + clock_gettime(CLOCK_MONOTONIC, &s->start); + for (i = 0; i < cnt; i++) { + int sent = sendmsg(fd, &msg, flags); + + if (sent < 0) { + perror("send loop error:"); + goto out_errno; + } + s->bytes_sent += sent; + } + clock_gettime(CLOCK_MONOTONIC, &s->end); + } else { + int slct, recv, max_fd = fd; + struct timeval timeout; + float total_bytes; + fd_set w; + + total_bytes = (float)iov_count * (float)iov_length * (float)cnt; + err = clock_gettime(CLOCK_MONOTONIC, &s->start); + if (err < 0) + perror("recv start time: "); + while (s->bytes_recvd < total_bytes) { + timeout.tv_sec = 1; + timeout.tv_usec = 0; + + /* FD sets */ + FD_ZERO(&w); + FD_SET(fd, &w); + + slct = select(max_fd + 1, &w, NULL, NULL, &timeout); + if (slct == -1) { + perror("select()"); + clock_gettime(CLOCK_MONOTONIC, &s->end); + goto out_errno; + } else if (!slct) { + fprintf(stderr, "unexpected timeout\n"); + errno = -EIO; + clock_gettime(CLOCK_MONOTONIC, &s->end); + goto out_errno; + } + + recv = recvmsg(fd, &msg, flags); + if (recv < 0) { + if (errno != EWOULDBLOCK) { + clock_gettime(CLOCK_MONOTONIC, &s->end); + perror("recv failed()\n"); + goto out_errno; + } + } + + s->bytes_recvd += recv; + } + clock_gettime(CLOCK_MONOTONIC, &s->end); + } + + for (i = 0; i < iov_count; i++) + free(iov[i].iov_base); + free(iov); + return 0; +out_errno: + for (i = 0; i < iov_count; i++) + free(iov[i].iov_base); + free(iov); + return errno; +} + +static float giga = 1000000000; + +static inline float sentBps(struct msg_stats s) +{ + return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec); +} + +static inline float recvdBps(struct msg_stats s) +{ + return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec); +} + +static int sendmsg_test(int iov_count, int iov_buf, int cnt, + int verbose, bool base) +{ + float sent_Bps = 0, recvd_Bps = 0; + int rx_fd, txpid, rxpid, err = 0; + struct msg_stats s = {0}; + int status; + + errno = 0; + + if (base) + rx_fd = p1; + else + rx_fd = p2; + + rxpid = fork(); + if (rxpid == 0) { + err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false); + if (err) + fprintf(stderr, + "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", + iov_count, iov_buf, cnt, err); + shutdown(p2, SHUT_RDWR); + shutdown(p1, SHUT_RDWR); + if (s.end.tv_sec - s.start.tv_sec) { + sent_Bps = sentBps(s); + recvd_Bps = recvdBps(s); + } + fprintf(stdout, + "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n", + s.bytes_sent, sent_Bps, sent_Bps/giga, + s.bytes_recvd, recvd_Bps, recvd_Bps/giga); + exit(1); + } else if (rxpid == -1) { + perror("msg_loop_rx: "); + return errno; + } + + txpid = fork(); + if (txpid == 0) { + err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true); + if (err) + fprintf(stderr, + "msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n", + iov_count, iov_buf, cnt, err); + shutdown(c1, SHUT_RDWR); + if (s.end.tv_sec - s.start.tv_sec) { + sent_Bps = sentBps(s); + recvd_Bps = recvdBps(s); + } + fprintf(stdout, + "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n", + s.bytes_sent, sent_Bps, sent_Bps/giga, + s.bytes_recvd, recvd_Bps, recvd_Bps/giga); + exit(1); + } else if (txpid == -1) { + perror("msg_loop_tx: "); + return errno; + } + + assert(waitpid(rxpid, &status, 0) == rxpid); + assert(waitpid(txpid, &status, 0) == txpid); + return err; +} + +static int forever_ping_pong(int rate, int verbose) +{ + struct timeval timeout; + char buf[1024] = {0}; + int sc; + + timeout.tv_sec = 10; + timeout.tv_usec = 0; /* Ping/Pong data from client to server */ sc = send(c1, buf, sizeof(buf), 0); if (sc < 0) { perror("send failed()\n"); - goto out; + return sc; } do { - int s, rc, i; + int s, rc, i, max_fd = p2; + fd_set w; /* FD sets */ FD_ZERO(&w); @@ -193,7 +410,7 @@ static int sockmap_test_sockets(int rate, int dot) if (rc < 0) { if (errno != EWOULDBLOCK) { perror("recv failed()\n"); - break; + return rc; } } @@ -205,35 +422,92 @@ static int sockmap_test_sockets(int rate, int dot) sc = send(i, buf, rc, 0); if (sc < 0) { perror("send failed()\n"); - break; + return sc; } } - sleep(rate); - if (dot) { + + if (rate) + sleep(rate); + + if (verbose) { printf("."); fflush(stdout); } } while (running); -out: - close(s1); - close(s2); - close(p1); - close(p2); - close(c1); - close(c2); - return err; + return 0; } +enum { + PING_PONG, + SENDMSG, + BASE, +}; + int main(int argc, char **argv) { - int rate = 1, dot = 1; + int iov_count = 1, length = 1024, rate = 1, verbose = 0; + struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + int opt, longindex, err, cg_fd = 0; + int test = PING_PONG; char filename[256]; - int err, cg_fd; - char *cg_path; - cg_path = argv[argc - 1]; + while ((opt = getopt_long(argc, argv, "hvc:r:i:l:t:", + long_options, &longindex)) != -1) { + switch (opt) { + /* Cgroup configuration */ + case 'c': + cg_fd = open(optarg, O_DIRECTORY, O_RDONLY); + if (cg_fd < 0) { + fprintf(stderr, + "ERROR: (%i) open cg path failed: %s\n", + cg_fd, optarg); + return cg_fd; + } + break; + case 'r': + rate = atoi(optarg); + break; + case 'v': + verbose = 1; + break; + case 'i': + iov_count = atoi(optarg); + break; + case 'l': + length = atoi(optarg); + break; + case 't': + if (strcmp(optarg, "ping") == 0) { + test = PING_PONG; + } else if (strcmp(optarg, "sendmsg") == 0) { + test = SENDMSG; + } else if (strcmp(optarg, "base") == 0) { + test = BASE; + } else { + usage(argv); + return -1; + } + break; + case 'h': + default: + usage(argv); + return -1; + } + } + + if (!cg_fd) { + fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n", + argv[0]); + return -1; + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); running = 1; @@ -241,20 +515,16 @@ int main(int argc, char **argv) /* catch SIGINT */ signal(SIGINT, running_handler); + /* If base test skip BPF setup */ + if (test == BASE) + goto run; + if (load_bpf_file(filename)) { fprintf(stderr, "load_bpf_file: (%s) %s\n", filename, strerror(errno)); return 1; } - /* Cgroup configuration */ - cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); - if (cg_fd < 0) { - fprintf(stderr, "ERROR: (%i) open cg path failed: %s\n", - cg_fd, cg_path); - return cg_fd; - } - /* Attach programs to sockmap */ err = bpf_prog_attach(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER, 0); @@ -280,12 +550,30 @@ int main(int argc, char **argv) return err; } - err = sockmap_test_sockets(rate, dot); +run: + err = sockmap_init_sockets(); if (err) { fprintf(stderr, "ERROR: test socket failed: %d\n", err); - return err; + goto out; } - return 0; + + if (test == PING_PONG) + err = forever_ping_pong(rate, verbose); + else if (test == SENDMSG) + err = sendmsg_test(iov_count, length, rate, verbose, false); + else if (test == BASE) + err = sendmsg_test(iov_count, length, rate, verbose, true); + else + fprintf(stderr, "unknown test\n"); +out: + close(s1); + close(s2); + close(p1); + close(p2); + close(c1); + close(c2); + close(cg_fd); + return err; } void running_handler(int a) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index af1f49ad8b88..db6bdc375126 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -17,7 +17,7 @@ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ -#define BPF_DW 0x18 /* double word */ +#define BPF_DW 0x18 /* double word (64-bit) */ #define BPF_XADD 0xc0 /* exclusive add */ /* alu/jmp fields */ @@ -642,6 +642,14 @@ union bpf_attr { * @optlen: length of optval in bytes * Return: 0 or negative error * + * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) + * Set callback flags for sock_ops + * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct + * @flags: flags value + * Return: 0 for no error + * -EINVAL if there is no full tcp socket + * bits in flags that are not supported by current kernel + * * int bpf_skb_adjust_room(skb, len_diff, mode, flags) * Grow or shrink room in sk_buff. * @skb: pointer to skb @@ -748,7 +756,8 @@ union bpf_attr { FN(perf_event_read_value), \ FN(perf_prog_read_value), \ FN(getsockopt), \ - FN(override_return), + FN(override_return), \ + FN(sock_ops_cb_flags_set), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -952,8 +961,9 @@ struct bpf_map_info { struct bpf_sock_ops { __u32 op; union { - __u32 reply; - __u32 replylong[4]; + __u32 args[4]; /* Optionally passed to bpf program */ + __u32 reply; /* Returned by bpf program */ + __u32 replylong[4]; /* Optionally returned by bpf prog */ }; __u32 family; __u32 remote_ip4; /* Stored in network byte order */ @@ -968,8 +978,39 @@ struct bpf_sock_ops { */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; }; +/* Definitions for bpf_sock_ops_cb_flags */ +#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) +#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) +#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently + * supported cb flags + */ + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ @@ -1003,6 +1044,43 @@ enum { * a congestion threshold. RTTs above * this indicate congestion */ + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. + * Arg1: value of icsk_retransmits + * Arg2: value of icsk_rto + * Arg3: whether RTO has expired + */ + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. + * Arg1: sequence number of 1st byte + * Arg2: # segments + * Arg3: return value of + * tcp_transmit_skb (0 => success) + */ + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. + * Arg1: old_state + * Arg2: new_state + */ +}; + +/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect + * changes between the TCP and BPF versions. Ideally this should never happen. + * If it does, we need to add code to convert them before calling + * the BPF sock_ops function. + */ +enum { + BPF_TCP_ESTABLISHED = 1, + BPF_TCP_SYN_SENT, + BPF_TCP_SYN_RECV, + BPF_TCP_FIN_WAIT1, + BPF_TCP_FIN_WAIT2, + BPF_TCP_TIME_WAIT, + BPF_TCP_CLOSE, + BPF_TCP_CLOSE_WAIT, + BPF_TCP_LAST_ACK, + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, + + BPF_TCP_MAX_STATES /* Leave at the end! */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3a44b655d852..bf05bc5e36e5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -11,16 +11,16 @@ ifneq ($(wildcard $(GENHDR)),) endif CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include -LDLIBS += -lcap -lelf -lrt +LDLIBS += -lcap -lelf -lrt -lpthread TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ - test_align test_verifier_log test_dev_cgroup + test_align test_verifier_log test_dev_cgroup test_tcpbpf_user TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \ test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \ - sample_map_ret0.o + sample_map_ret0.o test_tcpbpf_kern.o TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \ test_offload.py diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 33cb00e46c49..dde2c11d7771 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -71,6 +71,8 @@ static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval, int optlen) = (void *) BPF_FUNC_getsockopt; +static int (*bpf_sock_ops_cb_flags_set)(void *ctx, int flags) = + (void *) BPF_FUNC_sock_ops_cb_flags_set; static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) = (void *) BPF_FUNC_sk_redirect_map; static int (*bpf_sock_map_update)(void *map, void *key, void *value, diff --git a/tools/testing/selftests/bpf/tcp_client.py b/tools/testing/selftests/bpf/tcp_client.py new file mode 100755 index 000000000000..481dccdf140c --- /dev/null +++ b/tools/testing/selftests/bpf/tcp_client.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python2 +# +# SPDX-License-Identifier: GPL-2.0 +# + +import sys, os, os.path, getopt +import socket, time +import subprocess +import select + +def read(sock, n): + buf = '' + while len(buf) < n: + rem = n - len(buf) + try: s = sock.recv(rem) + except (socket.error), e: return '' + buf += s + return buf + +def send(sock, s): + total = len(s) + count = 0 + while count < total: + try: n = sock.send(s) + except (socket.error), e: n = 0 + if n == 0: + return count; + count += n + return count + + +serverPort = int(sys.argv[1]) +HostName = socket.gethostname() + +# create active socket +sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) +try: + sock.connect((HostName, serverPort)) +except socket.error as e: + sys.exit(1) + +buf = '' +n = 0 +while n < 1000: + buf += '+' + n += 1 + +sock.settimeout(1); +n = send(sock, buf) +n = read(sock, 500) +sys.exit(0) diff --git a/tools/testing/selftests/bpf/tcp_server.py b/tools/testing/selftests/bpf/tcp_server.py new file mode 100755 index 000000000000..bc454d7d0be2 --- /dev/null +++ b/tools/testing/selftests/bpf/tcp_server.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python2 +# +# SPDX-License-Identifier: GPL-2.0 +# + +import sys, os, os.path, getopt +import socket, time +import subprocess +import select + +def read(sock, n): + buf = '' + while len(buf) < n: + rem = n - len(buf) + try: s = sock.recv(rem) + except (socket.error), e: return '' + buf += s + return buf + +def send(sock, s): + total = len(s) + count = 0 + while count < total: + try: n = sock.send(s) + except (socket.error), e: n = 0 + if n == 0: + return count; + count += n + return count + + +SERVER_PORT = 12877 +MAX_PORTS = 2 + +serverPort = SERVER_PORT +serverSocket = None + +HostName = socket.gethostname() + +# create passive socket +serverSocket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) +host = socket.gethostname() + +try: serverSocket.bind((host, 0)) +except socket.error as msg: + print 'bind fails: ', msg + +sn = serverSocket.getsockname() +serverPort = sn[1] + +cmdStr = ("./tcp_client.py %d &") % (serverPort) +os.system(cmdStr) + +buf = '' +n = 0 +while n < 500: + buf += '.' + n += 1 + +serverSocket.listen(MAX_PORTS) +readList = [serverSocket] + +while True: + readyRead, readyWrite, inError = \ + select.select(readList, [], [], 2) + + if len(readyRead) > 0: + waitCount = 0 + for sock in readyRead: + if sock == serverSocket: + (clientSocket, address) = serverSocket.accept() + address = str(address[0]) + readList.append(clientSocket) + else: + sock.settimeout(1); + s = read(sock, 1000) + n = send(sock, buf) + sock.close() + serverSocket.close() + sys.exit(0) + else: + print 'Select timeout!' + sys.exit(1) diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c index e19b410125eb..ff8bd7e3e50c 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/test_align.c @@ -446,11 +446,9 @@ static struct bpf_align_test tests[] = { .insns = { PREP_PKT_POINTERS, BPF_MOV64_IMM(BPF_REG_0, 0), - /* ptr & const => unknown & const */ - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), - BPF_ALU64_IMM(BPF_AND, BPF_REG_5, 0x40), - /* ptr << const => unknown << const */ - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + /* (ptr - ptr) << 2 */ + BPF_MOV64_REG(BPF_REG_5, BPF_REG_3), + BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_2), BPF_ALU64_IMM(BPF_LSH, BPF_REG_5, 2), /* We have a (4n) value. Let's make a packet offset * out of it. First add 14, to make it a (4n+2) @@ -473,8 +471,26 @@ static struct bpf_align_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .matches = { - {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"}, - /* R5 bitwise operator &= on pointer prohibited */ + {4, "R5_w=pkt_end(id=0,off=0,imm=0)"}, + /* (ptr - ptr) << 2 == unknown, (4n) */ + {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"}, + /* (4n) + 14 == (4n+2). We blow our bounds, because + * the add could overflow. + */ + {7, "R5=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"}, + /* Checked s>=0 */ + {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + /* packet pointer + nonnegative (4n+2) */ + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {13, "R4=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. + * We checked the bounds, but it might have been able + * to overflow if the packet pointer started in the + * upper half of the address space. + * So we did not get a 'range' on R6, and the access + * attempt will fail. + */ + {15, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, } }, { diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c index c1535b34f14f..3489cc283433 100644 --- a/tools/testing/selftests/bpf/test_dev_cgroup.c +++ b/tools/testing/selftests/bpf/test_dev_cgroup.c @@ -21,7 +21,7 @@ #define DEV_CGROUP_PROG "./dev_cgroup.o" -#define TEST_CGROUP "test-bpf-based-device-cgroup/" +#define TEST_CGROUP "/test-bpf-based-device-cgroup/" int main(int argc, char **argv) { diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index 081510853c6d..2be87e9ee28d 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -14,6 +14,7 @@ #include <errno.h> #include <inttypes.h> #include <linux/bpf.h> +#include <pthread.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -641,6 +642,98 @@ static void test_lpm_get_next_key(void) close(map_fd); } +#define MAX_TEST_KEYS 4 +struct lpm_mt_test_info { + int cmd; /* 0: update, 1: delete, 2: lookup, 3: get_next_key */ + int iter; + int map_fd; + struct { + __u32 prefixlen; + __u32 data; + } key[MAX_TEST_KEYS]; +}; + +static void *lpm_test_command(void *arg) +{ + int i, j, ret, iter, key_size; + struct lpm_mt_test_info *info = arg; + struct bpf_lpm_trie_key *key_p; + + key_size = sizeof(struct bpf_lpm_trie_key) + sizeof(__u32); + key_p = alloca(key_size); + for (iter = 0; iter < info->iter; iter++) + for (i = 0; i < MAX_TEST_KEYS; i++) { + /* first half of iterations in forward order, + * and second half in backward order. + */ + j = (iter < (info->iter / 2)) ? i : MAX_TEST_KEYS - i - 1; + key_p->prefixlen = info->key[j].prefixlen; + memcpy(key_p->data, &info->key[j].data, sizeof(__u32)); + if (info->cmd == 0) { + __u32 value = j; + /* update must succeed */ + assert(bpf_map_update_elem(info->map_fd, key_p, &value, 0) == 0); + } else if (info->cmd == 1) { + ret = bpf_map_delete_elem(info->map_fd, key_p); + assert(ret == 0 || errno == ENOENT); + } else if (info->cmd == 2) { + __u32 value; + ret = bpf_map_lookup_elem(info->map_fd, key_p, &value); + assert(ret == 0 || errno == ENOENT); + } else { + struct bpf_lpm_trie_key *next_key_p = alloca(key_size); + ret = bpf_map_get_next_key(info->map_fd, key_p, next_key_p); + assert(ret == 0 || errno == ENOENT || errno == ENOMEM); + } + } + + // Pass successful exit info back to the main thread + pthread_exit((void *)info); +} + +static void setup_lpm_mt_test_info(struct lpm_mt_test_info *info, int map_fd) +{ + info->iter = 2000; + info->map_fd = map_fd; + info->key[0].prefixlen = 16; + inet_pton(AF_INET, "192.168.0.0", &info->key[0].data); + info->key[1].prefixlen = 24; + inet_pton(AF_INET, "192.168.0.0", &info->key[1].data); + info->key[2].prefixlen = 24; + inet_pton(AF_INET, "192.168.128.0", &info->key[2].data); + info->key[3].prefixlen = 24; + inet_pton(AF_INET, "192.168.1.0", &info->key[3].data); +} + +static void test_lpm_multi_thread(void) +{ + struct lpm_mt_test_info info[4]; + size_t key_size, value_size; + pthread_t thread_id[4]; + int i, map_fd; + void *ret; + + /* create a trie */ + value_size = sizeof(__u32); + key_size = sizeof(struct bpf_lpm_trie_key) + value_size; + map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, value_size, + 100, BPF_F_NO_PREALLOC); + + /* create 4 threads to test update, delete, lookup and get_next_key */ + setup_lpm_mt_test_info(&info[0], map_fd); + for (i = 0; i < 4; i++) { + if (i != 0) + memcpy(&info[i], &info[0], sizeof(info[i])); + info[i].cmd = i; + assert(pthread_create(&thread_id[i], NULL, &lpm_test_command, &info[i]) == 0); + } + + for (i = 0; i < 4; i++) + assert(pthread_join(thread_id[i], &ret) == 0 && ret == (void *)&info[i]); + + close(map_fd); +} + int main(void) { struct rlimit limit = { RLIM_INFINITY, RLIM_INFINITY }; @@ -667,6 +760,8 @@ int main(void) test_lpm_get_next_key(); + test_lpm_multi_thread(); + printf("test_lpm: OK\n"); return 0; } diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 040356ecc862..436c4c72414f 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -242,7 +242,7 @@ static void test_hashmap_percpu(int task, void *data) static void test_hashmap_walk(int task, void *data) { - int fd, i, max_entries = 100000; + int fd, i, max_entries = 1000; long long key, value, next_key; bool next_key_valid = true; @@ -463,7 +463,7 @@ static void test_devmap(int task, void *data) #define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.o" static void test_sockmap(int tasks, void *data) { - int one = 1, map_fd_rx, map_fd_tx, map_fd_break, s, sc, rc; + int one = 1, map_fd_rx = 0, map_fd_tx = 0, map_fd_break, s, sc, rc; struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_break; int ports[] = {50200, 50201, 50202, 50204}; int err, i, fd, udp, sfd[6] = {0xdeadbeef}; @@ -868,9 +868,12 @@ static void test_sockmap(int tasks, void *data) goto out_sockmap; } - /* Test map close sockets */ - for (i = 0; i < 6; i++) + /* Test map close sockets and empty maps */ + for (i = 0; i < 6; i++) { + bpf_map_delete_elem(map_fd_tx, &i); + bpf_map_delete_elem(map_fd_rx, &i); close(sfd[i]); + } close(fd); close(map_fd_rx); bpf_object__close(obj); @@ -881,8 +884,13 @@ out: printf("Failed to create sockmap '%i:%s'!\n", i, strerror(errno)); exit(1); out_sockmap: - for (i = 0; i < 6; i++) + for (i = 0; i < 6; i++) { + if (map_fd_tx) + bpf_map_delete_elem(map_fd_tx, &i); + if (map_fd_rx) + bpf_map_delete_elem(map_fd_rx, &i); close(sfd[i]); + } close(fd); exit(1); } @@ -931,8 +939,12 @@ static void test_map_large(void) close(fd); } -static void run_parallel(int tasks, void (*fn)(int task, void *data), - void *data) +#define run_parallel(N, FN, DATA) \ + printf("Fork %d tasks to '" #FN "'\n", N); \ + __run_parallel(N, FN, DATA) + +static void __run_parallel(int tasks, void (*fn)(int task, void *data), + void *data) { pid_t pid[tasks]; int i; @@ -972,7 +984,7 @@ static void test_map_stress(void) #define DO_UPDATE 1 #define DO_DELETE 0 -static void do_work(int fn, void *data) +static void test_update_delete(int fn, void *data) { int do_update = ((int *)data)[1]; int fd = ((int *)data)[0]; @@ -1012,7 +1024,7 @@ static void test_map_parallel(void) */ data[0] = fd; data[1] = DO_UPDATE; - run_parallel(TASKS, do_work, data); + run_parallel(TASKS, test_update_delete, data); /* Check that key=0 is already there. */ assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 && @@ -1035,7 +1047,7 @@ static void test_map_parallel(void) /* Now let's delete all elemenets in parallel. */ data[1] = DO_DELETE; - run_parallel(TASKS, do_work, data); + run_parallel(TASKS, test_update_delete, data); /* Nothing should be left. */ key = -1; diff --git a/tools/testing/selftests/bpf/test_tcpbpf.h b/tools/testing/selftests/bpf/test_tcpbpf.h new file mode 100644 index 000000000000..2fe43289943c --- /dev/null +++ b/tools/testing/selftests/bpf/test_tcpbpf.h @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _TEST_TCPBPF_H +#define _TEST_TCPBPF_H + +struct tcpbpf_globals { + __u32 event_map; + __u32 total_retrans; + __u32 data_segs_in; + __u32 data_segs_out; + __u32 bad_cb_test_rv; + __u32 good_cb_test_rv; + __u64 bytes_received; + __u64 bytes_acked; +}; +#endif diff --git a/tools/testing/selftests/bpf/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/test_tcpbpf_kern.c new file mode 100644 index 000000000000..57119ad57a3f --- /dev/null +++ b/tools/testing/selftests/bpf/test_tcpbpf_kern.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stddef.h> +#include <string.h> +#include <linux/bpf.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/ip.h> +#include <linux/in6.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/tcp.h> +#include <netinet/in.h> +#include "bpf_helpers.h" +#include "bpf_endian.h" +#include "test_tcpbpf.h" + +struct bpf_map_def SEC("maps") global_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct tcpbpf_globals), + .max_entries = 2, +}; + +static inline void update_event_map(int event) +{ + __u32 key = 0; + struct tcpbpf_globals g, *gp; + + gp = bpf_map_lookup_elem(&global_map, &key); + if (gp == NULL) { + struct tcpbpf_globals g = {0}; + + g.event_map |= (1 << event); + bpf_map_update_elem(&global_map, &key, &g, + BPF_ANY); + } else { + g = *gp; + g.event_map |= (1 << event); + bpf_map_update_elem(&global_map, &key, &g, + BPF_ANY); + } +} + +int _version SEC("version") = 1; + +SEC("sockops") +int bpf_testcb(struct bpf_sock_ops *skops) +{ + int rv = -1; + int bad_call_rv = 0; + int good_call_rv = 0; + int op; + int v = 0; + + op = (int) skops->op; + + update_event_map(op); + + switch (op) { + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + /* Test failure to set largest cb flag (assumes not defined) */ + bad_call_rv = bpf_sock_ops_cb_flags_set(skops, 0x80); + /* Set callback */ + good_call_rv = bpf_sock_ops_cb_flags_set(skops, + BPF_SOCK_OPS_STATE_CB_FLAG); + /* Update results */ + { + __u32 key = 0; + struct tcpbpf_globals g, *gp; + + gp = bpf_map_lookup_elem(&global_map, &key); + if (!gp) + break; + g = *gp; + g.bad_cb_test_rv = bad_call_rv; + g.good_cb_test_rv = good_call_rv; + bpf_map_update_elem(&global_map, &key, &g, + BPF_ANY); + } + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + skops->sk_txhash = 0x12345f; + v = 0xff; + rv = bpf_setsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v, + sizeof(v)); + break; + case BPF_SOCK_OPS_RTO_CB: + break; + case BPF_SOCK_OPS_RETRANS_CB: + break; + case BPF_SOCK_OPS_STATE_CB: + if (skops->args[1] == BPF_TCP_CLOSE) { + __u32 key = 0; + struct tcpbpf_globals g, *gp; + + gp = bpf_map_lookup_elem(&global_map, &key); + if (!gp) + break; + g = *gp; + g.total_retrans = skops->total_retrans; + g.data_segs_in = skops->data_segs_in; + g.data_segs_out = skops->data_segs_out; + g.bytes_received = skops->bytes_received; + g.bytes_acked = skops->bytes_acked; + bpf_map_update_elem(&global_map, &key, &g, + BPF_ANY); + } + break; + default: + rv = -1; + } + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c new file mode 100644 index 000000000000..95a370f3d378 --- /dev/null +++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <errno.h> +#include <signal.h> +#include <string.h> +#include <assert.h> +#include <linux/perf_event.h> +#include <linux/ptrace.h> +#include <linux/bpf.h> +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "bpf_util.h" +#include <linux/perf_event.h> +#include "test_tcpbpf.h" + +static int bpf_find_map(const char *test, struct bpf_object *obj, + const char *name) +{ + struct bpf_map *map; + + map = bpf_object__find_map_by_name(obj, name); + if (!map) { + printf("%s:FAIL:map '%s' not found\n", test, name); + return -1; + } + return bpf_map__fd(map); +} + +#define SYSTEM(CMD) \ + do { \ + if (system(CMD)) { \ + printf("system(%s) FAILS!\n", CMD); \ + } \ + } while (0) + +int main(int argc, char **argv) +{ + const char *file = "test_tcpbpf_kern.o"; + struct tcpbpf_globals g = {0}; + int cg_fd, prog_fd, map_fd; + bool debug_flag = false; + int error = EXIT_FAILURE; + struct bpf_object *obj; + char cmd[100], *dir; + struct stat buffer; + __u32 key = 0; + int pid; + int rv; + + if (argc > 1 && strcmp(argv[1], "-d") == 0) + debug_flag = true; + + dir = "/tmp/cgroupv2/foo"; + + if (stat(dir, &buffer) != 0) { + SYSTEM("mkdir -p /tmp/cgroupv2"); + SYSTEM("mount -t cgroup2 none /tmp/cgroupv2"); + SYSTEM("mkdir -p /tmp/cgroupv2/foo"); + } + pid = (int) getpid(); + sprintf(cmd, "echo %d >> /tmp/cgroupv2/foo/cgroup.procs", pid); + SYSTEM(cmd); + + cg_fd = open(dir, O_DIRECTORY, O_RDONLY); + if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) { + printf("FAILED: load_bpf_file failed for: %s\n", file); + goto err; + } + + rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0); + if (rv) { + printf("FAILED: bpf_prog_attach: %d (%s)\n", + error, strerror(errno)); + goto err; + } + + SYSTEM("./tcp_server.py"); + + map_fd = bpf_find_map(__func__, obj, "global_map"); + if (map_fd < 0) + goto err; + + rv = bpf_map_lookup_elem(map_fd, &key, &g); + if (rv != 0) { + printf("FAILED: bpf_map_lookup_elem returns %d\n", rv); + goto err; + } + + if (g.bytes_received != 501 || g.bytes_acked != 1002 || + g.data_segs_in != 1 || g.data_segs_out != 1 || + (g.event_map ^ 0x47e) != 0 || g.bad_cb_test_rv != 0x80 || + g.good_cb_test_rv != 0) { + printf("FAILED: Wrong stats\n"); + if (debug_flag) { + printf("\n"); + printf("bytes_received: %d (expecting 501)\n", + (int)g.bytes_received); + printf("bytes_acked: %d (expecting 1002)\n", + (int)g.bytes_acked); + printf("data_segs_in: %d (expecting 1)\n", + g.data_segs_in); + printf("data_segs_out: %d (expecting 1)\n", + g.data_segs_out); + printf("event_map: 0x%x (at least 0x47e)\n", + g.event_map); + printf("bad_cb_test_rv: 0x%x (expecting 0x80)\n", + g.bad_cb_test_rv); + printf("good_cb_test_rv:0x%x (expecting 0)\n", + g.good_cb_test_rv); + } + goto err; + } + printf("PASSED!\n"); + error = 0; +err: + bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS); + return error; + +} diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index fb82d29ee863..697bd83de295 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -21,6 +21,7 @@ #include <stddef.h> #include <stdbool.h> #include <sched.h> +#include <limits.h> #include <sys/capability.h> #include <sys/resource.h> @@ -111,7 +112,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, - .retval = 0, + .retval = 42, }, { "DIV32 by 0, zero check 2", @@ -123,7 +124,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, - .retval = 0, + .retval = 42, }, { "DIV64 by 0, zero check", @@ -135,7 +136,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, - .retval = 0, + .retval = 42, }, { "MOD32 by 0, zero check 1", @@ -147,7 +148,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, - .retval = 0, + .retval = 42, }, { "MOD32 by 0, zero check 2", @@ -159,7 +160,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, - .retval = 0, + .retval = 42, }, { "MOD64 by 0, zero check", @@ -171,13 +172,245 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = 42, + }, + { + "DIV32 by 0, zero check ok, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_MOV32_IMM(BPF_REG_1, 2), + BPF_MOV32_IMM(BPF_REG_2, 16), + BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 8, + }, + { + "DIV32 by 0, zero check 1, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV32 by 0, zero check 2, cls", + .insns = { + BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV64 by 0, zero check, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_ALU64_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, .retval = 0, }, { + "MOD32 by 0, zero check ok, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_MOV32_IMM(BPF_REG_1, 3), + BPF_MOV32_IMM(BPF_REG_2, 5), + BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 2, + }, + { + "MOD32 by 0, zero check 1, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, + }, + { + "MOD32 by 0, zero check 2, cls", + .insns = { + BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, + }, + { + "MOD64 by 0, zero check 1, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_0, 2), + BPF_ALU64_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 2, + }, + { + "MOD64 by 0, zero check 2, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_0, -1), + BPF_ALU64_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = -1, + }, + /* Just make sure that JITs used udiv/umod as otherwise we get + * an exception from INT_MIN/-1 overflow similarly as with div + * by zero. + */ + { + "DIV32 overflow, check 1", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, -1), + BPF_MOV32_IMM(BPF_REG_0, INT_MIN), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV32 overflow, check 2", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, INT_MIN), + BPF_ALU32_IMM(BPF_DIV, BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV64 overflow, check 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_LD_IMM64(BPF_REG_0, LLONG_MIN), + BPF_ALU64_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV64 overflow, check 2", + .insns = { + BPF_LD_IMM64(BPF_REG_0, LLONG_MIN), + BPF_ALU64_IMM(BPF_DIV, BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "MOD32 overflow, check 1", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, -1), + BPF_MOV32_IMM(BPF_REG_0, INT_MIN), + BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = INT_MIN, + }, + { + "MOD32 overflow, check 2", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, INT_MIN), + BPF_ALU32_IMM(BPF_MOD, BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = INT_MIN, + }, + { + "MOD64 overflow, check 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_LD_IMM64(BPF_REG_2, LLONG_MIN), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_2), + BPF_ALU64_REG(BPF_MOD, BPF_REG_2, BPF_REG_1), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_JMP_REG(BPF_JNE, BPF_REG_3, BPF_REG_2, 1), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, + }, + { + "MOD64 overflow, check 2", + .insns = { + BPF_LD_IMM64(BPF_REG_2, LLONG_MIN), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_2), + BPF_ALU64_IMM(BPF_MOD, BPF_REG_2, -1), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_JMP_REG(BPF_JNE, BPF_REG_3, BPF_REG_2, 1), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, + }, + { + "xor32 zero extend check", + .insns = { + BPF_MOV32_IMM(BPF_REG_2, -1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 0xffff), + BPF_ALU32_REG(BPF_XOR, BPF_REG_2, BPF_REG_2), + BPF_MOV32_IMM(BPF_REG_0, 2), + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0, 1), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, + }, + { "empty prog", .insns = { }, - .errstr = "last insn is not an exit or jmp", + .errstr = "unknown opcode 00", .result = REJECT, }, { @@ -374,7 +607,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "BPF_ARSH not supported for 32 bit ALU", + .errstr = "unknown opcode c4", }, { "arsh32 on reg", @@ -385,7 +618,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "BPF_ARSH not supported for 32 bit ALU", + .errstr = "unknown opcode cc", }, { "arsh64 on imm", @@ -501,7 +734,7 @@ static struct bpf_test tests[] = { BPF_RAW_INSN(BPF_JMP | BPF_CALL | BPF_X, 0, 0, 0, 0), BPF_EXIT_INSN(), }, - .errstr = "BPF_CALL uses reserved", + .errstr = "unknown opcode 8d", .result = REJECT, }, { @@ -691,7 +924,7 @@ static struct bpf_test tests[] = { BPF_RAW_INSN(0, 0, 0, 0, 0), BPF_EXIT_INSN(), }, - .errstr = "invalid BPF_LD_IMM", + .errstr = "unknown opcode 00", .result = REJECT, }, { @@ -709,7 +942,7 @@ static struct bpf_test tests[] = { BPF_RAW_INSN(-1, 0, 0, 0, 0), BPF_EXIT_INSN(), }, - .errstr = "invalid BPF_ALU opcode f0", + .errstr = "unknown opcode ff", .result = REJECT, }, { @@ -718,7 +951,7 @@ static struct bpf_test tests[] = { BPF_RAW_INSN(-1, -1, -1, -1, -1), BPF_EXIT_INSN(), }, - .errstr = "invalid BPF_ALU opcode f0", + .errstr = "unknown opcode ff", .result = REJECT, }, { @@ -7543,7 +7776,7 @@ static struct bpf_test tests[] = { }, BPF_EXIT_INSN(), }, - .errstr = "BPF_END uses reserved fields", + .errstr = "unknown opcode d7", .result = REJECT, }, { @@ -8766,6 +8999,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = 1, }, { "check deducing bounds from const, 3", @@ -8963,6 +9197,90 @@ static struct bpf_test tests[] = { .retval = 1, }, { + "calls: div by 0 in subprog", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV32_IMM(BPF_REG_2, 0), + BPF_MOV32_IMM(BPF_REG_3, 1), + BPF_ALU32_REG(BPF_DIV, BPF_REG_3, BPF_REG_2), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, + }, + { + "calls: multiple ret types in subprog 1", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "R0 invalid mem access 'inv'", + }, + { + "calls: multiple ret types in subprog 2", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 9), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, + offsetof(struct __sk_buff, data)), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 64), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map1 = { 16 }, + .result = REJECT, + .errstr = "R0 min value is outside of the array range", + }, + { "calls: overlapping caller/callee", .insns = { BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 0), |