From 8a76145a2ec2a81dfe34d7ac42e8c242f095e8c8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 5 Oct 2022 21:24:51 -0700 Subject: bpf: explicitly define BPF_FUNC_xxx integer values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Historically enum bpf_func_id's BPF_FUNC_xxx enumerators relied on implicit sequential values being assigned by compiler. This is convenient, as new BPF helpers are always added at the very end, but it also has its downsides, some of them being: - with over 200 helpers now it's very hard to know what's each helper's ID, which is often important to know when working with BPF assembly (e.g., by dumping raw bpf assembly instructions with llvm-objdump -d command). it's possible to work around this by looking into vmlinux.h, dumping /sys/btf/kernel/vmlinux, looking at libbpf-provided bpf_helper_defs.h, etc. But it always feels like an unnecessary step and one should be able to quickly figure this out from UAPI header. - when backporting and cherry-picking only some BPF helpers onto older kernels it's important to be able to skip some enum values for helpers that weren't backported, but preserve absolute integer IDs to keep BPF helper IDs stable so that BPF programs stay portable across upstream and backported kernels. While neither problem is insurmountable, they come up frequently enough and are annoying enough to warrant improving the situation. And for the backporting the problem can easily go unnoticed for a while, especially if backport is done with people not very familiar with BPF subsystem overall. Anyways, it's easy to fix this by making sure that __BPF_FUNC_MAPPER macro provides explicit helper IDs. Unfortunately that would potentially break existing users that use UAPI-exposed __BPF_FUNC_MAPPER and are expected to pass macro that accepts only symbolic helper identifier (e.g., map_lookup_elem for bpf_map_lookup_elem() helper). As such, we need to introduce a new macro (___BPF_FUNC_MAPPER) which would specify both identifier and integer ID, but in such a way as to allow existing __BPF_FUNC_MAPPER be expressed in terms of new ___BPF_FUNC_MAPPER macro. And that's what this patch is doing. To avoid duplication and allow __BPF_FUNC_MAPPER stay *exactly* the same, ___BPF_FUNC_MAPPER accepts arbitrary "context" arguments, which can be used to pass any extra macros, arguments, and whatnot. In our case we use this to pass original user-provided macro that expects single argument and __BPF_FUNC_MAPPER is using it's own three-argument __BPF_FUNC_MAPPER_APPLY intermediate macro to impedance-match new and old "callback" macros. Once we resolve this, we use new ___BPF_FUNC_MAPPER to define enum bpf_func_id with explicit values. The other users of __BPF_FUNC_MAPPER in kernel (namely in kernel/bpf/disasm.c) are kept exactly the same both as demonstration that backwards compat works, but also to avoid unnecessary code churn. Note that new ___BPF_FUNC_MAPPER() doesn't forcefully insert comma between values, as that might not be appropriate in all possible cases where ___BPF_FUNC_MAPPER might be used by users. This doesn't reduce usability, as it's trivial to insert that comma inside "callback" macro. To validate all the manually specified IDs are exactly right, we used BTF to compare before and after values: $ bpftool btf dump file ~/linux-build/default/vmlinux | rg bpf_func_id -A 211 > after.txt $ git stash # stach UAPI changes $ make -j90 ... re-building kernel without UAPI changes ... $ bpftool btf dump file ~/linux-build/default/vmlinux | rg bpf_func_id -A 211 > before.txt $ diff -u before.txt after.txt --- before.txt 2022-10-05 10:48:18.119195916 -0700 +++ after.txt 2022-10-05 10:46:49.446615025 -0700 @@ -1,4 +1,4 @@ -[14576] ENUM 'bpf_func_id' encoding=UNSIGNED size=4 vlen=211 +[9560] ENUM 'bpf_func_id' encoding=UNSIGNED size=4 vlen=211 'BPF_FUNC_unspec' val=0 'BPF_FUNC_map_lookup_elem' val=1 'BPF_FUNC_map_update_elem' val=2 As can be seen from diff above, the only thing that changed was resulting BTF type ID of ENUM bpf_func_id, not any of the enumerators, their names or integer values. The only other place that needed fixing was scripts/bpf_doc.py used to generate man pages and bpf_helper_defs.h header for libbpf and selftests. That script is tightly-coupled to exact shape of ___BPF_FUNC_MAPPER macro definition, so had to be trivially adapted. Cc: Quentin Monnet Reported-by: Andrea Terzolo Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Acked-by: Jiri Olsa Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221006042452.2089843-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 432 ++++++++++++++++++++++++----------------------- 1 file changed, 219 insertions(+), 213 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 51b9aa640ad2..17f61338f8f8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5436,225 +5436,231 @@ union bpf_attr { * larger than the size of the ring buffer, or which cannot fit * within a struct bpf_dynptr. */ -#define __BPF_FUNC_MAPPER(FN) \ - FN(unspec), \ - FN(map_lookup_elem), \ - FN(map_update_elem), \ - FN(map_delete_elem), \ - FN(probe_read), \ - FN(ktime_get_ns), \ - FN(trace_printk), \ - FN(get_prandom_u32), \ - FN(get_smp_processor_id), \ - FN(skb_store_bytes), \ - FN(l3_csum_replace), \ - FN(l4_csum_replace), \ - FN(tail_call), \ - FN(clone_redirect), \ - FN(get_current_pid_tgid), \ - FN(get_current_uid_gid), \ - FN(get_current_comm), \ - FN(get_cgroup_classid), \ - FN(skb_vlan_push), \ - FN(skb_vlan_pop), \ - FN(skb_get_tunnel_key), \ - FN(skb_set_tunnel_key), \ - FN(perf_event_read), \ - FN(redirect), \ - FN(get_route_realm), \ - FN(perf_event_output), \ - FN(skb_load_bytes), \ - FN(get_stackid), \ - FN(csum_diff), \ - FN(skb_get_tunnel_opt), \ - FN(skb_set_tunnel_opt), \ - FN(skb_change_proto), \ - FN(skb_change_type), \ - FN(skb_under_cgroup), \ - FN(get_hash_recalc), \ - FN(get_current_task), \ - FN(probe_write_user), \ - FN(current_task_under_cgroup), \ - FN(skb_change_tail), \ - FN(skb_pull_data), \ - FN(csum_update), \ - FN(set_hash_invalid), \ - FN(get_numa_node_id), \ - FN(skb_change_head), \ - FN(xdp_adjust_head), \ - FN(probe_read_str), \ - FN(get_socket_cookie), \ - FN(get_socket_uid), \ - FN(set_hash), \ - FN(setsockopt), \ - FN(skb_adjust_room), \ - FN(redirect_map), \ - FN(sk_redirect_map), \ - FN(sock_map_update), \ - FN(xdp_adjust_meta), \ - FN(perf_event_read_value), \ - FN(perf_prog_read_value), \ - FN(getsockopt), \ - FN(override_return), \ - FN(sock_ops_cb_flags_set), \ - FN(msg_redirect_map), \ - FN(msg_apply_bytes), \ - FN(msg_cork_bytes), \ - FN(msg_pull_data), \ - FN(bind), \ - FN(xdp_adjust_tail), \ - FN(skb_get_xfrm_state), \ - FN(get_stack), \ - FN(skb_load_bytes_relative), \ - FN(fib_lookup), \ - FN(sock_hash_update), \ - FN(msg_redirect_hash), \ - FN(sk_redirect_hash), \ - FN(lwt_push_encap), \ - FN(lwt_seg6_store_bytes), \ - FN(lwt_seg6_adjust_srh), \ - FN(lwt_seg6_action), \ - FN(rc_repeat), \ - FN(rc_keydown), \ - FN(skb_cgroup_id), \ - FN(get_current_cgroup_id), \ - FN(get_local_storage), \ - FN(sk_select_reuseport), \ - FN(skb_ancestor_cgroup_id), \ - FN(sk_lookup_tcp), \ - FN(sk_lookup_udp), \ - FN(sk_release), \ - FN(map_push_elem), \ - FN(map_pop_elem), \ - FN(map_peek_elem), \ - FN(msg_push_data), \ - FN(msg_pop_data), \ - FN(rc_pointer_rel), \ - FN(spin_lock), \ - FN(spin_unlock), \ - FN(sk_fullsock), \ - FN(tcp_sock), \ - FN(skb_ecn_set_ce), \ - FN(get_listener_sock), \ - FN(skc_lookup_tcp), \ - FN(tcp_check_syncookie), \ - FN(sysctl_get_name), \ - FN(sysctl_get_current_value), \ - FN(sysctl_get_new_value), \ - FN(sysctl_set_new_value), \ - FN(strtol), \ - FN(strtoul), \ - FN(sk_storage_get), \ - FN(sk_storage_delete), \ - FN(send_signal), \ - FN(tcp_gen_syncookie), \ - FN(skb_output), \ - FN(probe_read_user), \ - FN(probe_read_kernel), \ - FN(probe_read_user_str), \ - FN(probe_read_kernel_str), \ - FN(tcp_send_ack), \ - FN(send_signal_thread), \ - FN(jiffies64), \ - FN(read_branch_records), \ - FN(get_ns_current_pid_tgid), \ - FN(xdp_output), \ - FN(get_netns_cookie), \ - FN(get_current_ancestor_cgroup_id), \ - FN(sk_assign), \ - FN(ktime_get_boot_ns), \ - FN(seq_printf), \ - FN(seq_write), \ - FN(sk_cgroup_id), \ - FN(sk_ancestor_cgroup_id), \ - FN(ringbuf_output), \ - FN(ringbuf_reserve), \ - FN(ringbuf_submit), \ - FN(ringbuf_discard), \ - FN(ringbuf_query), \ - FN(csum_level), \ - FN(skc_to_tcp6_sock), \ - FN(skc_to_tcp_sock), \ - FN(skc_to_tcp_timewait_sock), \ - FN(skc_to_tcp_request_sock), \ - FN(skc_to_udp6_sock), \ - FN(get_task_stack), \ - FN(load_hdr_opt), \ - FN(store_hdr_opt), \ - FN(reserve_hdr_opt), \ - FN(inode_storage_get), \ - FN(inode_storage_delete), \ - FN(d_path), \ - FN(copy_from_user), \ - FN(snprintf_btf), \ - FN(seq_printf_btf), \ - FN(skb_cgroup_classid), \ - FN(redirect_neigh), \ - FN(per_cpu_ptr), \ - FN(this_cpu_ptr), \ - FN(redirect_peer), \ - FN(task_storage_get), \ - FN(task_storage_delete), \ - FN(get_current_task_btf), \ - FN(bprm_opts_set), \ - FN(ktime_get_coarse_ns), \ - FN(ima_inode_hash), \ - FN(sock_from_file), \ - FN(check_mtu), \ - FN(for_each_map_elem), \ - FN(snprintf), \ - FN(sys_bpf), \ - FN(btf_find_by_name_kind), \ - FN(sys_close), \ - FN(timer_init), \ - FN(timer_set_callback), \ - FN(timer_start), \ - FN(timer_cancel), \ - FN(get_func_ip), \ - FN(get_attach_cookie), \ - FN(task_pt_regs), \ - FN(get_branch_snapshot), \ - FN(trace_vprintk), \ - FN(skc_to_unix_sock), \ - FN(kallsyms_lookup_name), \ - FN(find_vma), \ - FN(loop), \ - FN(strncmp), \ - FN(get_func_arg), \ - FN(get_func_ret), \ - FN(get_func_arg_cnt), \ - FN(get_retval), \ - FN(set_retval), \ - FN(xdp_get_buff_len), \ - FN(xdp_load_bytes), \ - FN(xdp_store_bytes), \ - FN(copy_from_user_task), \ - FN(skb_set_tstamp), \ - FN(ima_file_hash), \ - FN(kptr_xchg), \ - FN(map_lookup_percpu_elem), \ - FN(skc_to_mptcp_sock), \ - FN(dynptr_from_mem), \ - FN(ringbuf_reserve_dynptr), \ - FN(ringbuf_submit_dynptr), \ - FN(ringbuf_discard_dynptr), \ - FN(dynptr_read), \ - FN(dynptr_write), \ - FN(dynptr_data), \ - FN(tcp_raw_gen_syncookie_ipv4), \ - FN(tcp_raw_gen_syncookie_ipv6), \ - FN(tcp_raw_check_syncookie_ipv4), \ - FN(tcp_raw_check_syncookie_ipv6), \ - FN(ktime_get_tai_ns), \ - FN(user_ringbuf_drain), \ +#define ___BPF_FUNC_MAPPER(FN, ctx...) \ + FN(unspec, 0, ##ctx) \ + FN(map_lookup_elem, 1, ##ctx) \ + FN(map_update_elem, 2, ##ctx) \ + FN(map_delete_elem, 3, ##ctx) \ + FN(probe_read, 4, ##ctx) \ + FN(ktime_get_ns, 5, ##ctx) \ + FN(trace_printk, 6, ##ctx) \ + FN(get_prandom_u32, 7, ##ctx) \ + FN(get_smp_processor_id, 8, ##ctx) \ + FN(skb_store_bytes, 9, ##ctx) \ + FN(l3_csum_replace, 10, ##ctx) \ + FN(l4_csum_replace, 11, ##ctx) \ + FN(tail_call, 12, ##ctx) \ + FN(clone_redirect, 13, ##ctx) \ + FN(get_current_pid_tgid, 14, ##ctx) \ + FN(get_current_uid_gid, 15, ##ctx) \ + FN(get_current_comm, 16, ##ctx) \ + FN(get_cgroup_classid, 17, ##ctx) \ + FN(skb_vlan_push, 18, ##ctx) \ + FN(skb_vlan_pop, 19, ##ctx) \ + FN(skb_get_tunnel_key, 20, ##ctx) \ + FN(skb_set_tunnel_key, 21, ##ctx) \ + FN(perf_event_read, 22, ##ctx) \ + FN(redirect, 23, ##ctx) \ + FN(get_route_realm, 24, ##ctx) \ + FN(perf_event_output, 25, ##ctx) \ + FN(skb_load_bytes, 26, ##ctx) \ + FN(get_stackid, 27, ##ctx) \ + FN(csum_diff, 28, ##ctx) \ + FN(skb_get_tunnel_opt, 29, ##ctx) \ + FN(skb_set_tunnel_opt, 30, ##ctx) \ + FN(skb_change_proto, 31, ##ctx) \ + FN(skb_change_type, 32, ##ctx) \ + FN(skb_under_cgroup, 33, ##ctx) \ + FN(get_hash_recalc, 34, ##ctx) \ + FN(get_current_task, 35, ##ctx) \ + FN(probe_write_user, 36, ##ctx) \ + FN(current_task_under_cgroup, 37, ##ctx) \ + FN(skb_change_tail, 38, ##ctx) \ + FN(skb_pull_data, 39, ##ctx) \ + FN(csum_update, 40, ##ctx) \ + FN(set_hash_invalid, 41, ##ctx) \ + FN(get_numa_node_id, 42, ##ctx) \ + FN(skb_change_head, 43, ##ctx) \ + FN(xdp_adjust_head, 44, ##ctx) \ + FN(probe_read_str, 45, ##ctx) \ + FN(get_socket_cookie, 46, ##ctx) \ + FN(get_socket_uid, 47, ##ctx) \ + FN(set_hash, 48, ##ctx) \ + FN(setsockopt, 49, ##ctx) \ + FN(skb_adjust_room, 50, ##ctx) \ + FN(redirect_map, 51, ##ctx) \ + FN(sk_redirect_map, 52, ##ctx) \ + FN(sock_map_update, 53, ##ctx) \ + FN(xdp_adjust_meta, 54, ##ctx) \ + FN(perf_event_read_value, 55, ##ctx) \ + FN(perf_prog_read_value, 56, ##ctx) \ + FN(getsockopt, 57, ##ctx) \ + FN(override_return, 58, ##ctx) \ + FN(sock_ops_cb_flags_set, 59, ##ctx) \ + FN(msg_redirect_map, 60, ##ctx) \ + FN(msg_apply_bytes, 61, ##ctx) \ + FN(msg_cork_bytes, 62, ##ctx) \ + FN(msg_pull_data, 63, ##ctx) \ + FN(bind, 64, ##ctx) \ + FN(xdp_adjust_tail, 65, ##ctx) \ + FN(skb_get_xfrm_state, 66, ##ctx) \ + FN(get_stack, 67, ##ctx) \ + FN(skb_load_bytes_relative, 68, ##ctx) \ + FN(fib_lookup, 69, ##ctx) \ + FN(sock_hash_update, 70, ##ctx) \ + FN(msg_redirect_hash, 71, ##ctx) \ + FN(sk_redirect_hash, 72, ##ctx) \ + FN(lwt_push_encap, 73, ##ctx) \ + FN(lwt_seg6_store_bytes, 74, ##ctx) \ + FN(lwt_seg6_adjust_srh, 75, ##ctx) \ + FN(lwt_seg6_action, 76, ##ctx) \ + FN(rc_repeat, 77, ##ctx) \ + FN(rc_keydown, 78, ##ctx) \ + FN(skb_cgroup_id, 79, ##ctx) \ + FN(get_current_cgroup_id, 80, ##ctx) \ + FN(get_local_storage, 81, ##ctx) \ + FN(sk_select_reuseport, 82, ##ctx) \ + FN(skb_ancestor_cgroup_id, 83, ##ctx) \ + FN(sk_lookup_tcp, 84, ##ctx) \ + FN(sk_lookup_udp, 85, ##ctx) \ + FN(sk_release, 86, ##ctx) \ + FN(map_push_elem, 87, ##ctx) \ + FN(map_pop_elem, 88, ##ctx) \ + FN(map_peek_elem, 89, ##ctx) \ + FN(msg_push_data, 90, ##ctx) \ + FN(msg_pop_data, 91, ##ctx) \ + FN(rc_pointer_rel, 92, ##ctx) \ + FN(spin_lock, 93, ##ctx) \ + FN(spin_unlock, 94, ##ctx) \ + FN(sk_fullsock, 95, ##ctx) \ + FN(tcp_sock, 96, ##ctx) \ + FN(skb_ecn_set_ce, 97, ##ctx) \ + FN(get_listener_sock, 98, ##ctx) \ + FN(skc_lookup_tcp, 99, ##ctx) \ + FN(tcp_check_syncookie, 100, ##ctx) \ + FN(sysctl_get_name, 101, ##ctx) \ + FN(sysctl_get_current_value, 102, ##ctx) \ + FN(sysctl_get_new_value, 103, ##ctx) \ + FN(sysctl_set_new_value, 104, ##ctx) \ + FN(strtol, 105, ##ctx) \ + FN(strtoul, 106, ##ctx) \ + FN(sk_storage_get, 107, ##ctx) \ + FN(sk_storage_delete, 108, ##ctx) \ + FN(send_signal, 109, ##ctx) \ + FN(tcp_gen_syncookie, 110, ##ctx) \ + FN(skb_output, 111, ##ctx) \ + FN(probe_read_user, 112, ##ctx) \ + FN(probe_read_kernel, 113, ##ctx) \ + FN(probe_read_user_str, 114, ##ctx) \ + FN(probe_read_kernel_str, 115, ##ctx) \ + FN(tcp_send_ack, 116, ##ctx) \ + FN(send_signal_thread, 117, ##ctx) \ + FN(jiffies64, 118, ##ctx) \ + FN(read_branch_records, 119, ##ctx) \ + FN(get_ns_current_pid_tgid, 120, ##ctx) \ + FN(xdp_output, 121, ##ctx) \ + FN(get_netns_cookie, 122, ##ctx) \ + FN(get_current_ancestor_cgroup_id, 123, ##ctx) \ + FN(sk_assign, 124, ##ctx) \ + FN(ktime_get_boot_ns, 125, ##ctx) \ + FN(seq_printf, 126, ##ctx) \ + FN(seq_write, 127, ##ctx) \ + FN(sk_cgroup_id, 128, ##ctx) \ + FN(sk_ancestor_cgroup_id, 129, ##ctx) \ + FN(ringbuf_output, 130, ##ctx) \ + FN(ringbuf_reserve, 131, ##ctx) \ + FN(ringbuf_submit, 132, ##ctx) \ + FN(ringbuf_discard, 133, ##ctx) \ + FN(ringbuf_query, 134, ##ctx) \ + FN(csum_level, 135, ##ctx) \ + FN(skc_to_tcp6_sock, 136, ##ctx) \ + FN(skc_to_tcp_sock, 137, ##ctx) \ + FN(skc_to_tcp_timewait_sock, 138, ##ctx) \ + FN(skc_to_tcp_request_sock, 139, ##ctx) \ + FN(skc_to_udp6_sock, 140, ##ctx) \ + FN(get_task_stack, 141, ##ctx) \ + FN(load_hdr_opt, 142, ##ctx) \ + FN(store_hdr_opt, 143, ##ctx) \ + FN(reserve_hdr_opt, 144, ##ctx) \ + FN(inode_storage_get, 145, ##ctx) \ + FN(inode_storage_delete, 146, ##ctx) \ + FN(d_path, 147, ##ctx) \ + FN(copy_from_user, 148, ##ctx) \ + FN(snprintf_btf, 149, ##ctx) \ + FN(seq_printf_btf, 150, ##ctx) \ + FN(skb_cgroup_classid, 151, ##ctx) \ + FN(redirect_neigh, 152, ##ctx) \ + FN(per_cpu_ptr, 153, ##ctx) \ + FN(this_cpu_ptr, 154, ##ctx) \ + FN(redirect_peer, 155, ##ctx) \ + FN(task_storage_get, 156, ##ctx) \ + FN(task_storage_delete, 157, ##ctx) \ + FN(get_current_task_btf, 158, ##ctx) \ + FN(bprm_opts_set, 159, ##ctx) \ + FN(ktime_get_coarse_ns, 160, ##ctx) \ + FN(ima_inode_hash, 161, ##ctx) \ + FN(sock_from_file, 162, ##ctx) \ + FN(check_mtu, 163, ##ctx) \ + FN(for_each_map_elem, 164, ##ctx) \ + FN(snprintf, 165, ##ctx) \ + FN(sys_bpf, 166, ##ctx) \ + FN(btf_find_by_name_kind, 167, ##ctx) \ + FN(sys_close, 168, ##ctx) \ + FN(timer_init, 169, ##ctx) \ + FN(timer_set_callback, 170, ##ctx) \ + FN(timer_start, 171, ##ctx) \ + FN(timer_cancel, 172, ##ctx) \ + FN(get_func_ip, 173, ##ctx) \ + FN(get_attach_cookie, 174, ##ctx) \ + FN(task_pt_regs, 175, ##ctx) \ + FN(get_branch_snapshot, 176, ##ctx) \ + FN(trace_vprintk, 177, ##ctx) \ + FN(skc_to_unix_sock, 178, ##ctx) \ + FN(kallsyms_lookup_name, 179, ##ctx) \ + FN(find_vma, 180, ##ctx) \ + FN(loop, 181, ##ctx) \ + FN(strncmp, 182, ##ctx) \ + FN(get_func_arg, 183, ##ctx) \ + FN(get_func_ret, 184, ##ctx) \ + FN(get_func_arg_cnt, 185, ##ctx) \ + FN(get_retval, 186, ##ctx) \ + FN(set_retval, 187, ##ctx) \ + FN(xdp_get_buff_len, 188, ##ctx) \ + FN(xdp_load_bytes, 189, ##ctx) \ + FN(xdp_store_bytes, 190, ##ctx) \ + FN(copy_from_user_task, 191, ##ctx) \ + FN(skb_set_tstamp, 192, ##ctx) \ + FN(ima_file_hash, 193, ##ctx) \ + FN(kptr_xchg, 194, ##ctx) \ + FN(map_lookup_percpu_elem, 195, ##ctx) \ + FN(skc_to_mptcp_sock, 196, ##ctx) \ + FN(dynptr_from_mem, 197, ##ctx) \ + FN(ringbuf_reserve_dynptr, 198, ##ctx) \ + FN(ringbuf_submit_dynptr, 199, ##ctx) \ + FN(ringbuf_discard_dynptr, 200, ##ctx) \ + FN(dynptr_read, 201, ##ctx) \ + FN(dynptr_write, 202, ##ctx) \ + FN(dynptr_data, 203, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv4, 204, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv6, 205, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv4, 206, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx) \ + FN(ktime_get_tai_ns, 208, ##ctx) \ + FN(user_ringbuf_drain, 209, ##ctx) \ /* */ +/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't + * know or care about integer value that is now passed as second argument + */ +#define __BPF_FUNC_MAPPER_APPLY(name, value, FN) FN(name), +#define __BPF_FUNC_MAPPER(FN) ___BPF_FUNC_MAPPER(__BPF_FUNC_MAPPER_APPLY, FN) + /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call */ -#define __BPF_ENUM_FN(x) BPF_FUNC_ ## x +#define __BPF_ENUM_FN(x, y) BPF_FUNC_ ## x = y, enum bpf_func_id { - __BPF_FUNC_MAPPER(__BPF_ENUM_FN) + ___BPF_FUNC_MAPPER(__BPF_ENUM_FN) __BPF_FUNC_MAX_ID, }; #undef __BPF_ENUM_FN -- cgit v1.2.3 From 0ff57171d6d225558c81a69439d5323e35b40549 Mon Sep 17 00:00:00 2001 From: Vinayak Yadawad Date: Wed, 7 Sep 2022 18:14:48 +0530 Subject: cfg80211: Update Transition Disable policy during port authorization In case of 4way handshake offload, transition disable policy updated by the AP during EAPOL 3/4 is not updated to the upper layer. This results in mismatch between transition disable policy between the upper layer and the driver. This patch addresses this issue by updating transition disable policy as part of port authorization indication. Signed-off-by: Vinayak Yadawad Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 2 +- include/net/cfg80211.h | 4 +++- include/uapi/linux/nl80211.h | 3 +++ net/wireless/core.h | 5 ++++- net/wireless/nl80211.c | 8 +++++++- net/wireless/nl80211.h | 3 ++- net/wireless/sme.c | 12 ++++++++---- net/wireless/util.c | 4 +++- 8 files changed, 31 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index bf184c0e64cb..3f2336062217 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -6268,7 +6268,7 @@ done: brcmf_dbg(CONN, "Report roaming result\n"); if (profile->use_fwsup == BRCMF_PROFILE_FWSUP_1X && profile->is_ft) { - cfg80211_port_authorized(ndev, profile->bssid, GFP_KERNEL); + cfg80211_port_authorized(ndev, profile->bssid, NULL, 0, GFP_KERNEL); brcmf_dbg(CONN, "Report port authorized\n"); } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 659dd1bee70f..11a370e64143 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7683,6 +7683,8 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, * * @dev: network device * @bssid: the BSSID of the AP + * @td_bitmap: transition disable policy + * @td_bitmap_len: Length of transition disable policy * @gfp: allocation flags * * This function should be called by a driver that supports 4 way handshake @@ -7693,7 +7695,7 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, * indicate the 802.11 association. */ void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, - gfp_t gfp); + const u8* td_bitmap, u8 td_bitmap_len, gfp_t gfp); /** * cfg80211_disconnected - notify cfg80211 that connection was dropped diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c32e7616a366..c14a91bbca7c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2749,6 +2749,8 @@ enum nl80211_commands { * When used with %NL80211_CMD_FRAME_TX_STATUS, indicates the ack RX * timestamp. When used with %NL80211_CMD_FRAME RX notification, indicates * the incoming frame RX timestamp. + * @NL80211_ATTR_TD_BITMAP: Transition Disable bitmap, for subsequent + * (re)associations. * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3276,6 +3278,7 @@ enum nl80211_attrs { NL80211_ATTR_TX_HW_TIMESTAMP, NL80211_ATTR_RX_HW_TIMESTAMP, + NL80211_ATTR_TD_BITMAP, /* add attributes here, update the policy in nl80211.c */ diff --git a/net/wireless/core.h b/net/wireless/core.h index 775e16cb99ed..af85d8909935 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -271,6 +271,8 @@ struct cfg80211_event { } ij; struct { u8 bssid[ETH_ALEN]; + const u8 *td_bitmap; + u8 td_bitmap_len; } pa; }; }; @@ -409,7 +411,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, bool wextev); void __cfg80211_roamed(struct wireless_dev *wdev, struct cfg80211_roam_info *info); -void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid); +void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid, + const u8 *td_bitmap, u8 td_bitmap_len); int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_autodisconnect_wk(struct work_struct *work); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1d0277758d0e..fe368af39554 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -17942,7 +17942,8 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, } void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *bssid) + struct net_device *netdev, const u8 *bssid, + const u8 *td_bitmap, u8 td_bitmap_len) { struct sk_buff *msg; void *hdr; @@ -17962,6 +17963,11 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) goto nla_put_failure; + if ((td_bitmap_len > 0) && td_bitmap) + if (nla_put(msg, NL80211_ATTR_TD_BITMAP, + td_bitmap_len, td_bitmap)) + goto nla_put_failure; + genlmsg_end(msg, hdr); genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 855d540ddfb9..ba9457e94c43 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -83,7 +83,8 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_roam_info *info, gfp_t gfp); void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *bssid); + struct net_device *netdev, const u8 *bssid, + const u8 *td_bitmap, u8 td_bitmap_len); void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, struct net_device *netdev, u16 reason, const u8 *ie, size_t ie_len, bool from_ap); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index f94497e9db43..4b5b6ee0fe01 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -1251,7 +1251,8 @@ out: } EXPORT_SYMBOL(cfg80211_roamed); -void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid) +void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid, + const u8 *td_bitmap, u8 td_bitmap_len) { ASSERT_WDEV_LOCK(wdev); @@ -1264,11 +1265,11 @@ void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid) return; nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev, - bssid); + bssid, td_bitmap, td_bitmap_len); } void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, - gfp_t gfp) + const u8 *td_bitmap, u8 td_bitmap_len, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -1278,12 +1279,15 @@ void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, if (WARN_ON(!bssid)) return; - ev = kzalloc(sizeof(*ev), gfp); + ev = kzalloc(sizeof(*ev) + td_bitmap_len, gfp); if (!ev) return; ev->type = EVENT_PORT_AUTHORIZED; memcpy(ev->pa.bssid, bssid, ETH_ALEN); + ev->pa.td_bitmap = ((u8 *)ev) + sizeof(*ev); + ev->pa.td_bitmap_len = td_bitmap_len; + memcpy((void *)ev->pa.td_bitmap, td_bitmap, td_bitmap_len); /* * Use the wdev event list so that if there are pending diff --git a/net/wireless/util.c b/net/wireless/util.c index 01493568a21d..f09d528e5199 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -988,7 +988,9 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) __cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); break; case EVENT_PORT_AUTHORIZED: - __cfg80211_port_authorized(wdev, ev->pa.bssid); + __cfg80211_port_authorized(wdev, ev->pa.bssid, + ev->pa.td_bitmap, + ev->pa.td_bitmap_len); break; } wdev_unlock(wdev); -- cgit v1.2.3 From 404c76783f322120266a4ef659abe418dc74f5c6 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Thu, 20 Oct 2022 17:20:03 +0200 Subject: ethtool: Add support for 800Gbps link modes Add support for 800Gbps speed, link modes of 100Gbps per lane. As mentioned in slide 21 in IEEE documentation [1], all adopted 802.3df copper and optical PMDs baselines using 100G/lane will be supported. Add the relevant PMDs which are mentioned in slide 5 in IEEE documentation [1] and were approved on 10-2022 [2]: BP - KR8 Cu Cable - CR8 MMF 50m - VR8 MMF 100m - SR8 SMF 500m - DR8 SMF 2km - DR8-2 [1]: https://www.ieee802.org/3/df/public/22_10/22_1004/shrikhande_3df_01a_221004.pdf [2]: https://ieee802.org/3/df/KeyMotions_3df_221005.pdf Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- drivers/net/phy/phy-core.c | 11 ++++++++++- include/uapi/linux/ethtool.h | 8 ++++++++ net/ethtool/common.c | 14 ++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 2c8bf438ea61..5d08c627a516 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -13,7 +13,7 @@ */ const char *phy_speed_to_str(int speed) { - BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 93, + BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 99, "Enum ethtool_link_mode_bit_indices and phylib are out of sync. " "If a speed or mode has been added please update phy_speed_to_str " "and the PHY settings array.\n"); @@ -49,6 +49,8 @@ const char *phy_speed_to_str(int speed) return "200Gbps"; case SPEED_400000: return "400Gbps"; + case SPEED_800000: + return "800Gbps"; case SPEED_UNKNOWN: return "Unknown"; default: @@ -157,6 +159,13 @@ EXPORT_SYMBOL_GPL(phy_interface_num_ports); .bit = ETHTOOL_LINK_MODE_ ## b ## _BIT} static const struct phy_setting settings[] = { + /* 800G */ + PHY_SETTING( 800000, FULL, 800000baseCR8_Full ), + PHY_SETTING( 800000, FULL, 800000baseKR8_Full ), + PHY_SETTING( 800000, FULL, 800000baseDR8_Full ), + PHY_SETTING( 800000, FULL, 800000baseDR8_2_Full ), + PHY_SETTING( 800000, FULL, 800000baseSR8_Full ), + PHY_SETTING( 800000, FULL, 800000baseVR8_Full ), /* 400G */ PHY_SETTING( 400000, FULL, 400000baseCR8_Full ), PHY_SETTING( 400000, FULL, 400000baseKR8_Full ), diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index dc2aa3d75b39..f341de2ae612 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1737,6 +1737,13 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, ETHTOOL_LINK_MODE_10baseT1L_Full_BIT = 92, + ETHTOOL_LINK_MODE_800000baseCR8_Full_BIT = 93, + ETHTOOL_LINK_MODE_800000baseKR8_Full_BIT = 94, + ETHTOOL_LINK_MODE_800000baseDR8_Full_BIT = 95, + ETHTOOL_LINK_MODE_800000baseDR8_2_Full_BIT = 96, + ETHTOOL_LINK_MODE_800000baseSR8_Full_BIT = 97, + ETHTOOL_LINK_MODE_800000baseVR8_Full_BIT = 98, + /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS }; @@ -1848,6 +1855,7 @@ enum ethtool_link_mode_bit_indices { #define SPEED_100000 100000 #define SPEED_200000 200000 #define SPEED_400000 400000 +#define SPEED_800000 800000 #define SPEED_UNKNOWN -1 diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 566adf85e658..ee3e02da0013 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -202,6 +202,12 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(100, FX, Half), __DEFINE_LINK_MODE_NAME(100, FX, Full), __DEFINE_LINK_MODE_NAME(10, T1L, Full), + __DEFINE_LINK_MODE_NAME(800000, CR8, Full), + __DEFINE_LINK_MODE_NAME(800000, KR8, Full), + __DEFINE_LINK_MODE_NAME(800000, DR8, Full), + __DEFINE_LINK_MODE_NAME(800000, DR8_2, Full), + __DEFINE_LINK_MODE_NAME(800000, SR8, Full), + __DEFINE_LINK_MODE_NAME(800000, VR8, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -238,6 +244,8 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_X 1 #define __LINK_MODE_LANES_FX 1 #define __LINK_MODE_LANES_T1L 1 +#define __LINK_MODE_LANES_VR8 8 +#define __LINK_MODE_LANES_DR8_2 8 #define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ @@ -352,6 +360,12 @@ const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(100, FX, Half), __DEFINE_LINK_MODE_PARAMS(100, FX, Full), __DEFINE_LINK_MODE_PARAMS(10, T1L, Full), + __DEFINE_LINK_MODE_PARAMS(800000, CR8, Full), + __DEFINE_LINK_MODE_PARAMS(800000, KR8, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR8, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full), + __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full), + __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); -- cgit v1.2.3 From 3a07327d10a09379315c844c63f27941f5081e0a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 25 Oct 2022 13:48:15 +0200 Subject: netfilter: nft_inner: support for inner tunnel header matching This new expression allows you to match on the inner headers that are encapsulated by any of the existing tunneling protocols. This expression parses the inner packet to set the link, network and transport offsets, so the existing expressions (with a few updates) can be reused to match on the inner headers. The inner expression supports for different tunnel combinations such as: - ethernet frame over IPv4/IPv6 packet, eg. VxLAN. - IPv4/IPv6 packet over IPv4/IPv6 packet, eg. IPIP. - IPv4/IPv6 packet over IPv4/IPv6 + transport header, eg. GRE. - transport header (ESP or SCTP) over transport header (usually UDP) The following fields are used to describe the tunnel protocol: - flags, which describe how to parse the inner headers: NFT_PAYLOAD_CTX_INNER_TUN, the tunnel provides its own header. NFT_PAYLOAD_CTX_INNER_ETHER, the ethernet frame is available as inner header. NFT_PAYLOAD_CTX_INNER_NH, the network header is available as inner header. NFT_PAYLOAD_CTX_INNER_TH, the transport header is available as inner header. For example, VxLAN sets on all of these flags. While GRE only sets on NFT_PAYLOAD_CTX_INNER_NH and NFT_PAYLOAD_CTX_INNER_TH. Then, ESP over UDP only sets on NFT_PAYLOAD_CTX_INNER_TH. The tunnel description is composed of the following attributes: - header size: in case the tunnel comes with its own header, eg. VxLAN. - type: this provides a hint to userspace on how to delinearize the rule. This is useful for VxLAN and Geneve since they run over UDP, since transport does not provide a hint. This is also useful in case hardware offload is ever supported. The type is not currently interpreted by the kernel. - expression: currently only payload supported. Follow up patch adds also inner meta support which is required by autogenerated dependencies. The exthdr expression should be supported too at some point. There is a new inner_ops operation that needs to be set on to allow to use an existing expression from the inner expression. This patch adds a new NFT_PAYLOAD_TUN_HEADER base which allows to match on the tunnel header fields, eg. vxlan vni. The payload expression is embedded into nft_inner private area and this private data area is passed to the payload inner eval function via direct call. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 + include/net/netfilter/nf_tables_core.h | 24 +++ include/uapi/linux/netfilter/nf_tables.h | 26 +++ net/netfilter/Makefile | 2 +- net/netfilter/nf_tables_api.c | 37 ++++ net/netfilter/nf_tables_core.c | 1 + net/netfilter/nft_inner.c | 336 +++++++++++++++++++++++++++++++ net/netfilter/nft_payload.c | 89 +++++++- 8 files changed, 518 insertions(+), 2 deletions(-) create mode 100644 net/netfilter/nft_inner.c (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f6db510689a8..2dbfe7524a7e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -375,6 +375,10 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) return (void *)expr->data; } +struct nft_expr_info; + +int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, + struct nft_expr_info *info); int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, @@ -864,6 +868,7 @@ struct nft_expr_type { const struct nlattr * const tb[]); void (*release_ops)(const struct nft_expr_ops *ops); const struct nft_expr_ops *ops; + const struct nft_expr_ops *inner_ops; struct list_head list; const char *name; struct module *owner; diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 83d763631f81..be2b2b5d0a52 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -19,6 +19,7 @@ extern struct nft_expr_type nft_rt_type; extern struct nft_expr_type nft_exthdr_type; extern struct nft_expr_type nft_last_type; extern struct nft_expr_type nft_objref_type; +extern struct nft_expr_type nft_inner_type; #ifdef CONFIG_NETWORK_SECMARK extern struct nft_object_type nft_secmark_obj_type; @@ -139,4 +140,27 @@ void nft_rt_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); + +enum { + NFT_PAYLOAD_CTX_INNER_TUN = (1 << 0), + NFT_PAYLOAD_CTX_INNER_LL = (1 << 1), + NFT_PAYLOAD_CTX_INNER_NH = (1 << 2), + NFT_PAYLOAD_CTX_INNER_TH = (1 << 3), +}; + +struct nft_inner_tun_ctx { + u16 inner_tunoff; + u16 inner_lloff; + u16 inner_nhoff; + u16 inner_thoff; + __be16 llproto; + u8 l4proto; + u8 flags; +}; + +int nft_payload_inner_offset(const struct nft_pktinfo *pkt); +void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *ctx); + #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 466fd3f4447c..05a15dce8271 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -760,6 +760,7 @@ enum nft_payload_bases { NFT_PAYLOAD_NETWORK_HEADER, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_PAYLOAD_INNER_HEADER, + NFT_PAYLOAD_TUN_HEADER, }; /** @@ -779,6 +780,31 @@ enum nft_payload_csum_flags { NFT_PAYLOAD_L4CSUM_PSEUDOHDR = (1 << 0), }; +enum nft_inner_type { + NFT_INNER_UNSPEC = 0, + NFT_INNER_VXLAN, +}; + +enum nft_inner_flags { + NFT_INNER_HDRSIZE = (1 << 0), + NFT_INNER_LL = (1 << 1), + NFT_INNER_NH = (1 << 2), + NFT_INNER_TH = (1 << 3), +}; +#define NFT_INNER_MASK (NFT_INNER_HDRSIZE | NFT_INNER_LL | \ + NFT_INNER_NH | NFT_INNER_TH) + +enum nft_inner_attributes { + NFTA_INNER_UNSPEC, + NFTA_INNER_NUM, + NFTA_INNER_TYPE, + NFTA_INNER_FLAGS, + NFTA_INNER_HDRSIZE, + NFTA_INNER_EXPR, + __NFTA_INNER_MAX +}; +#define NFTA_INNER_MAX (__NFTA_INNER_MAX - 1) + /** * enum nft_payload_attributes - nf_tables payload expression netlink attributes * diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 7a6b518ba2b4..1d4db1943936 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -86,7 +86,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \ - nft_counter.o nft_objref.o \ + nft_counter.o nft_objref.o nft_inner.o \ nft_chain_route.o nf_tables_offload.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ nft_set_pipapo.o diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 58d9cbc9ccdc..6b79f5e18f08 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2857,6 +2857,43 @@ err1: return err; } +int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, + struct nft_expr_info *info) +{ + struct nlattr *tb[NFTA_EXPR_MAX + 1]; + const struct nft_expr_type *type; + int err; + + err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla, + nft_expr_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_EXPR_DATA]) + return -EINVAL; + + type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); + if (IS_ERR(type)) + return PTR_ERR(type); + + if (!type->inner_ops) + return -EOPNOTSUPP; + + err = nla_parse_nested_deprecated(info->tb, type->maxattr, + tb[NFTA_EXPR_DATA], + type->policy, NULL); + if (err < 0) + goto err_nla_parse; + + info->attr = nla; + info->ops = type->inner_ops; + + return 0; + +err_nla_parse: + return err; +} + static int nf_tables_newexpr(const struct nft_ctx *ctx, const struct nft_expr_info *expr_info, struct nft_expr *expr) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 6dcead50208c..709a736c301c 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -341,6 +341,7 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_last_type, &nft_counter_type, &nft_objref_type, + &nft_inner_type, }; static struct nft_object_type *nft_basic_objects[] = { diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c new file mode 100644 index 000000000000..1e4079b5b431 --- /dev/null +++ b/net/netfilter/nft_inner.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Pablo Neira Ayuso + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Same layout as nft_expr but it embeds the private expression data area. */ +struct __nft_expr { + const struct nft_expr_ops *ops; + union { + struct nft_payload payload; + } __attribute__((aligned(__alignof__(u64)))); +}; + +enum { + NFT_INNER_EXPR_PAYLOAD, +}; + +struct nft_inner { + u8 flags; + u8 hdrsize; + u8 type; + u8 expr_type; + + struct __nft_expr expr; +}; + +static int nft_inner_parse_l2l3(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *ctx, u32 off) +{ + __be16 llproto, outer_llproto; + u32 nhoff, thoff; + + if (priv->flags & NFT_INNER_LL) { + struct vlan_ethhdr *veth, _veth; + struct ethhdr *eth, _eth; + u32 hdrsize; + + eth = skb_header_pointer(pkt->skb, off, sizeof(_eth), &_eth); + if (!eth) + return -1; + + switch (eth->h_proto) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + llproto = eth->h_proto; + hdrsize = sizeof(_eth); + break; + case htons(ETH_P_8021Q): + veth = skb_header_pointer(pkt->skb, off, sizeof(_veth), &_veth); + if (!eth) + return -1; + + outer_llproto = veth->h_vlan_encapsulated_proto; + llproto = veth->h_vlan_proto; + hdrsize = sizeof(_veth); + break; + default: + return -1; + } + + ctx->inner_lloff = off; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_LL; + off += hdrsize; + } else { + struct iphdr *iph; + u32 _version; + + iph = skb_header_pointer(pkt->skb, off, sizeof(_version), &_version); + if (!iph) + return -1; + + switch (iph->version) { + case 4: + llproto = htons(ETH_P_IP); + break; + case 6: + llproto = htons(ETH_P_IPV6); + break; + default: + return -1; + } + } + + ctx->llproto = llproto; + if (llproto == htons(ETH_P_8021Q)) + llproto = outer_llproto; + + nhoff = off; + + switch (llproto) { + case htons(ETH_P_IP): { + struct iphdr *iph, _iph; + + iph = skb_header_pointer(pkt->skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return -1; + + if (iph->ihl < 5 || iph->version != 4) + return -1; + + ctx->inner_nhoff = nhoff; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH; + + thoff = nhoff + (iph->ihl * 4); + if ((ntohs(iph->frag_off) & IP_OFFSET) == 0) { + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; + ctx->inner_thoff = thoff; + ctx->l4proto = iph->protocol; + } + } + break; + case htons(ETH_P_IPV6): { + struct ipv6hdr *ip6h, _ip6h; + int fh_flags = IP6_FH_F_AUTH; + unsigned short fragoff; + int l4proto; + + ip6h = skb_header_pointer(pkt->skb, nhoff, sizeof(_ip6h), &_ip6h); + if (!ip6h) + return -1; + + if (ip6h->version != 6) + return -1; + + ctx->inner_nhoff = nhoff; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH; + + thoff = nhoff; + l4proto = ipv6_find_hdr(pkt->skb, &thoff, -1, &fragoff, &fh_flags); + if (l4proto < 0 || thoff > U16_MAX) + return -1; + + if (fragoff == 0) { + thoff = nhoff + sizeof(_ip6h); + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; + ctx->inner_thoff = thoff; + ctx->l4proto = l4proto; + } + } + break; + default: + return -1; + } + + return 0; +} + +static int nft_inner_parse_tunhdr(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *ctx, u32 *off) +{ + if (pkt->tprot != IPPROTO_UDP || + pkt->tprot != IPPROTO_GRE) + return -1; + + ctx->inner_tunoff = *off; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; + *off += priv->hdrsize; + + return 0; +} + +static int nft_inner_parse(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + struct nft_inner_tun_ctx ctx = {}; + u32 off = pkt->inneroff; + + if (priv->flags & NFT_INNER_HDRSIZE && + nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0) + return -1; + + if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) { + if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0) + return -1; + } else if (priv->flags & NFT_INNER_TH) { + ctx.inner_thoff = off; + ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH; + } + + *tun_ctx = ctx; + + return 0; +} + +static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_inner *priv = nft_expr_priv(expr); + struct nft_inner_tun_ctx tun_ctx = {}; + + if (nft_payload_inner_offset(pkt) < 0) + goto err; + + if (nft_inner_parse(priv, pkt, &tun_ctx) < 0) + goto err; + + switch (priv->expr_type) { + case NFT_INNER_EXPR_PAYLOAD: + nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); + break; + default: + WARN_ON_ONCE(1); + goto err; + } + return; +err: + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_inner_policy[NFTA_INNER_MAX + 1] = { + [NFTA_INNER_NUM] = { .type = NLA_U32 }, + [NFTA_INNER_FLAGS] = { .type = NLA_U32 }, + [NFTA_INNER_HDRSIZE] = { .type = NLA_U32 }, + [NFTA_INNER_TYPE] = { .type = NLA_U32 }, + [NFTA_INNER_EXPR] = { .type = NLA_NESTED }, +}; + +struct nft_expr_info { + const struct nft_expr_ops *ops; + const struct nlattr *attr; + struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; +}; + +static int nft_inner_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_inner *priv = nft_expr_priv(expr); + u32 flags, hdrsize, type, num; + struct nft_expr_info expr_info; + int err; + + if (!tb[NFTA_INNER_FLAGS] || + !tb[NFTA_INNER_HDRSIZE] || + !tb[NFTA_INNER_TYPE] || + !tb[NFTA_INNER_EXPR]) + return -EINVAL; + + flags = ntohl(nla_get_be32(tb[NFTA_INNER_FLAGS])); + if (flags & ~NFT_INNER_MASK) + return -EOPNOTSUPP; + + num = ntohl(nla_get_be32(tb[NFTA_INNER_NUM])); + if (num != 0) + return -EOPNOTSUPP; + + hdrsize = ntohl(nla_get_be32(tb[NFTA_INNER_HDRSIZE])); + type = ntohl(nla_get_be32(tb[NFTA_INNER_TYPE])); + + if (type > U8_MAX) + return -EINVAL; + + if (flags & NFT_INNER_HDRSIZE) { + if (hdrsize == 0 || hdrsize > 64) + return -EOPNOTSUPP; + } + + priv->flags = flags; + priv->hdrsize = hdrsize; + priv->type = type; + + err = nft_expr_inner_parse(ctx, tb[NFTA_INNER_EXPR], &expr_info); + if (err < 0) + return err; + + priv->expr.ops = expr_info.ops; + + if (!strcmp(expr_info.ops->type->name, "payload")) + priv->expr_type = NFT_INNER_EXPR_PAYLOAD; + else + return -EINVAL; + + err = expr_info.ops->init(ctx, (struct nft_expr *)&priv->expr, + (const struct nlattr * const*)expr_info.tb); + if (err < 0) + return err; + + return 0; +} + +static int nft_inner_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_inner *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_INNER_NUM, htonl(0)) || + nla_put_be32(skb, NFTA_INNER_TYPE, htonl(priv->type)) || + nla_put_be32(skb, NFTA_INNER_FLAGS, htonl(priv->flags)) || + nla_put_be32(skb, NFTA_INNER_HDRSIZE, htonl(priv->hdrsize))) + goto nla_put_failure; + + if (nft_expr_dump(skb, NFTA_INNER_EXPR, + (struct nft_expr *)&priv->expr) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static const struct nft_expr_ops nft_inner_ops = { + .type = &nft_inner_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_inner)), + .eval = nft_inner_eval, + .init = nft_inner_init, + .dump = nft_inner_dump, +}; + +struct nft_expr_type nft_inner_type __read_mostly = { + .name = "inner", + .ops = &nft_inner_ops, + .policy = nft_inner_policy, + .maxattr = NFTA_INNER_MAX, + .owner = THIS_MODULE, +}; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 84b490d6cc75..9d2ac764a14c 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -144,7 +144,7 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt) return 0; } -static int nft_payload_inner_offset(const struct nft_pktinfo *pkt) +int nft_payload_inner_offset(const struct nft_pktinfo *pkt) { if (!(pkt->flags & NFT_PKTINFO_INNER) && __nft_payload_inner_offset((struct nft_pktinfo *)pkt) < 0) @@ -587,6 +587,92 @@ const struct nft_expr_ops nft_payload_fast_ops = { .offload = nft_payload_offload, }; +void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + const struct sk_buff *skb = pkt->skb; + u32 *dest = ®s->data[priv->dreg]; + int offset; + + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; + + switch (priv->base) { + case NFT_PAYLOAD_TUN_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TUN)) + goto err; + + offset = tun_ctx->inner_tunoff; + break; + case NFT_PAYLOAD_LL_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_LL)) + goto err; + + offset = tun_ctx->inner_lloff; + break; + case NFT_PAYLOAD_NETWORK_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_NH)) + goto err; + + offset = tun_ctx->inner_nhoff; + break; + case NFT_PAYLOAD_TRANSPORT_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH)) + goto err; + + offset = tun_ctx->inner_thoff; + break; + default: + WARN_ON_ONCE(1); + goto err; + } + offset += priv->offset; + + if (skb_copy_bits(skb, offset, dest, priv->len) < 0) + goto err; + + return; +err: + regs->verdict.code = NFT_BREAK; +} + +static int nft_payload_inner_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_payload *priv = nft_expr_priv(expr); + u32 base; + + base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); + switch (base) { + case NFT_PAYLOAD_TUN_HEADER: + case NFT_PAYLOAD_LL_HEADER: + case NFT_PAYLOAD_NETWORK_HEADER: + case NFT_PAYLOAD_TRANSPORT_HEADER: + break; + default: + return -EOPNOTSUPP; + } + + priv->base = base; + priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); + priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + + return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); +} + +static const struct nft_expr_ops nft_payload_inner_ops = { + .type = &nft_payload_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), + .init = nft_payload_inner_init, + .dump = nft_payload_dump, + /* direct call to nft_payload_inner_eval(). */ +}; + static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) { *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum)); @@ -930,6 +1016,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, struct nft_expr_type nft_payload_type __read_mostly = { .name = "payload", .select_ops = nft_payload_select_ops, + .inner_ops = &nft_payload_inner_ops, .policy = nft_payload_policy, .maxattr = NFTA_PAYLOAD_MAX, .owner = THIS_MODULE, -- cgit v1.2.3 From 0db14b95660b63dceeb7e89f2e3ffa97d331fce0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Oct 2022 13:03:34 +0200 Subject: netfilter: nft_inner: add geneve support Geneve tunnel header may contain options, parse geneve header and update offset to point to the link layer header according to the opt_len field. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nft_inner.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 05a15dce8271..e4b739d57480 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -783,6 +783,7 @@ enum nft_payload_csum_flags { enum nft_inner_type { NFT_INNER_UNSPEC = 0, NFT_INNER_VXLAN, + NFT_INNER_GENEVE, }; enum nft_inner_flags { diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index c43a2fe0ceb7..19fdc8c70cd1 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -181,6 +182,22 @@ static int nft_inner_parse_tunhdr(const struct nft_inner *priv, ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; *off += priv->hdrsize; + switch (priv->type) { + case NFT_INNER_GENEVE: { + struct genevehdr *gnvh, _gnvh; + + gnvh = skb_header_pointer(pkt->skb, pkt->inneroff, + sizeof(_gnvh), &_gnvh); + if (!gnvh) + return -1; + + *off += gnvh->opt_len * 4; + } + break; + default: + break; + } + return 0; } -- cgit v1.2.3 From c4bcfb38a95edb1021a53f2d0356a78120ecfbe4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Oct 2022 21:28:50 -0700 Subject: bpf: Implement cgroup storage available to non-cgroup-attached bpf progs Similar to sk/inode/task storage, implement similar cgroup local storage. There already exists a local storage implementation for cgroup-attached bpf programs. See map type BPF_MAP_TYPE_CGROUP_STORAGE and helper bpf_get_local_storage(). But there are use cases such that non-cgroup attached bpf progs wants to access cgroup local storage data. For example, tc egress prog has access to sk and cgroup. It is possible to use sk local storage to emulate cgroup local storage by storing data in socket. But this is a waste as it could be lots of sockets belonging to a particular cgroup. Alternatively, a separate map can be created with cgroup id as the key. But this will introduce additional overhead to manipulate the new map. A cgroup local storage, similar to existing sk/inode/task storage, should help for this use case. The life-cycle of storage is managed with the life-cycle of the cgroup struct. i.e. the storage is destroyed along with the owning cgroup with a call to bpf_cgrp_storage_free() when cgroup itself is deleted. The userspace map operations can be done by using a cgroup fd as a key passed to the lookup, update and delete operations. Typically, the following code is used to get the current cgroup: struct task_struct *task = bpf_get_current_task_btf(); ... task->cgroups->dfl_cgrp ... and in structure task_struct definition: struct task_struct { .... struct css_set __rcu *cgroups; .... } With sleepable program, accessing task->cgroups is not protected by rcu_read_lock. So the current implementation only supports non-sleepable program and supporting sleepable program will be the next step together with adding rcu_read_lock protection for rcu tagged structures. Since map name BPF_MAP_TYPE_CGROUP_STORAGE has been used for old cgroup local storage support, the new map name BPF_MAP_TYPE_CGRP_STORAGE is used for cgroup storage available to non-cgroup-attached bpf programs. The old cgroup storage supports bpf_get_local_storage() helper to get the cgroup data. The new cgroup storage helper bpf_cgrp_storage_get() can provide similar functionality. While old cgroup storage pre-allocates storage memory, the new mechanism can also pre-allocate with a user space bpf_map_update_elem() call to avoid potential run-time memory allocation failure. Therefore, the new cgroup storage can provide all functionality w.r.t. the old one. So in uapi bpf.h, the old BPF_MAP_TYPE_CGROUP_STORAGE is alias to BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED to indicate the old cgroup storage can be deprecated since the new one can provide the same functionality. Acked-by: David Vernet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221026042850.673791-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 ++ include/linux/bpf_types.h | 1 + include/linux/cgroup-defs.h | 4 + include/uapi/linux/bpf.h | 50 ++++++++- kernel/bpf/Makefile | 2 +- kernel/bpf/bpf_cgrp_storage.c | 247 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/helpers.c | 6 + kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 13 ++- kernel/cgroup/cgroup.c | 1 + kernel/trace/bpf_trace.c | 4 + scripts/bpf_doc.py | 2 + tools/include/uapi/linux/bpf.h | 50 ++++++++- 13 files changed, 385 insertions(+), 5 deletions(-) create mode 100644 kernel/bpf/bpf_cgrp_storage.c (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a5dbac8f5aba..9fd68b0b3e9c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2041,6 +2041,7 @@ struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); void bpf_task_storage_free(struct task_struct *task); +void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, @@ -2295,6 +2296,10 @@ static inline bool has_current_bpf_ctx(void) static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) { } + +static inline void bpf_cgrp_storage_free(struct cgroup *cgroup) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, @@ -2535,6 +2540,8 @@ extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; +extern const struct bpf_func_proto bpf_cgrp_storage_get_proto; +extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2c6a4f2562a7..d4ee3ccd3753 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -86,6 +86,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PROG_ARRAY, prog_array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops) #ifdef CONFIG_CGROUPS BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_CGRP_STORAGE, cgrp_storage_map_ops) #endif #ifdef CONFIG_CGROUP_BPF BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8f481d1b159a..c466fdc3a32a 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -504,6 +504,10 @@ struct cgroup { /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; +#ifdef CONFIG_BPF_SYSCALL + struct bpf_local_storage __rcu *bpf_cgrp_storage; +#endif + /* All ancestors including self */ struct cgroup *ancestors[]; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 17f61338f8f8..94659f6b3395 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -922,7 +922,14 @@ enum bpf_map_type { BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, - BPF_MAP_TYPE_CGROUP_STORAGE, + BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + /* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching + * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to + * both cgroup-attached and other progs and supports all functionality + * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark + * BPF_MAP_TYPE_CGROUP_STORAGE deprecated. + */ + BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_QUEUE, @@ -935,6 +942,7 @@ enum bpf_map_type { BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, + BPF_MAP_TYPE_CGRP_STORAGE, }; /* Note that tracing related programs such as @@ -5435,6 +5443,44 @@ union bpf_attr { * **-E2BIG** if user-space has tried to publish a sample which is * larger than the size of the ring buffer, or which cannot fit * within a struct bpf_dynptr. + * + * void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *cgroup*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *cgroup* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this + * helper enforces the key must be a cgroup struct and the map must also + * be a **BPF_MAP_TYPE_CGRP_STORAGE**. + * + * In reality, the local-storage value is embedded directly inside of the + * *cgroup* object itself, rather than being located in the + * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is + * queried for some *map* on a *cgroup* object, the kernel will perform an + * O(n) iteration over all of the live local-storage values for that + * *cgroup* object until the local-storage value for the *map* is found. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup) + * Description + * Delete a bpf_local_storage from a *cgroup*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5647,6 +5693,8 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx) \ FN(ktime_get_tai_ns, 208, ##ctx) \ FN(user_ringbuf_drain, 209, ##ctx) \ + FN(cgrp_storage_get, 210, ##ctx) \ + FN(cgrp_storage_delete, 211, ##ctx) \ /* */ /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 341c94f208f4..3a12e6b400a2 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -25,7 +25,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif ifeq ($(CONFIG_CGROUPS),y) -obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o +obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o bpf_cgrp_storage.o endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o ifeq ($(CONFIG_INET),y) diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c new file mode 100644 index 000000000000..309403800f82 --- /dev/null +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + */ + +#include +#include +#include +#include +#include + +DEFINE_BPF_STORAGE_CACHE(cgroup_cache); + +static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); + +static void bpf_cgrp_storage_lock(void) +{ + migrate_disable(); + this_cpu_inc(bpf_cgrp_storage_busy); +} + +static void bpf_cgrp_storage_unlock(void) +{ + this_cpu_dec(bpf_cgrp_storage_busy); + migrate_enable(); +} + +static bool bpf_cgrp_storage_trylock(void) +{ + migrate_disable(); + if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { + this_cpu_dec(bpf_cgrp_storage_busy); + migrate_enable(); + return false; + } + return true; +} + +static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) +{ + struct cgroup *cg = owner; + + return &cg->bpf_cgrp_storage; +} + +void bpf_cgrp_storage_free(struct cgroup *cgroup) +{ + struct bpf_local_storage *local_storage; + bool free_cgroup_storage = false; + unsigned long flags; + + rcu_read_lock(); + local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + bpf_cgrp_storage_lock(); + raw_spin_lock_irqsave(&local_storage->lock, flags); + free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_cgrp_storage_unlock(); + rcu_read_unlock(); + + if (free_cgroup_storage) + kfree_rcu(local_storage, rcu); +} + +static struct bpf_local_storage_data * +cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit) +{ + struct bpf_local_storage *cgroup_storage; + struct bpf_local_storage_map *smap; + + cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage, + bpf_rcu_lock_held()); + if (!cgroup_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit); +} + +static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct cgroup *cgroup; + int fd; + + fd = *(int *)key; + cgroup = cgroup_get_from_fd(fd); + if (IS_ERR(cgroup)) + return ERR_CAST(cgroup); + + bpf_cgrp_storage_lock(); + sdata = cgroup_storage_lookup(cgroup, map, true); + bpf_cgrp_storage_unlock(); + cgroup_put(cgroup); + return sdata ? sdata->data : NULL; +} + +static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct cgroup *cgroup; + int fd; + + fd = *(int *)key; + cgroup = cgroup_get_from_fd(fd); + if (IS_ERR(cgroup)) + return PTR_ERR(cgroup); + + bpf_cgrp_storage_lock(); + sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, + value, map_flags, GFP_ATOMIC); + bpf_cgrp_storage_unlock(); + cgroup_put(cgroup); + return PTR_ERR_OR_ZERO(sdata); +} + +static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = cgroup_storage_lookup(cgroup, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata), true); + return 0; +} + +static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct cgroup *cgroup; + int err, fd; + + fd = *(int *)key; + cgroup = cgroup_get_from_fd(fd); + if (IS_ERR(cgroup)) + return PTR_ERR(cgroup); + + bpf_cgrp_storage_lock(); + err = cgroup_storage_delete(cgroup, map); + bpf_cgrp_storage_unlock(); + cgroup_put(cgroup); + return err; +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) +{ + return bpf_local_storage_map_alloc(attr, &cgroup_cache); +} + +static void cgroup_storage_map_free(struct bpf_map *map) +{ + bpf_local_storage_map_free(map, &cgroup_cache, NULL); +} + +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, + void *, value, u64, flags, gfp_t, gfp_flags) +{ + struct bpf_local_storage_data *sdata; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + if (!cgroup) + return (unsigned long)NULL; + + if (!bpf_cgrp_storage_trylock()) + return (unsigned long)NULL; + + sdata = cgroup_storage_lookup(cgroup, map, true); + if (sdata) + goto unlock; + + /* only allocate new storage, when the cgroup is refcounted */ + if (!percpu_ref_is_dying(&cgroup->self.refcnt) && + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) + sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, + value, BPF_NOEXIST, gfp_flags); + +unlock: + bpf_cgrp_storage_unlock(); + return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; +} + +BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) +{ + int ret; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (!cgroup) + return -EINVAL; + + if (!bpf_cgrp_storage_trylock()) + return -EBUSY; + + ret = cgroup_storage_delete(cgroup, map); + bpf_cgrp_storage_unlock(); + return ret; +} + +BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map) +const struct bpf_map_ops cgrp_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = cgroup_storage_map_alloc, + .map_free = cgroup_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_cgrp_storage_lookup_elem, + .map_update_elem = bpf_cgrp_storage_update_elem, + .map_delete_elem = bpf_cgrp_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_id = &cgroup_storage_map_btf_ids[0], + .map_owner_storage_ptr = cgroup_storage_ptr, +}; + +const struct bpf_func_proto bpf_cgrp_storage_get_proto = { + .func = bpf_cgrp_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_cgroup_btf_id[0], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_cgrp_storage_delete_proto = { + .func = bpf_cgrp_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_cgroup_btf_id[0], +}; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a6b04faed282..124fd199ce5c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1663,6 +1663,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_dynptr_write_proto; case BPF_FUNC_dynptr_data: return &bpf_dynptr_data_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_cgrp_storage_get: + return &bpf_cgrp_storage_get_proto; + case BPF_FUNC_cgrp_storage_delete: + return &bpf_cgrp_storage_delete_proto; +#endif default: break; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a0b4266196a8..11df90962101 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1016,7 +1016,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && map->map_type != BPF_MAP_TYPE_INODE_STORAGE && - map->map_type != BPF_MAP_TYPE_TASK_STORAGE) + map->map_type != BPF_MAP_TYPE_TASK_STORAGE && + map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ddc1452cf023..8f849a763b79 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6350,6 +6350,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_task_storage_delete) goto error; break; + case BPF_MAP_TYPE_CGRP_STORAGE: + if (func_id != BPF_FUNC_cgrp_storage_get && + func_id != BPF_FUNC_cgrp_storage_delete) + goto error; + break; case BPF_MAP_TYPE_BLOOM_FILTER: if (func_id != BPF_FUNC_map_peek_elem && func_id != BPF_FUNC_map_push_elem) @@ -6462,6 +6467,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) goto error; break; + case BPF_FUNC_cgrp_storage_get: + case BPF_FUNC_cgrp_storage_delete: + if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) + goto error; + break; default: break; } @@ -14139,7 +14149,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (insn->imm == BPF_FUNC_task_storage_get || insn->imm == BPF_FUNC_sk_storage_get || - insn->imm == BPF_FUNC_inode_storage_get) { + insn->imm == BPF_FUNC_inode_storage_get || + insn->imm == BPF_FUNC_cgrp_storage_get) { if (env->prog->aux->sleepable) insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); else diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 764bdd5fd8d1..12001928511b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5245,6 +5245,7 @@ static void css_free_rwork_fn(struct work_struct *work) atomic_dec(&cgrp->root->nr_cgrps); cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); + bpf_cgrp_storage_free(cgrp); if (cgroup_parent(cgrp)) { /* diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index eed1bd952c3a..ce0228c72a93 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1455,6 +1455,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_get_current_ancestor_cgroup_id: return &bpf_get_current_ancestor_cgroup_id_proto; + case BPF_FUNC_cgrp_storage_get: + return &bpf_cgrp_storage_get_proto; + case BPF_FUNC_cgrp_storage_delete: + return &bpf_cgrp_storage_delete_proto; #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index c0e6690be82a..fdb0aff8cb5a 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -685,6 +685,7 @@ class PrinterHelpers(Printer): 'struct udp6_sock', 'struct unix_sock', 'struct task_struct', + 'struct cgroup', 'struct __sk_buff', 'struct sk_msg_md', @@ -742,6 +743,7 @@ class PrinterHelpers(Printer): 'struct udp6_sock', 'struct unix_sock', 'struct task_struct', + 'struct cgroup', 'struct path', 'struct btf_ptr', 'struct inode', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 17f61338f8f8..94659f6b3395 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -922,7 +922,14 @@ enum bpf_map_type { BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, - BPF_MAP_TYPE_CGROUP_STORAGE, + BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + /* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching + * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to + * both cgroup-attached and other progs and supports all functionality + * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark + * BPF_MAP_TYPE_CGROUP_STORAGE deprecated. + */ + BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_QUEUE, @@ -935,6 +942,7 @@ enum bpf_map_type { BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, + BPF_MAP_TYPE_CGRP_STORAGE, }; /* Note that tracing related programs such as @@ -5435,6 +5443,44 @@ union bpf_attr { * **-E2BIG** if user-space has tried to publish a sample which is * larger than the size of the ring buffer, or which cannot fit * within a struct bpf_dynptr. + * + * void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *cgroup*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *cgroup* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this + * helper enforces the key must be a cgroup struct and the map must also + * be a **BPF_MAP_TYPE_CGRP_STORAGE**. + * + * In reality, the local-storage value is embedded directly inside of the + * *cgroup* object itself, rather than being located in the + * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is + * queried for some *map* on a *cgroup* object, the kernel will perform an + * O(n) iteration over all of the live local-storage values for that + * *cgroup* object until the local-storage value for the *map* is found. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup) + * Description + * Delete a bpf_local_storage from a *cgroup*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5647,6 +5693,8 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx) \ FN(ktime_get_tai_ns, 208, ##ctx) \ FN(user_ringbuf_drain, 209, ##ctx) \ + FN(cgrp_storage_get, 210, ##ctx) \ + FN(cgrp_storage_delete, 211, ##ctx) \ /* */ /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't -- cgit v1.2.3 From 29c1c44646aec5d5134f2365259a84becc1ee7d3 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:14 +0000 Subject: tcp: add u32 counter in tcp_sock and an SNMP counter for PLB A u32 counter is added to tcp_sock for counting the number of PLB triggered rehashes for a TCP connection. An SNMP counter is also added to count overall PLB triggered rehash events for a host. These counters are hooked up to PLB implementation for DCTCP. TCP_NLA_REHASH is added to SCM_TIMESTAMPING_OPT_STATS that reports the rehash attempts triggered due to PLB or timeouts. This gives a historical view of sustained congestion or timeouts experienced by the TCP connection. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + include/uapi/linux/snmp.h | 1 + include/uapi/linux/tcp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_plb.c | 2 ++ 6 files changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 41b1da621a45..ca7f05a130d2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -423,6 +423,7 @@ struct tcp_sock { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; + u32 plb_rehash; /* PLB-triggered rehash attempts */ u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 4d7470036a8b..6600cb0164c2 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -292,6 +292,7 @@ enum LINUX_MIB_TCPDSACKIGNOREDDUBIOUS, /* TCPDSACKIgnoredDubious */ LINUX_MIB_TCPMIGRATEREQSUCCESS, /* TCPMigrateReqSuccess */ LINUX_MIB_TCPMIGRATEREQFAILURE, /* TCPMigrateReqFailure */ + LINUX_MIB_TCPPLBREHASH, /* TCPPLBRehash */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8fc09e8638b3..c9abe86eda5f 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -315,6 +315,7 @@ enum { TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */ TCP_NLA_TTL, /* TTL or hop limit of a packet received */ + TCP_NLA_REHASH, /* PLB and timeout triggered rehash attempts */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 5386f460bd20..f88daace9de3 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -297,6 +297,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), + SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ef14efa1fb70..1da7c53b6cb5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3176,6 +3176,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->sacked_out = 0; tp->tlp_high_seq = 0; tp->last_oow_ack_time = 0; + tp->plb_rehash = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rack.mstamp = 0; @@ -3973,6 +3974,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */ 0; } @@ -4049,6 +4051,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, nla_put_u8(stats, TCP_NLA_TTL, tcp_skb_ttl_or_hop_limit(ack_skb)); + nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); return stats; } diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c index f4ced370acad..bb1a08fda113 100644 --- a/net/ipv4/tcp_plb.c +++ b/net/ipv4/tcp_plb.c @@ -79,6 +79,8 @@ void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) sk_rethink_txhash(sk); plb->consec_cong_rounds = 0; + tcp_sk(sk)->plb_rehash++; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH); } EXPORT_SYMBOL_GPL(tcp_plb_check_rehash); -- cgit v1.2.3 From 71fc704768f601ed3fa36310822a5e03f310f781 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:15 +0000 Subject: tcp: add rcv_wnd and plb_rehash to TCP_INFO rcv_wnd can be useful to diagnose TCP performance where receiver window becomes the bottleneck. rehash reports the PLB and timeout triggered rehash attempts by the TCP connection. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 5 +++++ net/ipv4/tcp.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index c9abe86eda5f..879eeb0a084b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -284,6 +284,11 @@ struct tcp_info { __u32 tcpi_snd_wnd; /* peer's advertised receive window after * scaling (bytes) */ + __u32 tcpi_rcv_wnd; /* local advertised receive window after + * scaling (bytes) + */ + + __u32 tcpi_rehash; /* PLB or timeout triggered rehash attempts */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1da7c53b6cb5..de8f0cd7cb32 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3940,6 +3940,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; + info->tcpi_rcv_wnd = tp->rcv_wnd; + info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } -- cgit v1.2.3 From 58ba426388d9fe56aa638f555b01d6e63cada88c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 27 Oct 2022 17:10:14 -0400 Subject: net/packet: add PACKET_FANOUT_FLAG_IGNORE_OUTGOING Extend packet socket option PACKET_IGNORE_OUTGOING to fanout groups. The socket option sets ptype.ignore_outgoing, which makes dev_queue_xmit_nit skip the socket. When the socket joins a fanout group, the option is not reflected in the struct ptype of the group. dev_queue_xmit_nit only tests the fanout ptype, so the flag is ignored once a socket joins a fanout group. Inheriting the option from a socket would change established behavior. Different sockets in the group can set different flags, and can also change them at runtime. Testing in packet_rcv_fanout defeats the purpose of the original patch, which is to avoid skb_clone in dev_queue_xmit_nit (esp. for MSG_ZEROCOPY packets). Instead, introduce a new fanout group flag with the same behavior. Tested with https://github.com/wdebruij/kerneltools/blob/master/tests/test_psock_fanout_ignore_outgoing.c Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20221027211014.3581513-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_packet.h | 1 + net/packet/af_packet.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index c07caf7b40db..a8516b3594a4 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -70,6 +70,7 @@ struct sockaddr_ll { #define PACKET_FANOUT_EBPF 7 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 #define PACKET_FANOUT_FLAG_UNIQUEID 0x2000 +#define PACKET_FANOUT_FLAG_IGNORE_OUTGOING 0x4000 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 struct tpacket_stats { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8c5b3da0c29f..44f20cf8a0c0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1777,6 +1777,7 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; + match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING; list_add(&match->list, &fanout_list); } err = -EINVAL; -- cgit v1.2.3 From 738136a0e3757a8534df3ad97d6ff6d7f429f6c1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 27 Oct 2022 14:25:53 -0700 Subject: netlink: split up copies in the ack construction Clean up the use of unsafe_memcpy() by adding a flexible array at the end of netlink message header and splitting up the header and data copies. Reviewed-by: Kees Cook Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netlink.h | 21 +++++++++++++++++++++ include/uapi/linux/netlink.h | 2 ++ net/netlink/af_netlink.c | 29 ++++++++++++++++++++--------- 3 files changed, 43 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netlink.h b/include/net/netlink.h index 4418b1981e31..784b4688fc6f 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -931,6 +931,27 @@ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 se return __nlmsg_put(skb, portid, seq, type, payload, flags); } +/** + * nlmsg_append - Add more data to a nlmsg in a skb + * @skb: socket buffer to store message in + * @size: length of message payload + * + * Append data to an existing nlmsg, used when constructing a message + * with multiple fixed-format headers (which is rare). + * Returns NULL if the tailroom of the skb is insufficient to store + * the extra payload. + */ +static inline void *nlmsg_append(struct sk_buff *skb, u32 size) +{ + if (unlikely(skb_tailroom(skb) < NLMSG_ALIGN(size))) + return NULL; + + if (NLMSG_ALIGN(size) - size) + memset(skb_tail_pointer(skb) + size, 0, + NLMSG_ALIGN(size) - size); + return __skb_put(skb, NLMSG_ALIGN(size)); +} + /** * nlmsg_put_answer - Add a new callback based netlink message to an skb * @skb: socket buffer to store message in diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index e2ae82e3f9f7..5da0da59bf01 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -48,6 +48,7 @@ struct sockaddr_nl { * @nlmsg_flags: Additional flags * @nlmsg_seq: Sequence number * @nlmsg_pid: Sending process port ID + * @nlmsg_data: Message payload */ struct nlmsghdr { __u32 nlmsg_len; @@ -55,6 +56,7 @@ struct nlmsghdr { __u16 nlmsg_flags; __u32 nlmsg_seq; __u32 nlmsg_pid; + __u8 nlmsg_data[]; }; /* Flags values */ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f0c94d394ab1..b10d5e50b99d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2499,19 +2499,24 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, flags |= NLM_F_ACK_TLVS; skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); - if (!skb) { - NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; - sk_error_report(NETLINK_CB(in_skb).sk); - return; - } + if (!skb) + goto err_bad_put; rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - NLMSG_ERROR, payload, flags); + NLMSG_ERROR, sizeof(*errmsg), flags); + if (!rep) + goto err_bad_put; errmsg = nlmsg_data(rep); errmsg->error = err; - unsafe_memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) - ? nlh->nlmsg_len : sizeof(*nlh), - /* Bounds checked by the skb layer. */); + errmsg->msg = *nlh; + + if (!(flags & NLM_F_CAPPED)) { + if (!nlmsg_append(skb, nlmsg_len(nlh))) + goto err_bad_put; + + memcpy(errmsg->msg.nlmsg_data, nlh->nlmsg_data, + nlmsg_len(nlh)); + } if (tlvlen) netlink_ack_tlv_fill(in_skb, skb, nlh, err, extack); @@ -2519,6 +2524,12 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, nlmsg_end(skb, rep); nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid); + + return; + +err_bad_put: + NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; + sk_error_report(NETLINK_CB(in_skb).sk); } EXPORT_SYMBOL(netlink_ack); -- cgit v1.2.3 From ec32c0c42d0a7208571c4c77f140bffaa4e5e304 Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Tue, 1 Nov 2022 10:48:29 +0100 Subject: net: dcb: add new pcp selector to app object Add new PCP selector for the 8021Qaz APP managed object. As the PCP selector is not part of the 8021Qaz standard, a new non-std extension attribute DCB_ATTR_DCB_APP has been introduced. Also two helper functions to translate between selector and app attribute type has been added. The new selector has been given a value of 255, to minimize the risk of future overlap of std- and non-std attributes. The new DCB_ATTR_DCB_APP is sent alongside the ieee std attribute in the app table. This means that the dcb_app struct can now both contain std- and non-std app attributes. Currently there is no overlap between the selector values of the two attributes. The purpose of adding the PCP selector, is to be able to offload PCP-based queue classification to the 8021Q Priority Code Point table, see 6.9.3 of IEEE Std 802.1Q-2018. PCP and DEI is encoded in the protocol field as 8*dei+pcp, so that a mapping of PCP 2 and DEI 1 to priority 3 is encoded as {255, 10, 3}. Signed-off-by: Daniel Machon Reviewed-by: Petr Machata Signed-off-by: Paolo Abeni --- include/uapi/linux/dcbnl.h | 6 +++++ net/dcb/dcbnl.c | 58 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 60 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index a791a94013a6..dc7ef96207ca 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -218,6 +218,9 @@ struct cee_pfc { #define IEEE_8021QAZ_APP_SEL_ANY 4 #define IEEE_8021QAZ_APP_SEL_DSCP 5 +/* Non-std selector values */ +#define DCB_APP_SEL_PCP 255 + /* This structure contains the IEEE 802.1Qaz APP managed object. This * object is also used for the CEE std as well. * @@ -247,6 +250,8 @@ struct dcb_app { __u16 protocol; }; +#define IEEE_8021QAZ_APP_SEL_MAX 255 + /** * struct dcb_peer_app_info - APP feature information sent by the peer * @@ -425,6 +430,7 @@ enum ieee_attrs { enum ieee_attrs_app { DCB_ATTR_IEEE_APP_UNSPEC, DCB_ATTR_IEEE_APP, + DCB_ATTR_DCB_APP, __DCB_ATTR_IEEE_APP_MAX }; #define DCB_ATTR_IEEE_APP_MAX (__DCB_ATTR_IEEE_APP_MAX - 1) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index dc4fb699b56c..0c17bb28ae60 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -179,6 +179,38 @@ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { static LIST_HEAD(dcb_app_list); static DEFINE_SPINLOCK(dcb_lock); +static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector) +{ + switch (selector) { + case IEEE_8021QAZ_APP_SEL_ETHERTYPE: + case IEEE_8021QAZ_APP_SEL_STREAM: + case IEEE_8021QAZ_APP_SEL_DGRAM: + case IEEE_8021QAZ_APP_SEL_ANY: + case IEEE_8021QAZ_APP_SEL_DSCP: + return DCB_ATTR_IEEE_APP; + case DCB_APP_SEL_PCP: + return DCB_ATTR_DCB_APP; + default: + return DCB_ATTR_IEEE_APP_UNSPEC; + } +} + +static bool dcbnl_app_attr_type_validate(enum ieee_attrs_app type) +{ + switch (type) { + case DCB_ATTR_IEEE_APP: + case DCB_ATTR_DCB_APP: + return true; + default: + return false; + } +} + +static bool dcbnl_app_selector_validate(enum ieee_attrs_app type, u8 selector) +{ + return dcbnl_app_attr_type_get(selector) == type; +} + static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq, u32 flags, struct nlmsghdr **nlhp) { @@ -1116,8 +1148,9 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { - err = nla_put(skb, DCB_ATTR_IEEE_APP, sizeof(itr->app), - &itr->app); + enum ieee_attrs_app type = + dcbnl_app_attr_type_get(itr->app.selector); + err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); return -EMSGSIZE; @@ -1493,9 +1526,10 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, int rem; nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { + enum ieee_attrs_app type = nla_type(attr); struct dcb_app *app_data; - if (nla_type(attr) != DCB_ATTR_IEEE_APP) + if (!dcbnl_app_attr_type_validate(type)) continue; if (nla_len(attr) < sizeof(struct dcb_app)) { @@ -1504,6 +1538,13 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, } app_data = nla_data(attr); + + if (!dcbnl_app_selector_validate(type, + app_data->selector)) { + err = -EINVAL; + goto err; + } + if (ops->ieee_setapp) err = ops->ieee_setapp(netdev, app_data); else @@ -1554,11 +1595,20 @@ static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, int rem; nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { + enum ieee_attrs_app type = nla_type(attr); struct dcb_app *app_data; - if (nla_type(attr) != DCB_ATTR_IEEE_APP) + if (!dcbnl_app_attr_type_validate(type)) continue; + app_data = nla_data(attr); + + if (!dcbnl_app_selector_validate(type, + app_data->selector)) { + err = -EINVAL; + goto err; + } + if (ops->ieee_delapp) err = ops->ieee_delapp(netdev, app_data); else -- cgit v1.2.3 From 6182d5875c330a5a611687caa05f47752455720c Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Tue, 1 Nov 2022 10:48:30 +0100 Subject: net: dcb: add new apptrust attribute Add new apptrust extension attributes to the 8021Qaz APP managed object. Two new attributes, DCB_ATTR_DCB_APP_TRUST_TABLE and DCB_ATTR_DCB_APP_TRUST, has been added. Trusted selectors are passed in the nested attribute DCB_ATTR_DCB_APP_TRUST, in order of precedence. The new attributes are meant to allow drivers, whose hw supports the notion of trust, to be able to set whether a particular app selector is trusted - and in which order. Signed-off-by: Daniel Machon Reviewed-by: Petr Machata Signed-off-by: Paolo Abeni --- include/net/dcbnl.h | 4 +++ include/uapi/linux/dcbnl.h | 2 ++ net/dcb/dcbnl.c | 76 ++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h index 2b2d86fb3131..8841ab6c2de7 100644 --- a/include/net/dcbnl.h +++ b/include/net/dcbnl.h @@ -109,6 +109,10 @@ struct dcbnl_rtnl_ops { /* buffer settings */ int (*dcbnl_getbuffer)(struct net_device *, struct dcbnl_buffer *); int (*dcbnl_setbuffer)(struct net_device *, struct dcbnl_buffer *); + + /* apptrust */ + int (*dcbnl_setapptrust)(struct net_device *, u8 *, int); + int (*dcbnl_getapptrust)(struct net_device *, u8 *, int *); }; #endif /* __NET_DCBNL_H__ */ diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index dc7ef96207ca..99047223ab26 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -410,6 +410,7 @@ enum dcbnl_attrs { * @DCB_ATTR_IEEE_PEER_ETS: peer ETS configuration - get only * @DCB_ATTR_IEEE_PEER_PFC: peer PFC configuration - get only * @DCB_ATTR_IEEE_PEER_APP: peer APP tlv - get only + * @DCB_ATTR_DCB_APP_TRUST_TABLE: selector trust table */ enum ieee_attrs { DCB_ATTR_IEEE_UNSPEC, @@ -423,6 +424,7 @@ enum ieee_attrs { DCB_ATTR_IEEE_QCN, DCB_ATTR_IEEE_QCN_STATS, DCB_ATTR_DCB_BUFFER, + DCB_ATTR_DCB_APP_TRUST_TABLE, __DCB_ATTR_IEEE_MAX }; #define DCB_ATTR_IEEE_MAX (__DCB_ATTR_IEEE_MAX - 1) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 0c17bb28ae60..cec0632f96db 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -166,6 +166,7 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)}, [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, [DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)}, + [DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED}, }; /* DCB number of traffic classes nested attributes. */ @@ -1062,9 +1063,9 @@ nla_put_failure: /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { - struct nlattr *ieee, *app; - struct dcb_app_type *itr; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; + struct nlattr *ieee, *app, *apptrust; + struct dcb_app_type *itr; int dcbx; int err; @@ -1166,6 +1167,30 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); + if (ops->dcbnl_getapptrust) { + u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0}; + int nselectors, i; + + apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE); + if (!apptrust) + return -EMSGSIZE; + + err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors); + if (!err) { + for (i = 0; i < nselectors; i++) { + enum ieee_attrs_app type = + dcbnl_app_attr_type_get(selectors[i]); + err = nla_put_u8(skb, type, selectors[i]); + if (err) { + nla_nest_cancel(skb, apptrust); + return err; + } + } + } + + nla_nest_end(skb, apptrust); + } + /* get peer info if available */ if (ops->ieee_peer_getets) { struct ieee_ets ets; @@ -1554,6 +1579,53 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, } } + if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) { + u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0}; + struct nlattr *attr; + int nselectors = 0; + int rem; + + if (!ops->dcbnl_setapptrust) { + err = -EOPNOTSUPP; + goto err; + } + + nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE], + rem) { + enum ieee_attrs_app type = nla_type(attr); + u8 selector; + int i; + + if (!dcbnl_app_attr_type_validate(type) || + nla_len(attr) != 1 || + nselectors >= sizeof(selectors)) { + err = -EINVAL; + goto err; + } + + selector = nla_get_u8(attr); + + if (!dcbnl_app_selector_validate(type, selector)) { + err = -EINVAL; + goto err; + } + + /* Duplicate selector ? */ + for (i = 0; i < nselectors; i++) { + if (selectors[i] == selector) { + err = -EINVAL; + goto err; + } + } + + selectors[nselectors++] = selector; + } + + err = ops->dcbnl_setapptrust(netdev, selectors, nselectors); + if (err) + goto err; + } + err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0); -- cgit v1.2.3 From a35ec8e38cdd1766f29924ca391a01de20163931 Mon Sep 17 00:00:00 2001 From: "Hans J. Schultz" Date: Tue, 1 Nov 2022 21:39:21 +0200 Subject: bridge: Add MAC Authentication Bypass (MAB) support Hosts that support 802.1X authentication are able to authenticate themselves by exchanging EAPOL frames with an authenticator (Ethernet bridge, in this case) and an authentication server. Access to the network is only granted by the authenticator to successfully authenticated hosts. The above is implemented in the bridge using the "locked" bridge port option. When enabled, link-local frames (e.g., EAPOL) can be locally received by the bridge, but all other frames are dropped unless the host is authenticated. That is, unless the user space control plane installed an FDB entry according to which the source address of the frame is located behind the locked ingress port. The entry can be dynamic, in which case learning needs to be enabled so that the entry will be refreshed by incoming traffic. There are deployments in which not all the devices connected to the authenticator (the bridge) support 802.1X. Such devices can include printers and cameras. One option to support such deployments is to unlock the bridge ports connecting these devices, but a slightly more secure option is to use MAB. When MAB is enabled, the MAC address of the connected device is used as the user name and password for the authentication. For MAB to work, the user space control plane needs to be notified about MAC addresses that are trying to gain access so that they will be compared against an allow list. This can be implemented via the regular learning process with the sole difference that learned FDB entries are installed with a new "locked" flag indicating that the entry cannot be used to authenticate the device. The flag cannot be set by user space, but user space can clear the flag by replacing the entry, thereby authenticating the device. Locked FDB entries implement the following semantics with regards to roaming, aging and forwarding: 1. Roaming: Locked FDB entries can roam to unlocked (authorized) ports, in which case the "locked" flag is cleared. FDB entries cannot roam to locked ports regardless of MAB being enabled or not. Therefore, locked FDB entries are only created if an FDB entry with the given {MAC, VID} does not already exist. This behavior prevents unauthenticated devices from disrupting traffic destined to already authenticated devices. 2. Aging: Locked FDB entries age and refresh by incoming traffic like regular entries. 3. Forwarding: Locked FDB entries forward traffic like regular entries. If user space detects an unauthorized MAC behind a locked port and wishes to prevent traffic with this MAC DA from reaching the host, it can do so using tc or a different mechanism. Enable the above behavior using a new bridge port option called "mab". It can only be enabled on a bridge port that is both locked and has learning enabled. Locked FDB entries are flushed from the port once MAB is disabled. A new option is added because there are pure 802.1X deployments that are not interested in notifications about locked FDB entries. Signed-off-by: Hans J. Schultz Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/linux/if_bridge.h | 1 + include/uapi/linux/if_link.h | 1 + include/uapi/linux/neighbour.h | 8 +++++++- net/bridge/br_fdb.c | 24 ++++++++++++++++++++++++ net/bridge/br_input.c | 21 +++++++++++++++++++-- net/bridge/br_netlink.c | 21 ++++++++++++++++++++- net/bridge/br_private.h | 3 ++- net/core/rtnetlink.c | 5 +++++ 8 files changed, 79 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index d62ef428e3aa..1668ac4d7adc 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -59,6 +59,7 @@ struct br_ip_list { #define BR_MRP_LOST_IN_CONT BIT(19) #define BR_TX_FWD_OFFLOAD BIT(20) #define BR_PORT_LOCKED BIT(21) +#define BR_PORT_MAB BIT(22) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 5e7a1041df3a..d92b3f79eba3 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -561,6 +561,7 @@ enum { IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, IFLA_BRPORT_LOCKED, + IFLA_BRPORT_MAB, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index a998bf761635..5e67a7eaf4a7 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -52,7 +52,8 @@ enum { #define NTF_STICKY (1 << 6) #define NTF_ROUTER (1 << 7) /* Extended flags under NDA_FLAGS_EXT: */ -#define NTF_EXT_MANAGED (1 << 0) +#define NTF_EXT_MANAGED (1 << 0) +#define NTF_EXT_LOCKED (1 << 1) /* * Neighbor Cache Entry States. @@ -86,6 +87,11 @@ enum { * NTF_EXT_MANAGED flagged neigbor entries are managed by the kernel on behalf * of a user space control plane, and automatically refreshed so that (if * possible) they remain in NUD_REACHABLE state. + * + * NTF_EXT_LOCKED flagged bridge FDB entries are entries generated by the + * bridge in response to a host trying to communicate via a locked bridge port + * with MAB enabled. Their purpose is to notify user space that a host requires + * authentication. */ struct nda_cacheinfo { diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e7f4fccb6adb..3b83af4458b8 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -105,6 +105,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, struct nda_cacheinfo ci; struct nlmsghdr *nlh; struct ndmsg *ndm; + u32 ext_flags = 0; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) @@ -125,11 +126,16 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_flags |= NTF_EXT_LEARNED; if (test_bit(BR_FDB_STICKY, &fdb->flags)) ndm->ndm_flags |= NTF_STICKY; + if (test_bit(BR_FDB_LOCKED, &fdb->flags)) + ext_flags |= NTF_EXT_LOCKED; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) goto nla_put_failure; if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex)) goto nla_put_failure; + if (nla_put_u32(skb, NDA_FLAGS_EXT, ext_flags)) + goto nla_put_failure; + ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); @@ -171,6 +177,7 @@ static inline size_t fdb_nlmsg_size(void) return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(u32)) /* NDA_MASTER */ + + nla_total_size(sizeof(u32)) /* NDA_FLAGS_EXT */ + nla_total_size(sizeof(u16)) /* NDA_VLAN */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */ @@ -879,6 +886,11 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, &fdb->flags))) clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); + /* Clear locked flag when roaming to an + * unlocked port. + */ + if (unlikely(test_bit(BR_FDB_LOCKED, &fdb->flags))) + clear_bit(BR_FDB_LOCKED, &fdb->flags); } if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) @@ -1082,6 +1094,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, modified = true; } + if (test_and_clear_bit(BR_FDB_LOCKED, &fdb->flags)) + modified = true; + if (fdb_handle_notify(fdb, notify)) modified = true; @@ -1150,6 +1165,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; struct net_bridge *br = NULL; + u32 ext_flags = 0; int err = 0; trace_br_fdb_add(ndm, dev, addr, vid, nlh_flags); @@ -1178,6 +1194,14 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], vg = nbp_vlan_group(p); } + if (tb[NDA_FLAGS_EXT]) + ext_flags = nla_get_u32(tb[NDA_FLAGS_EXT]); + + if (ext_flags & NTF_EXT_LOCKED) { + NL_SET_ERR_MSG_MOD(extack, "Cannot add FDB entry with \"locked\" flag set"); + return -EINVAL; + } + if (tb[NDA_FDB_EXT_ATTRS]) { attr = tb[NDA_FDB_EXT_ATTRS]; err = nla_parse_nested(nfea_tb, NFEA_MAX, attr, diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 68b3e850bcb9..d04d2205ad4e 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -109,9 +109,26 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_fdb_entry *fdb_src = br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid); - if (!fdb_src || READ_ONCE(fdb_src->dst) != p || - test_bit(BR_FDB_LOCAL, &fdb_src->flags)) + if (!fdb_src) { + /* FDB miss. Create locked FDB entry if MAB is enabled + * and drop the packet. + */ + if (p->flags & BR_PORT_MAB) + br_fdb_update(br, p, eth_hdr(skb)->h_source, + vid, BIT(BR_FDB_LOCKED)); goto drop; + } else if (READ_ONCE(fdb_src->dst) != p || + test_bit(BR_FDB_LOCAL, &fdb_src->flags)) { + /* FDB mismatch. Drop the packet without roaming. */ + goto drop; + } else if test_bit(BR_FDB_LOCKED, &fdb_src->flags) { + /* FDB match, but entry is locked. Refresh it and drop + * the packet. + */ + br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, + BIT(BR_FDB_LOCKED)); + goto drop; + } } nbp_switchdev_frame_mark(p, skb); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index d087fd4c784a..4316cc82ae17 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -188,6 +188,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */ + nla_total_size(1) /* IFLA_BRPORT_LOCKED */ + + nla_total_size(1) /* IFLA_BRPORT_MAB */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ @@ -274,7 +275,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, !!(p->flags & BR_MRP_LOST_IN_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) || - nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED))) + nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) || + nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -876,6 +878,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, [IFLA_BRPORT_LOCKED] = { .type = NLA_U8 }, + [IFLA_BRPORT_MAB] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, }; @@ -943,6 +946,22 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS); br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED); + br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB); + + if ((p->flags & BR_PORT_MAB) && + (!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) { + NL_SET_ERR_MSG(extack, "Bridge port must be locked and have learning enabled when MAB is enabled"); + p->flags = old_flags; + return -EINVAL; + } else if (!(p->flags & BR_PORT_MAB) && (old_flags & BR_PORT_MAB)) { + struct net_bridge_fdb_flush_desc desc = { + .flags = BIT(BR_FDB_LOCKED), + .flags_mask = BIT(BR_FDB_LOCKED), + .port_ifindex = p->dev->ifindex, + }; + + br_fdb_flush(p->br, &desc); + } changed_mask = old_flags ^ p->flags; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 06e5f6faa431..4ce8b8e5ae0b 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -251,7 +251,8 @@ enum { BR_FDB_ADDED_BY_EXT_LEARN, BR_FDB_OFFLOADED, BR_FDB_NOTIFY, - BR_FDB_NOTIFY_INACTIVE + BR_FDB_NOTIFY_INACTIVE, + BR_FDB_LOCKED, }; struct net_bridge_fdb_key { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d2f27548fc0b..b64fffeb3844 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4051,6 +4051,11 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, return err; } + if (tb[NDA_FLAGS_EXT]) { + netdev_info(dev, "invalid flags given to default FDB implementation\n"); + return err; + } + if (vid) { netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n"); return err; -- cgit v1.2.3 From dca56c3038c34a3e5acfe0aadb1f2bc9d724ae79 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:11 +0100 Subject: net: expose devlink port over rtnetlink Expose devlink port handle related to netdev over rtnetlink. Introduce a new nested IFLA attribute to carry the info. Call into devlink code to fill-up the nest with existing devlink attributes that are used over devlink netlink. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 14 ++++++++++++++ include/uapi/linux/if_link.h | 2 ++ net/core/devlink.c | 18 ++++++++++++++++++ net/core/rtnetlink.c | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 7befad57afd4..fa6e936af1a5 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1873,6 +1873,9 @@ int devlink_compat_phys_port_name_get(struct net_device *dev, int devlink_compat_switch_id_get(struct net_device *dev, struct netdev_phys_item_id *ppid); +int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port); +size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port); + #else static inline struct devlink *devlink_try_get(struct devlink *devlink) @@ -1909,6 +1912,17 @@ devlink_compat_switch_id_get(struct net_device *dev, return -EOPNOTSUPP; } +static inline int +devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) +{ + return 0; +} + +static inline size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) +{ + return 0; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index d92b3f79eba3..1021a7e47a86 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -372,6 +372,8 @@ enum { IFLA_TSO_MAX_SEGS, IFLA_ALLMULTI, /* Allmulti count: > 0 means acts ALLMULTI */ + IFLA_DEVLINK_PORT, + __IFLA_MAX }; diff --git a/net/core/devlink.c b/net/core/devlink.c index 3a454d0045e5..2dcf2bcc3527 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -880,6 +880,24 @@ nla_put_failure: return -EMSGSIZE; } +int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) +{ + if (devlink_nl_put_handle(msg, devlink_port->devlink)) + return -EMSGSIZE; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + return -EMSGSIZE; + return 0; +} + +size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) +{ + struct devlink *devlink = devlink_port->devlink; + + return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ + + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ + + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ +} + struct devlink_reload_combination { enum devlink_reload_action action; enum devlink_reload_limit limit; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b64fffeb3844..64289bc98887 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "dev.h" @@ -1038,6 +1039,16 @@ static size_t rtnl_proto_down_size(const struct net_device *dev) return size; } +static size_t rtnl_devlink_port_size(const struct net_device *dev) +{ + size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */ + + if (dev->devlink_port) + size += devlink_nl_port_handle_size(dev->devlink_port); + + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1091,6 +1102,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_MAX_MTU */ + rtnl_prop_list_size(dev) + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + + rtnl_devlink_port_size(dev) + 0; } @@ -1728,6 +1740,30 @@ nla_put_failure: return -EMSGSIZE; } +static int rtnl_fill_devlink_port(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *devlink_port_nest; + int ret; + + devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT); + if (!devlink_port_nest) + return -EMSGSIZE; + + if (dev->devlink_port) { + ret = devlink_nl_port_handle_fill(skb, dev->devlink_port); + if (ret < 0) + goto nest_cancel; + } + + nla_nest_end(skb, devlink_port_nest); + return 0; + +nest_cancel: + nla_nest_cancel(skb, devlink_port_nest); + return ret; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1865,6 +1901,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, dev->dev.parent->bus->name)) goto nla_put_failure; + if (rtnl_fill_devlink_port(skb, dev)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From 9a0f830f80265bd1ef816e1541ac24bee80e9a3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:01:25 -0700 Subject: ethtool: linkstate: add a statistic for PHY down events The previous attempt to augment carrier_down (see Link) was not met with much enthusiasm so let's do the simple thing of exposing what some devices already maintain. Add a common ethtool statistic for link going down. Currently users have to maintain per-driver mapping to extract the right stat from the vendor-specific ethtool -S stats. carrier_down does not fit the bill because it counts a lot of software related false positives. Add the statistic to the extended link state API to steer vendors towards implementing all of it. Implement for bnxt and all Linux-controlled PHYs. mlx5 and (possibly) enic also have a counter for this but I leave the implementation to their maintainers. Link: https://lore.kernel.org/r/20220520004500.2250674-1-kuba@kernel.org Reviewed-by: Florian Fainelli Reviewed-by: Michael Chan Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20221104190125.684910-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/networking/ethtool-netlink.rst | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 15 ++++++++++++++ drivers/net/phy/phy.c | 1 + include/linux/ethtool.h | 17 ++++++++++++++++ include/linux/phy.h | 3 +++ include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/linkstate.c | 24 ++++++++++++++++++++++- 7 files changed, 61 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index d578b8bcd8a4..bede24ef44fd 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -491,6 +491,7 @@ Kernel response contents: ``ETHTOOL_A_LINKSTATE_SQI_MAX`` u32 Max support SQI value ``ETHTOOL_A_LINKSTATE_EXT_STATE`` u8 link extended state ``ETHTOOL_A_LINKSTATE_EXT_SUBSTATE`` u8 link extended substate + ``ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT`` u32 count of link down events ==================================== ====== ============================ For most NIC drivers, the value of ``ETHTOOL_A_LINKSTATE_LINK`` returns diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index cc89e5eabcb9..d8f0351df954 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4112,6 +4112,20 @@ static void bnxt_get_rmon_stats(struct net_device *dev, *ranges = bnxt_rmon_ranges; } +static void bnxt_get_link_ext_stats(struct net_device *dev, + struct ethtool_link_ext_stats *stats) +{ + struct bnxt *bp = netdev_priv(dev); + u64 *rx; + + if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_PORT_STATS_EXT)) + return; + + rx = bp->rx_port_stats_ext.sw_stats; + stats->link_down_events = + *(rx + BNXT_RX_STATS_EXT_OFFSET(link_down_events)); +} + void bnxt_ethtool_free(struct bnxt *bp) { kfree(bp->test_info); @@ -4161,6 +4175,7 @@ const struct ethtool_ops bnxt_ethtool_ops = { .get_eeprom = bnxt_get_eeprom, .set_eeprom = bnxt_set_eeprom, .get_link = bnxt_get_link, + .get_link_ext_stats = bnxt_get_link_ext_stats, .get_eee = bnxt_get_eee, .set_eee = bnxt_set_eee, .get_module_info = bnxt_get_module_info, diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index e741d8aebffe..e5b6cb1a77f9 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -67,6 +67,7 @@ static void phy_link_down(struct phy_device *phydev) { phydev->phy_link_change(phydev, false); phy_led_trigger_change_speed(phydev); + WRITE_ONCE(phydev->link_down_events, phydev->link_down_events + 1); } static const char *phy_pause_str(struct phy_device *phydev) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 99dc7bfbcd3c..5c51c7fda32a 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -125,6 +125,20 @@ struct ethtool_link_ext_state_info { }; }; +struct ethtool_link_ext_stats { + /* Custom Linux statistic for PHY level link down events. + * In a simpler world it should be equal to netdev->carrier_down_count + * unfortunately netdev also counts local reconfigurations which don't + * actually take the physical link down, not to mention NC-SI which, + * if present, keeps the link up regardless of host state. + * This statistic counts when PHY _actually_ went down, or lost link. + * + * Note that we need u64 for ethtool_stats_init() and comparisons + * to ETHTOOL_STAT_NOT_SET, but only u32 is exposed to the user. + */ + u64 link_down_events; +}; + /** * ethtool_rxfh_indir_default - get default value for RX flow hash indirection * @index: Index in RX flow hash indirection table @@ -481,6 +495,7 @@ struct ethtool_module_power_mode_params { * do not attach ext_substate attribute to netlink message). If link_ext_state * and link_ext_substate are unknown, return -ENODATA. If not implemented, * link_ext_state and link_ext_substate will not be sent to userspace. + * @get_link_ext_stats: Read extra link-related counters. * @get_eeprom_len: Read range of EEPROM addresses for validation of * @get_eeprom and @set_eeprom requests. * Returns 0 if device does not support EEPROM access. @@ -652,6 +667,8 @@ struct ethtool_ops { u32 (*get_link)(struct net_device *); int (*get_link_ext_state)(struct net_device *, struct ethtool_link_ext_state_info *); + void (*get_link_ext_stats)(struct net_device *dev, + struct ethtool_link_ext_stats *stats); int (*get_eeprom_len)(struct net_device *); int (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); diff --git a/include/linux/phy.h b/include/linux/phy.h index ddf66198f751..9a3752c0c444 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -600,6 +600,7 @@ struct macsec_ops; * @psec: Pointer to Power Sourcing Equipment control struct * @lock: Mutex for serialization access to PHY * @state_queue: Work queue for state machine + * @link_down_events: Number of times link was lost * @shared: Pointer to private data shared by phys in one package * @priv: Pointer to driver private data * @@ -723,6 +724,8 @@ struct phy_device { int pma_extable; + unsigned int link_down_events; + void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index bb57084ac524..aaf7c6963d61 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -262,6 +262,7 @@ enum { ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ + ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, /* u32 */ /* add new constants above here */ __ETHTOOL_A_LINKSTATE_CNT, diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index fb676f349455..2158c17a0b32 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -13,6 +13,7 @@ struct linkstate_reply_data { int link; int sqi; int sqi_max; + struct ethtool_link_ext_stats link_stats; bool link_ext_state_provided; struct ethtool_link_ext_state_info ethtool_link_ext_state_info; }; @@ -22,7 +23,7 @@ struct linkstate_reply_data { const struct nla_policy ethnl_linkstate_get_policy[] = { [ETHTOOL_A_LINKSTATE_HEADER] = - NLA_POLICY_NESTED(ethnl_header_policy), + NLA_POLICY_NESTED(ethnl_header_policy_stats), }; static int linkstate_get_sqi(struct net_device *dev) @@ -107,6 +108,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, goto out; } + ethtool_stats_init((u64 *)&data->link_stats, + sizeof(data->link_stats) / 8); + + if (req_base->flags & ETHTOOL_FLAG_STATS) { + if (dev->phydev) + data->link_stats.link_down_events = + READ_ONCE(dev->phydev->link_down_events); + + if (dev->ethtool_ops->get_link_ext_stats) + dev->ethtool_ops->get_link_ext_stats(dev, + &data->link_stats); + } + ret = 0; out: ethnl_ops_complete(dev); @@ -134,6 +148,9 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base, if (data->ethtool_link_ext_state_info.__link_ext_substate) len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_SUBSTATE */ + if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET) + len += nla_total_size(sizeof(u32)); + return len; } @@ -166,6 +183,11 @@ static int linkstate_fill_reply(struct sk_buff *skb, return -EMSGSIZE; } + if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET) + if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, + data->link_stats.link_down_events)) + return -EMSGSIZE; + return 0; } -- cgit v1.2.3 From a21b06e7319129994f339ed47f512bbe57b77f5b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 6 Nov 2022 15:34:17 -0500 Subject: net: sched: add helper support in act_ct This patch is to add helper support in act_ct for OVS actions=ct(alg=xxx) offloading, which is corresponding to Commit cae3a2627520 ("openvswitch: Allow attaching helpers to ct action") in OVS kernel part. The difference is when adding TC actions family and proto cannot be got from the filter/match, other than helper name in tb[TCA_CT_HELPER_NAME], we also need to send the family in tb[TCA_CT_HELPER_FAMILY] and the proto in tb[TCA_CT_HELPER_PROTO] to kernel. Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: Paolo Abeni --- include/net/tc_act/tc_ct.h | 1 + include/uapi/linux/tc_act/tc_ct.h | 3 ++ net/sched/act_ct.c | 89 +++++++++++++++++++++++++++++++++++---- 3 files changed, 85 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index 8250d6f0a462..b24ea2d9400b 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -10,6 +10,7 @@ #include struct tcf_ct_params { + struct nf_conntrack_helper *helper; struct nf_conn *tmpl; u16 zone; diff --git a/include/uapi/linux/tc_act/tc_ct.h b/include/uapi/linux/tc_act/tc_ct.h index 5fb1d7ac1027..6c5200f0ed38 100644 --- a/include/uapi/linux/tc_act/tc_ct.h +++ b/include/uapi/linux/tc_act/tc_ct.h @@ -22,6 +22,9 @@ enum { TCA_CT_NAT_PORT_MIN, /* be16 */ TCA_CT_NAT_PORT_MAX, /* be16 */ TCA_CT_PAD, + TCA_CT_HELPER_NAME, /* string */ + TCA_CT_HELPER_FAMILY, /* u8 */ + TCA_CT_HELPER_PROTO, /* u8 */ __TCA_CT_MAX }; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 193a460a9d7f..da0b7f665277 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -33,6 +33,7 @@ #include #include #include +#include #include static struct workqueue_struct *act_ct_wq; @@ -655,7 +656,7 @@ struct tc_ct_action_net { /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb, - u16 zone_id, bool force) + struct tcf_ct_params *p) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -665,11 +666,19 @@ static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb, return false; if (!net_eq(net, read_pnet(&ct->ct_net))) goto drop_ct; - if (nf_ct_zone(ct)->id != zone_id) + if (nf_ct_zone(ct)->id != p->zone) goto drop_ct; + if (p->helper) { + struct nf_conn_help *help; + + help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); + if (help && rcu_access_pointer(help->helper) != p->helper) + goto drop_ct; + } /* Force conntrack entry direction. */ - if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { + if ((p->ct_action & TCA_CT_ACT_FORCE) && + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { if (nf_ct_is_confirmed(ct)) nf_ct_kill(ct); @@ -832,6 +841,13 @@ out_free: static void tcf_ct_params_free(struct tcf_ct_params *params) { + if (params->helper) { +#if IS_ENABLED(CONFIG_NF_NAT) + if (params->ct_action & TCA_CT_ACT_NAT) + nf_nat_helper_put(params->helper); +#endif + nf_conntrack_helper_put(params->helper); + } if (params->ct_ft) tcf_ct_flow_table_put(params->ct_ft); if (params->tmpl) @@ -1026,13 +1042,14 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct net *net = dev_net(skb->dev); - bool cached, commit, clear, force; enum ip_conntrack_info ctinfo; struct tcf_ct *c = to_ct(a); struct nf_conn *tmpl = NULL; struct nf_hook_state state; + bool cached, commit, clear; int nh_ofs, err, retval; struct tcf_ct_params *p; + bool add_helper = false; bool skip_add = false; bool defrag = false; struct nf_conn *ct; @@ -1043,7 +1060,6 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, retval = READ_ONCE(c->tcf_action); commit = p->ct_action & TCA_CT_ACT_COMMIT; clear = p->ct_action & TCA_CT_ACT_CLEAR; - force = p->ct_action & TCA_CT_ACT_FORCE; tmpl = p->tmpl; tcf_lastuse_update(&c->tcf_tm); @@ -1086,7 +1102,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, * actually run the packet through conntrack twice unless it's for a * different zone. */ - cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force); + cached = tcf_ct_skb_nfct_cached(net, skb, p); if (!cached) { if (tcf_ct_flow_table_lookup(p, skb, family)) { skip_add = true; @@ -1119,6 +1135,22 @@ do_nat: if (err != NF_ACCEPT) goto drop; + if (!nf_ct_is_confirmed(ct) && commit && p->helper && !nfct_help(ct)) { + err = __nf_ct_try_assign_helper(ct, p->tmpl, GFP_ATOMIC); + if (err) + goto drop; + add_helper = true; + if (p->ct_action & TCA_CT_ACT_NAT && !nfct_seqadj(ct)) { + if (!nfct_seqadj_ext_add(ct)) + goto drop; + } + } + + if (nf_ct_is_confirmed(ct) ? ((!cached && !skip_add) || add_helper) : commit) { + if (nf_ct_helper(skb, ct, ctinfo, family) != NF_ACCEPT) + goto drop; + } + if (commit) { tcf_ct_act_set_mark(ct, p->mark, p->mark_mask); tcf_ct_act_set_labels(ct, p->labels, p->labels_mask); @@ -1167,6 +1199,9 @@ static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = { [TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 }, [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 }, + [TCA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN }, + [TCA_CT_HELPER_FAMILY] = { .type = NLA_U8 }, + [TCA_CT_HELPER_PROTO] = { .type = NLA_U8 }, }; static int tcf_ct_fill_params_nat(struct tcf_ct_params *p, @@ -1256,8 +1291,9 @@ static int tcf_ct_fill_params(struct net *net, { struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id); struct nf_conntrack_zone zone; + int err, family, proto, len; struct nf_conn *tmpl; - int err; + char *name; p->zone = NF_CT_DEFAULT_ZONE_ID; @@ -1318,10 +1354,31 @@ static int tcf_ct_fill_params(struct net *net, NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template"); return -ENOMEM; } - __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); p->tmpl = tmpl; + if (tb[TCA_CT_HELPER_NAME]) { + name = nla_data(tb[TCA_CT_HELPER_NAME]); + len = nla_len(tb[TCA_CT_HELPER_NAME]); + if (len > 16 || name[len - 1] != '\0') { + NL_SET_ERR_MSG_MOD(extack, "Failed to parse helper name."); + err = -EINVAL; + goto err; + } + family = tb[TCA_CT_HELPER_FAMILY] ? nla_get_u8(tb[TCA_CT_HELPER_FAMILY]) : AF_INET; + proto = tb[TCA_CT_HELPER_PROTO] ? nla_get_u8(tb[TCA_CT_HELPER_PROTO]) : IPPROTO_TCP; + err = nf_ct_add_helper(tmpl, name, family, proto, + p->ct_action & TCA_CT_ACT_NAT, &p->helper); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to add helper"); + goto err; + } + } + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); return 0; +err: + nf_ct_put(p->tmpl); + p->tmpl = NULL; + return err; } static int tcf_ct_init(struct net *net, struct nlattr *nla, @@ -1490,6 +1547,19 @@ static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) return 0; } +static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper) +{ + if (!helper) + return 0; + + if (nla_put_string(skb, TCA_CT_HELPER_NAME, helper->name) || + nla_put_u8(skb, TCA_CT_HELPER_FAMILY, helper->tuple.src.l3num) || + nla_put_u8(skb, TCA_CT_HELPER_PROTO, helper->tuple.dst.protonum)) + return -1; + + return 0; +} + static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -1542,6 +1612,9 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, if (tcf_ct_dump_nat(skb, p)) goto nla_put_failure; + if (tcf_ct_dump_helper(skb, p->helper)) + goto nla_put_failure; + skip_dump: if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; -- cgit v1.2.3 From 9bb053490f1a5a0914eb9f7b4116a0e4a95d4f8e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 7 Nov 2022 15:04:18 -0800 Subject: bpf: Add hwtstamp field for the sockops prog The bpf-tc prog has already been able to access the skb_hwtstamps(skb)->hwtstamp. This patch extends the same hwtstamp access to the sockops prog. In sockops, the skb is also available to the bpf prog during the BPF_SOCK_OPS_PARSE_HDR_OPT_CB event. There is a use case that the hwtstamp will be useful to the sockops prog to better measure the one-way-delay when the sender has put the tx timestamp in the tcp header option. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221107230420.4192307-2-martin.lau@linux.dev --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 39 +++++++++++++++++++++++++++++++-------- tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 33 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 94659f6b3395..fb4c911d2a03 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6445,6 +6445,7 @@ struct bpf_sock_ops { * the outgoing header has not * been written yet. */ + __u64 skb_hwtstamp; }; /* Definitions for bpf_sock_ops_cb_flags */ diff --git a/net/core/filter.c b/net/core/filter.c index cb3b635e35be..cd667cdbdb26 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8925,6 +8925,10 @@ static bool sock_ops_is_valid_access(int off, int size, bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case offsetof(struct bpf_sock_ops, skb_hwtstamp): + if (size != sizeof(__u64)) + return false; + break; default: if (size != size_default) return false; @@ -9108,21 +9112,21 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, return insn; } -static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, +static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg, struct bpf_insn *insn) { /* si->dst_reg = skb_shinfo(SKB); */ #ifdef NET_SKBUFF_DATA_USES_OFFSET *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), - BPF_REG_AX, si->src_reg, + BPF_REG_AX, skb_reg, offsetof(struct sk_buff, end)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), - si->dst_reg, si->src_reg, + dst_reg, skb_reg, offsetof(struct sk_buff, head)); - *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX); #else *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), - si->dst_reg, si->src_reg, + dst_reg, skb_reg, offsetof(struct sk_buff, end)); #endif @@ -9515,7 +9519,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, gso_segs): - insn = bpf_convert_shinfo_access(si, insn); + insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, @@ -9523,7 +9527,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, target_size)); break; case offsetof(struct __sk_buff, gso_size): - insn = bpf_convert_shinfo_access(si, insn); + insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, @@ -9550,7 +9554,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8); BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0); - insn = bpf_convert_shinfo_access(si, insn); + insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, @@ -10400,6 +10404,25 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, tcp_flags), si->dst_reg, si->dst_reg, off); break; + case offsetof(struct bpf_sock_ops, skb_hwtstamp): { + struct bpf_insn *jmp_on_null_skb; + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + /* Reserve one insn to test skb == NULL */ + jmp_on_null_skb = insn++; + insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + hwtstamps, 8, + target_size)); + *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, + insn - jmp_on_null_skb - 1); + break; + } } return insn - insn_buf; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 94659f6b3395..fb4c911d2a03 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6445,6 +6445,7 @@ struct bpf_sock_ops { * the outgoing header has not * been written yet. */ + __u64 skb_hwtstamp; }; /* Definitions for bpf_sock_ops_cb_flags */ -- cgit v1.2.3 From f0c5941ff5b255413d31425bb327c2aec3625673 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:25 +0530 Subject: bpf: Support bpf_list_head in map values Add the support on the map side to parse, recognize, verify, and build metadata table for a new special field of the type struct bpf_list_head. To parameterize the bpf_list_head for a certain value type and the list_node member it will accept in that value type, we use BTF declaration tags. The definition of bpf_list_head in a map value will be done as follows: struct foo { struct bpf_list_node node; int data; }; struct map_value { struct bpf_list_head head __contains(foo, node); }; Then, the bpf_list_head only allows adding to the list 'head' using the bpf_list_node 'node' for the type struct foo. The 'contains' annotation is a BTF declaration tag composed of four parts, "contains:name:node" where the name is then used to look up the type in the map BTF, with its kind hardcoded to BTF_KIND_STRUCT during the lookup. The node defines name of the member in this type that has the type struct bpf_list_node, which is actually used for linking into the linked list. For now, 'kind' part is hardcoded as struct. This allows building intrusive linked lists in BPF, using container_of to obtain pointer to entry, while being completely type safe from the perspective of the verifier. The verifier knows exactly the type of the nodes, and knows that list helpers return that type at some fixed offset where the bpf_list_node member used for this list exists. The verifier also uses this information to disallow adding types that are not accepted by a certain list. For now, no elements can be added to such lists. Support for that is coming in future patches, hence draining and freeing items is done with a TODO that will be resolved in a future patch. Note that the bpf_list_head_free function moves the list out to a local variable under the lock and releases it, doing the actual draining of the list items outside the lock. While this helps with not holding the lock for too long pessimizing other concurrent list operations, it is also necessary for deadlock prevention: unless every function called in the critical section would be notrace, a fentry/fexit program could attach and call bpf_map_update_elem again on the map, leading to the same lock being acquired if the key matches and lead to a deadlock. While this requires some special effort on part of the BPF programmer to trigger and is highly unlikely to occur in practice, it is always better if we can avoid such a condition. While notrace would prevent this, doing the draining outside the lock has advantages of its own, hence it is used to also fix the deadlock related problem. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 17 +++++ include/uapi/linux/bpf.h | 10 +++ kernel/bpf/btf.c | 145 ++++++++++++++++++++++++++++++++++++++++- kernel/bpf/helpers.c | 32 +++++++++ kernel/bpf/syscall.c | 22 ++++++- kernel/bpf/verifier.c | 7 ++ tools/include/uapi/linux/bpf.h | 10 +++ 7 files changed, 239 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f08eb2d27de0..05f98e9e5c48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -175,6 +175,7 @@ enum btf_field_type { BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, + BPF_LIST_HEAD = (1 << 4), }; struct btf_field_kptr { @@ -184,11 +185,18 @@ struct btf_field_kptr { u32 btf_id; }; +struct btf_field_list_head { + struct btf *btf; + u32 value_btf_id; + u32 node_offset; +}; + struct btf_field { u32 offset; enum btf_field_type type; union { struct btf_field_kptr kptr; + struct btf_field_list_head list_head; }; }; @@ -266,6 +274,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return "kptr"; + case BPF_LIST_HEAD: + return "bpf_list_head"; default: WARN_ON_ONCE(1); return "unknown"; @@ -282,6 +292,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return sizeof(u64); + case BPF_LIST_HEAD: + return sizeof(struct bpf_list_head); default: WARN_ON_ONCE(1); return 0; @@ -298,6 +310,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return __alignof__(u64); + case BPF_LIST_HEAD: + return __alignof__(struct bpf_list_head); default: WARN_ON_ONCE(1); return 0; @@ -403,6 +417,9 @@ static inline void zero_map_value(struct bpf_map *map, void *dst) void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); +void bpf_list_head_free(const struct btf_field *field, void *list_head, + struct bpf_spin_lock *spin_lock); + int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fb4c911d2a03..6580448e9f77 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6888,6 +6888,16 @@ struct bpf_dynptr { __u64 :64; } __attribute__((aligned(8))); +struct bpf_list_head { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_list_node { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 12361d7b2498..c0d73d71c539 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3205,9 +3205,15 @@ enum { struct btf_field_info { enum btf_field_type type; u32 off; - struct { - u32 type_id; - } kptr; + union { + struct { + u32 type_id; + } kptr; + struct { + const char *node_name; + u32 value_btf_id; + } list_head; + }; }; static int btf_find_struct(const struct btf *btf, const struct btf_type *t, @@ -3261,6 +3267,63 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, return BTF_FIELD_FOUND; } +static const char *btf_find_decl_tag_value(const struct btf *btf, + const struct btf_type *pt, + int comp_idx, const char *tag_key) +{ + int i; + + for (i = 1; i < btf_nr_types(btf); i++) { + const struct btf_type *t = btf_type_by_id(btf, i); + int len = strlen(tag_key); + + if (!btf_type_is_decl_tag(t)) + continue; + if (pt != btf_type_by_id(btf, t->type) || + btf_type_decl_tag(t)->component_idx != comp_idx) + continue; + if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len)) + continue; + return __btf_name_by_offset(btf, t->name_off) + len; + } + return NULL; +} + +static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt, + const struct btf_type *t, int comp_idx, + u32 off, int sz, struct btf_field_info *info) +{ + const char *value_type; + const char *list_node; + s32 id; + + if (!__btf_type_is_struct(t)) + return BTF_FIELD_IGNORE; + if (t->size != sz) + return BTF_FIELD_IGNORE; + value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:"); + if (!value_type) + return -EINVAL; + list_node = strstr(value_type, ":"); + if (!list_node) + return -EINVAL; + value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN); + if (!value_type) + return -ENOMEM; + id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT); + kfree(value_type); + if (id < 0) + return id; + list_node++; + if (str_is_empty(list_node)) + return -EINVAL; + info->type = BPF_LIST_HEAD; + info->off = off; + info->list_head.value_btf_id = id; + info->list_head.node_name = list_node; + return BTF_FIELD_FOUND; +} + static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, int *align, int *sz) { @@ -3284,6 +3347,12 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, goto end; } } + if (field_mask & BPF_LIST_HEAD) { + if (!strcmp(name, "bpf_list_head")) { + type = BPF_LIST_HEAD; + goto end; + } + } /* Only return BPF_KPTR when all other types with matchable names fail */ if (field_mask & BPF_KPTR) { type = BPF_KPTR_REF; @@ -3339,6 +3408,12 @@ static int btf_find_struct_field(const struct btf *btf, if (ret < 0) return ret; break; + case BPF_LIST_HEAD: + ret = btf_find_list_head(btf, t, member_type, i, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; default: return -EFAULT; } @@ -3393,6 +3468,12 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, if (ret < 0) return ret; break; + case BPF_LIST_HEAD: + ret = btf_find_list_head(btf, var, var_type, -1, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; default: return -EFAULT; } @@ -3491,11 +3572,52 @@ end_btf: return ret; } +static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, + struct btf_field_info *info) +{ + const struct btf_type *t, *n = NULL; + const struct btf_member *member; + u32 offset; + int i; + + t = btf_type_by_id(btf, info->list_head.value_btf_id); + /* We've already checked that value_btf_id is a struct type. We + * just need to figure out the offset of the list_node, and + * verify its type. + */ + for_each_member(i, t, member) { + if (strcmp(info->list_head.node_name, __btf_name_by_offset(btf, member->name_off))) + continue; + /* Invalid BTF, two members with same name */ + if (n) + return -EINVAL; + n = btf_type_by_id(btf, member->type); + if (!__btf_type_is_struct(n)) + return -EINVAL; + if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off))) + return -EINVAL; + offset = __btf_member_bit_offset(n, member); + if (offset % 8) + return -EINVAL; + offset /= 8; + if (offset % __alignof__(struct bpf_list_node)) + return -EINVAL; + + field->list_head.btf = (struct btf *)btf; + field->list_head.value_btf_id = info->list_head.value_btf_id; + field->list_head.node_offset = offset; + } + if (!n) + return -ENOENT; + return 0; +} + struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size) { struct btf_field_info info_arr[BTF_FIELDS_MAX]; struct btf_record *rec; + u32 next_off = 0; int ret, i, cnt; ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr)); @@ -3517,6 +3639,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type ret = -EFAULT; goto end; } + if (info_arr[i].off < next_off) { + ret = -EEXIST; + goto end; + } + next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type); rec->field_mask |= info_arr[i].type; rec->fields[i].offset = info_arr[i].off; @@ -3539,12 +3666,24 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type if (ret < 0) goto end; break; + case BPF_LIST_HEAD: + ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]); + if (ret < 0) + goto end; + break; default: ret = -EFAULT; goto end; } rec->cnt++; } + + /* bpf_list_head requires bpf_spin_lock */ + if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) { + ret = -EINVAL; + goto end; + } + return rec; end: btf_record_free(rec); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 283f55bbeb70..7bc71995f17c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1706,6 +1706,38 @@ bpf_base_func_proto(enum bpf_func_id func_id) } } +void bpf_list_head_free(const struct btf_field *field, void *list_head, + struct bpf_spin_lock *spin_lock) +{ + struct list_head *head = list_head, *orig_head = list_head; + + BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); + BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); + + /* Do the actual list draining outside the lock to not hold the lock for + * too long, and also prevent deadlocks if tracing programs end up + * executing on entry/exit of functions called inside the critical + * section, and end up doing map ops that call bpf_list_head_free for + * the same map value again. + */ + __bpf_spin_lock_irqsave(spin_lock); + if (!head->next || list_empty(head)) + goto unlock; + head = head->next; +unlock: + INIT_LIST_HEAD(orig_head); + __bpf_spin_unlock_irqrestore(spin_lock); + + while (head != orig_head) { + void *obj = head; + + obj -= field->list_head.node_offset; + head = head->next; + /* TODO: Rework later */ + kfree(obj); + } +} + BTF_SET8_START(tracing_btf_ids) #ifdef CONFIG_KEXEC_CORE BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 85532d301124..fdbae52f463f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -536,6 +536,9 @@ void btf_record_free(struct btf_record *rec) module_put(rec->fields[i].kptr.module); btf_put(rec->fields[i].kptr.btf); break; + case BPF_LIST_HEAD: + /* Nothing to release for bpf_list_head */ + break; default: WARN_ON_ONCE(1); continue; @@ -578,6 +581,9 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) goto free; } break; + case BPF_LIST_HEAD: + /* Nothing to acquire for bpf_list_head */ + break; default: ret = -EFAULT; WARN_ON_ONCE(1); @@ -637,6 +643,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_KPTR_REF: field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0)); break; + case BPF_LIST_HEAD: + if (WARN_ON_ONCE(rec->spin_lock_off < 0)) + continue; + bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off); + break; default: WARN_ON_ONCE(1); continue; @@ -965,7 +976,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (!value_type || value_size != map->value_size) return -EINVAL; - map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR, + map->record = btf_parse_fields(btf, value_type, + BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1012,6 +1024,14 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, goto free_map_tab; } break; + case BPF_LIST_HEAD: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; default: /* Fail if map_type checks are missing for a field type */ ret = -EOPNOTSUPP; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 07c0259dfc1a..a50018e2d4a0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12814,6 +12814,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); + if (btf_record_has_field(map->record, BPF_LIST_HEAD)) { + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_list_head yet\n"); + return -EINVAL; + } + } + if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index fb4c911d2a03..6580448e9f77 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6888,6 +6888,16 @@ struct bpf_dynptr { __u64 :64; } __attribute__((aligned(8))); +struct bpf_list_head { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_list_node { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. -- cgit v1.2.3 From 8daa8fde3fc3f069ff0b5c87079a5c1df7743113 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 14 Oct 2022 23:45:59 +0200 Subject: netfilter: nf_tables: Introduce NFT_MSG_GETRULE_RESET Analogous to NFT_MSG_GETOBJ_RESET, but for rules: Reset stateful expressions like counters or quotas. The latter two are the only consumers, adjust their 'dump' callbacks to respect the parameter introduced earlier. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 49 +++++++++++++++++++++----------- net/netfilter/nft_counter.c | 2 +- net/netfilter/nft_dynset.c | 4 +-- net/netfilter/nft_inner.c | 2 +- net/netfilter/nft_quota.c | 2 +- 7 files changed, 41 insertions(+), 22 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c557a57fb0f1..e69ce23566ea 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -383,7 +383,7 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, - const struct nft_expr *expr); + const struct nft_expr *expr, bool reset); bool nft_expr_reduce_bitwise(struct nft_regs_track *track, const struct nft_expr *expr); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index e4b739d57480..cfa844da1ce6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -97,6 +97,7 @@ enum nft_verdicts { * @NFT_MSG_NEWFLOWTABLE: add new flow table (enum nft_flowtable_attributes) * @NFT_MSG_GETFLOWTABLE: get flow table (enum nft_flowtable_attributes) * @NFT_MSG_DELFLOWTABLE: delete flow table (enum nft_flowtable_attributes) + * @NFT_MSG_GETRULE_RESET: get rules and reset stateful expressions (enum nft_obj_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -124,6 +125,7 @@ enum nf_tables_msg_types { NFT_MSG_NEWFLOWTABLE, NFT_MSG_GETFLOWTABLE, NFT_MSG_DELFLOWTABLE, + NFT_MSG_GETRULE_RESET, NFT_MSG_MAX, }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 741a0e386406..80e613405f6f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2759,7 +2759,7 @@ static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { }; static int nf_tables_fill_expr_info(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name)) goto nla_put_failure; @@ -2769,7 +2769,7 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb, NFTA_EXPR_DATA); if (data == NULL) goto nla_put_failure; - if (expr->ops->dump(skb, expr, false) < 0) + if (expr->ops->dump(skb, expr, reset) < 0) goto nla_put_failure; nla_nest_end(skb, data); } @@ -2781,14 +2781,14 @@ nla_put_failure: }; int nft_expr_dump(struct sk_buff *skb, unsigned int attr, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, attr); if (!nest) goto nla_put_failure; - if (nf_tables_fill_expr_info(skb, expr) < 0) + if (nf_tables_fill_expr_info(skb, expr, reset) < 0) goto nla_put_failure; nla_nest_end(skb, nest); return 0; @@ -3034,7 +3034,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct nft_rule *rule, u64 handle) + const struct nft_rule *rule, u64 handle, + bool reset) { struct nlmsghdr *nlh; const struct nft_expr *expr, *next; @@ -3067,7 +3068,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, if (list == NULL) goto nla_put_failure; nft_rule_for_each_expr(expr, next, rule) { - if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, list); @@ -3118,7 +3119,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table, - ctx->chain, rule, handle); + ctx->chain, rule, handle, false); if (err < 0) { kfree_skb(skb); goto err; @@ -3139,7 +3140,8 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, unsigned int *idx, struct netlink_callback *cb, const struct nft_table *table, - const struct nft_chain *chain) + const struct nft_chain *chain, + bool reset) { struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; @@ -3166,7 +3168,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule, handle) < 0) + table, chain, rule, handle, reset) < 0) return 1; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -3189,6 +3191,10 @@ static int nf_tables_dump_rules(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; + bool reset = false; + + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) + reset = true; rcu_read_lock(); nft_net = nft_pernet(net); @@ -3213,14 +3219,15 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (!nft_is_active(net, chain)) continue; __nf_tables_dump_rules(skb, &idx, - cb, table, chain); + cb, table, chain, reset); break; } goto done; } list_for_each_entry_rcu(chain, &table->chains, list) { - if (__nf_tables_dump_rules(skb, &idx, cb, table, chain)) + if (__nf_tables_dump_rules(skb, &idx, + cb, table, chain, reset)) goto done; } @@ -3291,6 +3298,7 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; + bool reset = false; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { @@ -3327,9 +3335,12 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, if (!skb2) return -ENOMEM; + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) + reset = true; + err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, - family, table, chain, rule, 0); + family, table, chain, rule, 0, reset); if (err < 0) goto err_fill_rule_info; @@ -4104,7 +4115,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); - if (nf_tables_fill_expr_info(skb, set->exprs[0]) < 0) + if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0) goto nla_put_failure; nla_nest_end(skb, nest); @@ -4115,7 +4126,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, for (i = 0; i < set->num_exprs; i++) { if (nft_expr_dump(skb, NFTA_LIST_ELEM, - set->exprs[i]) < 0) + set->exprs[i], false) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -4946,7 +4957,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, if (num_exprs == 1) { expr = nft_setelem_expr_at(elem_expr, 0); - if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr) < 0) + if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, false) < 0) return -1; return 0; @@ -4957,7 +4968,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, nft_setelem_expr_foreach(expr, elem_expr, size) { expr = nft_setelem_expr_at(elem_expr, size); - if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, false) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -8311,6 +8322,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, + [NFT_MSG_GETRULE_RESET] = { + .call = nf_tables_getrule, + .type = NFNL_CB_RCU, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, [NFT_MSG_DELRULE] = { .call = nf_tables_delrule, .type = NFNL_CB_BATCH, diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 06482fb9c145..dccc68a5135a 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -206,7 +206,7 @@ static int nft_counter_dump(struct sk_buff *skb, { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - return nft_counter_do_dump(skb, priv, false); + return nft_counter_do_dump(skb, priv, reset); } static int nft_counter_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 01c61e090639..274579b1696e 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -380,7 +380,7 @@ static int nft_dynset_dump(struct sk_buff *skb, if (priv->set->num_exprs == 0) { if (priv->num_exprs == 1) { if (nft_expr_dump(skb, NFTA_DYNSET_EXPR, - priv->expr_array[0])) + priv->expr_array[0], reset)) goto nla_put_failure; } else if (priv->num_exprs > 1) { struct nlattr *nest; @@ -391,7 +391,7 @@ static int nft_dynset_dump(struct sk_buff *skb, for (i = 0; i < priv->num_exprs; i++) { if (nft_expr_dump(skb, NFTA_LIST_ELEM, - priv->expr_array[i])) + priv->expr_array[i], reset)) goto nla_put_failure; } nla_nest_end(skb, nest); diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 6d96b826db4e..28e2873ba24e 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -359,7 +359,7 @@ static int nft_inner_dump(struct sk_buff *skb, goto nla_put_failure; if (nft_expr_dump(skb, NFTA_INNER_EXPR, - (struct nft_expr *)&priv->expr) < 0) + (struct nft_expr *)&priv->expr, reset) < 0) goto nla_put_failure; return 0; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index b1a1217bca4c..123578e28917 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -222,7 +222,7 @@ static int nft_quota_dump(struct sk_buff *skb, { struct nft_quota *priv = nft_expr_priv(expr); - return nft_quota_do_dump(skb, priv, false); + return nft_quota_do_dump(skb, priv, reset); } static void nft_quota_destroy(const struct nft_ctx *ctx, -- cgit v1.2.3 From 32637e33003f36e75e9147788cc0e2f21706ef99 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 8 Nov 2022 15:06:00 +0100 Subject: bpf: Expand map key argument of bpf_redirect_map to u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For queueing packets in XDP we want to add a new redirect map type with support for 64-bit indexes. To prepare fore this, expand the width of the 'key' argument to the bpf_redirect_map() helper. Since BPF registers are always 64-bit, this should be safe to do after the fact. Acked-by: Song Liu Reviewed-by: Stanislav Fomichev Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221108140601.149971-3-toke@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- include/linux/filter.h | 12 ++++++------ include/uapi/linux/bpf.h | 2 +- kernel/bpf/cpumap.c | 4 ++-- kernel/bpf/devmap.c | 4 ++-- kernel/bpf/verifier.c | 2 +- net/core/filter.c | 4 ++-- net/xdp/xskmap.c | 4 ++-- 8 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 49f9d2bec401..54462dd28824 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -135,7 +135,7 @@ struct bpf_map_ops { struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* Misc helpers.*/ - int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags); + int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure diff --git a/include/linux/filter.h b/include/linux/filter.h index 787d35dbf5b0..bf701976056e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -643,13 +643,13 @@ struct bpf_nh_params { }; struct bpf_redirect_info { - u32 flags; - u32 tgt_index; + u64 tgt_index; void *tgt_value; struct bpf_map *map; + u32 flags; + u32 kern_flags; u32 map_id; enum bpf_map_type map_type; - u32 kern_flags; struct bpf_nh_params nh; }; @@ -1504,7 +1504,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { @@ -1515,7 +1515,7 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind if (unlikely(flags & ~(action_mask | flag_mask))) return XDP_ABORTED; - ri->tgt_value = lookup_elem(map, ifindex); + ri->tgt_value = lookup_elem(map, index); if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program @@ -1527,7 +1527,7 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind return flags & action_mask; } - ri->tgt_index = ifindex; + ri->tgt_index = index; ri->map_id = map->id; ri->map_type = map->map_type; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6580448e9f77..ab86145df760 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2647,7 +2647,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * long bpf_redirect_map(struct bpf_map *map, u64 key, u64 flags) * Description * Redirect the packet to the endpoint referenced by *map* at * index *key*. Depending on its type, this *map* can contain diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 6b6a78c04b90..e0b2d016f0bf 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -667,9 +667,9 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, 0, + return __bpf_xdp_redirect_map(map, index, flags, 0, __cpu_map_lookup_elem); } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f9a87dcc5535..d01e4c55b376 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -992,14 +992,14 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } -static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_lookup_elem); } -static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5e74f460dfd0..be24774961ab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14384,7 +14384,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_redirect, - (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL)); + (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_for_each_callback, (int (*)(struct bpf_map *map, bpf_callback_t callback_fn, diff --git a/net/core/filter.c b/net/core/filter.c index 37fad5a9b752..754dd01354d8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4414,10 +4414,10 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, +BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key, u64, flags) { - return map->ops->map_redirect(map, ifindex, flags); + return map->ops->map_redirect(map, key, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index acc8e52a4f5f..771d0fa90ef5 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -231,9 +231,9 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, 0, + return __bpf_xdp_redirect_map(map, index, flags, 0, __xsk_map_lookup_elem); } -- cgit v1.2.3 From f20a0a0519f35a3a34236ed2d38b067c5cb22b9f Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 17 Nov 2022 02:18:28 +0900 Subject: ethtool: doc: clarify what drivers can implement in their get_drvinfo() Many of the drivers which implement ethtool_ops::get_drvinfo() will prints the .driver, .version or .bus_info of struct ethtool_drvinfo. To have a glance of current state, do: $ git grep -W "get_drvinfo(struct" Printing in those three fields is useless because: - since [1], the driver version should be the kernel version (at least for upstream drivers). Arguably, out of tree drivers might still want to set a custom version, but out of tree is not our focus. - since [2], the core is able to provide default values for .driver and .bus_info. In summary, drivers may provide .fw_version and .erom_version, the rest is expected to be done by the core. In struct ethtool_ops doc from linux/ethtool: rephrase field get_drvinfo() doc to discourage developers from implementing this callback. In struct ethtool_drvinfo doc from uapi/linux/ethtool.h: remove the paragraph mentioning what drivers should do. Rationale: no need to repeat what is already written in struct ethtool_ops doc. But add a note that .fw_version and .erom_version are driver defined. Also update the dummy driver and simply remove the callback in order not to confuse the newcomers: most of the drivers will not need this callback function any more. [1] commit 6a7e25c7fb48 ("net/core: Replace driver version to be kernel version") Link: https://git.kernel.org/torvalds/linux/c/6a7e25c7fb48 [2] commit edaf5df22cb8 ("ethtool: ethtool_get_drvinfo: populate drvinfo fields even if callback exits") Link: https://git.kernel.org/netdev/net-next/c/edaf5df22cb8 Reviewed-by: Leon Romanovsky Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20221116171828.4093-1-mailhol.vincent@wanadoo.fr Signed-off-by: Jakub Kicinski --- drivers/net/dummy.c | 7 ------- include/linux/ethtool.h | 8 ++++---- include/uapi/linux/ethtool.h | 10 ++++------ 3 files changed, 8 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index aa0fc00faecb..c4b1b0aa438a 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -99,14 +99,7 @@ static const struct net_device_ops dummy_netdev_ops = { .ndo_change_carrier = dummy_change_carrier, }; -static void dummy_get_drvinfo(struct net_device *dev, - struct ethtool_drvinfo *info) -{ - strscpy(info->driver, DRV_NAME, sizeof(info->driver)); -} - static const struct ethtool_ops dummy_ethtool_ops = { - .get_drvinfo = dummy_get_drvinfo, .get_ts_info = ethtool_op_get_ts_info, }; diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 5c51c7fda32a..9e0a76fc7de9 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -473,10 +473,10 @@ struct ethtool_module_power_mode_params { * parameter. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. - * @get_drvinfo: Report driver/device information. Should only set the - * @driver, @version, @fw_version and @bus_info fields. If not - * implemented, the @driver and @bus_info fields will be filled in - * according to the netdev's parent device. + * @get_drvinfo: Report driver/device information. Modern drivers no + * longer have to implement this callback. Most fields are + * correctly filled in by the core using system information, or + * populated using other driver operations. * @get_regs_len: Get buffer length required for @get_regs * @get_regs: Get device registers * @get_wol: Report whether Wake-on-Lan is enabled diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index f341de2ae612..58e587ba0450 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -159,8 +159,10 @@ static inline __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep) * in its bus driver structure (e.g. pci_driver::name). Must * not be an empty string. * @version: Driver version string; may be an empty string - * @fw_version: Firmware version string; may be an empty string - * @erom_version: Expansion ROM version string; may be an empty string + * @fw_version: Firmware version string; driver defined; may be an + * empty string + * @erom_version: Expansion ROM version string; driver defined; may be + * an empty string * @bus_info: Device bus address. This should match the dev_name() * string for the underlying bus device, if there is one. May be * an empty string. @@ -179,10 +181,6 @@ static inline __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep) * * Users can use the %ETHTOOL_GSSET_INFO command to get the number of * strings in any string set (from Linux 2.6.34). - * - * Drivers should set at most @driver, @version, @fw_version and - * @bus_info in their get_drvinfo() implementation. The ethtool - * core fills in the other fields using other driver operations. */ struct ethtool_drvinfo { __u32 cmd; -- cgit v1.2.3 From cd502236835b678738810ecd501c85a3a7a11150 Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Tue, 15 Nov 2022 11:48:15 +0100 Subject: devlink: Introduce new attribute 'tx_priority' to devlink-rate To fully utilize offload capabilities of Intel 100G card QoS capabilities new attribute 'tx_priority' needs to be introduced. This attribute allows for usage of strict priority arbiter among siblings. This arbitration scheme attempts to schedule nodes based on their priority as long as the nodes remain within their bandwidth limit. Introduce new attribute in devlink-rate that will allow for configuration of strict priority. New attribute is optional. Signed-off-by: Michal Wilczynski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 6 ++++++ include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 611a23a3deb2..90d59d673cb1 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -114,6 +114,8 @@ struct devlink_rate { refcount_t refcnt; }; }; + + u32 tx_priority; }; struct devlink_port { @@ -1511,10 +1513,14 @@ struct devlink_ops { u64 tx_share, struct netlink_ext_ack *extack); int (*rate_leaf_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack); + int (*rate_leaf_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack); + int (*rate_node_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 2f24b53a87a5..1a9214d35ef5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -607,6 +607,7 @@ enum devlink_attr { DEVLINK_ATTR_SELFTESTS, /* nested */ + DEVLINK_ATTR_RATE_TX_PRIORITY, /* u32 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 7f789bbcbbd7..bf6d3a3c28bb 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1203,6 +1203,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, devlink_rate->tx_max, DEVLINK_ATTR_PAD)) goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, + devlink_rate->tx_priority)) + goto nla_put_failure; if (devlink_rate->parent) if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, devlink_rate->parent->name)) @@ -1936,6 +1939,7 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, { struct nlattr *nla_parent, **attrs = info->attrs; int err = -EOPNOTSUPP; + u32 priority; u64 rate; if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { @@ -1964,6 +1968,20 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, devlink_rate->tx_max = rate; } + if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) { + priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv, + priority, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv, + priority, info->extack); + + if (err) + return err; + devlink_rate->tx_priority = priority; + } + nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; if (nla_parent) { err = devlink_nl_rate_parent_node_set(devlink_rate, info, @@ -1995,6 +2013,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], + "TX priority set isn't supported for the leafs"); + return false; + } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes"); @@ -2009,6 +2033,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], + "TX priority set isn't supported for the nodes"); + return false; + } } else { WARN(1, "Unknown type of rate object"); return false; @@ -9187,6 +9217,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 }, [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, + [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, }; static const struct genl_small_ops devlink_nl_ops[] = { -- cgit v1.2.3 From 6e2d7e84fcfee62d70e62aa9e469d10b8b6a7dc7 Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Tue, 15 Nov 2022 11:48:16 +0100 Subject: devlink: Introduce new attribute 'tx_weight' to devlink-rate To fully utilize offload capabilities of Intel 100G card QoS capabilities new attribute 'tx_weight' needs to be introduced. This attribute allows for usage of Weighted Fair Queuing arbitration scheme among siblings. This arbitration scheme can be used simultaneously with the strict priority. Introduce new attribute in devlink-rate that will allow for configuration of Weighted Fair Queueing. New attribute is optional. Signed-off-by: Michal Wilczynski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 5 +++++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 90d59d673cb1..366b23d3f973 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -116,6 +116,7 @@ struct devlink_rate { }; u32 tx_priority; + u32 tx_weight; }; struct devlink_port { @@ -1515,12 +1516,16 @@ struct devlink_ops { u64 tx_max, struct netlink_ext_ack *extack); int (*rate_leaf_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_priority, struct netlink_ext_ack *extack); + int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_weight, struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack); int (*rate_node_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_priority, struct netlink_ext_ack *extack); + int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_weight, struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 1a9214d35ef5..498d0d5d0957 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -608,6 +608,8 @@ enum devlink_attr { DEVLINK_ATTR_SELFTESTS, /* nested */ DEVLINK_ATTR_RATE_TX_PRIORITY, /* u32 */ + DEVLINK_ATTR_RATE_TX_WEIGHT, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index bf6d3a3c28bb..525bdf426163 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1206,6 +1206,11 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, devlink_rate->tx_priority)) goto nla_put_failure; + + if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT, + devlink_rate->tx_weight)) + goto nla_put_failure; + if (devlink_rate->parent) if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, devlink_rate->parent->name)) @@ -1940,6 +1945,7 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, struct nlattr *nla_parent, **attrs = info->attrs; int err = -EOPNOTSUPP; u32 priority; + u32 weight; u64 rate; if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { @@ -1982,6 +1988,20 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, devlink_rate->tx_priority = priority; } + if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) { + weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv, + weight, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv, + weight, info->extack); + + if (err) + return err; + devlink_rate->tx_weight = weight; + } + nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; if (nla_parent) { err = devlink_nl_rate_parent_node_set(devlink_rate, info, @@ -2019,6 +2039,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX priority set isn't supported for the leafs"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], + "TX weight set isn't supported for the leafs"); + return false; + } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes"); @@ -2039,6 +2065,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX priority set isn't supported for the nodes"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], + "TX weight set isn't supported for the nodes"); + return false; + } } else { WARN(1, "Unknown type of rate object"); return false; @@ -9218,6 +9250,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, + [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 }, }; static const struct genl_small_ops devlink_nl_ops[] = { -- cgit v1.2.3 From c73a72f4cbb47672c8cc7f7d7aba52f1cb15baca Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Nov 2022 19:39:03 -0800 Subject: netlink: remove the flex array from struct nlmsghdr I've added a flex array to struct nlmsghdr in commit 738136a0e375 ("netlink: split up copies in the ack construction") to allow accessing the data easily. It leads to warnings with clang, if user space wraps this structure into another struct and the flex array is not at the end of the container. Reviewed-by: Kees Cook Reviewed-by: David Ahern Link: https://lore.kernel.org/all/20221114023927.GA685@u2004-local/ Link: https://lore.kernel.org/r/20221118033903.1651026-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/netlink.h | 2 -- net/netlink/af_netlink.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 5da0da59bf01..e2ae82e3f9f7 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -48,7 +48,6 @@ struct sockaddr_nl { * @nlmsg_flags: Additional flags * @nlmsg_seq: Sequence number * @nlmsg_pid: Sending process port ID - * @nlmsg_data: Message payload */ struct nlmsghdr { __u32 nlmsg_len; @@ -56,7 +55,6 @@ struct nlmsghdr { __u16 nlmsg_flags; __u32 nlmsg_seq; __u32 nlmsg_pid; - __u8 nlmsg_data[]; }; /* Flags values */ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9ebdf3262015..d73091f6bb0f 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2514,7 +2514,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, if (!nlmsg_append(skb, nlmsg_len(nlh))) goto err_bad_put; - memcpy(errmsg->msg.nlmsg_data, nlh->nlmsg_data, + memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh), nlmsg_len(nlh)); } -- cgit v1.2.3 From 72b43bde38de4aa05e6a7fa12d7965f48180deb6 Mon Sep 17 00:00:00 2001 From: Ji Rongfeng Date: Fri, 18 Nov 2022 16:18:18 +0800 Subject: bpf: Update bpf_{g,s}etsockopt() documentation * append missing optnames to the end * simplify bpf_getsockopt()'s doc Signed-off-by: Ji Rongfeng Link: https://lore.kernel.org/r/DU0P192MB15479B86200B1216EC90E162D6099@DU0P192MB1547.EURP192.PROD.OUTLOOK.COM Signed-off-by: Martin KaFai Lau --- include/uapi/linux/bpf.h | 21 ++++++++++++--------- tools/include/uapi/linux/bpf.h | 23 +++++++++++++---------- 2 files changed, 25 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ab86145df760..f89de51a45db 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2584,14 +2584,19 @@ union bpf_attr { * * **SOL_SOCKET**, which supports the following *optname*\ s: * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, - * **SO_BINDTODEVICE**, **SO_KEEPALIVE**. + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, + * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. * * **IPPROTO_TCP**, which supports the following *optname*\ s: * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, + * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, + * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, + * **TCP_BPF_RTO_MIN**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. - * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * * **IPPROTO_IPV6**, which supports the following *optname*\ s: + * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. * Return * 0 on success, or a negative error in case of failure. * @@ -2808,12 +2813,10 @@ union bpf_attr { * and **BPF_CGROUP_INET6_CONNECT**. * * This helper actually implements a subset of **getsockopt()**. - * It supports the following *level*\ s: - * - * * **IPPROTO_TCP**, which supports *optname* - * **TCP_CONGESTION**. - * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. - * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * It supports the same set of *optname*\ s that is supported by + * the **bpf_setsockopt**\ () helper. The exceptions are + * **TCP_BPF_*** is **bpf_setsockopt**\ () only and + * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. * Return * 0 on success, or a negative error in case of failure. * diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6580448e9f77..f89de51a45db 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2584,14 +2584,19 @@ union bpf_attr { * * **SOL_SOCKET**, which supports the following *optname*\ s: * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, - * **SO_BINDTODEVICE**, **SO_KEEPALIVE**. + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, + * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. * * **IPPROTO_TCP**, which supports the following *optname*\ s: * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, + * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, + * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, + * **TCP_BPF_RTO_MIN**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. - * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * * **IPPROTO_IPV6**, which supports the following *optname*\ s: + * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. * Return * 0 on success, or a negative error in case of failure. * @@ -2647,7 +2652,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * long bpf_redirect_map(struct bpf_map *map, u64 key, u64 flags) * Description * Redirect the packet to the endpoint referenced by *map* at * index *key*. Depending on its type, this *map* can contain @@ -2808,12 +2813,10 @@ union bpf_attr { * and **BPF_CGROUP_INET6_CONNECT**. * * This helper actually implements a subset of **getsockopt()**. - * It supports the following *level*\ s: - * - * * **IPPROTO_TCP**, which supports *optname* - * **TCP_CONGESTION**. - * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. - * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * It supports the same set of *optname*\ s that is supported by + * the **bpf_setsockopt**\ () helper. The exceptions are + * **TCP_BPF_*** is **bpf_setsockopt**\ () only and + * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. * Return * 0 on success, or a negative error in case of failure. * -- cgit v1.2.3 From bff3d0534804452e19c097ae6b4eb4b4d846d67f Mon Sep 17 00:00:00 2001 From: Sriram Yagnaraman Date: Fri, 4 Nov 2022 18:18:35 +0100 Subject: netfilter: conntrack: add sctp DATA_SENT state SCTP conntrack currently assumes that the SCTP endpoints will probe secondary paths using HEARTBEAT before sending traffic. But, according to RFC 9260, SCTP endpoints can send any traffic on any of the confirmed paths after SCTP association is up. SCTP endpoints that sends INIT will confirm all peer addresses that upper layer configures, and the SCTP endpoint that receives COOKIE_ECHO will only confirm the address it sent the INIT_ACK to. So, we can have a situation where the INIT sender can start to use secondary paths without the need to send HEARTBEAT. This patch allows DATA/SACK packets to create new connection tracking entry. A new state has been added to indicate that a DATA/SACK chunk has been seen in the original direction - SCTP_CONNTRACK_DATA_SENT. State transitions mostly follows the HEARTBEAT_SENT, except on receiving HEARTBEAT/HEARTBEAT_ACK/DATA/SACK in the reply direction. State transitions in original direction: - DATA_SENT behaves similar to HEARTBEAT_SENT for all chunks, except that it remains in DATA_SENT on receving HEARTBEAT, HEARTBEAT_ACK/DATA/SACK chunks State transitions in reply direction: - DATA_SENT behaves similar to HEARTBEAT_SENT for all chunks, except that it moves to HEARTBEAT_ACKED on receiving HEARTBEAT/HEARTBEAT_ACK/DATA/SACK chunks Note: This patch still doesn't solve the problem when the SCTP endpoint decides to use primary paths for association establishment but uses a secondary path for association shutdown. We still have to depend on timeout for connections to expire in such a case. Signed-off-by: Sriram Yagnaraman Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_sctp.h | 1 + include/uapi/linux/netfilter/nfnetlink_cttimeout.h | 1 + net/netfilter/nf_conntrack_proto_sctp.c | 104 ++++++++++++--------- net/netfilter/nf_conntrack_standalone.c | 8 ++ 4 files changed, 71 insertions(+), 43 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h index edc6ddab0de6..c742469afe21 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h +++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h @@ -16,6 +16,7 @@ enum sctp_conntrack { SCTP_CONNTRACK_SHUTDOWN_ACK_SENT, SCTP_CONNTRACK_HEARTBEAT_SENT, SCTP_CONNTRACK_HEARTBEAT_ACKED, + SCTP_CONNTRACK_DATA_SENT, SCTP_CONNTRACK_MAX }; diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h index 6b20fb22717b..94e74034706d 100644 --- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h +++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h @@ -95,6 +95,7 @@ enum ctattr_timeout_sctp { CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, CTA_TIMEOUT_SCTP_HEARTBEAT_SENT, CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED, + CTA_TIMEOUT_SCTP_DATA_SENT, __CTA_TIMEOUT_SCTP_MAX }; #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 5a936334b517..d88b92a8ffca 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -60,6 +60,7 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, + [SCTP_CONNTRACK_DATA_SENT] = 30 SECS, }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 @@ -74,6 +75,7 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT #define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED +#define sDS SCTP_CONNTRACK_DATA_SENT #define sIV SCTP_CONNTRACK_MAX /* @@ -90,15 +92,16 @@ COOKIE WAIT - We have seen an INIT chunk in the original direction, or als COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. -SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction. SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. -HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to - that of the HEARTBEAT chunk. Secondary connection is - established. +HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK/DATA/SACK in the direction + opposite to that of the HEARTBEAT/DATA chunk. Secondary connection + is established. +DATA_SENT - We have seen a DATA/SACK in a new flow. */ /* TODO @@ -112,36 +115,38 @@ cookie echoed to closed. */ /* SCTP conntrack state transitions */ -static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { +static const u8 sctp_conntracks[2][12][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA}, -/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, -/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */ +/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA, sCW}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS, sCL}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA, sCL}, +/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS}, +/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS}, +/* data/sack */ {sDS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS} }, { /* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA}, -/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, -/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL, sIV}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR, sIV}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA, sIV}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA, sIV}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA, sIV}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA, sIV}, +/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sHA}, +/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA}, +/* data/sack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA}, } }; @@ -253,6 +258,11 @@ static int sctp_new_state(enum ip_conntrack_dir dir, pr_debug("SCTP_CID_HEARTBEAT_ACK"); i = 10; break; + case SCTP_CID_DATA: + case SCTP_CID_SACK: + pr_debug("SCTP_CID_DATA/SACK"); + i = 11; + break; default: /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type, Will stay in %s\n", @@ -306,7 +316,9 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, ih->init_tag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; - } else if (sch->type == SCTP_CID_HEARTBEAT) { + } else if (sch->type == SCTP_CID_HEARTBEAT || + sch->type == SCTP_CID_DATA || + sch->type == SCTP_CID_SACK) { pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; @@ -392,19 +404,19 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, if (!sctp_new(ct, skb, sh, dataoff)) return -NF_ACCEPT; - } - - /* Check the verification tag (Sec 8.5) */ - if (!test_bit(SCTP_CID_INIT, map) && - !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && - !test_bit(SCTP_CID_COOKIE_ECHO, map) && - !test_bit(SCTP_CID_ABORT, map) && - !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && - !test_bit(SCTP_CID_HEARTBEAT, map) && - !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && - sh->vtag != ct->proto.sctp.vtag[dir]) { - pr_debug("Verification tag check failed\n"); - goto out; + } else { + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, map) && + !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && + !test_bit(SCTP_CID_COOKIE_ECHO, map) && + !test_bit(SCTP_CID_ABORT, map) && + !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && + !test_bit(SCTP_CID_HEARTBEAT, map) && + !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && + sh->vtag != ct->proto.sctp.vtag[dir]) { + pr_debug("Verification tag check failed\n"); + goto out; + } } old_state = new_state = SCTP_CONNTRACK_NONE; @@ -464,6 +476,11 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } + } else if (sch->type == SCTP_CID_DATA || sch->type == SCTP_CID_SACK) { + if (ct->proto.sctp.vtag[dir] == 0) { + pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); + ct->proto.sctp.vtag[dir] = sh->vtag; + } } old_state = ct->proto.sctp.state; @@ -684,6 +701,7 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_DATA_SENT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 4ffe84c5a82c..a884c06abf09 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -602,6 +602,7 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED, + NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, @@ -892,6 +893,12 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT] = { + .procname = "nf_conntrack_sctp_timeout_data_sent", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { @@ -1036,6 +1043,7 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, XASSIGN(SHUTDOWN_ACK_SENT, sn); XASSIGN(HEARTBEAT_SENT, sn); XASSIGN(HEARTBEAT_ACKED, sn); + XASSIGN(DATA_SENT, sn); #undef XASSIGN #endif } -- cgit v1.2.3 From e9374524950512a1769f610a868fcdf89ea59b8e Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Tue, 22 Nov 2022 20:30:57 +0100 Subject: netfilter: ipset: Add support for new bitmask parameter Add a new parameter to complement the existing 'netmask' option. The main difference between netmask and bitmask is that bitmask takes any arbitrary ip address as input, it does not have to be a valid netmask. The name of the new parameter is 'bitmask'. This lets us mask out arbitrary bits in the ip address, for example: ipset create set1 hash:ip bitmask 255.128.255.0 ipset create set2 hash:ip,port family inet6 bitmask ffff::ff80 Signed-off-by: Vishwanath Pai Signed-off-by: Joshua Hunt Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 10 ++++ include/uapi/linux/netfilter/ipset/ip_set.h | 2 + net/netfilter/ipset/ip_set_hash_gen.h | 71 +++++++++++++++++++++++++---- net/netfilter/ipset/ip_set_hash_ip.c | 19 ++++---- net/netfilter/ipset/ip_set_hash_ipport.c | 24 +++++++++- net/netfilter/ipset/ip_set_hash_netnet.c | 26 +++++++++-- 6 files changed, 126 insertions(+), 26 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index ada1296c87d5..ab934ad951a8 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -515,6 +515,16 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, *skbinfo = ext->skbinfo; } +static inline void +nf_inet_addr_mask_inplace(union nf_inet_addr *a1, + const union nf_inet_addr *mask) +{ + a1->all[0] &= mask->all[0]; + a1->all[1] &= mask->all[1]; + a1->all[2] &= mask->all[2]; + a1->all[3] &= mask->all[3]; +} + #define IP_SET_INIT_KEXT(skb, opt, set) \ { .bytes = (skb)->len, .packets = 1, .target = true,\ .timeout = ip_set_adt_opt_timeout(opt, set) } diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 79e5d68b87af..333807efd32b 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -85,6 +85,7 @@ enum { IPSET_ATTR_CADT_LINENO = IPSET_ATTR_LINENO, /* 9 */ IPSET_ATTR_MARK, /* 10 */ IPSET_ATTR_MARKMASK, /* 11 */ + IPSET_ATTR_BITMASK, /* 12 */ /* Reserve empty slots */ IPSET_ATTR_CADT_MAX = 16, /* Create-only specific attributes */ @@ -153,6 +154,7 @@ enum ipset_errno { IPSET_ERR_COMMENT, IPSET_ERR_INVALID_MARKMASK, IPSET_ERR_SKBINFO, + IPSET_ERR_BITMASK_NETMASK_EXCL, /* Type specific error codes */ IPSET_ERR_TYPE_SPECIFIC = 4352, diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 3adc291d9ce1..fdb5225a3e5c 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -159,6 +159,17 @@ htable_size(u8 hbits) (SET_WITH_TIMEOUT(set) && \ ip_set_timeout_expired(ext_timeout(d, set))) +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) +static const union nf_inet_addr onesmask = { + .all[0] = 0xffffffff, + .all[1] = 0xffffffff, + .all[2] = 0xffffffff, + .all[3] = 0xffffffff +}; + +static const union nf_inet_addr zeromask = {}; +#endif + #endif /* _IP_SET_HASH_GEN_H */ #ifndef MTYPE @@ -283,8 +294,9 @@ struct htype { u32 markmask; /* markmask value for mark mask to store */ #endif u8 bucketsize; /* max elements in an array block */ -#ifdef IP_SET_HASH_WITH_NETMASK +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) u8 netmask; /* netmask value for subnets to store */ + union nf_inet_addr bitmask; /* stores bitmask */ #endif struct list_head ad; /* Resize add|del backlist */ struct mtype_elem next; /* temporary storage for uadd */ @@ -459,8 +471,8 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b) /* Resizing changes htable_bits, so we ignore it */ return x->maxelem == y->maxelem && a->timeout == b->timeout && -#ifdef IP_SET_HASH_WITH_NETMASK - x->netmask == y->netmask && +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + nf_inet_addr_cmp(&x->bitmask, &y->bitmask) && #endif #ifdef IP_SET_HASH_WITH_MARKMASK x->markmask == y->markmask && @@ -1264,9 +1276,21 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) htonl(jhash_size(htable_bits))) || nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) goto nla_put_failure; +#ifdef IP_SET_HASH_WITH_BITMASK + /* if netmask is set to anything other than HOST_MASK we know that the user supplied netmask + * and not bitmask. These two are mutually exclusive. */ + if (h->netmask == HOST_MASK && !nf_inet_addr_cmp(&onesmask, &h->bitmask)) { + if (set->family == NFPROTO_IPV4) { + if (nla_put_ipaddr4(skb, IPSET_ATTR_BITMASK, h->bitmask.ip)) + goto nla_put_failure; + } else if (set->family == NFPROTO_IPV6) { + if (nla_put_ipaddr6(skb, IPSET_ATTR_BITMASK, &h->bitmask.in6)) + goto nla_put_failure; + } + } +#endif #ifdef IP_SET_HASH_WITH_NETMASK - if (h->netmask != HOST_MASK && - nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) + if (h->netmask != HOST_MASK && nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) goto nla_put_failure; #endif #ifdef IP_SET_HASH_WITH_MARKMASK @@ -1429,8 +1453,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, u32 markmask; #endif u8 hbits; -#ifdef IP_SET_HASH_WITH_NETMASK - u8 netmask; +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + int ret __attribute__((unused)) = 0; + u8 netmask = set->family == NFPROTO_IPV4 ? 32 : 128; + union nf_inet_addr bitmask = onesmask; #endif size_t hsize; struct htype *h; @@ -1468,7 +1494,6 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif #ifdef IP_SET_HASH_WITH_NETMASK - netmask = set->family == NFPROTO_IPV4 ? 32 : 128; if (tb[IPSET_ATTR_NETMASK]) { netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); @@ -1476,6 +1501,33 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, (set->family == NFPROTO_IPV6 && netmask > 128) || netmask == 0) return -IPSET_ERR_INVALID_NETMASK; + + /* we convert netmask to bitmask and store it */ + if (set->family == NFPROTO_IPV4) + bitmask.ip = ip_set_netmask(netmask); + else + ip6_netmask(&bitmask, netmask); + } +#endif + +#ifdef IP_SET_HASH_WITH_BITMASK + if (tb[IPSET_ATTR_BITMASK]) { + /* bitmask and netmask do the same thing, allow only one of these options */ + if (tb[IPSET_ATTR_NETMASK]) + return -IPSET_ERR_BITMASK_NETMASK_EXCL; + + if (set->family == NFPROTO_IPV4) { + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_BITMASK], &bitmask.ip); + if (ret || !bitmask.ip) + return -IPSET_ERR_INVALID_NETMASK; + } else if (set->family == NFPROTO_IPV6) { + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_BITMASK], &bitmask); + if (ret || ipv6_addr_any(&bitmask.in6)) + return -IPSET_ERR_INVALID_NETMASK; + } + + if (nf_inet_addr_cmp(&bitmask, &zeromask)) + return -IPSET_ERR_INVALID_NETMASK; } #endif @@ -1518,7 +1570,8 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, for (i = 0; i < ahash_numof_locks(hbits); i++) spin_lock_init(&t->hregion[i].lock); h->maxelem = maxelem; -#ifdef IP_SET_HASH_WITH_NETMASK +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + h->bitmask = bitmask; h->netmask = netmask; #endif #ifdef IP_SET_HASH_WITH_MARKMASK diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index dd30c03d5a23..556f74268a28 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -24,7 +24,8 @@ /* 2 Comments support */ /* 3 Forceadd support */ /* 4 skbinfo support */ -#define IPSET_TYPE_REV_MAX 5 /* bucketsize, initval support */ +/* 5 bucketsize, initval support */ +#define IPSET_TYPE_REV_MAX 6 /* bitmask support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -34,6 +35,7 @@ MODULE_ALIAS("ip_set_hash:ip"); /* Type specific function prefix */ #define HTYPE hash_ip #define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ @@ -86,7 +88,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, __be32 ip; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); - ip &= ip_set_netmask(h->netmask); + ip &= h->bitmask.ip; if (ip == 0) return -EINVAL; @@ -119,7 +121,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - ip &= ip_set_hostmask(h->netmask); + ip &= ntohl(h->bitmask.ip); e.ip = htonl(ip); if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; @@ -187,12 +189,6 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1, return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } -static void -hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) -{ - ip6_netmask(ip, prefix); -} - static bool hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { @@ -229,7 +225,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); - hash_ip6_netmask(&e.ip, h->netmask); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; @@ -268,7 +264,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - hash_ip6_netmask(&e.ip, h->netmask); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -IPSET_ERR_HASH_ELEM; @@ -295,6 +291,7 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 7303138e46be..2ffbd0b78a8c 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -26,7 +26,8 @@ /* 3 Comments support added */ /* 4 Forceadd support added */ /* 5 skbinfo support added */ -#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ +/* 6 bucketsize, initval support added */ +#define IPSET_TYPE_REV_MAX 7 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -35,6 +36,8 @@ MODULE_ALIAS("ip_set_hash:ip,port"); /* Type specific function prefix */ #define HTYPE hash_ipport +#define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ @@ -92,12 +95,16 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + const struct MTYPE *h = set->data; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= h->bitmask.ip; + if (e.ip == 0) + return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -129,6 +136,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; + e.ip &= h->bitmask.ip; + if (e.ip == 0) + return -EINVAL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { @@ -253,12 +264,17 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + const struct MTYPE *h = set->data; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); + if (ipv6_addr_any(&e.ip.in6)) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -298,6 +314,10 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); + if (ipv6_addr_any(&e.ip.in6)) + return -EINVAL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { @@ -356,6 +376,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 3d09eefe998a..cdfb78c6e0d3 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -23,7 +23,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support added */ /* 2 skbinfo support added */ -#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */ +/* 3 bucketsize, initval support added */ +#define IPSET_TYPE_REV_MAX 4 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith "); @@ -33,6 +34,8 @@ MODULE_ALIAS("ip_set_hash:net,net"); /* Type specific function prefix */ #define HTYPE hash_netnet #define IP_SET_HASH_WITH_NETS +#define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK #define IPSET_NET_COUNT 2 /* IPv4 variants */ @@ -153,8 +156,8 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]); - e.ip[0] &= ip_set_netmask(e.cidr[0]); - e.ip[1] &= ip_set_netmask(e.cidr[1]); + e.ip[0] &= (ip_set_netmask(e.cidr[0]) & h->bitmask.ip); + e.ip[1] &= (ip_set_netmask(e.cidr[1]) & h->bitmask.ip); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -213,8 +216,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) { - e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); - e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); + e.ip[0] = htonl(ip & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[0])); + e.ip[1] = htonl(ip2_from & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[1])); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; @@ -404,6 +407,11 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); + nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask); + nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask); + if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6)) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -414,6 +422,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + const struct hash_netnet6 *h = set->data; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -453,6 +462,11 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); + nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask); + nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask); + if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6)) + return -IPSET_ERR_HASH_ELEM; + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -484,6 +498,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, -- cgit v1.2.3 From af6397c9ee2b42988c912dcad2fca1f43d5c1c99 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 28 Nov 2022 12:36:44 -0800 Subject: devlink: support directly reading from region memory To read from a region, user space must currently request a new snapshot of the region and then read from that snapshot. This can sometimes be overkill if user space only reads a tiny portion. They first create the snapshot, then request a read, then destroy the snapshot. For regions which have a single underlying "contents", it makes sense to allow supporting direct reading of the region data. Extend the DEVLINK_CMD_REGION_READ to allow direct reading from a region if requested via the new DEVLINK_ATTR_REGION_DIRECT. If this attribute is set, then perform a direct read instead of using a snapshot. Direct read is mutually exclusive with DEVLINK_ATTR_REGION_SNAPSHOT_ID, and care is taken to ensure that we reject commands which provide incorrect attributes. Regions must enable support for direct read by implementing the .read() callback function. If a region does not support such direct reads, a suitable extended error message is reported. Signed-off-by: Jacob Keller Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- .../networking/devlink/devlink-region.rst | 13 ++++ include/net/devlink.h | 16 +++++ include/uapi/linux/devlink.h | 2 + net/core/devlink.c | 80 +++++++++++++++++----- 4 files changed, 94 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/devlink/devlink-region.rst b/Documentation/networking/devlink/devlink-region.rst index f06dca9a1eb6..9232cd7da301 100644 --- a/Documentation/networking/devlink/devlink-region.rst +++ b/Documentation/networking/devlink/devlink-region.rst @@ -31,6 +31,15 @@ in its ``devlink_region_ops`` structure. If snapshot id is not set in the ``DEVLINK_CMD_REGION_NEW`` request kernel will allocate one and send the snapshot information to user space. +Regions may optionally allow directly reading from their contents without a +snapshot. Direct read requests are not atomic. In particular a read request +of size 256 bytes or larger will be split into multiple chunks. If atomic +access is required, use a snapshot. A driver wishing to enable this for a +region should implement the ``.read`` callback in the ``devlink_region_ops`` +structure. User space can request a direct read by using the +``DEVLINK_ATTR_REGION_DIRECT`` attribute instead of specifying a snapshot +id. + example usage ------------- @@ -65,6 +74,10 @@ example usage $ devlink region read pci/0000:00:05.0/fw-health snapshot 1 address 0 length 16 0000000000000000 0014 95dc 0014 9514 0035 1670 0034 db30 + # Read from the region without a snapshot + $ devlink region read pci/0000:00:05.0/fw-health address 16 length 16 + 0000000000000010 0000 0000 ffff ff04 0029 8c00 0028 8cc8 + As regions are likely very device or driver specific, no generic regions are defined. See the driver-specific documentation files for information on the specific regions a driver supports. diff --git a/include/net/devlink.h b/include/net/devlink.h index 074a79b8933f..02528f736f65 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -650,6 +650,10 @@ struct devlink_info_req; * the data variable must be updated to point to the snapshot data. * The function will be called while the devlink instance lock is * held. + * @read: callback to directly read a portion of the region. On success, + * the data pointer will be updated with the contents of the + * requested portion of the region. The function will be called + * while the devlink instance lock is held. * @priv: Pointer to driver private data for the region operation */ struct devlink_region_ops { @@ -659,6 +663,10 @@ struct devlink_region_ops { const struct devlink_region_ops *ops, struct netlink_ext_ack *extack, u8 **data); + int (*read)(struct devlink *devlink, + const struct devlink_region_ops *ops, + struct netlink_ext_ack *extack, + u64 offset, u32 size, u8 *data); void *priv; }; @@ -670,6 +678,10 @@ struct devlink_region_ops { * the data variable must be updated to point to the snapshot data. * The function will be called while the devlink instance lock is * held. + * @read: callback to directly read a portion of the region. On success, + * the data pointer will be updated with the contents of the + * requested portion of the region. The function will be called + * while the devlink instance lock is held. * @priv: Pointer to driver private data for the region operation */ struct devlink_port_region_ops { @@ -679,6 +691,10 @@ struct devlink_port_region_ops { const struct devlink_port_region_ops *ops, struct netlink_ext_ack *extack, u8 **data); + int (*read)(struct devlink_port *port, + const struct devlink_port_region_ops *ops, + struct netlink_ext_ack *extack, + u64 offset, u32 size, u8 *data); void *priv; }; diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 498d0d5d0957..70191d96af89 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -610,6 +610,8 @@ enum devlink_attr { DEVLINK_ATTR_RATE_TX_PRIORITY, /* u32 */ DEVLINK_ATTR_RATE_TX_WEIGHT, /* u32 */ + DEVLINK_ATTR_REGION_DIRECT, /* flag */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 6c05cfaa571d..298041a44aa8 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6515,6 +6515,26 @@ devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size, return 0; } +static int +devlink_region_port_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, + u64 curr_offset, struct netlink_ext_ack *extack) +{ + struct devlink_region *region = cb_priv; + + return region->port_ops->read(region->port, region->port_ops, extack, + curr_offset, chunk_size, chunk); +} + +static int +devlink_region_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, + u64 curr_offset, struct netlink_ext_ack *extack) +{ + struct devlink_region *region = cb_priv; + + return region->ops->read(region->devlink, region->ops, extack, + curr_offset, chunk_size, chunk); +} + static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { @@ -6523,12 +6543,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->attrs; struct devlink_port *port = NULL; - struct devlink_snapshot *snapshot; + devlink_chunk_fill_t *region_cb; struct devlink_region *region; const char *region_name; struct devlink *devlink; unsigned int index; - u32 snapshot_id; + void *region_cb_priv; void *hdr; int err; @@ -6546,12 +6566,6 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } - if (!attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { - NL_SET_ERR_MSG(cb->extack, "No snapshot id provided"); - err = -EINVAL; - goto out_unlock; - } - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); @@ -6577,12 +6591,43 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, } snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; - snapshot_id = nla_get_u32(snapshot_attr); - snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); - if (!snapshot) { - NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist"); - err = -EINVAL; - goto out_unlock; + if (!snapshot_attr) { + if (!nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { + NL_SET_ERR_MSG(cb->extack, "No snapshot id provided"); + err = -EINVAL; + goto out_unlock; + } + + if (!region->ops->read) { + NL_SET_ERR_MSG(cb->extack, "Requested region does not support direct read"); + err = -EOPNOTSUPP; + goto out_unlock; + } + + if (port) + region_cb = &devlink_region_port_direct_fill; + else + region_cb = &devlink_region_direct_fill; + region_cb_priv = region; + } else { + struct devlink_snapshot *snapshot; + u32 snapshot_id; + + if (nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { + NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Direct region read does not use snapshot"); + err = -EINVAL; + goto out_unlock; + } + + snapshot_id = nla_get_u32(snapshot_attr); + snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); + if (!snapshot) { + NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist"); + err = -EINVAL; + goto out_unlock; + } + region_cb = &devlink_region_snapshot_fill; + region_cb_priv = snapshot; } if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && @@ -6633,9 +6678,9 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; } - err = devlink_nl_region_read_fill(skb, &devlink_region_snapshot_fill, - snapshot, start_offset, end_offset, - &ret_offset, cb->extack); + err = devlink_nl_region_read_fill(skb, region_cb, region_cb_priv, + start_offset, end_offset, &ret_offset, + cb->extack); if (err && err != -EMSGSIZE) goto nla_put_failure; @@ -9280,6 +9325,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 }, + [DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG }, }; static const struct genl_small_ops devlink_nl_ops[] = { -- cgit v1.2.3 From f8c9dfbd875b17fee59c7f1aa35a4944d4e6d810 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 30 Nov 2022 15:06:28 +0100 Subject: mptcp: add pm listener events This patch adds two new MPTCP netlink event types for PM listening socket create and close, named MPTCP_EVENT_LISTENER_CREATED and MPTCP_EVENT_LISTENER_CLOSED. Add a new function mptcp_event_pm_listener() to push the new events with family, port and addr to userspace. Invoke mptcp_event_pm_listener() with MPTCP_EVENT_LISTENER_CREATED in mptcp_listen() and mptcp_pm_nl_create_listen_socket(), invoke it with MPTCP_EVENT_LISTENER_CLOSED in __mptcp_close_ssk(). Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 9 ++++++++ net/mptcp/pm_netlink.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.c | 3 +++ net/mptcp/protocol.h | 2 ++ 4 files changed, 71 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index dfe19bf13f4c..32af2d278cb4 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -160,6 +160,12 @@ struct mptcp_info { * daddr4 | daddr6, sport, dport, backup, if_idx * [, error] * The priority of a subflow has changed. 'error' should not be set. + * + * MPTCP_EVENT_LISTENER_CREATED: family, sport, saddr4 | saddr6 + * A new PM listener is created. + * + * MPTCP_EVENT_LISTENER_CLOSED: family, sport, saddr4 | saddr6 + * A PM listener is closed. */ enum mptcp_event_type { MPTCP_EVENT_UNSPEC = 0, @@ -174,6 +180,9 @@ enum mptcp_event_type { MPTCP_EVENT_SUB_CLOSED = 11, MPTCP_EVENT_SUB_PRIORITY = 13, + + MPTCP_EVENT_LISTENER_CREATED = 15, + MPTCP_EVENT_LISTENER_CLOSED = 16, }; enum mptcp_event_attr { diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index d66fbd558263..eef69d0e44ec 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1029,6 +1029,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (err) return err; + mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); + return 0; } @@ -2152,6 +2154,58 @@ nla_put_failure: kfree_skb(skb); } +void mptcp_event_pm_listener(const struct sock *ssk, + enum mptcp_event_type event) +{ + const struct inet_sock *issk = inet_sk(ssk); + struct net *net = sock_net(ssk); + struct nlmsghdr *nlh; + struct sk_buff *skb; + + if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return; + + nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, event); + if (!nlh) + goto nla_put_failure; + + if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) + goto nla_put_failure; + + if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) + goto nla_put_failure; + + switch (ssk->sk_family) { + case AF_INET: + if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) + goto nla_put_failure; + break; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + case AF_INET6: { + const struct ipv6_pinfo *np = inet6_sk(ssk); + + if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) + goto nla_put_failure; + break; + } +#endif + default: + WARN_ON_ONCE(1); + goto nla_put_failure; + } + + genlmsg_end(skb, nlh); + mptcp_nl_mcast_send(net, skb, GFP_KERNEL); + return; + +nla_put_failure: + kfree_skb(skb); +} + void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp) { @@ -2197,6 +2251,9 @@ void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, if (mptcp_event_sub_closed(skb, msk, ssk) < 0) goto nla_put_failure; break; + case MPTCP_EVENT_LISTENER_CREATED: + case MPTCP_EVENT_LISTENER_CLOSED: + break; } genlmsg_end(skb, nlh); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b0d387be500a..f6f93957275b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2355,6 +2355,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, tcp_set_state(ssk, TCP_CLOSE); mptcp_subflow_queue_clean(ssk); inet_csk_listen_stop(ssk); + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); } __tcp_close(ssk, 0); @@ -3647,6 +3648,8 @@ static int mptcp_listen(struct socket *sock, int backlog) if (!err) mptcp_copy_inaddrs(sock->sk, ssock->sk); + mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); + unlock: release_sock(sock->sk); return err; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 8b4379a2cd85..955fb3d88eb3 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -839,6 +839,8 @@ void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info); void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); +void mptcp_event_pm_listener(const struct sock *ssk, + enum mptcp_event_type event); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, -- cgit v1.2.3 From d14f28b8c1de668bab863bf5892a49c824cb110d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:27 +0200 Subject: xfrm: add new packet offload flag In the next patches, the xfrm core code will be extended to support new type of offload - packet offload. In that mode, both policy and state should be specially configured in order to perform whole offloaded data path. Full offload takes care of encryption, decryption, encapsulation and other operations with headers. As this mode is new for XFRM policy flow, we can "start fresh" with flag bits and release first and second bit for future use. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 7 +++++++ include/uapi/linux/xfrm.h | 6 ++++++ net/xfrm/xfrm_device.c | 3 +++ net/xfrm/xfrm_user.c | 2 ++ 4 files changed, 18 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e0cc6791c001..b39d24fa2ef0 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -131,12 +131,19 @@ enum { XFRM_DEV_OFFLOAD_OUT, }; +enum { + XFRM_DEV_OFFLOAD_UNSPECIFIED, + XFRM_DEV_OFFLOAD_CRYPTO, + XFRM_DEV_OFFLOAD_PACKET, +}; + struct xfrm_dev_offload { struct net_device *dev; netdevice_tracker dev_tracker; struct net_device *real_dev; unsigned long offload_handle; u8 dir : 2; + u8 type : 2; }; struct xfrm_mode { diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 4f84ea7ee14c..23543c33fee8 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -519,6 +519,12 @@ struct xfrm_user_offload { */ #define XFRM_OFFLOAD_IPV6 1 #define XFRM_OFFLOAD_INBOUND 2 +/* Two bits above are relevant for state path only, while + * offload is used for both policy and state flows. + * + * In policy offload mode, they are free and can be safely reused. + */ +#define XFRM_OFFLOAD_PACKET 4 struct xfrm_userpolicy_default { #define XFRM_USERPOLICY_UNSPEC 0 diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 21269e8f2db4..3b0c1ca8d4bb 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -291,12 +291,15 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, else xso->dir = XFRM_DEV_OFFLOAD_OUT; + xso->type = XFRM_DEV_OFFLOAD_CRYPTO; + err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { xso->dev = NULL; xso->dir = 0; xso->real_dev = NULL; netdev_put(dev, &xso->dev_tracker); + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; if (err != -EOPNOTSUPP) { NL_SET_ERR_MSG(extack, "Device failed to offload this state"); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0eb4696661c8..c3b8c1532718 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -956,6 +956,8 @@ static int copy_user_offload(struct xfrm_dev_offload *xso, struct sk_buff *skb) xuo->ifindex = xso->dev->ifindex; if (xso->dir == XFRM_DEV_OFFLOAD_IN) xuo->flags = XFRM_OFFLOAD_INBOUND; + if (xso->type == XFRM_DEV_OFFLOAD_PACKET) + xuo->flags |= XFRM_OFFLOAD_PACKET; return 0; } -- cgit v1.2.3 From 7112a04664bfc10ae4709b2079fe3991cbd1fe18 Mon Sep 17 00:00:00 2001 From: Sudheer Mogilappagari Date: Thu, 1 Dec 2022 16:25:55 -0800 Subject: ethtool: add netlink based get rss support Add netlink based support for "ethtool -x [context x]" command by implementing ETHTOOL_MSG_RSS_GET netlink message. This is equivalent to functionality provided via ETHTOOL_GRSSH in ioctl path. It sends RSS table, hash key and hash function of an interface to user space. This patch implements existing functionality available in ioctl path and enables addition of new RSS context based parameters in future. Signed-off-by: Sudheer Mogilappagari Link: https://lore.kernel.org/r/20221202002555.241580-1-sudheer.mogilappagari@intel.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ethtool-netlink.rst | 31 +++++- include/uapi/linux/ethtool_netlink.h | 14 +++ net/ethtool/Makefile | 2 +- net/ethtool/netlink.c | 7 ++ net/ethtool/netlink.h | 2 + net/ethtool/rss.c | 153 +++++++++++++++++++++++++++ 6 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 net/ethtool/rss.c (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index bede24ef44fd..f10f8eb44255 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -222,6 +222,7 @@ Userspace to kernel: ``ETHTOOL_MSG_MODULE_GET`` get transceiver module parameters ``ETHTOOL_MSG_PSE_SET`` set PSE parameters ``ETHTOOL_MSG_PSE_GET`` get PSE parameters + ``ETHTOOL_MSG_RSS_GET`` get RSS settings ===================================== ================================= Kernel to userspace: @@ -263,6 +264,7 @@ Kernel to userspace: ``ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY`` PHC virtual clocks info ``ETHTOOL_MSG_MODULE_GET_REPLY`` transceiver module parameters ``ETHTOOL_MSG_PSE_GET_REPLY`` PSE parameters + ``ETHTOOL_MSG_RSS_GET_REPLY`` RSS settings ======================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1687,6 +1689,33 @@ to control PoDL PSE Admin functions. This option is implementing ``IEEE 802.3-2018`` 30.15.1.2.1 acPoDLPSEAdminControl. See ``ETHTOOL_A_PODL_PSE_ADMIN_STATE`` for supported values. +RSS_GET +======= + +Get indirection table, hash key and hash function info associated with a +RSS context of an interface similar to ``ETHTOOL_GRSSH`` ioctl request. + +Request contents: + +===================================== ====== ========================== + ``ETHTOOL_A_RSS_HEADER`` nested request header + ``ETHTOOL_A_RSS_CONTEXT`` u32 context number +===================================== ====== ========================== + +Kernel response contents: + +===================================== ====== ========================== + ``ETHTOOL_A_RSS_HEADER`` nested reply header + ``ETHTOOL_A_RSS_HFUNC`` u32 RSS hash func + ``ETHTOOL_A_RSS_INDIR`` binary Indir table bytes + ``ETHTOOL_A_RSS_HKEY`` binary Hash key bytes +===================================== ====== ========================== + +ETHTOOL_A_RSS_HFUNC attribute is bitmap indicating the hash function +being used. Current supported options are toeplitz, xor or crc32. +ETHTOOL_A_RSS_INDIR attribute returns RSS indrection table where each byte +indicates queue number. + Request translation =================== @@ -1768,7 +1797,7 @@ are netlink only. ``ETHTOOL_GMODULEEEPROM`` ``ETHTOOL_MSG_MODULE_EEPROM_GET`` ``ETHTOOL_GEEE`` ``ETHTOOL_MSG_EEE_GET`` ``ETHTOOL_SEEE`` ``ETHTOOL_MSG_EEE_SET`` - ``ETHTOOL_GRSSH`` n/a + ``ETHTOOL_GRSSH`` ``ETHTOOL_MSG_RSS_GET`` ``ETHTOOL_SRSSH`` n/a ``ETHTOOL_GTUNABLE`` n/a ``ETHTOOL_STUNABLE`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index aaf7c6963d61..5799a9db034e 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -51,6 +51,7 @@ enum { ETHTOOL_MSG_MODULE_SET, ETHTOOL_MSG_PSE_GET, ETHTOOL_MSG_PSE_SET, + ETHTOOL_MSG_RSS_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -97,6 +98,7 @@ enum { ETHTOOL_MSG_MODULE_GET_REPLY, ETHTOOL_MSG_MODULE_NTF, ETHTOOL_MSG_PSE_GET_REPLY, + ETHTOOL_MSG_RSS_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -880,6 +882,18 @@ enum { ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) }; +enum { + ETHTOOL_A_RSS_UNSPEC, + ETHTOOL_A_RSS_HEADER, + ETHTOOL_A_RSS_CONTEXT, /* u32 */ + ETHTOOL_A_RSS_HFUNC, /* u32 */ + ETHTOOL_A_RSS_INDIR, /* binary */ + ETHTOOL_A_RSS_HKEY, /* binary */ + + __ETHTOOL_A_RSS_CNT, + ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 72ab0944262a..228f13df2e18 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -4,7 +4,7 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o -ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ +ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o \ diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 1a4c11356c96..aee98be6237f 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -287,6 +287,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_PHC_VCLOCKS_GET] = ðnl_phc_vclocks_request_ops, [ETHTOOL_MSG_MODULE_GET] = ðnl_module_request_ops, [ETHTOOL_MSG_PSE_GET] = ðnl_pse_request_ops, + [ETHTOOL_MSG_RSS_GET] = ðnl_rss_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -1040,6 +1041,12 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_pse_set_policy, .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_RSS_GET, + .doit = ethnl_default_doit, + .policy = ethnl_rss_get_policy, + .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 1bfd374f9718..3753787ba233 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -346,6 +346,7 @@ extern const struct ethnl_request_ops ethnl_stats_request_ops; extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; extern const struct ethnl_request_ops ethnl_module_request_ops; extern const struct ethnl_request_ops ethnl_pse_request_ops; +extern const struct ethnl_request_ops ethnl_rss_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -386,6 +387,7 @@ extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1]; extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; +extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_CONTEXT + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c new file mode 100644 index 000000000000..ebe6145aed3f --- /dev/null +++ b/net/ethtool/rss.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" + +struct rss_req_info { + struct ethnl_req_info base; + u32 rss_context; +}; + +struct rss_reply_data { + struct ethnl_reply_data base; + u32 indir_size; + u32 hkey_size; + u32 hfunc; + u32 *indir_table; + u8 *hkey; +}; + +#define RSS_REQINFO(__req_base) \ + container_of(__req_base, struct rss_req_info, base) + +#define RSS_REPDATA(__reply_base) \ + container_of(__reply_base, struct rss_reply_data, base) + +const struct nla_policy ethnl_rss_get_policy[] = { + [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32 }, +}; + +static int +rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct rss_req_info *request = RSS_REQINFO(req_info); + + if (tb[ETHTOOL_A_RSS_CONTEXT]) + request->rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]); + + return 0; +} + +static int +rss_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, struct genl_info *info) +{ + struct rss_reply_data *data = RSS_REPDATA(reply_base); + struct rss_req_info *request = RSS_REQINFO(req_base); + struct net_device *dev = reply_base->dev; + const struct ethtool_ops *ops; + u32 total_size, indir_bytes; + u8 dev_hfunc = 0; + u8 *rss_config; + int ret; + + ops = dev->ethtool_ops; + if (!ops->get_rxfh) + return -EOPNOTSUPP; + + /* Some drivers don't handle rss_context */ + if (request->rss_context && !ops->get_rxfh_context) + return -EOPNOTSUPP; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + data->indir_size = 0; + data->hkey_size = 0; + if (ops->get_rxfh_indir_size) + data->indir_size = ops->get_rxfh_indir_size(dev); + if (ops->get_rxfh_key_size) + data->hkey_size = ops->get_rxfh_key_size(dev); + + indir_bytes = data->indir_size * sizeof(u32); + total_size = indir_bytes + data->hkey_size; + rss_config = kzalloc(total_size, GFP_KERNEL); + if (!rss_config) { + ret = -ENOMEM; + goto out_ops; + } + + if (data->indir_size) + data->indir_table = (u32 *)rss_config; + + if (data->hkey_size) + data->hkey = rss_config + indir_bytes; + + if (request->rss_context) + ret = ops->get_rxfh_context(dev, data->indir_table, data->hkey, + &dev_hfunc, request->rss_context); + else + ret = ops->get_rxfh(dev, data->indir_table, data->hkey, + &dev_hfunc); + + if (ret) + goto out_ops; + + data->hfunc = dev_hfunc; +out_ops: + ethnl_ops_complete(dev); + return ret; +} + +static int +rss_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct rss_reply_data *data = RSS_REPDATA(reply_base); + int len; + + len = nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ + nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */ + nla_total_size(data->hkey_size); /* _RSS_HKEY */ + + return len; +} + +static int +rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct rss_reply_data *data = RSS_REPDATA(reply_base); + + if (nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc) || + nla_put(skb, ETHTOOL_A_RSS_INDIR, + sizeof(u32) * data->indir_size, data->indir_table) || + nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey)) + return -EMSGSIZE; + + return 0; +} + +static void rss_cleanup_data(struct ethnl_reply_data *reply_base) +{ + const struct rss_reply_data *data = RSS_REPDATA(reply_base); + + kfree(data->indir_table); +} + +const struct ethnl_request_ops ethnl_rss_request_ops = { + .request_cmd = ETHTOOL_MSG_RSS_GET, + .reply_cmd = ETHTOOL_MSG_RSS_GET_REPLY, + .hdr_attr = ETHTOOL_A_RSS_HEADER, + .req_info_size = sizeof(struct rss_req_info), + .reply_data_size = sizeof(struct rss_reply_data), + + .parse_request = rss_parse_request, + .prepare_data = rss_prepare_data, + .reply_size = rss_reply_size, + .fill_reply = rss_fill_reply, + .cleanup_data = rss_cleanup_data, +}; -- cgit v1.2.3 From da65e9ff3bf614d2836e38e1d405c7073e6ba3b7 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 20:51:15 +0200 Subject: devlink: Expose port function commands to control RoCE Expose port function commands to enable / disable RoCE, this is used to control the port RoCE device capabilities. When RoCE is disabled for a function of the port, function cannot create any RoCE specific resources (e.g GID table). It also saves system memory utilization. For example disabling RoCE enable a VF/SF saves 1 Mbytes of system memory per function. Example of a PCI VF port which supports function configuration: Set RoCE of the VF's port function. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 roce enable $ devlink port function set pci/0000:06:00.0/2 roce disable $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 roce disable Signed-off-by: Shay Drory Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-port.rst | 34 ++++++- include/net/devlink.h | 18 ++++ include/uapi/linux/devlink.h | 10 ++ net/core/devlink.c | 113 ++++++++++++++++++++++ 4 files changed, 174 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst index 2c637f4aae8e..c3302d23e480 100644 --- a/Documentation/networking/devlink/devlink-port.rst +++ b/Documentation/networking/devlink/devlink-port.rst @@ -110,7 +110,7 @@ devlink ports for both the controllers. Function configuration ====================== -A user can configure the function attribute before enumerating the PCI +Users can configure one or more function attributes before enumerating the PCI function. Usually it means, user should configure function attribute before a bus specific device for the function is created. However, when SRIOV is enabled, virtual function devices are created on the PCI bus. @@ -122,6 +122,9 @@ A user may set the hardware address of the function using `devlink port function set hw_addr` command. For Ethernet port function this means a MAC address. +Users may also set the RoCE capability of the function using +`devlink port function set roce` command. + Function attributes =================== @@ -162,6 +165,35 @@ device created for the PCI VF/SF. function: hw_addr 00:00:00:00:88:88 +RoCE capability setup +--------------------- +Not all PCI VFs/SFs require RoCE capability. + +When RoCE capability is disabled, it saves system memory per PCI VF/SF. + +When user disables RoCE capability for a VF/SF, user application cannot send or +receive any RoCE packets through this VF/SF and RoCE GID table for this PCI +will be empty. + +When RoCE capability is disabled in the device using port function attribute, +VF/SF driver cannot override it. + +- Get RoCE capability of the VF device:: + + $ devlink port show pci/0000:06:00.0/2 + pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 + function: + hw_addr 00:00:00:00:00:00 roce enable + +- Set RoCE capability of the VF device:: + + $ devlink port function set pci/0000:06:00.0/2 roce disable + + $ devlink port show pci/0000:06:00.0/2 + pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 + function: + hw_addr 00:00:00:00:00:00 roce disable + Subfunction ============ diff --git a/include/net/devlink.h b/include/net/devlink.h index 5f6eca5e4a40..ce4c65d2f2e7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1451,6 +1451,24 @@ struct devlink_ops { int (*port_function_hw_addr_set)(struct devlink_port *port, const u8 *hw_addr, int hw_addr_len, struct netlink_ext_ack *extack); + /** + * @port_fn_roce_get: Port function's roce get function. + * + * Query RoCE state of a function managed by the devlink port. + * Return -EOPNOTSUPP if port function RoCE handling is not supported. + */ + int (*port_fn_roce_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + /** + * @port_fn_roce_set: Port function's roce set function. + * + * Enable/Disable the RoCE state of a function managed by the devlink + * port. + * Return -EOPNOTSUPP if port function RoCE handling is not supported. + */ + int (*port_fn_roce_set)(struct devlink_port *devlink_port, + bool enable, struct netlink_ext_ack *extack); /** * port_new() - Add a new port function of a specified flavor * @devlink: Devlink instance diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 70191d96af89..6cc2925bd478 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -658,11 +658,21 @@ enum devlink_resource_unit { DEVLINK_RESOURCE_UNIT_ENTRY, }; +enum devlink_port_fn_attr_cap { + DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT, + + /* Add new caps above */ + __DEVLINK_PORT_FN_ATTR_CAPS_MAX, +}; + +#define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT) + enum devlink_port_function_attr { DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, /* binary */ DEVLINK_PORT_FN_ATTR_STATE, /* u8 */ DEVLINK_PORT_FN_ATTR_OPSTATE, /* u8 */ + DEVLINK_PORT_FN_ATTR_CAPS, /* bitfield32 */ __DEVLINK_PORT_FUNCTION_ATTR_MAX, DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1 diff --git a/net/core/devlink.c b/net/core/devlink.c index 035249c5dd17..8c0ad52431c5 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -195,11 +195,16 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); +#define DEVLINK_PORT_FN_CAPS_VALID_MASK \ + (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1) + static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, [DEVLINK_PORT_FN_ATTR_STATE] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, DEVLINK_PORT_FN_STATE_ACTIVE), + [DEVLINK_PORT_FN_ATTR_CAPS] = + NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), }; static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = { @@ -680,6 +685,60 @@ devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb, return 0; } +static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps, + u32 cap, bool is_enable) +{ + caps->selector |= cap; + if (is_enable) + caps->value |= cap; +} + +static int devlink_port_fn_roce_fill(const struct devlink_ops *ops, + struct devlink_port *devlink_port, + struct nla_bitfield32 *caps, + struct netlink_ext_ack *extack) +{ + bool is_enable; + int err; + + if (!ops->port_fn_roce_get) + return 0; + + err = ops->port_fn_roce_get(devlink_port, &is_enable, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable); + return 0; +} + +static int devlink_port_fn_caps_fill(const struct devlink_ops *ops, + struct devlink_port *devlink_port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) +{ + struct nla_bitfield32 caps = {}; + int err; + + err = devlink_port_fn_roce_fill(ops, devlink_port, &caps, extack); + if (err) + return err; + + if (!caps.selector) + return 0; + err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, + caps.selector); + if (err) + return err; + + *msg_updated = true; + return 0; +} + static int devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, struct genl_info *info, @@ -1263,6 +1322,35 @@ static int devlink_port_fn_state_fill(const struct devlink_ops *ops, return 0; } +static int +devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, + struct netlink_ext_ack *extack) +{ + const struct devlink_ops *ops = devlink_port->devlink->ops; + + return ops->port_fn_roce_set(devlink_port, enable, extack); +} + +static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nla_bitfield32 caps; + u32 caps_value; + int err; + + caps = nla_get_bitfield32(attr); + caps_value = caps.value & caps.selector; + if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) { + err = devlink_port_fn_roce_set(devlink_port, + caps_value & DEVLINK_PORT_FN_CAP_ROCE, + extack); + if (err) + return err; + } + return 0; +} + static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) @@ -1281,6 +1369,10 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por &msg_updated); if (err) goto out; + err = devlink_port_fn_caps_fill(ops, port, msg, extack, + &msg_updated); + if (err) + goto out; err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated); out: if (err || !msg_updated) @@ -1653,6 +1745,7 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port, struct netlink_ext_ack *extack) { const struct devlink_ops *ops = devlink_port->devlink->ops; + struct nlattr *attr; if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && !ops->port_function_hw_addr_set) { @@ -1665,6 +1758,18 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port, "Function does not support state setting"); return -EOPNOTSUPP; } + attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; + if (attr) { + struct nla_bitfield32 caps; + + caps = nla_get_bitfield32(attr); + if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE && + !ops->port_fn_roce_set) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Port doesn't support RoCE function attribute"); + return -EOPNOTSUPP; + } + } return 0; } @@ -1692,6 +1797,14 @@ static int devlink_port_function_set(struct devlink_port *port, if (err) return err; } + + attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; + if (attr) { + err = devlink_port_fn_caps_set(port, attr, extack); + if (err) + return err; + } + /* Keep this as the last function attribute set, so that when * multiple port function attributes are set along with state, * Those can be applied first before activating the state. -- cgit v1.2.3 From a8ce7b26a51efc4d7753b23d639ae092878a6193 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 20:51:18 +0200 Subject: devlink: Expose port function commands to control migratable Expose port function commands to enable / disable migratable capability, this is used to set the port function as migratable. Live migration is the process of transferring a live virtual machine from one physical host to another without disrupting its normal operation. In order for a VM to be able to perform LM, all the VM components must be able to perform migration. e.g.: to be migratable. In order for VF to be migratable, VF must be bound to VFIO driver with migration support. When migratable capability is enabled for a function of the port, the device is making the necessary preparations for the function to be migratable, which might include disabling features which cannot be migrated. Example of LM with migratable function configuration: Set migratable of the VF's port function. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable disable $ devlink port function set pci/0000:06:00.0/2 migratable enable $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable enable Bind VF to VFIO driver with migration support: $ echo > /sys/bus/pci/devices/0000:08:00.0/driver/unbind $ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override $ echo > /sys/bus/pci/devices/0000:08:00.0/driver/bind Attach VF to the VM. Start the VM. Perform LM. Signed-off-by: Shay Drory Reviewed-by: Jiri Pirko Acked-by: Shannon Nelson Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-port.rst | 46 +++++++++++++++++++ include/net/devlink.h | 21 +++++++++ include/uapi/linux/devlink.h | 3 ++ net/core/devlink.c | 55 +++++++++++++++++++++++ 4 files changed, 125 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst index c3302d23e480..3da590953ce8 100644 --- a/Documentation/networking/devlink/devlink-port.rst +++ b/Documentation/networking/devlink/devlink-port.rst @@ -125,6 +125,9 @@ this means a MAC address. Users may also set the RoCE capability of the function using `devlink port function set roce` command. +Users may also set the function as migratable using +'devlink port function set migratable' command. + Function attributes =================== @@ -194,6 +197,49 @@ VF/SF driver cannot override it. function: hw_addr 00:00:00:00:00:00 roce disable +migratable capability setup +--------------------------- +Live migration is the process of transferring a live virtual machine +from one physical host to another without disrupting its normal +operation. + +User who want PCI VFs to be able to perform live migration need to +explicitly enable the VF migratable capability. + +When user enables migratable capability for a VF, and the HV binds the VF to VFIO driver +with migration support, the user can migrate the VM with this VF from one HV to a +different one. + +However, when migratable capability is enable, device will disable features which cannot +be migrated. Thus migratable cap can impose limitations on a VF so let the user decide. + +Example of LM with migratable function configuration: +- Get migratable capability of the VF device:: + + $ devlink port show pci/0000:06:00.0/2 + pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 + function: + hw_addr 00:00:00:00:00:00 migratable disable + +- Set migratable capability of the VF device:: + + $ devlink port function set pci/0000:06:00.0/2 migratable enable + + $ devlink port show pci/0000:06:00.0/2 + pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 + function: + hw_addr 00:00:00:00:00:00 migratable enable + +- Bind VF to VFIO driver with migration support:: + + $ echo > /sys/bus/pci/devices/0000:08:00.0/driver/unbind + $ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override + $ echo > /sys/bus/pci/devices/0000:08:00.0/driver/bind + +Attach VF to the VM. +Start the VM. +Perform live migration. + Subfunction ============ diff --git a/include/net/devlink.h b/include/net/devlink.h index ce4c65d2f2e7..0f376a28b9c4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1469,6 +1469,27 @@ struct devlink_ops { */ int (*port_fn_roce_set)(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack); + /** + * @port_fn_migratable_get: Port function's migratable get function. + * + * Query migratable state of a function managed by the devlink port. + * Return -EOPNOTSUPP if port function migratable handling is not + * supported. + */ + int (*port_fn_migratable_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + /** + * @port_fn_migratable_set: Port function's migratable set function. + * + * Enable/Disable migratable state of a function managed by the devlink + * port. + * Return -EOPNOTSUPP if port function migratable handling is not + * supported. + */ + int (*port_fn_migratable_set)(struct devlink_port *devlink_port, + bool enable, + struct netlink_ext_ack *extack); /** * port_new() - Add a new port function of a specified flavor * @devlink: Devlink instance diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 6cc2925bd478..3782d4219ac9 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -660,12 +660,15 @@ enum devlink_resource_unit { enum devlink_port_fn_attr_cap { DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT, + DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT, /* Add new caps above */ __DEVLINK_PORT_FN_ATTR_CAPS_MAX, }; #define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT) +#define DEVLINK_PORT_FN_CAP_MIGRATABLE \ + _BITUL(DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT) enum devlink_port_function_attr { DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, diff --git a/net/core/devlink.c b/net/core/devlink.c index 8c0ad52431c5..ab40ebcb4aea 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -715,6 +715,29 @@ static int devlink_port_fn_roce_fill(const struct devlink_ops *ops, return 0; } +static int devlink_port_fn_migratable_fill(const struct devlink_ops *ops, + struct devlink_port *devlink_port, + struct nla_bitfield32 *caps, + struct netlink_ext_ack *extack) +{ + bool is_enable; + int err; + + if (!ops->port_fn_migratable_get || + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) + return 0; + + err = ops->port_fn_migratable_get(devlink_port, &is_enable, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable); + return 0; +} + static int devlink_port_fn_caps_fill(const struct devlink_ops *ops, struct devlink_port *devlink_port, struct sk_buff *msg, @@ -728,6 +751,10 @@ static int devlink_port_fn_caps_fill(const struct devlink_ops *ops, if (err) return err; + err = devlink_port_fn_migratable_fill(ops, devlink_port, &caps, extack); + if (err) + return err; + if (!caps.selector) return 0; err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, @@ -1322,6 +1349,15 @@ static int devlink_port_fn_state_fill(const struct devlink_ops *ops, return 0; } +static int +devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable, + struct netlink_ext_ack *extack) +{ + const struct devlink_ops *ops = devlink_port->devlink->ops; + + return ops->port_fn_migratable_set(devlink_port, enable, extack); +} + static int devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) @@ -1348,6 +1384,13 @@ static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, if (err) return err; } + if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { + err = devlink_port_fn_mig_set(devlink_port, caps_value & + DEVLINK_PORT_FN_CAP_MIGRATABLE, + extack); + if (err) + return err; + } return 0; } @@ -1769,6 +1812,18 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port, "Port doesn't support RoCE function attribute"); return -EOPNOTSUPP; } + if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { + if (!ops->port_fn_migratable_set) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Port doesn't support migratable function attribute"); + return -EOPNOTSUPP; + } + if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "migratable function attribute supported for VFs only"); + return -EOPNOTSUPP; + } + } } return 0; } -- cgit v1.2.3 From 270605317366e4535d8d9fc3d9da1ad0fb3c9d45 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:37 +0530 Subject: bpf: Rework process_dynptr_func Recently, user ringbuf support introduced a PTR_TO_DYNPTR register type for use in callback state, because in case of user ringbuf helpers, there is no dynptr on the stack that is passed into the callback. To reflect such a state, a special register type was created. However, some checks have been bypassed incorrectly during the addition of this feature. First, for arg_type with MEM_UNINIT flag which initialize a dynptr, they must be rejected for such register type. Secondly, in the future, there are plans to add dynptr helpers that operate on the dynptr itself and may change its offset and other properties. In all of these cases, PTR_TO_DYNPTR shouldn't be allowed to be passed to such helpers, however the current code simply returns 0. The rejection for helpers that release the dynptr is already handled. For fixing this, we take a step back and rework existing code in a way that will allow fitting in all classes of helpers and have a coherent model for dealing with the variety of use cases in which dynptr is used. First, for ARG_PTR_TO_DYNPTR, it can either be set alone or together with a DYNPTR_TYPE_* constant that denotes the only type it accepts. Next, helpers which initialize a dynptr use MEM_UNINIT to indicate this fact. To make the distinction clear, use MEM_RDONLY flag to indicate that the helper only operates on the memory pointed to by the dynptr, not the dynptr itself. In C parlance, it would be equivalent to taking the dynptr as a point to const argument. When either of these flags are not present, the helper is allowed to mutate both the dynptr itself and also the memory it points to. Currently, the read only status of the memory is not tracked in the dynptr, but it would be trivial to add this support inside dynptr state of the register. With these changes and renaming PTR_TO_DYNPTR to CONST_PTR_TO_DYNPTR to better reflect its usage, it can no longer be passed to helpers that initialize a dynptr, i.e. bpf_dynptr_from_mem, bpf_ringbuf_reserve_dynptr. A note to reviewers is that in code that does mark_stack_slots_dynptr, and unmark_stack_slots_dynptr, we implicitly rely on the fact that PTR_TO_STACK reg is the only case that can reach that code path, as one cannot pass CONST_PTR_TO_DYNPTR to helpers that don't set MEM_RDONLY. In both cases such helpers won't be setting that flag. The next patch will add a couple of selftest cases to make sure this doesn't break. Fixes: 205715673844 ("bpf: Add bpf_user_ringbuf_drain() helper") Acked-by: Joanne Koong Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 +- include/uapi/linux/bpf.h | 8 +- kernel/bpf/helpers.c | 18 +- kernel/bpf/verifier.c | 227 +++++++++++++++------ scripts/bpf_doc.py | 1 + tools/include/uapi/linux/bpf.h | 8 +- .../selftests/bpf/prog_tests/user_ringbuf.c | 4 +- 7 files changed, 191 insertions(+), 79 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4920ac252754..3de24cfb7a3d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -775,7 +775,7 @@ enum bpf_reg_type { PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ - PTR_TO_DYNPTR, /* reg points to a dynptr */ + CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ @@ -2828,7 +2828,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); -u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr); +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f89de51a45db..464ca3f01fe7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5293,7 +5293,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5303,7 +5303,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5313,7 +5313,7 @@ union bpf_attr { * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* * is a read-only dynptr or if *flags* is not 0. * - * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) * Description * Get a pointer to the underlying dynptr data. * @@ -5414,7 +5414,7 @@ union bpf_attr { * Drain samples from the specified user ring buffer, and invoke * the provided callback for each such sample: * - * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); * * If **callback_fn** returns 0, the helper will continue to try * and drain the next sample, up to a maximum of diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6ed5875b97a3..fa1093d4ad6b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1404,7 +1404,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } @@ -1414,7 +1414,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ ptr->size |= type << DYNPTR_TYPE_SHIFT; } -u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } @@ -1438,7 +1438,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) { u32 size = bpf_dynptr_get_size(ptr); @@ -1483,7 +1483,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, u32, offset, u64, flags) { int err; @@ -1506,12 +1506,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_DYNPTR, + .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len, u64, flags) { int err; @@ -1532,14 +1532,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) { int err; @@ -1560,7 +1560,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb742ac75844..a880776bd999 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -592,7 +592,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", - [PTR_TO_DYNPTR] = "dynptr_ptr", + [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", }; if (type & PTR_MAYBE_NULL) { @@ -725,6 +725,28 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type) return type == BPF_DYNPTR_TYPE_RINGBUF; } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, + enum bpf_dynptr_type type, + bool first_slot); + +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); + +static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1, + struct bpf_reg_state *sreg2, + enum bpf_dynptr_type type) +{ + __mark_dynptr_reg(sreg1, type, true); + __mark_dynptr_reg(sreg2, type, false); +} + +static void mark_dynptr_cb_reg(struct bpf_reg_state *reg, + enum bpf_dynptr_type type) +{ + __mark_dynptr_reg(reg, type, true); +} + + static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type, int insn_idx) { @@ -746,9 +768,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL; - state->stack[spi].spilled_ptr.dynptr.first_slot = true; - state->stack[spi].spilled_ptr.dynptr.type = type; - state->stack[spi - 1].spilled_ptr.dynptr.type = type; + mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr, + &state->stack[spi - 1].spilled_ptr, type); if (dynptr_type_refcounted(type)) { /* The id is used to track proper releasing */ @@ -756,8 +777,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (id < 0) return id; - state->stack[spi].spilled_ptr.id = id; - state->stack[spi - 1].spilled_ptr.id = id; + state->stack[spi].spilled_ptr.ref_obj_id = id; + state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } return 0; @@ -779,25 +800,23 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re } /* Invalidate any slices associated with this dynptr */ - if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - release_reference(env, state->stack[spi].spilled_ptr.id); - state->stack[spi].spilled_ptr.id = 0; - state->stack[spi - 1].spilled_ptr.id = 0; - } - - state->stack[spi].spilled_ptr.dynptr.first_slot = false; - state->stack[spi].spilled_ptr.dynptr.type = 0; - state->stack[spi - 1].spilled_ptr.dynptr.type = 0; + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) + WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id)); + __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); return 0; } static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); - int i; + int spi, i; + + if (reg->type == CONST_PTR_TO_DYNPTR) + return false; + spi = get_spi(reg->off); if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) return true; @@ -813,9 +832,14 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; int i; + /* This already represents first slot of initialized bpf_dynptr */ + if (reg->type == CONST_PTR_TO_DYNPTR) + return true; + + spi = get_spi(reg->off); if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || !state->stack[spi].spilled_ptr.dynptr.first_slot) return false; @@ -834,15 +858,19 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type dynptr_type; - int spi = get_spi(reg->off); + int spi; /* ARG_PTR_TO_DYNPTR takes any type of dynptr */ if (arg_type == ARG_PTR_TO_DYNPTR) return true; dynptr_type = arg_to_dynptr_type(arg_type); - - return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; + if (reg->type == CONST_PTR_TO_DYNPTR) { + return reg->dynptr.type == dynptr_type; + } else { + spi = get_spi(reg->off); + return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; + } } /* The reg state of a pointer or a bounded scalar was saved when @@ -1354,9 +1382,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void __mark_reg_not_init(const struct bpf_verifier_env *env, - struct bpf_reg_state *reg); - /* This helper doesn't clear reg->id */ static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { @@ -1419,6 +1444,19 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, __mark_reg_known_zero(regs + regno); } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, + bool first_slot) +{ + /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for + * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply + * set it unconditionally as it is ignored for STACK_DYNPTR anyway. + */ + __mark_reg_known_zero(reg); + reg->type = CONST_PTR_TO_DYNPTR; + reg->dynptr.type = type; + reg->dynptr.first_slot = first_slot; +} + static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) { if (base_type(reg->type) == PTR_TO_MAP_VALUE) { @@ -5857,19 +5895,58 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return 0; } +/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK + * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR. + * + * In both cases we deal with the first 8 bytes, but need to mark the next 8 + * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of + * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object. + * + * Mutability of bpf_dynptr is at two levels, one is at the level of struct + * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct + * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can + * mutate the view of the dynptr and also possibly destroy it. In the latter + * case, it cannot mutate the bpf_dynptr itself but it can still mutate the + * memory that dynptr points to. + * + * The verifier will keep track both levels of mutation (bpf_dynptr's in + * reg->type and the memory's in reg->dynptr.type), but there is no support for + * readonly dynptr view yet, hence only the first case is tracked and checked. + * + * This is consistent with how C applies the const modifier to a struct object, + * where the pointer itself inside bpf_dynptr becomes const but not what it + * points to. + * + * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument + * type, and declare it as 'const struct bpf_dynptr *' in their prototype. + */ int process_dynptr_func(struct bpf_verifier_env *env, int regno, - enum bpf_arg_type arg_type, - struct bpf_call_arg_meta *meta) + enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - /* We only need to check for initialized / uninitialized helper - * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the - * assumption is that if it is, that a helper function - * initialized the dynptr on behalf of the BPF program. + /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an + * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): + */ + if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { + verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); + return -EFAULT; + } + /* MEM_UNINIT - Points to memory that is an appropriate candidate for + * constructing a mutable bpf_dynptr object. + * + * Currently, this is only possible with PTR_TO_STACK + * pointing to a region of at least 16 bytes which doesn't + * contain an existing bpf_dynptr. + * + * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be + * mutated or destroyed. However, the memory it points to + * may be mutated. + * + * None - Points to a initialized dynptr that can be mutated and + * destroyed, including mutation of the memory it points + * to. */ - if (base_type(reg->type) == PTR_TO_DYNPTR) - return 0; if (arg_type & MEM_UNINIT) { if (!is_dynptr_reg_valid_uninit(env, reg)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); @@ -5885,7 +5962,13 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, } meta->uninit_dynptr_regno = regno; - } else { + } else /* MEM_RDONLY and None case from above */ { + /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ + if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { + verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); + return -EINVAL; + } + if (!is_dynptr_reg_valid_init(env, reg)) { verbose(env, "Expected an initialized dynptr as arg #%d\n", @@ -5893,7 +5976,8 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } - if (!is_dynptr_type_expected(env, reg, arg_type)) { + /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ + if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { const char *err_extra = ""; switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { @@ -6056,7 +6140,7 @@ static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } } static const struct bpf_reg_types dynptr_types = { .types = { PTR_TO_STACK, - PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL, + CONST_PTR_TO_DYNPTR, } }; @@ -6241,12 +6325,16 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); } -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; + + if (reg->type == CONST_PTR_TO_DYNPTR) + return reg->ref_obj_id; - return state->stack[spi].spilled_ptr.id; + spi = get_spi(reg->off); + return state->stack[spi].spilled_ptr.ref_obj_id; } static int check_func_arg(struct bpf_verifier_env *env, u32 arg, @@ -6311,11 +6399,22 @@ skip_type_check: if (arg_type_is_release(arg_type)) { if (arg_type_is_dynptr(arg_type)) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || - !state->stack[spi].spilled_ptr.id) { - verbose(env, "arg %d is an unacquired reference\n", regno); + /* Only dynptr created on stack can be released, thus + * the get_spi and stack state checks for spilled_ptr + * should only be done before process_dynptr_func for + * PTR_TO_STACK. + */ + if (reg->type == PTR_TO_STACK) { + spi = get_spi(reg->off); + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.ref_obj_id) { + verbose(env, "arg %d is an unacquired reference\n", regno); + return -EINVAL; + } + } else { + verbose(env, "cannot release unowned const bpf_dynptr\n"); return -EINVAL; } } else if (!reg->ref_obj_id && !register_is_null(reg)) { @@ -7289,11 +7388,10 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, { /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void * callback_ctx, u64 flags); - * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx); + * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx); */ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); - callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL; - __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL); callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; /* unused */ @@ -7687,7 +7785,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs = cur_regs(env); + /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr + * is safe to do directly. + */ if (meta.uninit_dynptr_regno) { + if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) { + verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n"); + return -EFAULT; + } /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, @@ -7705,15 +7811,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.release_regno) { err = -EINVAL; - if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) + /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr + * is safe to do directly. + */ + if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { + if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { + verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n"); + return -EFAULT; + } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - else if (meta.ref_obj_id) + } else if (meta.ref_obj_id) { err = release_reference(env, meta.ref_obj_id); - /* meta.ref_obj_id can only be 0 if register that is meant to be - * released is NULL, which must be > R0. - */ - else if (register_is_null(®s[meta.release_regno])) + } else if (register_is_null(®s[meta.release_regno])) { + /* meta.ref_obj_id can only be 0 if register that is meant to be + * released is NULL, which must be > R0. + */ err = 0; + } if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", func_id_name(func_id), func_id); @@ -7787,11 +7902,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } - if (base_type(reg->type) != PTR_TO_DYNPTR) - /* Find the id of the dynptr we're - * tracking the reference of - */ - meta.ref_obj_id = stack_slot_get_id(env, reg); + meta.ref_obj_id = dynptr_ref_obj_id(env, reg); break; } } @@ -8848,12 +8959,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_DYNPTR: if (reg->type != PTR_TO_STACK && - reg->type != PTR_TO_DYNPTR) { + reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i); return -EINVAL; } - ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR, NULL); + ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL); if (ret < 0) return ret; break; diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index fdb0aff8cb5a..e8d90829f23e 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -752,6 +752,7 @@ class PrinterHelpers(Printer): 'struct bpf_timer', 'struct mptcp_sock', 'struct bpf_dynptr', + 'const struct bpf_dynptr', 'struct iphdr', 'struct ipv6hdr', } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f89de51a45db..464ca3f01fe7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5293,7 +5293,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5303,7 +5303,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5313,7 +5313,7 @@ union bpf_attr { * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* * is a read-only dynptr or if *flags* is not 0. * - * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) * Description * Get a pointer to the underlying dynptr data. * @@ -5414,7 +5414,7 @@ union bpf_attr { * Drain samples from the specified user ring buffer, and invoke * the provided callback for each such sample: * - * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); * * If **callback_fn** returns 0, the helper will continue to try * and drain the next sample, up to a maximum of diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c index 02b18d018b36..aefa0a474e58 100644 --- a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c @@ -673,8 +673,8 @@ static struct { {"user_ringbuf_callback_write_forbidden", "invalid mem access 'dynptr_ptr'"}, {"user_ringbuf_callback_null_context_write", "invalid mem access 'scalar'"}, {"user_ringbuf_callback_null_context_read", "invalid mem access 'scalar'"}, - {"user_ringbuf_callback_discard_dynptr", "arg 1 is an unacquired reference"}, - {"user_ringbuf_callback_submit_dynptr", "arg 1 is an unacquired reference"}, + {"user_ringbuf_callback_discard_dynptr", "cannot release unowned const bpf_dynptr"}, + {"user_ringbuf_callback_submit_dynptr", "cannot release unowned const bpf_dynptr"}, {"user_ringbuf_callback_invalid_return", "At callback return the register R0 has value"}, }; -- cgit v1.2.3 From b534dc46c8ae0165b1b2509be24dbea4fa9c4011 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 7 Dec 2022 09:37:01 -0500 Subject: net_tstamp: add SOF_TIMESTAMPING_OPT_ID_TCP Add an option to initialize SOF_TIMESTAMPING_OPT_ID for TCP from write_seq sockets instead of snd_una. This should have been the behavior from the start. Because processes may now exist that rely on the established behavior, do not change behavior of the existing option, but add the right behavior with a new flag. It is encouraged to always set SOF_TIMESTAMPING_OPT_ID_TCP on stream sockets along with the existing SOF_TIMESTAMPING_OPT_ID. Intuitively the contract is that the counter is zero after the setsockopt, so that the next write N results in a notification for the last byte N - 1. On idle sockets snd_una == write_seq and this holds for both. But on sockets with data in transmission, snd_una records the unacked offset in the stream. This depends on the ACK response from the peer. A process cannot learn this in a race free manner (ioctl SIOCOUTQ is one racy approach). write_seq records the offset at the last byte written by the process. This is a better starting point. It matches the intuitive contract in all circumstances, unaffected by external behavior. The new timestamp flag necessitates increasing sk_tsflags to 32 bits. Move the field in struct sock to avoid growing the socket (for some common CONFIG variants). The UAPI interface so_timestamping.flags is already int, so 32 bits wide. Reported-by: Sotirios Delimanolis Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20221207143701.29861-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/networking/timestamping.rst | 32 ++++++++++++++++++++++++++++++- include/net/sock.h | 6 +++--- include/uapi/linux/net_tstamp.h | 3 ++- net/core/sock.c | 9 ++++++++- net/ethtool/common.c | 1 + 5 files changed, 45 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst index be4eb1242057..f17c01834a12 100644 --- a/Documentation/networking/timestamping.rst +++ b/Documentation/networking/timestamping.rst @@ -179,7 +179,8 @@ SOF_TIMESTAMPING_OPT_ID: identifier and returns that along with the timestamp. The identifier is derived from a per-socket u32 counter (that wraps). For datagram sockets, the counter increments with each sent packet. For stream - sockets, it increments with every byte. + sockets, it increments with every byte. For stream sockets, also set + SOF_TIMESTAMPING_OPT_ID_TCP, see the section below. The counter starts at zero. It is initialized the first time that the socket option is enabled. It is reset each time the option is @@ -192,6 +193,35 @@ SOF_TIMESTAMPING_OPT_ID: among all possibly concurrently outstanding timestamp requests for that socket. +SOF_TIMESTAMPING_OPT_ID_TCP: + Pass this modifier along with SOF_TIMESTAMPING_OPT_ID for new TCP + timestamping applications. SOF_TIMESTAMPING_OPT_ID defines how the + counter increments for stream sockets, but its starting point is + not entirely trivial. This option fixes that. + + For stream sockets, if SOF_TIMESTAMPING_OPT_ID is set, this should + always be set too. On datagram sockets the option has no effect. + + A reasonable expectation is that the counter is reset to zero with + the system call, so that a subsequent write() of N bytes generates + a timestamp with counter N-1. SOF_TIMESTAMPING_OPT_ID_TCP + implements this behavior under all conditions. + + SOF_TIMESTAMPING_OPT_ID without modifier often reports the same, + especially when the socket option is set when no data is in + transmission. If data is being transmitted, it may be off by the + length of the output queue (SIOCOUTQ). + + The difference is due to being based on snd_una versus write_seq. + snd_una is the offset in the stream acknowledged by the peer. This + depends on factors outside of process control, such as network RTT. + write_seq is the last byte written by the process. This offset is + not affected by external inputs. + + The difference is subtle and unlikely to be noticed when configured + at initial socket creation, when no data is queued or sent. But + SOF_TIMESTAMPING_OPT_ID_TCP behavior is more robust regardless of + when the socket option is set. SOF_TIMESTAMPING_OPT_CMSG: Support recv() cmsg for all timestamped packets. Control messages diff --git a/include/net/sock.h b/include/net/sock.h index 6d207e7c4ad0..ecea3dcc2217 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -503,10 +503,10 @@ struct sock { #if BITS_PER_LONG==32 seqlock_t sk_stamp_seq; #endif - u16 sk_tsflags; - u8 sk_shutdown; atomic_t sk_tskey; atomic_t sk_zckey; + u32 sk_tsflags; + u8 sk_shutdown; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, @@ -1899,7 +1899,7 @@ static inline void sock_replace_proto(struct sock *sk, struct proto *proto) struct sockcm_cookie { u64 transmit_time; u32 mark; - u16 tsflags; + u32 tsflags; }; static inline void sockcm_init(struct sockcm_cookie *sockc, diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 55501e5e7ac8..a2c66b3d7f0f 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -31,8 +31,9 @@ enum { SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13), SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14), SOF_TIMESTAMPING_BIND_PHC = (1 << 15), + SOF_TIMESTAMPING_OPT_ID_TCP = (1 << 16), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_BIND_PHC, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_ID_TCP, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/core/sock.c b/net/core/sock.c index 4571914a4aa8..b0ab841e0aed 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -901,13 +901,20 @@ int sock_set_timestamping(struct sock *sk, int optname, if (val & ~SOF_TIMESTAMPING_MASK) return -EINVAL; + if (val & SOF_TIMESTAMPING_OPT_ID_TCP && + !(val & SOF_TIMESTAMPING_OPT_ID)) + return -EINVAL; + if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk_is_tcp(sk)) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; - atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) + atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); + else + atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { atomic_set(&sk->sk_tskey, 0); } diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 21cfe8557205..6f399afc2ff2 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -417,6 +417,7 @@ const char sof_timestamping_names[][ETH_GSTRING_LEN] = { [const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)] = "option-pktinfo", [const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)] = "option-tx-swhw", [const_ilog2(SOF_TIMESTAMPING_BIND_PHC)] = "bind-phc", + [const_ilog2(SOF_TIMESTAMPING_OPT_ID_TCP)] = "option-id-tcp", }; static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT); -- cgit v1.2.3 From 1933ea365aa7a48ce26bea2ea09c9f7cc48cc668 Mon Sep 17 00:00:00 2001 From: wangchuanlei Date: Tue, 6 Dec 2022 20:38:57 -0500 Subject: net: openvswitch: Add support to count upcall packets Add support to count upall packets, when kmod of openvswitch upcall to count the number of packets for upcall succeed and failed, which is a better way to see how many packets upcalled on every interfaces. Signed-off-by: wangchuanlei Acked-by: Eelco Chaudron Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 14 +++++++++++ net/openvswitch/datapath.c | 41 ++++++++++++++++++++++++++++++++ net/openvswitch/vport.c | 50 ++++++++++++++++++++++++++++++++++++++++ net/openvswitch/vport.h | 16 +++++++++++++ 4 files changed, 121 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 94066f87e9ee..c5d62ee82567 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -277,11 +277,25 @@ enum ovs_vport_attr { OVS_VPORT_ATTR_PAD, OVS_VPORT_ATTR_IFINDEX, OVS_VPORT_ATTR_NETNSID, + OVS_VPORT_ATTR_UPCALL_STATS, __OVS_VPORT_ATTR_MAX }; #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1) +/** + * enum ovs_vport_upcall_attr - attributes for %OVS_VPORT_UPCALL* commands + * @OVS_VPORT_UPCALL_SUCCESS: 64-bit upcall success packets. + * @OVS_VPORT_UPCALL_FAIL: 64-bit upcall fail packets. + */ +enum ovs_vport_upcall_attr { + OVS_VPORT_UPCALL_ATTR_SUCCESS, + OVS_VPORT_UPCALL_ATTR_FAIL, + __OVS_VPORT_UPCALL_ATTR_MAX +}; + +#define OVS_VPORT_UPCALL_ATTR_MAX (__OVS_VPORT_UPCALL_ATTR_MAX - 1) + enum { OVS_VXLAN_EXT_UNSPEC, OVS_VXLAN_EXT_GBP, /* Flag or __u32 */ diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 861dfb8daf4a..932bcf766d63 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -209,6 +209,26 @@ static struct vport *new_vport(const struct vport_parms *parms) return vport; } +static void ovs_vport_update_upcall_stats(struct sk_buff *skb, + const struct dp_upcall_info *upcall_info, + bool upcall_result) +{ + struct vport *p = OVS_CB(skb)->input_vport; + struct vport_upcall_stats_percpu *stats; + + if (upcall_info->cmd != OVS_PACKET_CMD_MISS && + upcall_info->cmd != OVS_PACKET_CMD_ACTION) + return; + + stats = this_cpu_ptr(p->upcall_stats); + u64_stats_update_begin(&stats->syncp); + if (upcall_result) + u64_stats_inc(&stats->n_success); + else + u64_stats_inc(&stats->n_fail); + u64_stats_update_end(&stats->syncp); +} + void ovs_dp_detach_port(struct vport *p) { ASSERT_OVSL(); @@ -216,6 +236,9 @@ void ovs_dp_detach_port(struct vport *p) /* First drop references to device. */ hlist_del_rcu(&p->dp_hash_node); + /* Free percpu memory */ + free_percpu(p->upcall_stats); + /* Then destroy it. */ ovs_vport_del(p); } @@ -305,6 +328,8 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); else err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); + + ovs_vport_update_upcall_stats(skb, upcall_info, !err); if (err) goto err; @@ -1826,6 +1851,12 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) goto err_destroy_portids; } + vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); + if (!vport->upcall_stats) { + err = -ENOMEM; + goto err_destroy_portids; + } + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_NEW); BUG_ON(err < 0); @@ -2098,6 +2129,9 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, OVS_VPORT_ATTR_PAD)) goto nla_put_failure; + if (ovs_vport_get_upcall_stats(vport, skb)) + goto nla_put_failure; + if (ovs_vport_get_upcall_portids(vport, skb)) goto nla_put_failure; @@ -2279,6 +2313,12 @@ restart: goto exit_unlock_free; } + vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); + if (!vport->upcall_stats) { + err = -ENOMEM; + goto exit_unlock_free; + } + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW, GFP_KERNEL); @@ -2508,6 +2548,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, + [OVS_VPORT_ATTR_UPCALL_STATS] = { .type = NLA_NESTED }, }; static const struct genl_small_ops dp_vport_genl_ops[] = { diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 82a74f998966..7e0f5c45b512 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -284,6 +284,56 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) stats->tx_packets = dev_stats->tx_packets; } +/** + * ovs_vport_get_upcall_stats - retrieve upcall stats + * + * @vport: vport from which to retrieve the stats. + * @skb: sk_buff where upcall stats should be appended. + * + * Retrieves upcall stats for the given device. + * + * Must be called with ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb) +{ + struct nlattr *nla; + int i; + + __u64 tx_success = 0; + __u64 tx_fail = 0; + + for_each_possible_cpu(i) { + const struct vport_upcall_stats_percpu *stats; + unsigned int start; + + stats = per_cpu_ptr(vport->upcall_stats, i); + do { + start = u64_stats_fetch_begin(&stats->syncp); + tx_success += u64_stats_read(&stats->n_success); + tx_fail += u64_stats_read(&stats->n_fail); + } while (u64_stats_fetch_retry(&stats->syncp, start)); + } + + nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_UPCALL_STATS); + if (!nla) + return -EMSGSIZE; + + if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_SUCCESS, tx_success, + OVS_VPORT_ATTR_PAD)) { + nla_nest_cancel(skb, nla); + return -EMSGSIZE; + } + + if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_FAIL, tx_fail, + OVS_VPORT_ATTR_PAD)) { + nla_nest_cancel(skb, nla); + return -EMSGSIZE; + } + nla_nest_end(skb, nla); + + return 0; +} + /** * ovs_vport_get_options - retrieve device options * diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 6ff45e8a0868..3e71ca8ad8a7 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -32,6 +32,8 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name); void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); +int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb); + int ovs_vport_set_options(struct vport *, struct nlattr *options); int ovs_vport_get_options(const struct vport *, struct sk_buff *); @@ -65,6 +67,7 @@ struct vport_portids { * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. + * @upcall_stats: Upcall stats of every ports. * @detach_list: list used for detaching vport in net-exit call. * @rcu: RCU callback head for deferred destruction. */ @@ -78,6 +81,7 @@ struct vport { struct hlist_node hash_node; struct hlist_node dp_hash_node; const struct vport_ops *ops; + struct vport_upcall_stats_percpu __percpu *upcall_stats; struct list_head detach_list; struct rcu_head rcu; @@ -137,6 +141,18 @@ struct vport_ops { struct list_head list; }; +/** + * struct vport_upcall_stats_percpu - per-cpu packet upcall statistics for + * a given vport. + * @n_success: Number of packets that upcall to userspace succeed. + * @n_fail: Number of packets that upcall to userspace failed. + */ +struct vport_upcall_stats_percpu { + struct u64_stats_sync syncp; + u64_stats_t n_success; + u64_stats_t n_fail; +}; + struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); -- cgit v1.2.3 From b22bbdd17a5af2cc99cf42048970b9355dcbec8b Mon Sep 17 00:00:00 2001 From: Andrew Melnychenko Date: Wed, 7 Dec 2022 13:35:54 +0200 Subject: uapi/linux/if_tun.h: Added new offload types for USO4/6. Added 2 additional offlloads for USO(IPv4 & IPv6). Separate offloads are required for Windows VM guests, g.e. Windows may set USO rx only for IPv4. Signed-off-by: Andrew Melnychenko Signed-off-by: David S. Miller --- include/uapi/linux/if_tun.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index b6d7b868f290..287cdc81c939 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -90,6 +90,8 @@ #define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */ #define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */ #define TUN_F_UFO 0x10 /* I can handle UFO packets */ +#define TUN_F_USO4 0x20 /* I can handle USO for IPv4 packets */ +#define TUN_F_USO6 0x40 /* I can handle USO for IPv6 packets */ /* Protocol info prepended to the packets (when IFF_NO_PI is not set) */ #define TUN_PKT_STRIP 0x0001 -- cgit v1.2.3 From 34061b348ae912b9783fae7fabae69d46977a036 Mon Sep 17 00:00:00 2001 From: Andrew Melnychenko Date: Wed, 7 Dec 2022 13:35:56 +0200 Subject: uapi/linux/virtio_net.h: Added USO types. Added new GSO type for USO: VIRTIO_NET_HDR_GSO_UDP_L4. Feature VIRTIO_NET_F_HOST_USO allows to enable NETIF_F_GSO_UDP_L4. Separated VIRTIO_NET_F_GUEST_USO4 & VIRTIO_NET_F_GUEST_USO6 features required for Windows guests. Signed-off-by: Andrew Melnychenko Signed-off-by: David S. Miller --- include/uapi/linux/virtio_net.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 6cb842ea8979..b4062bed186a 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -57,6 +57,9 @@ * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ #define VIRTIO_NET_F_NOTF_COAL 53 /* Device supports notifications coalescing */ +#define VIRTIO_NET_F_GUEST_USO4 54 /* Guest can handle USOv4 in. */ +#define VIRTIO_NET_F_GUEST_USO6 55 /* Guest can handle USOv6 in. */ +#define VIRTIO_NET_F_HOST_USO 56 /* Host can handle USO in. */ #define VIRTIO_NET_F_HASH_REPORT 57 /* Supports hash report */ #define VIRTIO_NET_F_RSS 60 /* Supports RSS RX steering */ #define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */ @@ -130,6 +133,7 @@ struct virtio_net_hdr_v1 { #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ #define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ #define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ +#define VIRTIO_NET_HDR_GSO_UDP_L4 5 /* GSO frame, IPv4& IPv6 UDP (USO) */ #define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ __u8 gso_type; __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ -- cgit v1.2.3 From 47c50853bb9c5c35bf5c3eb00b712310ec2969e2 Mon Sep 17 00:00:00 2001 From: Igor Skalkin Date: Mon, 24 Oct 2022 15:40:33 +0200 Subject: virtio_bt: Fix alignment in configuration struct The current version of the configuration structure has unaligned 16-bit fields, but according to the specification [1], access to the configuration space must be aligned. Add a second, aligned version of the configuration structure and a new feature bit indicating that this version is being used. [1] https://docs.oasis-open.org/virtio/virtio/v1.1/virtio-v1.1.pdf Signed-off-by: Igor Skalkin Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/virtio_bt.c | 16 +++++++++++++--- include/uapi/linux/virtio_bt.h | 8 ++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/bluetooth/virtio_bt.c b/drivers/bluetooth/virtio_bt.c index 2bbeb65b1cd3..c570c45d1480 100644 --- a/drivers/bluetooth/virtio_bt.c +++ b/drivers/bluetooth/virtio_bt.c @@ -313,7 +313,12 @@ static int virtbt_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_BT_F_VND_HCI)) { __u16 vendor; - virtio_cread(vdev, struct virtio_bt_config, vendor, &vendor); + if (virtio_has_feature(vdev, VIRTIO_BT_F_CONFIG_V2)) + virtio_cread(vdev, struct virtio_bt_config_v2, + vendor, &vendor); + else + virtio_cread(vdev, struct virtio_bt_config, + vendor, &vendor); switch (vendor) { case VIRTIO_BT_CONFIG_VENDOR_ZEPHYR: @@ -346,8 +351,12 @@ static int virtbt_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_BT_F_MSFT_EXT)) { __u16 msft_opcode; - virtio_cread(vdev, struct virtio_bt_config, - msft_opcode, &msft_opcode); + if (virtio_has_feature(vdev, VIRTIO_BT_F_CONFIG_V2)) + virtio_cread(vdev, struct virtio_bt_config_v2, + msft_opcode, &msft_opcode); + else + virtio_cread(vdev, struct virtio_bt_config, + msft_opcode, &msft_opcode); hci_set_msft_opcode(hdev, msft_opcode); } @@ -402,6 +411,7 @@ static const unsigned int virtbt_features[] = { VIRTIO_BT_F_VND_HCI, VIRTIO_BT_F_MSFT_EXT, VIRTIO_BT_F_AOSP_EXT, + VIRTIO_BT_F_CONFIG_V2, }; static struct virtio_driver virtbt_driver = { diff --git a/include/uapi/linux/virtio_bt.h b/include/uapi/linux/virtio_bt.h index a7bd48daa9a9..af798f4c9680 100644 --- a/include/uapi/linux/virtio_bt.h +++ b/include/uapi/linux/virtio_bt.h @@ -9,6 +9,7 @@ #define VIRTIO_BT_F_VND_HCI 0 /* Indicates vendor command support */ #define VIRTIO_BT_F_MSFT_EXT 1 /* Indicates MSFT vendor support */ #define VIRTIO_BT_F_AOSP_EXT 2 /* Indicates AOSP vendor support */ +#define VIRTIO_BT_F_CONFIG_V2 3 /* Use second version configuration */ enum virtio_bt_config_type { VIRTIO_BT_CONFIG_TYPE_PRIMARY = 0, @@ -28,4 +29,11 @@ struct virtio_bt_config { __u16 msft_opcode; } __attribute__((packed)); +struct virtio_bt_config_v2 { + __u8 type; + __u8 alignment; + __u16 vendor; + __u16 msft_opcode; +}; + #endif /* _UAPI_LINUX_VIRTIO_BT_H */ -- cgit v1.2.3 From 6afaae6d12f54accf194e73ece617d1f456e438d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:29 +0200 Subject: bridge: mcast: Allow user space to add (*, G) with a source list and filter mode Add new netlink attributes to the RTM_NEWMDB request that allow user space to add (*, G) with a source list and filter mode. The RTM_NEWMDB message can already dump such entries (created by the kernel) so there is no need to add dump support. However, the message contains a different set of attributes depending if it is a request or a response. The naming and structure of the new attributes try to follow the existing ones used in the response. Request: [ struct nlmsghdr ] [ struct br_port_msg ] [ MDBA_SET_ENTRY ] struct br_mdb_entry [ MDBA_SET_ENTRY_ATTRS ] [ MDBE_ATTR_SOURCE ] struct in_addr / struct in6_addr [ MDBE_ATTR_SRC_LIST ] // new [ MDBE_SRC_LIST_ENTRY ] [ MDBE_SRCATTR_ADDRESS ] struct in_addr / struct in6_addr [ ...] [ MDBE_ATTR_GROUP_MODE ] // new u8 Response: [ struct nlmsghdr ] [ struct br_port_msg ] [ MDBA_MDB ] [ MDBA_MDB_ENTRY ] [ MDBA_MDB_ENTRY_INFO ] struct br_mdb_entry [ MDBA_MDB_EATTR_TIMER ] u32 [ MDBA_MDB_EATTR_SOURCE ] struct in_addr / struct in6_addr [ MDBA_MDB_EATTR_RTPROT ] u8 [ MDBA_MDB_EATTR_SRC_LIST ] [ MDBA_MDB_SRCLIST_ENTRY ] [ MDBA_MDB_SRCATTR_ADDRESS ] struct in_addr / struct in6_addr [ MDBA_MDB_SRCATTR_TIMER ] u8 [...] [ MDBA_MDB_EATTR_GROUP_MODE ] u8 Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 20 +++++++ net/bridge/br_mdb.c | 130 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index a86a7e7b811f..0d9fe73fc48c 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -723,10 +723,30 @@ enum { enum { MDBE_ATTR_UNSPEC, MDBE_ATTR_SOURCE, + MDBE_ATTR_SRC_LIST, + MDBE_ATTR_GROUP_MODE, __MDBE_ATTR_MAX, }; #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) +/* per mdb entry source */ +enum { + MDBE_SRC_LIST_UNSPEC, + MDBE_SRC_LIST_ENTRY, + __MDBE_SRC_LIST_MAX, +}; +#define MDBE_SRC_LIST_MAX (__MDBE_SRC_LIST_MAX - 1) + +/* per mdb entry per source attributes + * these are embedded in MDBE_SRC_LIST_ENTRY + */ +enum { + MDBE_SRCATTR_UNSPEC, + MDBE_SRCATTR_ADDRESS, + __MDBE_SRCATTR_MAX, +}; +#define MDBE_SRCATTR_MAX (__MDBE_SRCATTR_MAX - 1) + /* Embedded inside LINK_XSTATS_TYPE_BRIDGE */ enum { BRIDGE_XSTATS_UNSPEC, diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index e9a4b7e247e7..61d46b0a31b6 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -663,10 +663,25 @@ errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } +static const struct nla_policy +br_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = { + [MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY, + sizeof(struct in_addr), + sizeof(struct in6_addr)), +}; + +static const struct nla_policy +br_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = { + [MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(br_mdbe_src_list_entry_pol), +}; + static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), + [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE, + MCAST_INCLUDE), + [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(br_mdbe_src_list_pol), }; static bool is_valid_mdb_entry(struct br_mdb_entry *entry, @@ -1051,6 +1066,76 @@ static int __br_mdb_add(const struct br_mdb_config *cfg, return ret; } +static int br_mdb_config_src_entry_init(struct nlattr *src_entry, + struct br_mdb_src_entry *src, + __be16 proto, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[MDBE_SRCATTR_MAX + 1]; + int err; + + err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry, + br_mdbe_src_list_entry_pol, extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS)) + return -EINVAL; + + if (!is_valid_mdb_source(tb[MDBE_SRCATTR_ADDRESS], proto, extack)) + return -EINVAL; + + src->addr.proto = proto; + nla_memcpy(&src->addr.src, tb[MDBE_SRCATTR_ADDRESS], + nla_len(tb[MDBE_SRCATTR_ADDRESS])); + + return 0; +} + +static int br_mdb_config_src_list_init(struct nlattr *src_list, + struct br_mdb_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nlattr *src_entry; + int rem, err; + int i = 0; + + nla_for_each_nested(src_entry, src_list, rem) + cfg->num_src_entries++; + + if (cfg->num_src_entries >= PG_SRC_ENT_LIMIT) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Exceeded maximum number of source entries (%u)", + PG_SRC_ENT_LIMIT - 1); + return -EINVAL; + } + + cfg->src_entries = kcalloc(cfg->num_src_entries, + sizeof(struct br_mdb_src_entry), GFP_KERNEL); + if (!cfg->src_entries) + return -ENOMEM; + + nla_for_each_nested(src_entry, src_list, rem) { + err = br_mdb_config_src_entry_init(src_entry, + &cfg->src_entries[i], + cfg->entry->addr.proto, + extack); + if (err) + goto err_src_entry_init; + i++; + } + + return 0; + +err_src_entry_init: + kfree(cfg->src_entries); + return err; +} + +static void br_mdb_config_src_list_fini(struct br_mdb_config *cfg) +{ + kfree(cfg->src_entries); +} + static int br_mdb_config_attrs_init(struct nlattr *set_attrs, struct br_mdb_config *cfg, struct netlink_ext_ack *extack) @@ -1070,6 +1155,44 @@ static int br_mdb_config_attrs_init(struct nlattr *set_attrs, __mdb_entry_to_br_ip(cfg->entry, &cfg->group, mdb_attrs); + if (mdb_attrs[MDBE_ATTR_GROUP_MODE]) { + if (!cfg->p) { + NL_SET_ERR_MSG_MOD(extack, "Filter mode cannot be set for host groups"); + return -EINVAL; + } + if (!br_multicast_is_star_g(&cfg->group)) { + NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries"); + return -EINVAL; + } + cfg->filter_mode = nla_get_u8(mdb_attrs[MDBE_ATTR_GROUP_MODE]); + } else { + cfg->filter_mode = MCAST_EXCLUDE; + } + + if (mdb_attrs[MDBE_ATTR_SRC_LIST]) { + if (!cfg->p) { + NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set for host groups"); + return -EINVAL; + } + if (!br_multicast_is_star_g(&cfg->group)) { + NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries"); + return -EINVAL; + } + if (!mdb_attrs[MDBE_ATTR_GROUP_MODE]) { + NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode"); + return -EINVAL; + } + err = br_mdb_config_src_list_init(mdb_attrs[MDBE_ATTR_SRC_LIST], + cfg, extack); + if (err) + return err; + } + + if (!cfg->num_src_entries && cfg->filter_mode == MCAST_INCLUDE) { + NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list"); + return -EINVAL; + } + return 0; } @@ -1162,6 +1285,11 @@ static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh, return 0; } +static void br_mdb_config_fini(struct br_mdb_config *cfg) +{ + br_mdb_config_src_list_fini(cfg); +} + static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1220,6 +1348,7 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, } out: + br_mdb_config_fini(&cfg); return err; } @@ -1295,6 +1424,7 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, err = __br_mdb_del(&cfg); } + br_mdb_config_fini(&cfg); return err; } -- cgit v1.2.3 From 1d7b66a7d9754c229d12c53ad6a1effe88c16ca4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:30 +0200 Subject: bridge: mcast: Allow user space to specify MDB entry routing protocol Add the 'MDBE_ATTR_RTPORT' attribute to allow user space to specify the routing protocol of the MDB port group entry. Enforce a minimum value of 'RTPROT_STATIC' to prevent user space from using protocol values that should only be set by the kernel (e.g., 'RTPROT_KERNEL'). Maintain backward compatibility by defaulting to 'RTPROT_STATIC'. The protocol is already visible to user space in RTM_NEWMDB responses and notifications via the 'MDBA_MDB_EATTR_RTPROT' attribute. The routing protocol allows a routing daemon to distinguish between entries configured by it and those configured by the administrator. Once MDB flush is supported, the protocol can be used as a criterion according to which the flush is performed. Examples: # bridge mdb add dev br0 port dummy10 grp 239.1.1.1 permanent proto kernel Error: integer out of range. # bridge mdb add dev br0 port dummy10 grp 239.1.1.1 permanent proto static # bridge mdb add dev br0 port dummy10 grp 239.1.1.1 src 192.0.2.1 permanent proto zebra # bridge mdb add dev br0 port dummy10 grp 239.1.1.2 permanent source_list 198.51.100.1,198.51.100.2 filter_mode include proto 250 # bridge -d mdb show dev br0 port dummy10 grp 239.1.1.2 src 198.51.100.2 permanent filter_mode include proto 250 dev br0 port dummy10 grp 239.1.1.2 src 198.51.100.1 permanent filter_mode include proto 250 dev br0 port dummy10 grp 239.1.1.2 permanent filter_mode include source_list 198.51.100.2/0.00,198.51.100.1/0.00 proto 250 dev br0 port dummy10 grp 239.1.1.1 src 192.0.2.1 permanent filter_mode include proto zebra dev br0 port dummy10 grp 239.1.1.1 permanent filter_mode exclude proto static Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br_mdb.c | 15 +++++++++++++-- net/bridge/br_private.h | 1 + 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 0d9fe73fc48c..d9de241d90f9 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -725,6 +725,7 @@ enum { MDBE_ATTR_SOURCE, MDBE_ATTR_SRC_LIST, MDBE_ATTR_GROUP_MODE, + MDBE_ATTR_RTPROT, __MDBE_ATTR_MAX, }; #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 61d46b0a31b6..72d4e53193e5 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -682,6 +682,7 @@ static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE, MCAST_INCLUDE), [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(br_mdbe_src_list_pol), + [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), }; static bool is_valid_mdb_entry(struct br_mdb_entry *entry, @@ -823,7 +824,7 @@ static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, - MCAST_INCLUDE, RTPROT_STATIC); + MCAST_INCLUDE, cfg->rt_protocol); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (S, G) port group"); return -ENOMEM; @@ -881,6 +882,7 @@ static int br_mdb_add_group_src_fwd(const struct br_mdb_config *cfg, sg_cfg.group = sg_ip; sg_cfg.src_entry = true; sg_cfg.filter_mode = MCAST_INCLUDE; + sg_cfg.rt_protocol = cfg->rt_protocol; return br_mdb_add_group_sg(&sg_cfg, sgmp, brmctx, flags, extack); } @@ -982,7 +984,7 @@ static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, - cfg->filter_mode, RTPROT_STATIC); + cfg->filter_mode, cfg->rt_protocol); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (*, G) port group"); return -ENOMEM; @@ -1193,6 +1195,14 @@ static int br_mdb_config_attrs_init(struct nlattr *set_attrs, return -EINVAL; } + if (mdb_attrs[MDBE_ATTR_RTPROT]) { + if (!cfg->p) { + NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be set for host groups"); + return -EINVAL; + } + cfg->rt_protocol = nla_get_u8(mdb_attrs[MDBE_ATTR_RTPROT]); + } + return 0; } @@ -1212,6 +1222,7 @@ static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh, memset(cfg, 0, sizeof(*cfg)); cfg->filter_mode = MCAST_EXCLUDE; + cfg->rt_protocol = RTPROT_STATIC; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 368f5f6fa42b..cdc9e040f1f6 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -106,6 +106,7 @@ struct br_mdb_config { u8 filter_mode; struct br_mdb_src_entry *src_entries; int num_src_entries; + u8 rt_protocol; }; #endif -- cgit v1.2.3