diff options
Diffstat (limited to 'net')
169 files changed, 3987 insertions, 3006 deletions
diff --git a/net/Kconfig b/net/Kconfig index 7d39c1773eb4..2fb25b534df5 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -324,7 +324,7 @@ config CGROUP_NET_CLASSID config NET_RX_BUSY_POLL bool - default y if !PREEMPT_RT + default y if !PREEMPT_RT || (PREEMPT_RT && !NETCONSOLE) config BQL bool diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index e79e3a415ca9..2321bd2f9964 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -561,29 +561,6 @@ __bpf_kfunc int bpf_modify_return_test(int a, int *b) return a + *b; } -__bpf_kfunc u64 bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d) -{ - return a + b + c + d; -} - -__bpf_kfunc int bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b) -{ - return a + b; -} - -__bpf_kfunc struct sock *bpf_kfunc_call_test3(struct sock *sk) -{ - return sk; -} - -long noinline bpf_kfunc_call_test4(signed char a, short b, int c, long d) -{ - /* Provoke the compiler to assume that the caller has sign-extended a, - * b and c on platforms where this is required (e.g. s390x). - */ - return (long)a + (long)b + (long)c + d; -} - int noinline bpf_fentry_shadow_test(int a) { return a + 1; @@ -606,32 +583,6 @@ struct prog_test_ref_kfunc { refcount_t cnt; }; -static struct prog_test_ref_kfunc prog_test_struct = { - .a = 42, - .b = 108, - .next = &prog_test_struct, - .cnt = REFCOUNT_INIT(1), -}; - -__bpf_kfunc struct prog_test_ref_kfunc * -bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr) -{ - refcount_inc(&prog_test_struct.cnt); - return &prog_test_struct; -} - -__bpf_kfunc void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p) -{ - WARN_ON_ONCE(1); -} - -__bpf_kfunc struct prog_test_member * -bpf_kfunc_call_memb_acquire(void) -{ - WARN_ON_ONCE(1); - return NULL; -} - __bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) { refcount_dec(&p->cnt); @@ -641,134 +592,6 @@ __bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p) { } -__bpf_kfunc void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p) -{ - WARN_ON_ONCE(1); -} - -static int *__bpf_kfunc_call_test_get_mem(struct prog_test_ref_kfunc *p, const int size) -{ - if (size > 2 * sizeof(int)) - return NULL; - - return (int *)p; -} - -__bpf_kfunc int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, - const int rdwr_buf_size) -{ - return __bpf_kfunc_call_test_get_mem(p, rdwr_buf_size); -} - -__bpf_kfunc int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, - const int rdonly_buf_size) -{ - return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size); -} - -/* the next 2 ones can't be really used for testing expect to ensure - * that the verifier rejects the call. - * Acquire functions must return struct pointers, so these ones are - * failing. - */ -__bpf_kfunc int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p, - const int rdonly_buf_size) -{ - return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size); -} - -__bpf_kfunc void bpf_kfunc_call_int_mem_release(int *p) -{ -} - -struct prog_test_pass1 { - int x0; - struct { - int x1; - struct { - int x2; - struct { - int x3; - }; - }; - }; -}; - -struct prog_test_pass2 { - int len; - short arr1[4]; - struct { - char arr2[4]; - unsigned long arr3[8]; - } x; -}; - -struct prog_test_fail1 { - void *p; - int x; -}; - -struct prog_test_fail2 { - int x8; - struct prog_test_pass1 x; -}; - -struct prog_test_fail3 { - int len; - char arr1[2]; - char arr2[]; -}; - -__bpf_kfunc void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb) -{ -} - -__bpf_kfunc void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) -{ -} - -__bpf_kfunc void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) -{ -} - -__bpf_kfunc void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p) -{ -} - -__bpf_kfunc void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p) -{ -} - -__bpf_kfunc void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p) -{ -} - -__bpf_kfunc void bpf_kfunc_call_test_mem_len_pass1(void *mem, int mem__sz) -{ -} - -__bpf_kfunc void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len) -{ -} - -__bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len) -{ -} - -__bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p) -{ - /* p != NULL, but p->cnt could be 0 */ -} - -__bpf_kfunc void bpf_kfunc_call_test_destructive(void) -{ -} - -__bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused) -{ - return arg; -} - __diag_pop(); BTF_SET8_START(bpf_test_modify_return_ids) @@ -782,32 +605,8 @@ static const struct btf_kfunc_id_set bpf_test_modify_return_set = { }; BTF_SET8_START(test_sk_check_kfunc_ids) -BTF_ID_FLAGS(func, bpf_kfunc_call_test1) -BTF_ID_FLAGS(func, bpf_kfunc_call_test2) -BTF_ID_FLAGS(func, bpf_kfunc_call_test3) -BTF_ID_FLAGS(func, bpf_kfunc_call_test4) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_acquire, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_kfunc_call_memb_acquire, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_kfunc_call_memb1_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_get_rdwr_mem, KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_get_rdonly_mem, KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_acq_rdonly_mem, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_kfunc_call_int_mem_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass_ctx) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass1) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass2) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail1) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail2) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) BTF_SET8_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, @@ -1415,11 +1214,10 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, } frag = &sinfo->frags[sinfo->nr_frags++]; - __skb_frag_set_page(frag, page); data_len = min_t(u32, kattr->test.data_size_in - size, PAGE_SIZE); - skb_frag_size_set(frag, data_len); + skb_frag_fill_page_desc(frag, page, 0, data_len); if (copy_from_user(page_address(page), data_in + size, data_len)) { diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 8eca8a5c80c6..9a5ea06236bd 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -39,6 +39,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) u16 vid = 0; memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); + br_tc_skb_miss_set(skb, false); rcu_read_lock(); nf_ops = rcu_dereference(nf_br_ops); diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 84d6dd5e5b1a..6116eba1bd89 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -203,6 +203,8 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, struct net_bridge_port *prev = NULL; struct net_bridge_port *p; + br_tc_skb_miss_set(skb, pkt_type != BR_PKT_BROADCAST); + list_for_each_entry_rcu(p, &br->port_list, list) { /* Do not flood unicast traffic to ports that turn it off, nor * other traffic if flood off, except for traffic we originate @@ -295,6 +297,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, allow_mode_include = false; } else { p = NULL; + br_tc_skb_miss_set(skb, true); } while (p || rp) { diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index fc17b9fd93e6..c34a0b0901b0 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -334,6 +334,7 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) return RX_HANDLER_CONSUMED; memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); + br_tc_skb_miss_set(skb, false); p = br_port_get_rcu(skb->dev); if (p->flags & BR_VLAN_TUNNEL) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2119729ded2b..a63b32c1638e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -15,6 +15,7 @@ #include <linux/u64_stats_sync.h> #include <net/route.h> #include <net/ip6_fib.h> +#include <net/pkt_cls.h> #include <linux/if_vlan.h> #include <linux/rhashtable.h> #include <linux/refcount.h> @@ -754,6 +755,32 @@ void br_boolopt_multi_get(const struct net_bridge *br, struct br_boolopt_multi *bm); void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on); +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) +static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss) +{ + struct tc_skb_ext *ext; + + if (!tc_skb_ext_tc_enabled()) + return; + + ext = skb_ext_find(skb, TC_SKB_EXT); + if (ext) { + ext->l2_miss = miss; + return; + } + if (!miss) + return; + ext = tc_skb_ext_alloc(skb); + if (!ext) + return; + ext->l2_miss = true; +} +#else +static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss) +{ +} +#endif + /* br_device.c */ void br_dev_setup(struct net_device *dev); void br_dev_delete(struct net_device *dev, struct list_head *list); diff --git a/net/core/Makefile b/net/core/Makefile index 8f367813bc68..731db2eaa610 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -13,7 +13,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ fib_notifier.o xdp.o flow_offload.o gro.o \ - netdev-genl.o netdev-genl-gen.o + netdev-genl.o netdev-genl-gen.o gso.o obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o diff --git a/net/core/dev.c b/net/core/dev.c index c29f3e1db3ca..e4ff0adf5523 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -758,29 +758,43 @@ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) } EXPORT_SYMBOL(dev_get_by_name_rcu); +/* Deprecated for new users, call netdev_get_by_name() instead */ +struct net_device *dev_get_by_name(struct net *net, const char *name) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_name); + /** - * dev_get_by_name - find a device by its name + * netdev_get_by_name() - find a device by its name * @net: the applicable net namespace * @name: name to find + * @tracker: tracking object for the acquired reference + * @gfp: allocation flags for the tracker * * Find an interface by name. This can be called from any * context and does its own locking. The returned handle has - * the usage count incremented and the caller must use dev_put() to + * the usage count incremented and the caller must use netdev_put() to * release it when it is no longer needed. %NULL is returned if no * matching device is found. */ - -struct net_device *dev_get_by_name(struct net *net, const char *name) +struct net_device *netdev_get_by_name(struct net *net, const char *name, + netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; - rcu_read_lock(); - dev = dev_get_by_name_rcu(net, name); - dev_hold(dev); - rcu_read_unlock(); + dev = dev_get_by_name(net, name); + if (dev) + netdev_tracker_alloc(dev, tracker, gfp); return dev; } -EXPORT_SYMBOL(dev_get_by_name); +EXPORT_SYMBOL(netdev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex @@ -831,29 +845,42 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) } EXPORT_SYMBOL(dev_get_by_index_rcu); +/* Deprecated for new users, call netdev_get_by_index() instead */ +struct net_device *dev_get_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_index); /** - * dev_get_by_index - find a device by its ifindex + * netdev_get_by_index() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device + * @tracker: tracking object for the acquired reference + * @gfp: allocation flags for the tracker * * Search for an interface by index. Returns NULL if the device * is not found or a pointer to the device. The device returned has * had a reference added and the pointer is safe until the user calls - * dev_put to indicate they have finished with it. + * netdev_put() to indicate they have finished with it. */ - -struct net_device *dev_get_by_index(struct net *net, int ifindex) +struct net_device *netdev_get_by_index(struct net *net, int ifindex, + netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, ifindex); - dev_hold(dev); - rcu_read_unlock(); + dev = dev_get_by_index(net, ifindex); + if (dev) + netdev_tracker_alloc(dev, tracker, gfp); return dev; } -EXPORT_SYMBOL(dev_get_by_index); +EXPORT_SYMBOL(netdev_get_by_index); /** * dev_get_by_napi_id - find a device by napi_id @@ -3209,7 +3236,7 @@ static u16 skb_tx_hash(const struct net_device *dev, return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } -static void skb_warn_bad_offload(const struct sk_buff *skb) +void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features; struct net_device *dev = skb->dev; @@ -3338,74 +3365,6 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) return vlan_get_protocol_and_depth(skb, type, depth); } -/* openvswitch calls this on rx path, so we need a different check. - */ -static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) -{ - if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_UNNECESSARY; - - return skb->ip_summed == CHECKSUM_NONE; -} - -/** - * __skb_gso_segment - Perform segmentation on skb. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * @tx_path: whether it is called in TX path - * - * This function segments the given skb and returns a list of segments. - * - * It may return NULL if the skb requires no segmentation. This is - * only possible when GSO is used for verifying header integrity. - * - * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. - */ -struct sk_buff *__skb_gso_segment(struct sk_buff *skb, - netdev_features_t features, bool tx_path) -{ - struct sk_buff *segs; - - if (unlikely(skb_needs_check(skb, tx_path))) { - int err; - - /* We're going to init ->check field in TCP or UDP header */ - err = skb_cow_head(skb, 0); - if (err < 0) - return ERR_PTR(err); - } - - /* Only report GSO partial support if it will enable us to - * support segmentation on this frame without needing additional - * work. - */ - if (features & NETIF_F_GSO_PARTIAL) { - netdev_features_t partial_features = NETIF_F_GSO_ROBUST; - struct net_device *dev = skb->dev; - - partial_features |= dev->features & dev->gso_partial_features; - if (!skb_gso_ok(skb, features | partial_features)) - features &= ~NETIF_F_GSO_PARTIAL; - } - - BUILD_BUG_ON(SKB_GSO_CB_OFFSET + - sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); - - SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); - SKB_GSO_CB(skb)->encap_level = 0; - - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - - segs = skb_mac_gso_segment(skb, features); - - if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) - skb_warn_bad_offload(skb); - - return segs; -} -EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG @@ -6199,7 +6158,8 @@ restart: if (!napi) goto out; - preempt_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); for (;;) { int work = 0; @@ -6241,7 +6201,8 @@ count: if (unlikely(need_resched())) { if (napi_poll) busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); rcu_read_unlock(); cond_resched(); if (loop_end(loop_end_arg, start_time)) @@ -6252,7 +6213,8 @@ count: } if (napi_poll) busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); out: rcu_read_unlock(); } @@ -8819,6 +8781,8 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; + if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) + return 0; err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); if (err) return err; @@ -10570,8 +10534,10 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) { WARN_ON(dev->reg_state == NETREG_REGISTERED); - dev->gro_flush_timeout = 20000; - dev->napi_defer_hard_irqs = 1; + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + dev->gro_flush_timeout = 20000; + dev->napi_defer_hard_irqs = 1; + } } EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); @@ -10632,7 +10598,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; - ref_tracker_dir_init(&dev->refcnt_tracker, 128); + ref_tracker_dir_init(&dev->refcnt_tracker, 128, name); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) diff --git a/net/core/filter.c b/net/core/filter.c index d9ce04ca22ce..968139f4a1ac 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6916,6 +6916,8 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, FIELD)); \ } while (0) + BTF_TYPE_EMIT(struct bpf_tcp_sock); + switch (si->off) { case offsetof(struct bpf_tcp_sock, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != @@ -11721,3 +11723,66 @@ static int __init bpf_kfunc_init(void) return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); } late_initcall(bpf_kfunc_init); + +/* Disables missing prototype warnings */ +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. + * + * The function expects a non-NULL pointer to a socket, and invokes the + * protocol specific socket destroy handlers. + * + * The helper can only be called from BPF contexts that have acquired the socket + * locks. + * + * Parameters: + * @sock: Pointer to socket to be destroyed + * + * Return: + * On error, may return EPROTONOSUPPORT, EINVAL. + * EPROTONOSUPPORT if protocol specific destroy handler is not supported. + * 0 otherwise + */ +__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) +{ + struct sock *sk = (struct sock *)sock; + + /* The locking semantics that allow for synchronous execution of the + * destroy handlers are only supported for TCP and UDP. + * Supporting protocols will need to acquire sock lock in the BPF context + * prior to invoking this kfunc. + */ + if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP && + sk->sk_protocol != IPPROTO_UDP)) + return -EOPNOTSUPP; + + return sk->sk_prot->diag_destroy(sk, ECONNABORTED); +} + +__diag_pop() + +BTF_SET8_START(bpf_sk_iter_kfunc_ids) +BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) +BTF_SET8_END(bpf_sk_iter_kfunc_ids) + +static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) && + prog->expected_attach_type != BPF_TRACE_ITER) + return -EACCES; + return 0; +} + +static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_sk_iter_kfunc_ids, + .filter = tracing_iter_filter, +}; + +static int init_subsystem(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set); +} +late_initcall(init_subsystem); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 25fb0bbc310f..85a2d0d9bd39 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -27,6 +27,7 @@ #include <linux/tcp.h> #include <linux/ptp_classify.h> #include <net/flow_dissector.h> +#include <net/pkt_cls.h> #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> #include <linux/bpf.h> @@ -241,6 +242,15 @@ void skb_flow_dissect_meta(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_META, target_container); meta->ingress_ifindex = skb->skb_iif; +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + if (tc_skb_ext_tc_enabled()) { + struct tc_skb_ext *ext; + + ext = skb_ext_find(skb, TC_SKB_EXT); + if (ext) + meta->l2_miss = ext->l2_miss; + } +#endif } EXPORT_SYMBOL(skb_flow_dissect_meta); @@ -548,6 +558,30 @@ __skb_flow_dissect_arp(const struct sk_buff *skb, } static enum flow_dissect_ret +__skb_flow_dissect_cfm(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_cfm *key, *hdr, _hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM)) + return FLOW_DISSECT_RET_OUT_GOOD; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM, + target_container); + + key->mdl_ver = hdr->mdl_ver; + key->opcode = hdr->opcode; + + return FLOW_DISSECT_RET_OUT_GOOD; +} + +static enum flow_dissect_ret __skb_flow_dissect_gre(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, struct flow_dissector *flow_dissector, @@ -1390,6 +1424,12 @@ proto_again: break; } + case htons(ETH_P_CFM): + fdret = __skb_flow_dissect_cfm(skb, flow_dissector, + target_container, data, + nhoff, hlen); + break; + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; diff --git a/net/core/gro.c b/net/core/gro.c index 2d84165cb4f1..0759277dc14e 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -10,7 +10,7 @@ #define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(offload_lock); -static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); +struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); /* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ int gro_normal_batch __read_mostly = 8; @@ -92,63 +92,6 @@ void dev_remove_offload(struct packet_offload *po) } EXPORT_SYMBOL(dev_remove_offload); -/** - * skb_eth_gso_segment - segmentation handler for ethernet protocols. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * @type: Ethernet Protocol ID - */ -struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, - netdev_features_t features, __be16 type) -{ - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - segs = ptype->callbacks.gso_segment(skb, features); - break; - } - } - rcu_read_unlock(); - - return segs; -} -EXPORT_SYMBOL(skb_eth_gso_segment); - -/** - * skb_mac_gso_segment - mac layer segmentation handler. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - */ -struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - int vlan_depth = skb->mac_len; - __be16 type = skb_network_protocol(skb, &vlan_depth); - - if (unlikely(!type)) - return ERR_PTR(-EINVAL); - - __skb_pull(skb, vlan_depth); - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - segs = ptype->callbacks.gso_segment(skb, features); - break; - } - } - rcu_read_unlock(); - - __skb_push(skb, skb->data - skb_mac_header(skb)); - - return segs; -} -EXPORT_SYMBOL(skb_mac_gso_segment); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { @@ -239,9 +182,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; - __skb_frag_set_page(frag, page); - skb_frag_off_set(frag, first_offset); - skb_frag_size_set(frag, first_size); + skb_frag_fill_page_desc(frag, page, first_offset, first_size); memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ @@ -363,6 +304,24 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) } EXPORT_SYMBOL(napi_gro_flush); +static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, + const struct sk_buff *p, + unsigned long diffs) +{ +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; + + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); + + diffs |= (!!p_ext) ^ (!!skb_ext); + if (!diffs && unlikely(skb_ext)) + diffs |= p_ext->chain ^ skb_ext->chain; +#endif + return diffs; +} + static void gro_list_prepare(const struct list_head *head, const struct sk_buff *skb) { @@ -397,23 +356,11 @@ static void gro_list_prepare(const struct list_head *head, * avoid trying too hard to skip each of them individually */ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - struct tc_skb_ext *skb_ext; - struct tc_skb_ext *p_ext; -#endif - diffs |= p->sk != skb->sk; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - skb_ext = skb_ext_find(skb, TC_SKB_EXT); - p_ext = skb_ext_find(p, TC_SKB_EXT); - - diffs |= (!!p_ext) ^ (!!skb_ext); - if (!diffs && unlikely(skb_ext)) - diffs |= p_ext->chain ^ skb_ext->chain; -#endif + diffs |= gro_list_prepare_tc_ext(skb, p, diffs); } NAPI_GRO_CB(p)->same_flow = !diffs; @@ -460,6 +407,14 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) } } +static void gro_try_pull_from_frag0(struct sk_buff *skb) +{ + int grow = skb_gro_offset(skb) - skb_headlen(skb); + + if (grow > 0) + gro_pull_from_frag0(skb, grow); +} + static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) { struct sk_buff *oldest; @@ -489,7 +444,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; - int grow; if (netif_elide_gro(skb->dev)) goto normal; @@ -564,17 +518,14 @@ found_ptype: else gro_list->count++; + /* Must be called before setting NAPI_GRO_CB(skb)->{age|last} */ + gro_try_pull_from_frag0(skb); NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; if (!skb_is_gso(skb)) skb_shinfo(skb)->gso_size = skb_gro_len(skb); list_add(&skb->list, &gro_list->list); ret = GRO_HELD; - -pull: - grow = skb_gro_offset(skb) - skb_headlen(skb); - if (grow > 0) - gro_pull_from_frag0(skb, grow); ok: if (gro_list->count) { if (!test_bit(bucket, &napi->gro_bitmask)) @@ -587,7 +538,8 @@ ok: normal: ret = GRO_NORMAL; - goto pull; + gro_try_pull_from_frag0(skb); + goto ok; } struct packet_offload *gro_find_receive_by_type(__be16 type) diff --git a/net/core/gso.c b/net/core/gso.c new file mode 100644 index 000000000000..9e1803bfc9c6 --- /dev/null +++ b/net/core/gso.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/skbuff.h> +#include <linux/sctp.h> +#include <net/gso.h> +#include <net/gro.h> + +/** + * skb_eth_gso_segment - segmentation handler for ethernet protocols. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); + + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, vlan_depth); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} +EXPORT_SYMBOL(skb_mac_gso_segment); +/* openvswitch calls this on rx path, so we need a different check. + */ +static bool skb_needs_check(const struct sk_buff *skb, bool tx_path) +{ + if (tx_path) + return skb->ip_summed != CHECKSUM_PARTIAL && + skb->ip_summed != CHECKSUM_UNNECESSARY; + + return skb->ip_summed == CHECKSUM_NONE; +} + +/** + * __skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @tx_path: whether it is called in TX path + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path) +{ + struct sk_buff *segs; + + if (unlikely(skb_needs_check(skb, tx_path))) { + int err; + + /* We're going to init ->check field in TCP or UDP header */ + err = skb_cow_head(skb, 0); + if (err < 0) + return ERR_PTR(err); + } + + /* Only report GSO partial support if it will enable us to + * support segmentation on this frame without needing additional + * work. + */ + if (features & NETIF_F_GSO_PARTIAL) { + netdev_features_t partial_features = NETIF_F_GSO_ROBUST; + struct net_device *dev = skb->dev; + + partial_features |= dev->features & dev->gso_partial_features; + if (!skb_gso_ok(skb, features | partial_features)) + features &= ~NETIF_F_GSO_PARTIAL; + } + + BUILD_BUG_ON(SKB_GSO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + SKB_GSO_CB(skb)->encap_level = 0; + + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + segs = skb_mac_gso_segment(skb, features); + + if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) + skb_warn_bad_offload(skb); + + return segs; +} +EXPORT_SYMBOL(__skb_gso_segment); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int thlen = 0; + + if (skb->encapsulation) { + thlen = skb_inner_transport_header(skb) - + skb_transport_header(skb); + + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + thlen += inner_tcp_hdrlen(skb); + } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + thlen = tcp_hdrlen(skb); + } else if (unlikely(skb_is_gso_sctp(skb))) { + thlen = sizeof(struct sctphdr); + } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { + thlen = sizeof(struct udphdr); + } + /* UFO sets gso_size to the size of the fragmentation + * payload, i.e. the size of the L4 (UDP) header is already + * accounted for. + */ + return thlen + shinfo->gso_size; +} + +/** + * skb_gso_network_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_network_seglen is used to determine the real size of the + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). + * + * The MAC/L2 header is not accounted for. + */ +static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - + skb_network_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_mac_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_mac_seglen is used to determine the real size of the + * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 + * headers (TCP/UDP). + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS + * + * There are a couple of instances where we have a GSO skb, and we + * want to determine what size it would be after it is segmented. + * + * We might want to check: + * - L3+L4+payload size (e.g. IP forwarding) + * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) + * + * This is a helper to do that correctly considering GSO_BY_FRAGS. + * + * @skb: GSO skb + * + * @seg_len: The segmented length (from skb_gso_*_seglen). In the + * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. + * + * @max_len: The maximum permissible length. + * + * Returns true if the segmented length <= max length. + */ +static inline bool skb_gso_size_check(const struct sk_buff *skb, + unsigned int seg_len, + unsigned int max_len) { + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const struct sk_buff *iter; + + if (shinfo->gso_size != GSO_BY_FRAGS) + return seg_len <= max_len; + + /* Undo this so we can re-use header sizes */ + seg_len -= GSO_BY_FRAGS; + + skb_walk_frags(skb, iter) { + if (seg_len + skb_headlen(iter) > max_len) + return false; + } + + return true; +} + +/** + * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? + * + * @skb: GSO skb + * @mtu: MTU to validate against + * + * skb_gso_validate_network_len validates if a given skb will fit a + * wanted MTU once split. It considers L3 headers, L4 headers, and the + * payload. + */ +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) +{ + return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); + +/** + * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? + * + * @skb: GSO skb + * @len: length to validate against + * + * skb_gso_validate_mac_len validates if a given skb will fit a wanted + * length once split, including L2, L3 and L4 headers and the payload. + */ +bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) +{ + return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); + diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 3e3598cd49f2..f4183c4c1ec8 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -308,7 +308,7 @@ EXPORT_SYMBOL_GPL(get_net_ns_by_id); /* init code that must occur even if setup_net() is not called. */ static __net_init void preinit_net(struct net *net) { - ref_tracker_dir_init(&net->notrefcnt_tracker, 128); + ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt"); } /* @@ -322,7 +322,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) LIST_HEAD(net_exit_list); refcount_set(&net->ns.count, 1); - ref_tracker_dir_init(&net->refcnt_tracker, 128); + ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index de17ca2f7dbf..ea9231378aa6 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -8,7 +8,7 @@ #include "netdev-genl-gen.h" -#include <linux/netdev.h> +#include <uapi/linux/netdev.h> /* NETDEV_CMD_DEV_GET - do */ static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 74d74fc23167..7b370c073e7d 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -9,7 +9,7 @@ #include <net/netlink.h> #include <net/genetlink.h> -#include <linux/netdev.h> +#include <uapi/linux/netdev.h> int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e6a739b1afa9..543007f159f9 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -690,7 +690,7 @@ int netpoll_setup(struct netpoll *np) err = -ENODEV; goto unlock; } - dev_hold(ndev); + netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL); if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); @@ -783,12 +783,11 @@ put_noaddr: err = __netpoll_setup(np, ndev); if (err) goto put; - netdev_tracker_alloc(ndev, &np->dev_tracker, GFP_KERNEL); rtnl_unlock(); return 0; put: - dev_put(ndev); + netdev_put(ndev, &np->dev_tracker); unlock: rtnl_unlock(); return err; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 760238196db1..f56b8d697014 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2785,14 +2785,17 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, break; } get_page(pkt_dev->page); - skb_frag_set_page(skb, i, pkt_dev->page); - skb_frag_off_set(&skb_shinfo(skb)->frags[i], 0); + /*last fragment, fill rest of data*/ if (i == (frags - 1)) - skb_frag_size_set(&skb_shinfo(skb)->frags[i], - (datalen < PAGE_SIZE ? datalen : PAGE_SIZE)); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], + pkt_dev->page, 0, + (datalen < PAGE_SIZE ? + datalen : PAGE_SIZE)); else - skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], + pkt_dev->page, 0, frag_len); + datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]); skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]); skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 41de3a2f29e1..2c61fb912772 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -961,24 +961,27 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + - nla_total_size(0) + /* nest IFLA_VF_STATS */ - /* IFLA_VF_STATS_RX_PACKETS */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_TX_PACKETS */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_RX_BYTES */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_TX_BYTES */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_BROADCAST */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_MULTICAST */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_RX_DROPPED */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_TX_DROPPED */ - nla_total_size_64bit(sizeof(__u64)) + nla_total_size(sizeof(struct ifla_vf_trust))); + if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { + size += num_vfs * + (nla_total_size(0) + /* nest IFLA_VF_STATS */ + /* IFLA_VF_STATS_RX_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_BROADCAST */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_MULTICAST */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_DROPPED */ + nla_total_size_64bit(sizeof(__u64))); + } return size; } else return 0; @@ -1270,7 +1273,8 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, - struct nlattr *vfinfo) + struct nlattr *vfinfo, + u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; struct nlattr *vf, *vfstats, *vfvlanlist; @@ -1376,33 +1380,35 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, goto nla_put_vf_failure; } nla_nest_end(skb, vfvlanlist); - memset(&vf_stats, 0, sizeof(vf_stats)); - if (dev->netdev_ops->ndo_get_vf_stats) - dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, - &vf_stats); - vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); - if (!vfstats) - goto nla_put_vf_failure; - if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, - vf_stats.rx_packets, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, - vf_stats.tx_packets, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, - vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, - vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, - vf_stats.broadcast, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, - vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, - vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { - nla_nest_cancel(skb, vfstats); - goto nla_put_vf_failure; + if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { + memset(&vf_stats, 0, sizeof(vf_stats)); + if (dev->netdev_ops->ndo_get_vf_stats) + dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, + &vf_stats); + vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); + if (!vfstats) + goto nla_put_vf_failure; + if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, + vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, + vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { + nla_nest_cancel(skb, vfstats); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfstats); } - nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); return 0; @@ -1435,7 +1441,7 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) + if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask)) return -EMSGSIZE; } @@ -2377,45 +2383,43 @@ static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { - if (dev) { - if (tb[IFLA_ADDRESS] && - nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) - return -EINVAL; + if (tb[IFLA_ADDRESS] && + nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) + return -EINVAL; - if (tb[IFLA_BROADCAST] && - nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) - return -EINVAL; + if (tb[IFLA_BROADCAST] && + nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) + return -EINVAL; - if (tb[IFLA_GSO_MAX_SIZE] && - nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { - NL_SET_ERR_MSG(extack, "too big gso_max_size"); - return -EINVAL; - } + if (tb[IFLA_GSO_MAX_SIZE] && + nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { + NL_SET_ERR_MSG(extack, "too big gso_max_size"); + return -EINVAL; + } - if (tb[IFLA_GSO_MAX_SEGS] && - (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || - nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { - NL_SET_ERR_MSG(extack, "too big gso_max_segs"); - return -EINVAL; - } + if (tb[IFLA_GSO_MAX_SEGS] && + (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || + nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { + NL_SET_ERR_MSG(extack, "too big gso_max_segs"); + return -EINVAL; + } - if (tb[IFLA_GRO_MAX_SIZE] && - nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "too big gro_max_size"); - return -EINVAL; - } + if (tb[IFLA_GRO_MAX_SIZE] && + nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_max_size"); + return -EINVAL; + } - if (tb[IFLA_GSO_IPV4_MAX_SIZE] && - nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { - NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); - return -EINVAL; - } + if (tb[IFLA_GSO_IPV4_MAX_SIZE] && + nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { + NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); + return -EINVAL; + } - if (tb[IFLA_GRO_IPV4_MAX_SIZE] && - nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); - return -EINVAL; - } + if (tb[IFLA_GRO_IPV4_MAX_SIZE] && + nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); + return -EINVAL; } if (tb[IFLA_AF_SPEC]) { @@ -2736,10 +2740,6 @@ static int do_setlink(const struct sk_buff *skb, char ifname[IFNAMSIZ]; int err; - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - return err; - if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else @@ -3156,6 +3156,10 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } + err = validate_linkmsg(dev, tb, extack); + if (err < 0) + goto errout; + err = do_setlink(skb, dev, ifm, extack, tb, 0); errout: return err; @@ -3399,6 +3403,9 @@ static int rtnl_group_changelink(const struct sk_buff *skb, for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { + err = validate_linkmsg(dev, tb, extack); + if (err < 0) + return err; err = do_setlink(skb, dev, ifm, extack, tb, 0); if (err < 0) return err; @@ -3556,10 +3563,6 @@ replay: m_ops = master_dev->rtnl_link_ops; } - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - return err; - if (tb[IFLA_LINKINFO]) { err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], @@ -3623,6 +3626,10 @@ replay: if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + err = validate_linkmsg(dev, tb, extack); + if (err < 0) + return err; + if (linkinfo[IFLA_INFO_DATA]) { if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cea28d30abb5..fee2b1c105fe 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -67,6 +67,7 @@ #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> +#include <net/gso.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/mpls.h> @@ -92,15 +93,7 @@ static struct kmem_cache *skbuff_fclone_cache __ro_after_init; static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif -/* skb_small_head_cache and related code is only supported - * for CONFIG_SLAB and CONFIG_SLUB. - * As soon as SLOB is removed from the kernel, we can clean up this. - */ -#if !defined(CONFIG_SLOB) -# define HAVE_SKB_SMALL_HEAD_CACHE 1 -#endif -#ifdef HAVE_SKB_SMALL_HEAD_CACHE static struct kmem_cache *skb_small_head_cache __ro_after_init; #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) @@ -117,7 +110,6 @@ static struct kmem_cache *skb_small_head_cache __ro_after_init; #define SKB_SMALL_HEAD_HEADROOM \ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -#endif /* HAVE_SKB_SMALL_HEAD_CACHE */ int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -562,7 +554,6 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, void *obj; obj_size = SKB_HEAD_ALIGN(*size); -#ifdef HAVE_SKB_SMALL_HEAD_CACHE if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && !(flags & KMALLOC_NOT_NORMAL_BITS)) { obj = kmem_cache_alloc_node(skb_small_head_cache, @@ -576,7 +567,6 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node); goto out; } -#endif *size = obj_size = kmalloc_size_roundup(obj_size); /* * Try a regular allocation, when that fails and we're not entitled @@ -898,11 +888,9 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) static void skb_kfree_head(void *head, unsigned int end_offset) { -#ifdef HAVE_SKB_SMALL_HEAD_CACHE if (end_offset == SKB_SMALL_HEAD_HEADROOM) kmem_cache_free(skb_small_head_cache, head); else -#endif kfree(head); } @@ -2160,7 +2148,6 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) if (likely(skb_end_offset(skb) == saved_end_offset)) return 0; -#ifdef HAVE_SKB_SMALL_HEAD_CACHE /* We can not change skb->end if the original or new value * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head(). */ @@ -2174,7 +2161,6 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) WARN_ON_ONCE(1); return 0; } -#endif shinfo = skb_shinfo(skb); @@ -4203,13 +4189,13 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, EXPORT_SYMBOL(skb_find_text); int skb_append_pagefrags(struct sk_buff *skb, struct page *page, - int offset, size_t size) + int offset, size_t size, size_t max_frags) { int i = skb_shinfo(skb)->nr_frags; if (skb_can_coalesce(skb, i, page, offset)) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); - } else if (i < MAX_SKB_FRAGS) { + } else if (i < max_frags) { skb_zcopy_downgrade_managed(skb); get_page(page); skb_fill_page_desc_noacc(skb, i, page, offset, size); @@ -4249,10 +4235,9 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) struct page *page; page = virt_to_head_page(frag_skb->head); - __skb_frag_set_page(&head_frag, page); - skb_frag_off_set(&head_frag, frag_skb->data - - (unsigned char *)page_address(page)); - skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); + skb_frag_fill_page_desc(&head_frag, page, frag_skb->data - + (unsigned char *)page_address(page), + skb_headlen(frag_skb)); return head_frag; } @@ -4768,7 +4753,6 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); -#ifdef HAVE_SKB_SMALL_HEAD_CACHE /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. * struct skb_shared_info is located at the end of skb->head, * and should not be copied to/from user. @@ -4780,7 +4764,6 @@ void __init skb_init(void) 0, SKB_SMALL_HEAD_HEADROOM, NULL); -#endif skb_extensions_init(); } @@ -5784,147 +5767,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) } EXPORT_SYMBOL_GPL(skb_scrub_packet); -/** - * skb_gso_transport_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_transport_seglen is used to determine the real size of the - * individual segments, including Layer4 headers (TCP/UDP). - * - * The MAC/L2 or network (IP, IPv6) headers are not accounted for. - */ -static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) -{ - const struct skb_shared_info *shinfo = skb_shinfo(skb); - unsigned int thlen = 0; - - if (skb->encapsulation) { - thlen = skb_inner_transport_header(skb) - - skb_transport_header(skb); - - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - thlen += inner_tcp_hdrlen(skb); - } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { - thlen = tcp_hdrlen(skb); - } else if (unlikely(skb_is_gso_sctp(skb))) { - thlen = sizeof(struct sctphdr); - } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { - thlen = sizeof(struct udphdr); - } - /* UFO sets gso_size to the size of the fragmentation - * payload, i.e. the size of the L4 (UDP) header is already - * accounted for. - */ - return thlen + shinfo->gso_size; -} - -/** - * skb_gso_network_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_network_seglen is used to determine the real size of the - * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). - * - * The MAC/L2 header is not accounted for. - */ -static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) -{ - unsigned int hdr_len = skb_transport_header(skb) - - skb_network_header(skb); - - return hdr_len + skb_gso_transport_seglen(skb); -} - -/** - * skb_gso_mac_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_mac_seglen is used to determine the real size of the - * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 - * headers (TCP/UDP). - */ -static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) -{ - unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - - return hdr_len + skb_gso_transport_seglen(skb); -} - -/** - * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS - * - * There are a couple of instances where we have a GSO skb, and we - * want to determine what size it would be after it is segmented. - * - * We might want to check: - * - L3+L4+payload size (e.g. IP forwarding) - * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) - * - * This is a helper to do that correctly considering GSO_BY_FRAGS. - * - * @skb: GSO skb - * - * @seg_len: The segmented length (from skb_gso_*_seglen). In the - * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. - * - * @max_len: The maximum permissible length. - * - * Returns true if the segmented length <= max length. - */ -static inline bool skb_gso_size_check(const struct sk_buff *skb, - unsigned int seg_len, - unsigned int max_len) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); - const struct sk_buff *iter; - - if (shinfo->gso_size != GSO_BY_FRAGS) - return seg_len <= max_len; - - /* Undo this so we can re-use header sizes */ - seg_len -= GSO_BY_FRAGS; - - skb_walk_frags(skb, iter) { - if (seg_len + skb_headlen(iter) > max_len) - return false; - } - - return true; -} - -/** - * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? - * - * @skb: GSO skb - * @mtu: MTU to validate against - * - * skb_gso_validate_network_len validates if a given skb will fit a - * wanted MTU once split. It considers L3 headers, L4 headers, and the - * payload. - */ -bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) -{ - return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); -} -EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); - -/** - * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? - * - * @skb: GSO skb - * @len: length to validate against - * - * skb_gso_validate_mac_len validates if a given skb will fit a wanted - * length once split, including L2, L3 and L4 headers and the payload. - */ -bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) -{ - return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); -} -EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); - static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { int mac_len, meta_len; @@ -6912,3 +6754,91 @@ nodefer: __kfree_skb(skb); if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) smp_call_function_single_async(cpu, &sd->defer_csd); } + +static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, + size_t offset, size_t len) +{ + const char *kaddr; + __wsum csum; + + kaddr = kmap_local_page(page); + csum = csum_partial(kaddr + offset, len, 0); + kunmap_local(kaddr); + skb->csum = csum_block_add(skb->csum, csum, skb->len); +} + +/** + * skb_splice_from_iter - Splice (or copy) pages to skbuff + * @skb: The buffer to add pages to + * @iter: Iterator representing the pages to be added + * @maxsize: Maximum amount of pages to be added + * @gfp: Allocation flags + * + * This is a common helper function for supporting MSG_SPLICE_PAGES. It + * extracts pages from an iterator and adds them to the socket buffer if + * possible, copying them to fragments if not possible (such as if they're slab + * pages). + * + * Returns the amount of data spliced/copied or -EMSGSIZE if there's + * insufficient space in the buffer to transfer anything. + */ +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, + ssize_t maxsize, gfp_t gfp) +{ + size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); + struct page *pages[8], **ppages = pages; + ssize_t spliced = 0, ret = 0; + unsigned int i; + + while (iter->count > 0) { + ssize_t space, nr, len; + size_t off; + + ret = -EMSGSIZE; + space = frag_limit - skb_shinfo(skb)->nr_frags; + if (space < 0) + break; + + /* We might be able to coalesce without increasing nr_frags */ + nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); + + len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off); + if (len <= 0) { + ret = len ?: -EIO; + break; + } + + i = 0; + do { + struct page *page = pages[i++]; + size_t part = min_t(size_t, PAGE_SIZE - off, len); + + ret = -EIO; + if (WARN_ON_ONCE(!sendpage_ok(page))) + goto out; + + ret = skb_append_pagefrags(skb, page, off, part, + frag_limit); + if (ret < 0) { + iov_iter_revert(iter, len); + goto out; + } + + if (skb->ip_summed == CHECKSUM_NONE) + skb_splice_csum_page(skb, page, off, part); + + off = 0; + spliced += part; + maxsize -= part; + len -= part; + } while (len > 0); + + if (maxsize <= 0) + break; + } + +out: + skb_len_add(skb, spliced); + return spliced ?: ret; +} +EXPORT_SYMBOL(skb_splice_from_iter); diff --git a/net/core/sock.c b/net/core/sock.c index 6e5662ca00fe..5f1747c12004 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -114,6 +114,9 @@ #include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/compat.h> +#include <linux/mroute.h> +#include <linux/mroute6.h> +#include <linux/icmpv6.h> #include <linux/uaccess.h> @@ -138,6 +141,7 @@ #include <net/tcp.h> #include <net/busy_poll.h> +#include <net/phonet/phonet.h> #include <linux/ethtool.h> @@ -1246,6 +1250,13 @@ set_sndbuf: clear_bit(SOCK_PASSCRED, &sock->flags); break; + case SO_PASSPIDFD: + if (valbool) + set_bit(SOCK_PASSPIDFD, &sock->flags); + else + clear_bit(SOCK_PASSPIDFD, &sock->flags); + break; + case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: @@ -1726,6 +1737,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); break; + case SO_PASSPIDFD: + v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); + break; + case SO_PEERCRED: { struct ucred peercred; @@ -1741,6 +1756,39 @@ int sk_getsockopt(struct sock *sk, int level, int optname, goto lenout; } + case SO_PEERPIDFD: + { + struct pid *peer_pid; + struct file *pidfd_file = NULL; + int pidfd; + + if (len > sizeof(pidfd)) + len = sizeof(pidfd); + + spin_lock(&sk->sk_peer_lock); + peer_pid = get_pid(sk->sk_peer_pid); + spin_unlock(&sk->sk_peer_lock); + + if (!peer_pid) + return -ESRCH; + + pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); + put_pid(peer_pid); + if (pidfd < 0) + return pidfd; + + if (copy_to_sockptr(optval, &pidfd, len) || + copy_to_sockptr(optlen, &len, sizeof(int))) { + put_unused_fd(pidfd); + fput(pidfd_file); + + return -EFAULT; + } + + fd_install(pidfd, pidfd_file); + return 0; + } + case SO_PEERGROUPS: { const struct cred *cred; @@ -4100,3 +4148,63 @@ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) return sk->sk_prot->bind_add(sk, addr, addr_len); } EXPORT_SYMBOL(sock_bind_add); + +/* Copy 'size' bytes from userspace and return `size` back to userspace */ +int sock_ioctl_inout(struct sock *sk, unsigned int cmd, + void __user *arg, void *karg, size_t size) +{ + int ret; + + if (copy_from_user(karg, arg, size)) + return -EFAULT; + + ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); + if (ret) + return ret; + + if (copy_to_user(arg, karg, size)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(sock_ioctl_inout); + +/* This is the most common ioctl prep function, where the result (4 bytes) is + * copied back to userspace if the ioctl() returns successfully. No input is + * copied from userspace as input argument. + */ +static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) +{ + int ret, karg = 0; + + ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); + if (ret) + return ret; + + return put_user(karg, (int __user *)arg); +} + +/* A wrapper around sock ioctls, which copies the data from userspace + * (depending on the protocol/ioctl), and copies back the result to userspace. + * The main motivation for this function is to pass kernel memory to the + * protocol ioctl callbacks, instead of userspace memory. + */ +int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + int rc = 1; + + if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) + rc = ipmr_sk_ioctl(sk, cmd, arg); + else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) + rc = ip6mr_sk_ioctl(sk, cmd, arg); + else if (sk_is_phonet(sk)) + rc = phonet_sk_ioctl(sk, cmd, arg); + + /* If ioctl was processed, returns its value */ + if (rc <= 0) + return rc; + + /* Otherwise call the default handler */ + return sock_ioctl_out(sk, cmd, arg); +} +EXPORT_SYMBOL(sk_ioctl); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 9ddc3a9e89e4..1f748ed1279d 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -292,7 +292,7 @@ int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); -int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); +int dccp_ioctl(struct sock *sk, int cmd, int *karg); int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index b0ebf853cb07..f331e5977a84 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -362,7 +362,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock, EXPORT_SYMBOL_GPL(dccp_poll); -int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) +int dccp_ioctl(struct sock *sk, int cmd, int *karg) { int rc = -ENOTCONN; @@ -373,17 +373,17 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) switch (cmd) { case SIOCOUTQ: { - int amount = sk_wmem_alloc_get(sk); + *karg = sk_wmem_alloc_get(sk); /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and * always 0, comparably to UDP. */ - rc = put_user(amount, (int __user *)arg); + rc = 0; } break; case SIOCINQ: { struct sk_buff *skb; - unsigned long amount = 0; + *karg = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { @@ -391,9 +391,9 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) * We will only return the amount of this packet since * that is all that will be read. */ - amount = skb->len; + *karg = skb->len; } - rc = put_user(amount, (int __user *)arg); + rc = 0; } break; default: diff --git a/net/devlink/health.c b/net/devlink/health.c index 0839706d5741..194340a8bb86 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -480,7 +480,7 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter, int err; WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); - WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); + ASSERT_DEVLINK_REGISTERED(devlink); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index cd0254968076..1f00f874471f 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -447,18 +447,18 @@ static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps, caps->value |= cap; } -static int devlink_port_fn_roce_fill(const struct devlink_ops *ops, - struct devlink_port *devlink_port, +static int devlink_port_fn_roce_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; - if (!ops->port_fn_roce_get) + if (!devlink_port->ops->port_fn_roce_get) return 0; - err = ops->port_fn_roce_get(devlink_port, &is_enable, extack); + err = devlink_port->ops->port_fn_roce_get(devlink_port, &is_enable, + extack); if (err) { if (err == -EOPNOTSUPP) return 0; @@ -469,19 +469,19 @@ static int devlink_port_fn_roce_fill(const struct devlink_ops *ops, return 0; } -static int devlink_port_fn_migratable_fill(const struct devlink_ops *ops, - struct devlink_port *devlink_port, +static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; - if (!ops->port_fn_migratable_get || + if (!devlink_port->ops->port_fn_migratable_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; - err = ops->port_fn_migratable_get(devlink_port, &is_enable, extack); + err = devlink_port->ops->port_fn_migratable_get(devlink_port, + &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; @@ -492,8 +492,7 @@ static int devlink_port_fn_migratable_fill(const struct devlink_ops *ops, return 0; } -static int devlink_port_fn_caps_fill(const struct devlink_ops *ops, - struct devlink_port *devlink_port, +static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) @@ -501,11 +500,11 @@ static int devlink_port_fn_caps_fill(const struct devlink_ops *ops, struct nla_bitfield32 caps = {}; int err; - err = devlink_port_fn_roce_fill(ops, devlink_port, &caps, extack); + err = devlink_port_fn_roce_fill(devlink_port, &caps, extack); if (err) return err; - err = devlink_port_fn_migratable_fill(ops, devlink_port, &caps, extack); + err = devlink_port_fn_migratable_fill(devlink_port, &caps, extack); if (err) return err; @@ -691,8 +690,7 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return 0; } -static int devlink_port_fn_hw_addr_fill(const struct devlink_ops *ops, - struct devlink_port *port, +static int devlink_port_fn_hw_addr_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) @@ -701,10 +699,10 @@ static int devlink_port_fn_hw_addr_fill(const struct devlink_ops *ops, int hw_addr_len; int err; - if (!ops->port_function_hw_addr_get) + if (!port->ops->port_fn_hw_addr_get) return 0; - err = ops->port_function_hw_addr_get(port, hw_addr, &hw_addr_len, + err = port->ops->port_fn_hw_addr_get(port, hw_addr, &hw_addr_len, extack); if (err) { if (err == -EOPNOTSUPP) @@ -789,8 +787,7 @@ devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; } -static int devlink_port_fn_state_fill(const struct devlink_ops *ops, - struct devlink_port *port, +static int devlink_port_fn_state_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) @@ -799,10 +796,10 @@ static int devlink_port_fn_state_fill(const struct devlink_ops *ops, enum devlink_port_fn_state state; int err; - if (!ops->port_fn_state_get) + if (!port->ops->port_fn_state_get) return 0; - err = ops->port_fn_state_get(port, &state, &opstate, extack); + err = port->ops->port_fn_state_get(port, &state, &opstate, extack); if (err) { if (err == -EOPNOTSUPP) return 0; @@ -829,18 +826,16 @@ static int devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { - const struct devlink_ops *ops = devlink_port->devlink->ops; - - return ops->port_fn_migratable_set(devlink_port, enable, extack); + return devlink_port->ops->port_fn_migratable_set(devlink_port, enable, + extack); } static int devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { - const struct devlink_ops *ops = devlink_port->devlink->ops; - - return ops->port_fn_roce_set(devlink_port, enable, extack); + return devlink_port->ops->port_fn_roce_set(devlink_port, enable, + extack); } static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, @@ -874,7 +869,6 @@ static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { - const struct devlink_ops *ops; struct nlattr *function_attr; bool msg_updated = false; int err; @@ -883,16 +877,13 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por if (!function_attr) return -EMSGSIZE; - ops = port->devlink->ops; - err = devlink_port_fn_hw_addr_fill(ops, port, msg, extack, - &msg_updated); + err = devlink_port_fn_hw_addr_fill(port, msg, extack, &msg_updated); if (err) goto out; - err = devlink_port_fn_caps_fill(ops, port, msg, extack, - &msg_updated); + err = devlink_port_fn_caps_fill(port, msg, extack, &msg_updated); if (err) goto out; - err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated); + err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated); out: if (err || !msg_updated) nla_nest_cancel(msg, function_attr); @@ -1137,14 +1128,13 @@ static int devlink_port_type_set(struct devlink_port *devlink_port, { int err; - if (!devlink_port->devlink->ops->port_type_set) + if (!devlink_port->ops->port_type_set) return -EOPNOTSUPP; if (port_type == devlink_port->type) return 0; - err = devlink_port->devlink->ops->port_type_set(devlink_port, - port_type); + err = devlink_port->ops->port_type_set(devlink_port, port_type); if (err) return err; @@ -1157,7 +1147,6 @@ static int devlink_port_function_hw_addr_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { - const struct devlink_ops *ops = port->devlink->ops; const u8 *hw_addr; int hw_addr_len; @@ -1178,7 +1167,7 @@ static int devlink_port_function_hw_addr_set(struct devlink_port *port, } } - return ops->port_function_hw_addr_set(port, hw_addr, hw_addr_len, + return port->ops->port_fn_hw_addr_set(port, hw_addr, hw_addr_len, extack); } @@ -1187,22 +1176,20 @@ static int devlink_port_fn_state_set(struct devlink_port *port, struct netlink_ext_ack *extack) { enum devlink_port_fn_state state; - const struct devlink_ops *ops; state = nla_get_u8(attr); - ops = port->devlink->ops; - return ops->port_fn_state_set(port, state, extack); + return port->ops->port_fn_state_set(port, state, extack); } static int devlink_port_function_validate(struct devlink_port *devlink_port, struct nlattr **tb, struct netlink_ext_ack *extack) { - const struct devlink_ops *ops = devlink_port->devlink->ops; + const struct devlink_port_ops *ops = devlink_port->ops; struct nlattr *attr; if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && - !ops->port_function_hw_addr_set) { + !ops->port_fn_hw_addr_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], "Port doesn't support function attributes"); return -EOPNOTSUPP; @@ -1320,7 +1307,7 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT)) return -EINVAL; - if (!devlink->ops->port_split) + if (!devlink_port->ops->port_split) return -EOPNOTSUPP; count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); @@ -1339,8 +1326,8 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, return -EINVAL; } - return devlink->ops->port_split(devlink, devlink_port, count, - info->extack); + return devlink_port->ops->port_split(devlink, devlink_port, count, + info->extack); } static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, @@ -1349,40 +1336,9 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; - if (!devlink->ops->port_unsplit) + if (!devlink_port->ops->port_unsplit) return -EOPNOTSUPP; - return devlink->ops->port_unsplit(devlink, devlink_port, info->extack); -} - -static int devlink_port_new_notify(struct devlink *devlink, - unsigned int port_index, - struct genl_info *info) -{ - struct devlink_port *devlink_port; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - lockdep_assert_held(&devlink->lock); - devlink_port = devlink_port_get_by_index(devlink, port_index); - if (!devlink_port) { - err = -ENODEV; - goto out; - } - - err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, - info->snd_portid, info->snd_seq, 0, NULL); - if (err) - goto out; - - return genlmsg_reply(msg, info); - -out: - nlmsg_free(msg); - return err; + return devlink_port->ops->port_unsplit(devlink, devlink_port, info->extack); } static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, @@ -1391,10 +1347,11 @@ static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, struct netlink_ext_ack *extack = info->extack; struct devlink_port_new_attrs new_attrs = {}; struct devlink *devlink = info->user_ptr[0]; - unsigned int new_port_index; + struct devlink_port *devlink_port; + struct sk_buff *msg; int err; - if (!devlink->ops->port_new || !devlink->ops->port_del) + if (!devlink->ops->port_new) return -EOPNOTSUPP; if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || @@ -1423,36 +1380,43 @@ static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, new_attrs.sfnum_valid = true; } - err = devlink->ops->port_new(devlink, &new_attrs, extack, - &new_port_index); + err = devlink->ops->port_new(devlink, &new_attrs, + extack, &devlink_port); if (err) return err; - err = devlink_port_new_notify(devlink, new_port_index, info); - if (err && err != -ENODEV) { - /* Fail to send the response; destroy newly created port. */ - devlink->ops->port_del(devlink, new_port_index, extack); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto err_out_port_del; } + err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, + info->snd_portid, info->snd_seq, 0, NULL); + if (WARN_ON_ONCE(err)) + goto err_out_msg_free; + err = genlmsg_reply(msg, info); + if (err) + goto err_out_port_del; + return 0; + +err_out_msg_free: + nlmsg_free(msg); +err_out_port_del: + devlink_port->ops->port_del(devlink, devlink_port, NULL); return err; } static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, struct genl_info *info) { + struct devlink_port *devlink_port = info->user_ptr[1]; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; - unsigned int port_index; - if (!devlink->ops->port_del) + if (!devlink_port->ops->port_del) return -EOPNOTSUPP; - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_INDEX)) { - NL_SET_ERR_MSG(extack, "Port index is not specified"); - return -EINVAL; - } - port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - - return devlink->ops->port_del(devlink, port_index, extack); + return devlink_port->ops->port_del(devlink, devlink_port, extack); } static int @@ -6384,6 +6348,7 @@ const struct genl_small_ops devlink_nl_ops[56] = { .cmd = DEVLINK_CMD_PORT_DEL, .doit = devlink_nl_cmd_port_del_doit, .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { .cmd = DEVLINK_CMD_LINECARD_GET, @@ -6772,7 +6737,10 @@ void devlink_notify_unregister(struct devlink *devlink) static void devlink_port_type_warn(struct work_struct *work) { - WARN(true, "Type was not set for devlink port."); + struct devlink_port *port = container_of(to_delayed_work(work), + struct devlink_port, + type_warn_dw); + dev_warn(port->devlink->dev, "Type was not set for devlink port."); } static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) @@ -6809,7 +6777,7 @@ static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) * @devlink: devlink * @devlink_port: devlink port * - * Initialize essencial stuff that is needed for functions + * Initialize essential stuff that is needed for functions * that may be called before devlink port registration. * Call to this function is optional and not needed * in case the driver does not use such functions. @@ -6830,7 +6798,7 @@ EXPORT_SYMBOL_GPL(devlink_port_init); * * @devlink_port: devlink port * - * Deinitialize essencial stuff that is in use for functions + * Deinitialize essential stuff that is in use for functions * that may be called after devlink port unregistration. * Call to this function is optional and not needed * in case the driver does not use such functions. @@ -6841,12 +6809,15 @@ void devlink_port_fini(struct devlink_port *devlink_port) } EXPORT_SYMBOL_GPL(devlink_port_fini); +static const struct devlink_port_ops devlink_port_dummy_ops = {}; + /** - * devl_port_register() - Register devlink port + * devl_port_register_with_ops() - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port + * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure @@ -6854,9 +6825,10 @@ EXPORT_SYMBOL_GPL(devlink_port_fini); * Note that the caller should take care of zeroing the devlink_port * structure. */ -int devl_port_register(struct devlink *devlink, - struct devlink_port *devlink_port, - unsigned int port_index) +int devl_port_register_with_ops(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index, + const struct devlink_port_ops *ops) { int err; @@ -6867,6 +6839,7 @@ int devl_port_register(struct devlink *devlink, devlink_port_init(devlink, devlink_port); devlink_port->registered = true; devlink_port->index = port_index; + devlink_port->ops = ops ? ops : &devlink_port_dummy_ops; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); @@ -6878,14 +6851,15 @@ int devl_port_register(struct devlink *devlink, devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } -EXPORT_SYMBOL_GPL(devl_port_register); +EXPORT_SYMBOL_GPL(devl_port_register_with_ops); /** - * devlink_port_register - Register devlink port + * devlink_port_register_with_ops - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port + * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure @@ -6895,18 +6869,20 @@ EXPORT_SYMBOL_GPL(devl_port_register); * * Context: Takes and release devlink->lock <mutex>. */ -int devlink_port_register(struct devlink *devlink, - struct devlink_port *devlink_port, - unsigned int port_index) +int devlink_port_register_with_ops(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index, + const struct devlink_port_ops *ops) { int err; devl_lock(devlink); - err = devl_port_register(devlink, devlink_port, port_index); + err = devl_port_register_with_ops(devlink, devlink_port, + port_index, ops); devl_unlock(devlink); return err; } -EXPORT_SYMBOL_GPL(devlink_port_register); +EXPORT_SYMBOL_GPL(devlink_port_register_with_ops); /** * devl_port_unregister() - Unregister devlink port diff --git a/net/dsa/port.c b/net/dsa/port.c index 71ba30538411..0ce8fd311c78 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1603,6 +1603,21 @@ dsa_port_phylink_mac_select_pcs(struct phylink_config *config, return pcs; } +static int dsa_port_phylink_mac_prepare(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + int err = 0; + + if (ds->ops->phylink_mac_prepare) + err = ds->ops->phylink_mac_prepare(ds, dp->index, mode, + interface); + + return err; +} + static void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) @@ -1616,6 +1631,21 @@ static void dsa_port_phylink_mac_config(struct phylink_config *config, ds->ops->phylink_mac_config(ds, dp->index, mode, state); } +static int dsa_port_phylink_mac_finish(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + int err = 0; + + if (ds->ops->phylink_mac_finish) + err = ds->ops->phylink_mac_finish(ds, dp->index, mode, + interface); + + return err; +} + static void dsa_port_phylink_mac_an_restart(struct phylink_config *config) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); @@ -1671,7 +1701,9 @@ static const struct phylink_mac_ops dsa_port_phylink_mac_ops = { .validate = dsa_port_phylink_validate, .mac_select_pcs = dsa_port_phylink_mac_select_pcs, .mac_pcs_get_state = dsa_port_phylink_mac_pcs_get_state, + .mac_prepare = dsa_port_phylink_mac_prepare, .mac_config = dsa_port_phylink_mac_config, + .mac_finish = dsa_port_phylink_mac_finish, .mac_an_restart = dsa_port_phylink_mac_an_restart, .mac_link_down = dsa_port_phylink_mac_link_down, .mac_link_up = dsa_port_phylink_mac_link_up, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 6bb778e10461..4a51e0ec295c 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1436,15 +1436,26 @@ static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) { - struct ethtool_wolinfo wol; + struct ethtool_wolinfo wol, cur_wol; int ret; - if (!dev->ethtool_ops->set_wol) + if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol) return -EOPNOTSUPP; + memset(&cur_wol, 0, sizeof(struct ethtool_wolinfo)); + cur_wol.cmd = ETHTOOL_GWOL; + dev->ethtool_ops->get_wol(dev, &cur_wol); + if (copy_from_user(&wol, useraddr, sizeof(wol))) return -EFAULT; + if (wol.wolopts & ~cur_wol.supported) + return -EINVAL; + + if (wol.wolopts == cur_wol.wolopts && + !memcmp(wol.sopass, cur_wol.sopass, sizeof(wol.sopass))) + return 0; + ret = dev->ethtool_ops->set_wol(dev, &wol); if (ret) return ret; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 08120095cc68..39a459b0111b 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -96,6 +96,8 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, int ret; if (!header) { + if (!require_dev) + return 0; NL_SET_ERR_MSG(extack, "request header missing"); return -EINVAL; } @@ -113,7 +115,8 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) { u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]); - dev = dev_get_by_index(net, ifindex); + dev = netdev_get_by_index(net, ifindex, &req_info->dev_tracker, + GFP_KERNEL); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_HEADER_DEV_INDEX], @@ -123,13 +126,14 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, /* if both ifindex and ifname are passed, they must match */ if (devname_attr && strncmp(dev->name, nla_data(devname_attr), IFNAMSIZ)) { - dev_put(dev); + netdev_put(dev, &req_info->dev_tracker); NL_SET_ERR_MSG_ATTR(extack, header, "ifindex and name do not match"); return -ENODEV; } } else if (devname_attr) { - dev = dev_get_by_name(net, nla_data(devname_attr)); + dev = netdev_get_by_name(net, nla_data(devname_attr), + &req_info->dev_tracker, GFP_KERNEL); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, devname_attr, "no device matches name"); @@ -142,8 +146,6 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, } req_info->dev = dev; - if (dev) - netdev_tracker_alloc(dev, &req_info->dev_tracker, GFP_KERNEL); req_info->flags = flags; return 0; } diff --git a/net/handshake/genl.c b/net/handshake/genl.c index 9f29efb1493e..233be5cbfec9 100644 --- a/net/handshake/genl.c +++ b/net/handshake/genl.c @@ -8,7 +8,7 @@ #include "genl.h" -#include <linux/handshake.h> +#include <uapi/linux/handshake.h> /* HANDSHAKE_CMD_ACCEPT - do */ static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HANDLER_CLASS + 1] = { diff --git a/net/handshake/genl.h b/net/handshake/genl.h index 2c1f1aa6a02a..ae72a596f6cc 100644 --- a/net/handshake/genl.h +++ b/net/handshake/genl.h @@ -9,7 +9,7 @@ #include <net/netlink.h> #include <net/genetlink.h> -#include <linux/handshake.h> +#include <uapi/linux/handshake.h> int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info); int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 5a236aae2366..306f942c3b28 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -531,6 +531,11 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], if (res) goto err_add_master; + /* HSR forwarding offload supported in lower device? */ + if ((slave[0]->features & NETIF_F_HW_HSR_FWD) && + (slave[1]->features & NETIF_F_HW_HSR_FWD)) + hsr->fwd_offloaded = true; + res = register_netdevice(hsr_dev); if (res) goto err_unregister; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 5584c80a5c79..6851e33df7d1 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -208,6 +208,7 @@ struct hsr_priv { u8 net_id; /* for PRP, it occupies most significant 3 bits * of lan_id */ + bool fwd_offloaded; /* Forwarding offloaded to HW */ unsigned char sup_multicast_addr[ETH_ALEN] __aligned(sizeof(u16)); /* Align to u16 boundary to avoid unaligned access * in ether_addr_equal diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index b70e6bbf6021..e5742f2a2d52 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -131,9 +131,14 @@ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, struct hsr_port *master; int res; - res = dev_set_promiscuity(dev, 1); - if (res) - return res; + /* Don't use promiscuous mode for offload since L2 frame forward + * happens at the offloaded hardware. + */ + if (!port->hsr->fwd_offloaded) { + res = dev_set_promiscuity(dev, 1); + if (res) + return res; + } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr_dev = master->dev; @@ -152,7 +157,9 @@ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, fail_rx_handler: netdev_upper_dev_unlink(dev, hsr_dev); fail_upper_dev_link: - dev_set_promiscuity(dev, -1); + if (!port->hsr->fwd_offloaded) + dev_set_promiscuity(dev, -1); + return res; } diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 1fa2fe041ec0..9c124705120d 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -162,7 +162,7 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, default: if (!sk->sk_prot->ioctl) return -ENOIOCTLCMD; - return sk->sk_prot->ioctl(sk, cmd, arg); + return sk_ioctl(sk, cmd, (void __user *)arg); } } @@ -531,22 +531,21 @@ out: return err; } -static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int dgram_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { - int amount = sk_wmem_alloc_get(sk); + *karg = sk_wmem_alloc_get(sk); - return put_user(amount, (int __user *)arg); + return 0; } case SIOCINQ: { struct sk_buff *skb; - unsigned long amount; - amount = 0; + *karg = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) { @@ -554,10 +553,10 @@ static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg) * of this packet since that is all * that will be read. */ - amount = skb->len - ieee802154_hdr_length(skb); + *karg = skb->len - ieee802154_hdr_length(skb); } spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); + return 0; } } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4a76ebf793b8..38e649fb4474 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -100,6 +100,7 @@ #include <net/ip_fib.h> #include <net/inet_connection_sock.h> #include <net/gro.h> +#include <net/gso.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> @@ -732,6 +733,20 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, } EXPORT_SYMBOL(inet_stream_connect); +void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk) +{ + sock_rps_record_flow(newsk); + WARN_ON(!((1 << newsk->sk_state) & + (TCPF_ESTABLISHED | TCPF_SYN_RECV | + TCPF_CLOSE_WAIT | TCPF_CLOSE))); + + if (test_bit(SOCK_SUPPORT_ZC, &sock->flags)) + set_bit(SOCK_SUPPORT_ZC, &newsock->flags); + sock_graft(newsk, newsock); + + newsock->state = SS_CONNECTED; +} + /* * Accept a pending connection. The TCP layer now gives BSD semantics. */ @@ -745,24 +760,12 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags, /* IPV6_ADDRFORM can change sk->sk_prot under us. */ sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern); if (!sk2) - goto do_err; + return err; lock_sock(sk2); - - sock_rps_record_flow(sk2); - WARN_ON(!((1 << sk2->sk_state) & - (TCPF_ESTABLISHED | TCPF_SYN_RECV | - TCPF_CLOSE_WAIT | TCPF_CLOSE))); - - if (test_bit(SOCK_SUPPORT_ZC, &sock->flags)) - set_bit(SOCK_SUPPORT_ZC, &newsock->flags); - sock_graft(sk2, newsock); - - newsock->state = SS_CONNECTED; - err = 0; + __inet_accept(sock, newsock, sk2); release_sock(sk2); -do_err: - return err; + return 0; } EXPORT_SYMBOL(inet_accept); @@ -829,6 +832,21 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } EXPORT_SYMBOL(inet_sendmsg); +void inet_splice_eof(struct socket *sock) +{ + const struct proto *prot; + struct sock *sk = sock->sk; + + if (unlikely(inet_send_prepare(sk))) + return; + + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (prot->splice_eof) + prot->splice_eof(sock); +} +EXPORT_SYMBOL_GPL(inet_splice_eof); + ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { @@ -980,7 +998,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; default: if (sk->sk_prot->ioctl) - err = sk->sk_prot->ioctl(sk, cmd, arg); + err = sk_ioctl(sk, cmd, (void __user *)arg); else err = -ENOIOCTLCMD; break; @@ -1048,6 +1066,7 @@ const struct proto_ops inet_stream_ops = { #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif + .splice_eof = inet_splice_eof, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, @@ -1082,6 +1101,7 @@ const struct proto_ops inet_dgram_ops = { .read_skb = udp_read_skb, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, + .splice_eof = inet_splice_eof, .sendpage = inet_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT @@ -1113,6 +1133,7 @@ static const struct proto_ops inet_sockraw_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, + .splice_eof = inet_splice_eof, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index ee848be59e65..10e96ed6c9e3 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -17,6 +17,7 @@ #include <linux/err.h> #include <linux/module.h> #include <net/gro.h> +#include <net/gso.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/esp.h> diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c index 6c37c4f98cca..98b90107b5ab 100644 --- a/net/ipv4/fou_nl.c +++ b/net/ipv4/fou_nl.c @@ -8,7 +8,7 @@ #include "fou_nl.h" -#include <linux/fou.h> +#include <uapi/linux/fou.h> /* Global operation policy for fou */ const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = { diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h index dbd0780a5d34..63a6c4ed803d 100644 --- a/net/ipv4/fou_nl.h +++ b/net/ipv4/fou_nl.h @@ -9,7 +9,7 @@ #include <net/netlink.h> #include <net/genetlink.h> -#include <linux/fou.h> +#include <uapi/linux/fou.h> /* Global operation policy for fou */ extern const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1]; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 2b9cb5398335..311e70bfce40 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -11,6 +11,7 @@ #include <net/protocol.h> #include <net/gre.h> #include <net/gro.h> +#include <net/gso.h> static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1386787eaf1a..0cc19cfbb673 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -706,20 +706,23 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) out: release_sock(sk); if (newsk && mem_cgroup_sockets_enabled) { - int amt; + int amt = 0; /* atomically get the memory usage, set and charge the * newsk->sk_memcg. */ lock_sock(newsk); - /* The socket has not been accepted yet, no need to look at - * newsk->sk_wmem_queued. - */ - amt = sk_mem_pages(newsk->sk_forward_alloc + - atomic_read(&newsk->sk_rmem_alloc)); mem_cgroup_sk_alloc(newsk); - if (newsk->sk_memcg && amt) + if (newsk->sk_memcg) { + /* The socket has not been accepted yet, no need + * to look at newsk->sk_wmem_queued. + */ + amt = sk_mem_pages(newsk->sk_forward_alloc + + atomic_read(&newsk->sk_rmem_alloc)); + } + + if (amt) mem_cgroup_charge_skmem(newsk->sk_memcg, amt, GFP_KERNEL | __GFP_NOFAIL); @@ -792,7 +795,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, - RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, @@ -830,7 +833,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, fl4 = &newinet->cork.fl.u.ip4; flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, - RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e55a20264960..81a1cce1a7d1 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -189,10 +189,10 @@ static int ipgre_err(struct sk_buff *skb, u32 info, } #if IS_ENABLED(CONFIG_IPV6) - if (tpi->proto == htons(ETH_P_IPV6) && - !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, - type, data_len)) - return 0; + if (tpi->proto == htons(ETH_P_IPV6) && + !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, + type, data_len)) + return 0; #endif if (t->parms.iph.daddr == 0 || diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 61892268e8a6..6e70839257f7 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -73,6 +73,7 @@ #include <net/arp.h> #include <net/icmp.h> #include <net/checksum.h> +#include <net/gso.h> #include <net/inetpeer.h> #include <net/inet_ecn.h> #include <net/lwtunnel.h> @@ -946,17 +947,6 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk } EXPORT_SYMBOL(ip_generic_getfrag); -static inline __wsum -csum_page(struct page *page, int offset, int copy) -{ - char *kaddr; - __wsum csum; - kaddr = kmap(page); - csum = csum_partial(kaddr + offset, copy, 0); - kunmap(page); - return csum; -} - static int __ip_append_data(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, @@ -1048,6 +1038,15 @@ static int __ip_append_data(struct sock *sk, skb_zcopy_set(skb, uarg, &extra_uref); } } + } else if ((flags & MSG_SPLICE_PAGES) && length) { + if (inet->hdrincl) + return -EPERM; + if (rt->dst.dev->features & NETIF_F_SG && + getfrag == ip_generic_getfrag) + /* We need an empty buffer to attach stuff to */ + paged = true; + else + flags &= ~MSG_SPLICE_PAGES; } cork->length += length; @@ -1207,6 +1206,15 @@ alloc_new_skb: err = -EFAULT; goto error; } + } else if (flags & MSG_SPLICE_PAGES) { + struct msghdr *msg = from; + + err = skb_splice_from_iter(skb, &msg->msg_iter, copy, + sk->sk_allocation); + if (err < 0) + goto error; + copy = err; + wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; @@ -1310,10 +1318,10 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, } /* - * ip_append_data() and ip_append_page() can make one large IP datagram - * from many pieces of data. Each pieces will be holded on the socket - * until ip_push_pending_frames() is called. Each piece can be a page - * or non-page data. + * ip_append_data() can make one large IP datagram from many pieces of + * data. Each piece will be held on the socket until + * ip_push_pending_frames() is called. Each piece can be a page or + * non-page data. * * Not only UDP, other transport protocols - e.g. raw sockets - can use * this interface potentially. @@ -1346,134 +1354,6 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4, from, length, transhdrlen, flags); } -ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, - int offset, size_t size, int flags) -{ - struct inet_sock *inet = inet_sk(sk); - struct sk_buff *skb; - struct rtable *rt; - struct ip_options *opt = NULL; - struct inet_cork *cork; - int hh_len; - int mtu; - int len; - int err; - unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize; - - if (inet->hdrincl) - return -EPERM; - - if (flags&MSG_PROBE) - return 0; - - if (skb_queue_empty(&sk->sk_write_queue)) - return -EINVAL; - - cork = &inet->cork.base; - rt = (struct rtable *)cork->dst; - if (cork->flags & IPCORK_OPT) - opt = cork->opt; - - if (!(rt->dst.dev->features & NETIF_F_SG)) - return -EOPNOTSUPP; - - hh_len = LL_RESERVED_SPACE(rt->dst.dev); - mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; - - fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); - maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; - - if (cork->length + size > maxnonfragsize - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, - mtu - (opt ? opt->optlen : 0)); - return -EMSGSIZE; - } - - skb = skb_peek_tail(&sk->sk_write_queue); - if (!skb) - return -EINVAL; - - cork->length += size; - - while (size > 0) { - /* Check if the remaining data fits into current packet. */ - len = mtu - skb->len; - if (len < size) - len = maxfraglen - skb->len; - - if (len <= 0) { - struct sk_buff *skb_prev; - int alloclen; - - skb_prev = skb; - fraggap = skb_prev->len - maxfraglen; - - alloclen = fragheaderlen + hh_len + fraggap + 15; - skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); - if (unlikely(!skb)) { - err = -ENOBUFS; - goto error; - } - - /* - * Fill in the control structures - */ - skb->ip_summed = CHECKSUM_NONE; - skb->csum = 0; - skb_reserve(skb, hh_len); - - /* - * Find where to start putting bytes. - */ - skb_put(skb, fragheaderlen + fraggap); - skb_reset_network_header(skb); - skb->transport_header = (skb->network_header + - fragheaderlen); - if (fraggap) { - skb->csum = skb_copy_and_csum_bits(skb_prev, - maxfraglen, - skb_transport_header(skb), - fraggap); - skb_prev->csum = csum_sub(skb_prev->csum, - skb->csum); - pskb_trim_unique(skb_prev, maxfraglen); - } - - /* - * Put the packet on the pending queue. - */ - __skb_queue_tail(&sk->sk_write_queue, skb); - continue; - } - - if (len > size) - len = size; - - if (skb_append_pagefrags(skb, page, offset, len)) { - err = -EMSGSIZE; - goto error; - } - - if (skb->ip_summed == CHECKSUM_NONE) { - __wsum csum; - csum = csum_page(page, offset, len); - skb->csum = csum_block_add(skb->csum, csum, skb->len); - } - - skb_len_add(skb, len); - refcount_add(len, &sk->sk_wmem_alloc); - offset += len; - size -= len; - } - return 0; - -error: - cork->length -= size; - IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); - return err; -} - static void ip_cork_release(struct inet_cork *cork) { cork->flags &= ~IPCORK_OPT; @@ -1692,7 +1572,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, - unsigned int len, u64 transmit_time) + unsigned int len, u64 transmit_time, u32 txhash) { struct ip_options_data replyopts; struct ipcm_cookie ipc; @@ -1755,6 +1635,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; nskb->mono_delivery_time = !!transmit_time; + if (txhash) + skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4); ip_push_pending_frames(sk, &fl4); } out: diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index e90bc0aa85c7..c56b6fe6f0d7 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -665,6 +665,9 @@ static struct packet_type bootp_packet_type __initdata = { .func = ic_bootp_recv, }; +/* DHCPACK can overwrite DNS if fallback was set upon first BOOTP reply */ +static int ic_nameservers_fallback __initdata; + /* * Initialize DHCP/BOOTP extension fields in the request. */ @@ -938,7 +941,8 @@ static void __init ic_do_bootp_ext(u8 *ext) if (servers > CONF_NAMESERVERS_MAX) servers = CONF_NAMESERVERS_MAX; for (i = 0; i < servers; i++) { - if (ic_nameservers[i] == NONE) + if (ic_nameservers[i] == NONE || + ic_nameservers_fallback) memcpy(&ic_nameservers[i], ext+1+4*i, 4); } break; @@ -1158,8 +1162,10 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str ic_addrservaddr = b->iph.saddr; if (ic_gateway == NONE && b->relay_ip) ic_gateway = b->relay_ip; - if (ic_nameservers[0] == NONE) + if (ic_nameservers[0] == NONE) { ic_nameservers[0] = ic_servaddr; + ic_nameservers_fallback = 1; + } ic_got_reply = IC_BOOTP; drop_unlock: diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index eec1f6df80d8..3f0c6d602fb7 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1547,6 +1547,28 @@ out: return ret; } +/* Execute if this ioctl is a special mroute ioctl */ +int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + switch (cmd) { + /* These userspace buffers will be consumed by ipmr_ioctl() */ + case SIOCGETVIFCNT: { + struct sioc_vif_req buffer; + + return sock_ioctl_inout(sk, cmd, arg, &buffer, + sizeof(buffer)); + } + case SIOCGETSGCNT: { + struct sioc_sg_req buffer; + + return sock_ioctl_inout(sk, cmd, arg, &buffer, + sizeof(buffer)); + } + } + /* return code > 0 means that the ioctl was not executed */ + return 1; +} + /* Getsock opt support for the multicast routing system. */ int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) @@ -1593,13 +1615,13 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, } /* The IP multicast ioctl support routines. */ -int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) +int ipmr_ioctl(struct sock *sk, int cmd, void *arg) { - struct sioc_sg_req sr; - struct sioc_vif_req vr; struct vif_device *vif; struct mfc_cache *c; struct net *net = sock_net(sk); + struct sioc_vif_req *vr; + struct sioc_sg_req *sr; struct mr_table *mrt; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); @@ -1608,40 +1630,33 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) switch (cmd) { case SIOCGETVIFCNT: - if (copy_from_user(&vr, arg, sizeof(vr))) - return -EFAULT; - if (vr.vifi >= mrt->maxvif) + vr = (struct sioc_vif_req *)arg; + if (vr->vifi >= mrt->maxvif) return -EINVAL; - vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif); + vr->vifi = array_index_nospec(vr->vifi, mrt->maxvif); rcu_read_lock(); - vif = &mrt->vif_table[vr.vifi]; - if (VIF_EXISTS(mrt, vr.vifi)) { - vr.icount = READ_ONCE(vif->pkt_in); - vr.ocount = READ_ONCE(vif->pkt_out); - vr.ibytes = READ_ONCE(vif->bytes_in); - vr.obytes = READ_ONCE(vif->bytes_out); + vif = &mrt->vif_table[vr->vifi]; + if (VIF_EXISTS(mrt, vr->vifi)) { + vr->icount = READ_ONCE(vif->pkt_in); + vr->ocount = READ_ONCE(vif->pkt_out); + vr->ibytes = READ_ONCE(vif->bytes_in); + vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); - if (copy_to_user(arg, &vr, sizeof(vr))) - return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT: - if (copy_from_user(&sr, arg, sizeof(sr))) - return -EFAULT; + sr = (struct sioc_sg_req *)arg; rcu_read_lock(); - c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); + c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr); if (c) { - sr.pktcnt = c->_c.mfc_un.res.pkt; - sr.bytecnt = c->_c.mfc_un.res.bytes; - sr.wrong_if = c->_c.mfc_un.res.wrong_if; + sr->pktcnt = c->_c.mfc_un.res.pkt; + sr->bytecnt = c->_c.mfc_un.res.bytes; + sr->wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); - - if (copy_to_user(arg, &sr, sizeof(sr))) - return -EFAULT; return 0; } rcu_read_unlock(); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 5178a3f3cb53..25dd78cee179 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -49,13 +49,8 @@ #include <net/transp_v6.h> #endif -#define ping_portaddr_for_each_entry(__sk, node, list) \ - hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) -#define ping_portaddr_for_each_entry_rcu(__sk, node, list) \ - hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) - struct ping_table { - struct hlist_nulls_head hash[PING_HTABLE_SIZE]; + struct hlist_head hash[PING_HTABLE_SIZE]; spinlock_t lock; }; @@ -74,17 +69,16 @@ static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) } EXPORT_SYMBOL_GPL(ping_hash); -static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, - struct net *net, unsigned int num) +static inline struct hlist_head *ping_hashslot(struct ping_table *table, + struct net *net, unsigned int num) { return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; } int ping_get_port(struct sock *sk, unsigned short ident) { - struct hlist_nulls_node *node; - struct hlist_nulls_head *hlist; struct inet_sock *isk, *isk2; + struct hlist_head *hlist; struct sock *sk2 = NULL; isk = inet_sk(sk); @@ -98,7 +92,7 @@ int ping_get_port(struct sock *sk, unsigned short ident) result++; /* avoid zero */ hlist = ping_hashslot(&ping_table, sock_net(sk), result); - ping_portaddr_for_each_entry(sk2, node, hlist) { + sk_for_each(sk2, hlist) { isk2 = inet_sk(sk2); if (isk2->inet_num == result) @@ -115,7 +109,7 @@ next_port: goto fail; } else { hlist = ping_hashslot(&ping_table, sock_net(sk), ident); - ping_portaddr_for_each_entry(sk2, node, hlist) { + sk_for_each(sk2, hlist) { isk2 = inet_sk(sk2); /* BUG? Why is this reuse and not reuseaddr? ping.c @@ -133,9 +127,8 @@ next_port: isk->inet_num = ident; if (sk_unhashed(sk)) { pr_debug("was not hashed\n"); - sock_hold(sk); + sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); - hlist_nulls_add_head_rcu(&sk->sk_nulls_node, hlist); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } spin_unlock(&ping_table.lock); @@ -161,9 +154,7 @@ void ping_unhash(struct sock *sk) pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); spin_lock(&ping_table.lock); - if (sk_hashed(sk)) { - hlist_nulls_del_init_rcu(&sk->sk_nulls_node); - sock_put(sk); + if (sk_del_node_init_rcu(sk)) { isk->inet_num = 0; isk->inet_sport = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); @@ -175,10 +166,9 @@ EXPORT_SYMBOL_GPL(ping_unhash); /* Called under rcu_read_lock() */ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) { - struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); + struct hlist_head *hslot = ping_hashslot(&ping_table, net, ident); struct sock *sk = NULL; struct inet_sock *isk; - struct hlist_nulls_node *hnode; int dif, sdif; if (skb->protocol == htons(ETH_P_IP)) { @@ -197,7 +187,7 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) return NULL; } - ping_portaddr_for_each_entry_rcu(sk, hnode, hslot) { + sk_for_each_rcu(sk, hslot) { isk = inet_sk(sk); pr_debug("iterate\n"); @@ -715,7 +705,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip_options_data opt_copy; int free = 0; __be32 saddr, daddr, faddr; - u8 tos; + u8 tos, scope; int err; pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -779,11 +769,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) faddr = ipc.opt->opt.faddr; } tos = get_rttos(&ipc, inet); - if (sock_flag(sk, SOCK_LOCALROUTE) || - (msg->msg_flags & MSG_DONTROUTE) || - (ipc.opt && ipc.opt->opt.is_strictroute)) { - tos |= RTO_ONLINK; - } + scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) @@ -793,10 +779,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) ipc.oif = inet->uc_index; - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, - RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, - sk->sk_uid); + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, + saddr, 0, 0, sk->sk_uid); fl4.fl4_icmp_type = user_icmph.type; fl4.fl4_icmp_code = user_icmph.code; @@ -1045,15 +1030,14 @@ static struct sock *ping_get_first(struct seq_file *seq, int start) for (state->bucket = start; state->bucket < PING_HTABLE_SIZE; ++state->bucket) { - struct hlist_nulls_node *node; - struct hlist_nulls_head *hslot; + struct hlist_head *hslot; hslot = &ping_table.hash[state->bucket]; - if (hlist_nulls_empty(hslot)) + if (hlist_empty(hslot)) continue; - sk_nulls_for_each(sk, node, hslot) { + sk_for_each(sk, hslot) { if (net_eq(sock_net(sk), net) && sk->sk_family == state->family) goto found; @@ -1070,7 +1054,7 @@ static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk) struct net *net = seq_file_net(seq); do { - sk = sk_nulls_next(sk); + sk = sk_next(sk); } while (sk && (!net_eq(sock_net(sk), net))); if (!sk) @@ -1206,6 +1190,6 @@ void __init ping_init(void) int i; for (i = 0; i < PING_HTABLE_SIZE; i++) - INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i); + INIT_HLIST_HEAD(&ping_table.hash[i]); spin_lock_init(&ping_table.lock); } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index eadf1c9ef7e4..7782ff5e6539 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -476,10 +476,10 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; + u8 tos, scope; int free = 0; __be32 daddr; __be32 saddr; - u8 tos; int err; struct ip_options_data opt_copy; struct raw_frag_vec rfv; @@ -575,9 +575,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = ipc.opt->opt.faddr; } } - tos = get_rtconn_flags(&ipc, sk); - if (msg->msg_flags & MSG_DONTROUTE) - tos |= RTO_ONLINK; + tos = get_rttos(&ipc, inet); + scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) @@ -600,8 +599,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, - RT_SCOPE_UNIVERSE, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), @@ -858,29 +856,29 @@ static int raw_getsockopt(struct sock *sk, int level, int optname, return do_raw_getsockopt(sk, level, optname, optval, optlen); } -static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int raw_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { - int amount = sk_wmem_alloc_get(sk); - - return put_user(amount, (int __user *)arg); + *karg = sk_wmem_alloc_get(sk); + return 0; } case SIOCINQ: { struct sk_buff *skb; - int amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) - amount = skb->len; + *karg = skb->len; + else + *karg = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); + return 0; } default: #ifdef CONFIG_IP_MROUTE - return ipmr_ioctl(sk, cmd, (void __user *)arg); + return ipmr_ioctl(sk, cmd, karg); #else return -ENOIOCTLCMD; #endif diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 26fb97d1d4d9..dc478a0574cb 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -418,8 +418,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) * no easy way to do this. */ flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark, - RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, - inet_sk_flowi_flags(sk), + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), + IPPROTO_TCP, inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 88dfe51e68f3..2afb0870648b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -34,6 +34,7 @@ static int ip_ttl_min = 1; static int ip_ttl_max = 255; static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; +static int tcp_syn_linear_timeouts_max = MAX_TCP_SYNCNT; static unsigned long ip_ping_group_range_min[] = { 0, 0 }; static unsigned long ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static u32 u32_max_div_HZ = UINT_MAX / HZ; @@ -1470,6 +1471,24 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &tcp_plb_max_cong_thresh, }, + { + .procname = "tcp_syn_linear_timeouts", + .data = &init_net.ipv4.sysctl_tcp_syn_linear_timeouts, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_syn_linear_timeouts_max, + }, + { + .procname = "tcp_shrink_window", + .data = &init_net.ipv4.sysctl_tcp_shrink_window, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8d20d9221238..71b42eef9dbf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -599,7 +599,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) } EXPORT_SYMBOL(tcp_poll); -int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +int tcp_ioctl(struct sock *sk, int cmd, int *karg) { struct tcp_sock *tp = tcp_sk(sk); int answ; @@ -641,7 +641,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return -ENOIOCTLCMD; } - return put_user(answ, (int __user *)arg); + *karg = answ; + return 0; } EXPORT_SYMBOL(tcp_ioctl); @@ -858,12 +859,12 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } EXPORT_SYMBOL(tcp_splice_read); -struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, +struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule) { struct sk_buff *skb; - skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp); + skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); if (likely(skb)) { bool mem_scheduled; @@ -957,7 +958,7 @@ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) } -static int tcp_wmem_schedule(struct sock *sk, int copy) +int tcp_wmem_schedule(struct sock *sk, int copy) { int left; @@ -974,175 +975,24 @@ static int tcp_wmem_schedule(struct sock *sk, int copy) return min(copy, sk->sk_forward_alloc); } -static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, - struct page *page, int offset, size_t *size) -{ - struct sk_buff *skb = tcp_write_queue_tail(sk); - struct tcp_sock *tp = tcp_sk(sk); - bool can_coalesce; - int copy, i; - - if (!skb || (copy = size_goal - skb->len) <= 0 || - !tcp_skb_can_collapse_to(skb)) { -new_segment: - if (!sk_stream_memory_free(sk)) - return NULL; - - skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, - tcp_rtx_and_write_queues_empty(sk)); - if (!skb) - return NULL; - -#ifdef CONFIG_TLS_DEVICE - skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); -#endif - tcp_skb_entail(sk, skb); - copy = size_goal; - } - - if (copy > *size) - copy = *size; - - i = skb_shinfo(skb)->nr_frags; - can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { - tcp_mark_push(tp, skb); - goto new_segment; - } - if (tcp_downgrade_zcopy_pure(sk, skb)) - return NULL; - - copy = tcp_wmem_schedule(sk, copy); - if (!copy) - return NULL; - - if (can_coalesce) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); - } else { - get_page(page); - skb_fill_page_desc_noacc(skb, i, page, offset, copy); - } - - if (!(flags & MSG_NO_SHARED_FRAGS)) - skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; - - skb->len += copy; - skb->data_len += copy; - skb->truesize += copy; - sk_wmem_queued_add(sk, copy); - sk_mem_charge(sk, copy); - WRITE_ONCE(tp->write_seq, tp->write_seq + copy); - TCP_SKB_CB(skb)->end_seq += copy; - tcp_skb_pcount_set(skb, 0); - - *size = copy; - return skb; -} - -ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - struct tcp_sock *tp = tcp_sk(sk); - int mss_now, size_goal; - int err; - ssize_t copied; - long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - - if (IS_ENABLED(CONFIG_DEBUG_VM) && - WARN_ONCE(!sendpage_ok(page), - "page must not be a Slab one and have page_count > 0")) - return -EINVAL; - - /* Wait for a connection to finish. One exception is TCP Fast Open - * (passive side) where data is allowed to be sent before a connection - * is fully established. - */ - if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && - !tcp_passive_fastopen(sk)) { - err = sk_stream_wait_connect(sk, &timeo); - if (err != 0) - goto out_err; - } - - sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - - mss_now = tcp_send_mss(sk, &size_goal, flags); - copied = 0; - - err = -EPIPE; - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto out_err; - - while (size > 0) { - struct sk_buff *skb; - size_t copy = size; - - skb = tcp_build_frag(sk, size_goal, flags, page, offset, ©); - if (!skb) - goto wait_for_space; - - if (!copied) - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; - - copied += copy; - offset += copy; - size -= copy; - if (!size) - goto out; - - if (skb->len < size_goal || (flags & MSG_OOB)) - continue; - - if (forced_push(tp)) { - tcp_mark_push(tp, skb); - __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); - } else if (skb == tcp_send_head(sk)) - tcp_push_one(sk, mss_now); - continue; - -wait_for_space: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - tcp_push(sk, flags & ~MSG_MORE, mss_now, - TCP_NAGLE_PUSH, size_goal); - - err = sk_stream_wait_memory(sk, &timeo); - if (err != 0) - goto do_error; - - mss_now = tcp_send_mss(sk, &size_goal, flags); - } - -out: - if (copied) { - tcp_tx_timestamp(sk, sk->sk_tsflags); - if (!(flags & MSG_SENDPAGE_NOTLAST)) - tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); - } - return copied; - -do_error: - tcp_remove_empty_skb(sk); - if (copied) - goto out; -out_err: - /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { - sk->sk_write_space(sk); - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); - } - return sk_stream_error(sk, flags, err); -} -EXPORT_SYMBOL_GPL(do_tcp_sendpages); - int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; + if (!(sk->sk_route_caps & NETIF_F_SG)) return sock_no_sendpage_locked(sk, page, offset, size, flags); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ - return do_tcp_sendpages(sk, page, offset, size, flags); + bvec_set_page(&bvec, page, size, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); + + if (flags & MSG_SENDPAGE_NOTLAST) + msg.msg_flags |= MSG_MORE; + + return tcp_sendmsg_locked(sk, &msg, size); } EXPORT_SYMBOL_GPL(tcp_sendpage_locked); @@ -1223,28 +1073,31 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; - bool zc = false; + int zc = 0; long timeo; flags = msg->msg_flags; if ((flags & MSG_ZEROCOPY) && size) { - skb = tcp_write_queue_tail(sk); - if (msg->msg_ubuf) { uarg = msg->msg_ubuf; - net_zcopy_get(uarg); - zc = sk->sk_route_caps & NETIF_F_SG; + if (sk->sk_route_caps & NETIF_F_SG) + zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { + skb = tcp_write_queue_tail(sk); uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; goto out_err; } - zc = sk->sk_route_caps & NETIF_F_SG; - if (!zc) + if (sk->sk_route_caps & NETIF_F_SG) + zc = MSG_ZEROCOPY; + else uarg_to_msgzc(uarg)->zerocopy = 0; } + } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { + if (sk->sk_route_caps & NETIF_F_SG) + zc = MSG_SPLICE_PAGES; } if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && @@ -1307,7 +1160,7 @@ restart: goto do_error; while (msg_data_left(msg)) { - int copy = 0; + ssize_t copy = 0; skb = tcp_write_queue_tail(sk); if (skb) @@ -1326,7 +1179,7 @@ new_segment: goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); - skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, + skb = tcp_stream_alloc_skb(sk, sk->sk_allocation, first_skb); if (!skb) goto wait_for_space; @@ -1348,7 +1201,7 @@ new_segment: if (copy > msg_data_left(msg)) copy = msg_data_left(msg); - if (!zc) { + if (zc == 0) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); @@ -1393,7 +1246,7 @@ new_segment: page_ref_inc(pfrag->page); } pfrag->offset += copy; - } else { + } else if (zc == MSG_ZEROCOPY) { /* First append to a fragless skb builds initial * pure zerocopy skb */ @@ -1414,6 +1267,30 @@ new_segment: if (err < 0) goto do_error; copy = err; + } else if (zc == MSG_SPLICE_PAGES) { + /* Splice in data if we can; copy if we can't. */ + if (tcp_downgrade_zcopy_pure(sk, skb)) + goto wait_for_space; + copy = tcp_wmem_schedule(sk, copy); + if (!copy) + goto wait_for_space; + + err = skb_splice_from_iter(skb, &msg->msg_iter, copy, + sk->sk_allocation); + if (err < 0) { + if (err == -EMSGSIZE) { + tcp_mark_push(tp, skb); + goto new_segment; + } + goto do_error; + } + copy = err; + + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; + + sk_wmem_queued_add(sk, copy); + sk_mem_charge(sk, copy); } if (!copied) @@ -1459,7 +1336,9 @@ out: tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: - net_zcopy_put(uarg); + /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ + if (uarg && !msg->msg_ubuf) + net_zcopy_put(uarg); return copied + copied_syn; do_error: @@ -1468,7 +1347,9 @@ do_error: if (copied + copied_syn) goto out; out_err: - net_zcopy_put_abort(uarg, true); + /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ + if (uarg && !msg->msg_ubuf) + net_zcopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { @@ -1491,6 +1372,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } EXPORT_SYMBOL(tcp_sendmsg); +void tcp_splice_eof(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct tcp_sock *tp = tcp_sk(sk); + int mss_now, size_goal; + + if (!tcp_write_queue_tail(sk)) + return; + + lock_sock(sk); + mss_now = tcp_send_mss(sk, &size_goal, 0); + tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); + release_sock(sk); +} +EXPORT_SYMBOL_GPL(tcp_splice_eof); + /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) @@ -1877,7 +1774,7 @@ void tcp_update_recv_tstamps(struct sk_buff *skb, } #ifdef CONFIG_MMU -static const struct vm_operations_struct tcp_vm_ops = { +const struct vm_operations_struct tcp_vm_ops = { }; int tcp_mmap(struct file *file, struct socket *sock, @@ -2176,6 +2073,34 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, } } +static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, + unsigned long address, + bool *mmap_locked) +{ + struct vm_area_struct *vma = NULL; + +#ifdef CONFIG_PER_VMA_LOCK + vma = lock_vma_under_rcu(mm, address); +#endif + if (vma) { + if (!vma_is_tcp(vma)) { + vma_end_read(vma); + return NULL; + } + *mmap_locked = false; + return vma; + } + + mmap_read_lock(mm); + vma = vma_lookup(mm, address); + if (!vma || !vma_is_tcp(vma)) { + mmap_read_unlock(mm); + return NULL; + } + *mmap_locked = true; + return vma; +} + #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc, @@ -2193,6 +2118,7 @@ static int tcp_zerocopy_receive(struct sock *sk, u32 seq = tp->copied_seq; u32 total_bytes_to_map; int inq = tcp_inq(sk); + bool mmap_locked; int ret; zc->copybuf_len = 0; @@ -2217,13 +2143,10 @@ static int tcp_zerocopy_receive(struct sock *sk, return 0; } - mmap_read_lock(current->mm); - - vma = vma_lookup(current->mm, address); - if (!vma || vma->vm_ops != &tcp_vm_ops) { - mmap_read_unlock(current->mm); + vma = find_tcp_vma(current->mm, address, &mmap_locked); + if (!vma) return -EINVAL; - } + vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); avail_len = min_t(u32, vma_len, inq); total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); @@ -2297,7 +2220,10 @@ static int tcp_zerocopy_receive(struct sock *sk, zc, total_bytes_to_map); } out: - mmap_read_unlock(current->mm); + if (mmap_locked) + mmap_read_unlock(current->mm); + else + vma_end_read(vma); /* Try to copy straggler data. */ if (!ret) copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); @@ -4680,8 +4606,10 @@ int tcp_abort(struct sock *sk, int err) return 0; } - /* Don't race with userspace socket closes such as tcp_close. */ - lock_sock(sk); + /* BPF context ensures sock locking. */ + if (!has_current_bpf_ctx()) + /* Don't race with userspace socket closes such as tcp_close. */ + lock_sock(sk); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); @@ -4705,7 +4633,8 @@ int tcp_abort(struct sock *sk, int err) bh_unlock_sock(sk); local_bh_enable(); tcp_write_queue_purge(sk); - release_sock(sk); + if (!has_current_bpf_ctx()) + release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(tcp_abort); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 5f93918c063c..5a84053ac62b 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -90,11 +90,13 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, { bool apply = apply_bytes; struct scatterlist *sge; + struct msghdr msghdr = { .msg_flags = flags | MSG_SPLICE_PAGES, }; struct page *page; int size, ret = 0; u32 off; while (1) { + struct bio_vec bvec; bool has_tx_ulp; sge = sk_msg_elem(msg, msg->sg.start); @@ -106,16 +108,18 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, tcp_rate_check_app_limited(sk); retry: has_tx_ulp = tls_sw_has_ctx_tx(sk); - if (has_tx_ulp) { - flags |= MSG_SENDPAGE_NOPOLICY; - ret = kernel_sendpage_locked(sk, - page, off, size, flags); - } else { - ret = do_tcp_sendpages(sk, page, off, size, flags); - } + if (has_tx_ulp) + msghdr.msg_flags |= MSG_SENDPAGE_NOPOLICY; + + if (flags & MSG_SENDPAGE_NOTLAST) + msghdr.msg_flags |= MSG_MORE; + bvec_set_page(&bvec, page, size, off); + iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, size); + ret = tcp_sendmsg_locked(sk, &msghdr, size); if (ret <= 0) return ret; + if (apply) apply_bytes -= ret; msg->sg.size -= ret; @@ -481,7 +485,7 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) long timeo; int flags; - /* Don't let internal do_tcp_sendpages() flags through */ + /* Don't let internal sendpage flags through */ flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); flags |= MSG_NO_SHARED_FRAGS; @@ -564,49 +568,18 @@ out_err: static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { - struct sk_msg tmp, *msg = NULL; - int err = 0, copied = 0; - struct sk_psock *psock; - bool enospc = false; + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = flags | MSG_SPLICE_PAGES, + }; - psock = sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_sendpage(sk, page, offset, size, flags); - - lock_sock(sk); - if (psock->cork) { - msg = psock->cork; - } else { - msg = &tmp; - sk_msg_init(msg); - } + bvec_set_page(&bvec, page, size, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - /* Catch case where ring is full and sendpage is stalled. */ - if (unlikely(sk_msg_full(msg))) - goto out_err; - - sk_msg_page_add(msg, page, size, offset); - sk_mem_charge(sk, size); - copied = size; - if (sk_msg_full(msg)) - enospc = true; - if (psock->cork_bytes) { - if (size > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= size; - if (psock->cork_bytes && !enospc) - goto out_err; - /* All cork bytes are accounted, rerun the prog. */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } + if (flags & MSG_SENDPAGE_NOTLAST) + msg.msg_flags |= MSG_MORE; - err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags); -out_err: - release_sock(sk); - sk_psock_put(sk, psock); - return copied ? copied : err; + return tcp_bpf_sendmsg(sk, &msg, size); } enum { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 06d2573685ca..9213804b034f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -692,6 +692,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) u64 transmit_time = 0; struct sock *ctl_sk; struct net *net; + u32 txhash = 0; /* Never send a reset in response to a reset. */ if (th->rst) @@ -829,6 +830,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); xfrm_sk_clone_policy(ctl_sk, sk); + txhash = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_txhash : sk->sk_txhash; } else { ctl_sk->sk_mark = 0; ctl_sk->sk_priority = 0; @@ -837,7 +840,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, - transmit_time); + transmit_time, txhash); xfrm_sk_free_policy(ctl_sk); sock_net_set(ctl_sk, &init_net); @@ -859,7 +862,7 @@ static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, - int reply_flags, u8 tos) + int reply_flags, u8 tos, u32 txhash) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -935,7 +938,7 @@ static void tcp_v4_send_ack(const struct sock *sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, - transmit_time); + transmit_time, txhash); sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); @@ -955,7 +958,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, - tw->tw_tos + tw->tw_tos, + tw->tw_txhash ); inet_twsk_put(tw); @@ -988,7 +992,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 0, tcp_md5_do_lookup(sk, l3index, addr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, - ip_hdr(skb)->tos); + ip_hdr(skb)->tos, tcp_rsk(req)->txhash); } /* @@ -2963,7 +2967,6 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; - bool slow; uid_t uid; int ret; @@ -2971,7 +2974,7 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) return 0; if (sk_fullsock(sk)) - slow = lock_sock_fast(sk); + lock_sock(sk); if (unlikely(sk_unhashed(sk))) { ret = SEQ_SKIP; @@ -2995,7 +2998,7 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) unlock: if (sk_fullsock(sk)) - unlock_sock_fast(sk, slow); + release_sock(sk); return ret; } @@ -3113,6 +3116,7 @@ struct proto tcp_prot = { .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, + .splice_eof = tcp_splice_eof, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, @@ -3276,6 +3280,9 @@ static int __net_init tcp_sk_init(struct net *net) else net->ipv4.tcp_congestion_control = &tcp_reno; + net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; + net->ipv4.sysctl_tcp_shrink_window = 0; + return 0; } @@ -3356,7 +3363,7 @@ static struct bpf_iter_reg tcp_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__tcp, sk_common), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .get_func_proto = bpf_iter_tcp_get_func_proto, .seq_info = &tcp_seq_info, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index dac0d62120e6..04fc328727e6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -303,6 +303,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; tcptw->tw_tx_delay = tp->tcp_tx_delay; + tw->tw_txhash = sk->sk_txhash; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -311,7 +312,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); - tw->tw_txhash = sk->sk_txhash; tw->tw_ipv6only = sk->sk_ipv6only; } #endif diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 4851211aa60d..8311c38267b5 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -9,6 +9,7 @@ #include <linux/indirect_call_wrapper.h> #include <linux/skbuff.h> #include <net/gro.h> +#include <net/gso.h> #include <net/tcp.h> #include <net/protocol.h> @@ -295,7 +296,7 @@ out: return pp; } -int tcp_gro_complete(struct sk_buff *skb) +void tcp_gro_complete(struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); @@ -310,8 +311,6 @@ int tcp_gro_complete(struct sk_buff *skb) if (skb->encapsulation) skb->inner_transport_header = skb->transport_header; - - return 0; } EXPORT_SYMBOL(tcp_gro_complete); @@ -341,7 +340,8 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) if (NAPI_GRO_CB(skb)->is_atomic) skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID; - return tcp_gro_complete(skb); + tcp_gro_complete(skb); + return 0; } static const struct net_offload tcpv4_offload = { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cfe128b81a01..2cb39b6dad02 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -260,8 +260,8 @@ static u16 tcp_select_window(struct sock *sk) u32 old_win = tp->rcv_wnd; u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); + struct net *net = sock_net(sk); - /* Never shrink the offered window */ if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else @@ -270,11 +270,14 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ - if (new_win == 0) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPWANTZEROWINDOWADV); - new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); + if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) { + /* Never shrink the offered window */ + if (new_win == 0) + NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV); + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); + } } + tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; @@ -282,7 +285,7 @@ static u16 tcp_select_window(struct sock *sk) * scaled window. */ if (!tp->rx_opt.rcv_wscale && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) + READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); @@ -294,10 +297,9 @@ static u16 tcp_select_window(struct sock *sk) if (new_win == 0) { tp->pred_flags = 0; if (old_win) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPTOZEROWINDOWADV); + NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV); } else if (old_win == 0) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); + NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV); } return new_win; @@ -1530,7 +1532,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; - int nsize, old_factor; + int old_factor; long limit; int nlen; u8 flags; @@ -1538,9 +1540,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (WARN_ON(len > skb->len)) return -EINVAL; - nsize = skb_headlen(skb) - len; - if (nsize < 0) - nsize = 0; + DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb. * We need some allowance to not penalize applications setting small @@ -1560,7 +1560,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; /* Get a new skb... force flag on. */ - buff = tcp_stream_alloc_skb(sk, nsize, gfp, true); + buff = tcp_stream_alloc_skb(sk, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); @@ -1568,7 +1568,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); - nlen = skb->len - len - nsize; + nlen = skb->len - len; buff->truesize += nlen; skb->truesize -= nlen; @@ -1626,13 +1626,7 @@ static int __pskb_trim_head(struct sk_buff *skb, int len) struct skb_shared_info *shinfo; int i, k, eat; - eat = min_t(int, len, skb_headlen(skb)); - if (eat) { - __skb_pull(skb, eat); - len -= eat; - if (!len) - return 0; - } + DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); eat = len; k = 0; shinfo = skb_shinfo(skb); @@ -1671,12 +1665,10 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) TCP_SKB_CB(skb)->seq += len; - if (delta_truesize) { - skb->truesize -= delta_truesize; - sk_wmem_queued_add(sk, -delta_truesize); - if (!skb_zcopy_pure(skb)) - sk_mem_uncharge(sk, delta_truesize); - } + skb->truesize -= delta_truesize; + sk_wmem_queued_add(sk, -delta_truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, delta_truesize); /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) @@ -2126,11 +2118,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, u8 flags; /* All of a TSO frame must be composed of paged data. */ - if (skb->len != skb->data_len) - return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, - skb, len, mss_now, gfp); + DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len); - buff = tcp_stream_alloc_skb(sk, 0, gfp, true); + buff = tcp_stream_alloc_skb(sk, gfp, true); if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); @@ -2319,6 +2309,57 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) return true; } +static int tcp_clone_payload(struct sock *sk, struct sk_buff *to, + int probe_size) +{ + skb_frag_t *lastfrag = NULL, *fragto = skb_shinfo(to)->frags; + int i, todo, len = 0, nr_frags = 0; + const struct sk_buff *skb; + + if (!sk_wmem_schedule(sk, to->truesize + probe_size)) + return -ENOMEM; + + skb_queue_walk(&sk->sk_write_queue, skb) { + const skb_frag_t *fragfrom = skb_shinfo(skb)->frags; + + if (skb_headlen(skb)) + return -EINVAL; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) { + if (len >= probe_size) + goto commit; + todo = min_t(int, skb_frag_size(fragfrom), + probe_size - len); + len += todo; + if (lastfrag && + skb_frag_page(fragfrom) == skb_frag_page(lastfrag) && + skb_frag_off(fragfrom) == skb_frag_off(lastfrag) + + skb_frag_size(lastfrag)) { + skb_frag_size_add(lastfrag, todo); + continue; + } + if (unlikely(nr_frags == MAX_SKB_FRAGS)) + return -E2BIG; + skb_frag_page_copy(fragto, fragfrom); + skb_frag_off_copy(fragto, fragfrom); + skb_frag_size_set(fragto, todo); + nr_frags++; + lastfrag = fragto++; + } + } +commit: + WARN_ON_ONCE(len != probe_size); + for (i = 0; i < nr_frags; i++) + skb_frag_ref(to, i); + + skb_shinfo(to)->nr_frags = nr_frags; + to->truesize += probe_size; + to->len += probe_size; + to->data_len += probe_size; + __skb_header_release(to); + return 0; +} + /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing @@ -2395,9 +2436,15 @@ static int tcp_mtu_probe(struct sock *sk) return -1; /* We're allowed to probe. Build it now. */ - nskb = tcp_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); + nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false); if (!nskb) return -1; + + /* build the payload, and be prepared to abort if this fails. */ + if (tcp_clone_payload(sk, nskb, probe_size)) { + consume_skb(nskb); + return -1; + } sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); @@ -2415,7 +2462,6 @@ static int tcp_mtu_probe(struct sock *sk) len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); - skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); if (skb->len <= copy) { /* We've eaten all the data from this skb. @@ -2431,12 +2477,8 @@ static int tcp_mtu_probe(struct sock *sk) } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); - if (!skb_shinfo(skb)->nr_frags) { - skb_pull(skb, copy); - } else { - __pskb_trim_head(skb, copy); - tcp_set_skb_tso_segs(skb, mss_now); - } + __pskb_trim_head(skb, copy); + tcp_set_skb_tso_segs(skb, mss_now); TCP_SKB_CB(skb)->seq += copy; } @@ -2947,6 +2989,7 @@ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct @@ -2968,6 +3011,15 @@ u32 __tcp_select_window(struct sock *sk) if (mss <= 0) return 0; } + + /* Only allow window shrink if the sysctl is enabled and we have + * a non-zero scaling factor in effect. + */ + if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale) + goto shrink_window_allowed; + + /* do not allow window to shrink */ + if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; @@ -3022,6 +3074,36 @@ u32 __tcp_select_window(struct sock *sk) } return window; + +shrink_window_allowed: + /* new window should always be an exact multiple of scaling factor */ + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + + if (free_space < (full_space >> 1)) { + icsk->icsk_ack.quick = 0; + + if (tcp_under_memory_pressure(sk)) + tcp_adjust_rcv_ssthresh(sk); + + /* if free space is too low, return a zero window */ + if (free_space < (allowed_space >> 4) || free_space < mss || + free_space < (1 << tp->rx_opt.rcv_wscale)) + return 0; + } + + if (free_space > tp->rcv_ssthresh) { + free_space = tp->rcv_ssthresh; + /* new window should always be an exact multiple of scaling factor + * + * For this case, we ALIGN "up" (increase free_space) because + * we know free_space is not zero here, it has been reduced from + * the memory-based limit, and rcv_ssthresh is not a hard limit + * (unlike sk_rcvbuf). + */ + free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale)); + } + + return free_space; } void tcp_skb_collapse_tstamp(struct sk_buff *skb, @@ -3746,8 +3828,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int space, err = 0; + struct page_frag *pfrag = sk_page_frag(sk); struct sk_buff *syn_data; + int space, err = 0; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) @@ -3766,25 +3849,31 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) space = min_t(size_t, space, fo->size); - /* limit to order-0 allocations */ - space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - - syn_data = tcp_stream_alloc_skb(sk, space, sk->sk_allocation, false); + if (space && + !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE), + pfrag, sk->sk_allocation)) + goto fallback; + syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false); if (!syn_data) goto fallback; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (space) { - int copied = copy_from_iter(skb_put(syn_data, space), space, - &fo->data->msg_iter); - if (unlikely(!copied)) { + space = min_t(size_t, space, pfrag->size - pfrag->offset); + space = tcp_wmem_schedule(sk, space); + } + if (space) { + space = copy_page_from_iter(pfrag->page, pfrag->offset, + space, &fo->data->msg_iter); + if (unlikely(!space)) { tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } - if (copied != space) { - skb_trim(syn_data, copied); - space = copied; - } + skb_fill_page_desc(syn_data, 0, pfrag->page, + pfrag->offset, space); + page_ref_inc(pfrag->page); + pfrag->offset += space; + skb_len_add(syn_data, space); skb_zcopy_set(syn_data, fo->uarg, NULL); } /* No more data pending in inet_wait_for_connect() */ @@ -3849,7 +3938,7 @@ int tcp_connect(struct sock *sk) return 0; } - buff = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, true); + buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 39eb947fe392..470f581eedd4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -234,14 +234,19 @@ static int tcp_write_timeout(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); bool expired = false, do_reset; - int retry_until; + int retry_until, max_retransmits; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) __dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); - expired = icsk->icsk_retransmits >= retry_until; + + max_retransmits = retry_until; + if (sk->sk_state == TCP_SYN_SENT) + max_retransmits += READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts); + + expired = icsk->icsk_retransmits >= max_retransmits; } else { if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) { /* Black hole detection */ @@ -587,8 +592,12 @@ out_reset_timer: icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); - } else { - /* Use normal (exponential) backoff */ + } else if (sk->sk_state != TCP_SYN_SENT || + icsk->icsk_backoff > + READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) { + /* Use normal (exponential) backoff unless linear timeouts are + * activated. + */ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9482def1f310..48fdcd3cad9c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -103,6 +103,7 @@ #include <net/ip_tunnels.h> #include <net/route.h> #include <net/checksum.h> +#include <net/gso.h> #include <net/xfrm.h> #include <trace/events/udp.h> #include <linux/static_key.h> @@ -1062,8 +1063,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int free = 0; int connected = 0; __be32 daddr, faddr, saddr; + u8 tos, scope; __be16 dport; - u8 tos; int err, is_udplite = IS_UDPLITE(sk); int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); @@ -1183,12 +1184,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) connected = 0; } tos = get_rttos(&ipc, inet); - if (sock_flag(sk, SOCK_LOCALROUTE) || - (msg->msg_flags & MSG_DONTROUTE) || - (ipc.opt && ipc.opt->opt.is_strictroute)) { - tos |= RTO_ONLINK; + scope = ip_sendmsg_scope(inet, &ipc, msg); + if (scope == RT_SCOPE_LINK) connected = 0; - } if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) @@ -1221,11 +1219,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, - RT_SCOPE_UNIVERSE, sk->sk_protocol, - flow_flags, - faddr, saddr, dport, inet->inet_sport, - sk->sk_uid); + flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope, + sk->sk_protocol, flow_flags, faddr, saddr, + dport, inet->inet_sport, sk->sk_uid); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); @@ -1329,57 +1325,33 @@ do_confirm: } EXPORT_SYMBOL(udp_sendmsg); -int udp_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) +void udp_splice_eof(struct socket *sock) { - struct inet_sock *inet = inet_sk(sk); + struct sock *sk = sock->sk; struct udp_sock *up = udp_sk(sk); - int ret; - - if (flags & MSG_SENDPAGE_NOTLAST) - flags |= MSG_MORE; - - if (!up->pending) { - struct msghdr msg = { .msg_flags = flags|MSG_MORE }; - /* Call udp_sendmsg to specify destination address which - * sendpage interface can't pass. - * This will succeed only when the socket is connected. - */ - ret = udp_sendmsg(sk, &msg, 0); - if (ret < 0) - return ret; - } + if (!up->pending || READ_ONCE(up->corkflag)) + return; lock_sock(sk); + if (up->pending && !READ_ONCE(up->corkflag)) + udp_push_pending_frames(sk); + release_sock(sk); +} +EXPORT_SYMBOL_GPL(udp_splice_eof); - if (unlikely(!up->pending)) { - release_sock(sk); - - net_dbg_ratelimited("cork failed\n"); - return -EINVAL; - } +int udp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES }; - ret = ip_append_page(sk, &inet->cork.fl.u.ip4, - page, offset, size, flags); - if (ret == -EOPNOTSUPP) { - release_sock(sk); - return sock_no_sendpage(sk->sk_socket, page, offset, - size, flags); - } - if (ret < 0) { - udp_flush_pending_frames(sk); - goto out; - } + if (flags & MSG_SENDPAGE_NOTLAST) + msg.msg_flags |= MSG_MORE; - up->len += size; - if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE))) - ret = udp_push_pending_frames(sk); - if (!ret) - ret = size; -out: - release_sock(sk); - return ret; + bvec_set_page(&bvec, page, size, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); + return udp_sendmsg(sk, &msg, size); } #define UDP_SKB_IS_STATELESS 0x80000000 @@ -1720,21 +1692,19 @@ static int first_packet_length(struct sock *sk) * IOCTL requests applicable to the UDP protocol */ -int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) +int udp_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { - int amount = sk_wmem_alloc_get(sk); - - return put_user(amount, (int __user *)arg); + *karg = sk_wmem_alloc_get(sk); + return 0; } case SIOCINQ: { - int amount = max_t(int, 0, first_packet_length(sk)); - - return put_user(amount, (int __user *)arg); + *karg = max_t(int, 0, first_packet_length(sk)); + return 0; } default: @@ -2927,7 +2897,8 @@ EXPORT_SYMBOL(udp_poll); int udp_abort(struct sock *sk, int err) { - lock_sock(sk); + if (!has_current_bpf_ctx()) + lock_sock(sk); /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing * with close() @@ -2940,7 +2911,8 @@ int udp_abort(struct sock *sk, int err) __udp_disconnect(sk, 0); out: - release_sock(sk); + if (!has_current_bpf_ctx()) + release_sock(sk); return 0; } @@ -2960,6 +2932,7 @@ struct proto udp_prot = { .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, + .splice_eof = udp_splice_eof, .sendpage = udp_sendpage, .release_cb = ip4_datagram_release_cb, .hash = udp_lib_hash, @@ -2985,9 +2958,30 @@ EXPORT_SYMBOL(udp_prot); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS -static struct udp_table *udp_get_table_afinfo(struct udp_seq_afinfo *afinfo, - struct net *net) +static unsigned short seq_file_family(const struct seq_file *seq); +static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) { + unsigned short family = seq_file_family(seq); + + /* AF_UNSPEC is used as a match all */ + return ((family == AF_UNSPEC || family == sk->sk_family) && + net_eq(sock_net(sk), seq_file_net(seq))); +} + +#ifdef CONFIG_BPF_SYSCALL +static const struct seq_operations bpf_iter_udp_seq_ops; +#endif +static struct udp_table *udp_get_table_seq(struct seq_file *seq, + struct net *net) +{ + const struct udp_seq_afinfo *afinfo; + +#ifdef CONFIG_BPF_SYSCALL + if (seq->op == &bpf_iter_udp_seq_ops) + return net->ipv4.udp_table; +#endif + + afinfo = pde_data(file_inode(seq->file)); return afinfo->udp_table ? : net->ipv4.udp_table; } @@ -2995,16 +2989,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) { struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); - struct udp_seq_afinfo *afinfo; struct udp_table *udptable; struct sock *sk; - if (state->bpf_seq_afinfo) - afinfo = state->bpf_seq_afinfo; - else - afinfo = pde_data(file_inode(seq->file)); - - udptable = udp_get_table_afinfo(afinfo, net); + udptable = udp_get_table_seq(seq, net); for (state->bucket = start; state->bucket <= udptable->mask; ++state->bucket) { @@ -3015,10 +3003,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { - if (!net_eq(sock_net(sk), net)) - continue; - if (afinfo->family == AF_UNSPEC || - sk->sk_family == afinfo->family) + if (seq_sk_match(seq, sk)) goto found; } spin_unlock_bh(&hslot->lock); @@ -3032,22 +3017,14 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) { struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); - struct udp_seq_afinfo *afinfo; struct udp_table *udptable; - if (state->bpf_seq_afinfo) - afinfo = state->bpf_seq_afinfo; - else - afinfo = pde_data(file_inode(seq->file)); - do { sk = sk_next(sk); - } while (sk && (!net_eq(sock_net(sk), net) || - (afinfo->family != AF_UNSPEC && - sk->sk_family != afinfo->family))); + } while (sk && !seq_sk_match(seq, sk)); if (!sk) { - udptable = udp_get_table_afinfo(afinfo, net); + udptable = udp_get_table_seq(seq, net); if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); @@ -3093,15 +3070,9 @@ EXPORT_SYMBOL(udp_seq_next); void udp_seq_stop(struct seq_file *seq, void *v) { struct udp_iter_state *state = seq->private; - struct udp_seq_afinfo *afinfo; struct udp_table *udptable; - if (state->bpf_seq_afinfo) - afinfo = state->bpf_seq_afinfo; - else - afinfo = pde_data(file_inode(seq->file)); - - udptable = udp_get_table_afinfo(afinfo, seq_file_net(seq)); + udptable = udp_get_table_seq(seq, seq_file_net(seq)); if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); @@ -3154,6 +3125,143 @@ struct bpf_iter__udp { int bucket __aligned(8); }; +struct bpf_udp_iter_state { + struct udp_iter_state state; + unsigned int cur_sk; + unsigned int end_sk; + unsigned int max_sk; + int offset; + struct sock **batch; + bool st_bucket_done; +}; + +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, + unsigned int new_batch_sz); +static struct sock *bpf_iter_udp_batch(struct seq_file *seq) +{ + struct bpf_udp_iter_state *iter = seq->private; + struct udp_iter_state *state = &iter->state; + struct net *net = seq_file_net(seq); + struct udp_table *udptable; + unsigned int batch_sks = 0; + bool resized = false; + struct sock *sk; + + /* The current batch is done, so advance the bucket. */ + if (iter->st_bucket_done) { + state->bucket++; + iter->offset = 0; + } + + udptable = udp_get_table_seq(seq, net); + +again: + /* New batch for the next bucket. + * Iterate over the hash table to find a bucket with sockets matching + * the iterator attributes, and return the first matching socket from + * the bucket. The remaining matched sockets from the bucket are batched + * before releasing the bucket lock. This allows BPF programs that are + * called in seq_show to acquire the bucket lock if needed. + */ + iter->cur_sk = 0; + iter->end_sk = 0; + iter->st_bucket_done = false; + batch_sks = 0; + + for (; state->bucket <= udptable->mask; state->bucket++) { + struct udp_hslot *hslot2 = &udptable->hash2[state->bucket]; + + if (hlist_empty(&hslot2->head)) { + iter->offset = 0; + continue; + } + + spin_lock_bh(&hslot2->lock); + udp_portaddr_for_each_entry(sk, &hslot2->head) { + if (seq_sk_match(seq, sk)) { + /* Resume from the last iterated socket at the + * offset in the bucket before iterator was stopped. + */ + if (iter->offset) { + --iter->offset; + continue; + } + if (iter->end_sk < iter->max_sk) { + sock_hold(sk); + iter->batch[iter->end_sk++] = sk; + } + batch_sks++; + } + } + spin_unlock_bh(&hslot2->lock); + + if (iter->end_sk) + break; + + /* Reset the current bucket's offset before moving to the next bucket. */ + iter->offset = 0; + } + + /* All done: no batch made. */ + if (!iter->end_sk) + return NULL; + + if (iter->end_sk == batch_sks) { + /* Batching is done for the current bucket; return the first + * socket to be iterated from the batch. + */ + iter->st_bucket_done = true; + goto done; + } + if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) { + resized = true; + /* After allocating a larger batch, retry one more time to grab + * the whole bucket. + */ + state->bucket--; + goto again; + } +done: + return iter->batch[0]; +} + +static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_udp_iter_state *iter = seq->private; + struct sock *sk; + + /* Whenever seq_next() is called, the iter->cur_sk is + * done with seq_show(), so unref the iter->cur_sk. + */ + if (iter->cur_sk < iter->end_sk) { + sock_put(iter->batch[iter->cur_sk++]); + ++iter->offset; + } + + /* After updating iter->cur_sk, check if there are more sockets + * available in the current bucket batch. + */ + if (iter->cur_sk < iter->end_sk) + sk = iter->batch[iter->cur_sk]; + else + /* Prepare a new batch. */ + sk = bpf_iter_udp_batch(seq); + + ++*pos; + return sk; +} + +static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos) +{ + /* bpf iter does not support lseek, so it always + * continue from where it was stop()-ped. + */ + if (*pos) + return bpf_iter_udp_batch(seq); + + return SEQ_START_TOKEN; +} + static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) { @@ -3174,18 +3282,37 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v) struct bpf_prog *prog; struct sock *sk = v; uid_t uid; + int ret; if (v == SEQ_START_TOKEN) return 0; + lock_sock(sk); + + if (unlikely(sk_unhashed(sk))) { + ret = SEQ_SKIP; + goto unlock; + } + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); - return udp_prog_seq_show(prog, &meta, v, uid, state->bucket); + ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket); + +unlock: + release_sock(sk); + return ret; +} + +static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter) +{ + while (iter->cur_sk < iter->end_sk) + sock_put(iter->batch[iter->cur_sk++]); } static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) { + struct bpf_udp_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; @@ -3196,17 +3323,35 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) (void)udp_prog_seq_show(prog, &meta, v, 0, 0); } - udp_seq_stop(seq, v); + if (iter->cur_sk < iter->end_sk) { + bpf_iter_udp_put_batch(iter); + iter->st_bucket_done = false; + } } static const struct seq_operations bpf_iter_udp_seq_ops = { - .start = udp_seq_start, - .next = udp_seq_next, + .start = bpf_iter_udp_seq_start, + .next = bpf_iter_udp_seq_next, .stop = bpf_iter_udp_seq_stop, .show = bpf_iter_udp_seq_show, }; #endif +static unsigned short seq_file_family(const struct seq_file *seq) +{ + const struct udp_seq_afinfo *afinfo; + +#ifdef CONFIG_BPF_SYSCALL + /* BPF iterator: bpf programs to filter sockets. */ + if (seq->op == &bpf_iter_udp_seq_ops) + return AF_UNSPEC; +#endif + + /* Proc fs iterator */ + afinfo = pde_data(file_inode(seq->file)); + return afinfo->family; +} + const struct seq_operations udp_seq_ops = { .start = udp_seq_start, .next = udp_seq_next, @@ -3415,38 +3560,55 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = { DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) -static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, + unsigned int new_batch_sz) { - struct udp_iter_state *st = priv_data; - struct udp_seq_afinfo *afinfo; - int ret; + struct sock **new_batch; - afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); - if (!afinfo) + new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch), + GFP_USER | __GFP_NOWARN); + if (!new_batch) return -ENOMEM; - afinfo->family = AF_UNSPEC; - afinfo->udp_table = NULL; - st->bpf_seq_afinfo = afinfo; + bpf_iter_udp_put_batch(iter); + kvfree(iter->batch); + iter->batch = new_batch; + iter->max_sk = new_batch_sz; + + return 0; +} + +#define INIT_BATCH_SZ 16 + +static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct bpf_udp_iter_state *iter = priv_data; + int ret; + ret = bpf_iter_init_seq_net(priv_data, aux); if (ret) - kfree(afinfo); + return ret; + + ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ); + if (ret) + bpf_iter_fini_seq_net(priv_data); + return ret; } static void bpf_iter_fini_udp(void *priv_data) { - struct udp_iter_state *st = priv_data; + struct bpf_udp_iter_state *iter = priv_data; - kfree(st->bpf_seq_afinfo); bpf_iter_fini_seq_net(priv_data); + kvfree(iter->batch); } static const struct bpf_iter_seq_info udp_seq_info = { .seq_ops = &bpf_iter_udp_seq_ops, .init_seq_private = bpf_iter_init_udp, .fini_seq_private = bpf_iter_fini_udp, - .seq_priv_size = sizeof(struct udp_iter_state), + .seq_priv_size = sizeof(struct bpf_udp_iter_state), }; static struct bpf_iter_reg udp_reg_info = { @@ -3454,7 +3616,7 @@ static struct bpf_iter_reg udp_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__udp, udp_sk), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &udp_seq_info, }; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 1f01e15ca24f..75aa4de5b731 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -8,6 +8,7 @@ #include <linux/skbuff.h> #include <net/gro.h> +#include <net/gso.h> #include <net/udp.h> #include <net/protocol.h> #include <net/inet_common.h> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3797917237d0..5479da08ef40 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3633,8 +3633,8 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, idev->if_flags |= IF_READY; } - pr_info("ADDRCONF(NETDEV_CHANGE): %s: link becomes ready\n", - dev->name); + pr_debug("ADDRCONF(NETDEV_CHANGE): %s: link becomes ready\n", + dev->name); run_pending = 1; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2bbf13216a3d..b3451cf47d29 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -579,7 +579,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) prot = READ_ONCE(sk->sk_prot); if (!prot->ioctl) return -ENOIOCTLCMD; - return prot->ioctl(sk, cmd, arg); + return sk_ioctl(sk, cmd, (void __user *)arg); } /*NOTREACHED*/ return 0; @@ -695,6 +695,7 @@ const struct proto_ops inet6_stream_ops = { #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif + .splice_eof = inet_splice_eof, .sendpage = inet_sendpage, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 772340268997..a189e08370a5 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -17,6 +17,7 @@ #include <linux/err.h> #include <linux/module.h> #include <net/gro.h> +#include <net/gso.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/esp.h> diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 5fa0e37305d9..202fc3aaa83c 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -126,9 +126,6 @@ static bool ip6_parse_tlv(bool hopbyhop, max_count = -max_count; } - if (skb_transport_offset(skb) + len > skb_headlen(skb)) - goto bad; - off += 2; len -= 2; @@ -402,11 +399,7 @@ looped_back: skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); - - if (!pskb_pull(skb, offset)) { - kfree_skb(skb); - return -1; - } + skb_pull(skb, offset); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); @@ -444,9 +437,9 @@ looped_back: kfree_skb(skb); return -1; } - } - hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); + hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); + } hdr->segments_left--; addr = hdr->segments + hdr->segments_left; @@ -458,8 +451,6 @@ looped_back: ipv6_hdr(skb)->daddr = *addr; - skb_dst_drop(skb); - ip6_route_input(skb); if (skb_dst(skb)->error) { @@ -519,11 +510,7 @@ looped_back: skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); - - if (!pskb_pull(skb, offset)) { - kfree_skb(skb); - return -1; - } + skb_pull(skb, offset); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); @@ -545,11 +532,6 @@ looped_back: return 1; } - if (!pskb_may_pull(skb, sizeof(*hdr))) { - kfree_skb(skb); - return -1; - } - n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre); r = do_div(n, (16 - hdr->cmpri)); /* checks if calculation was without remainder and n fits into @@ -569,12 +551,6 @@ looped_back: return -1; } - if (!pskb_may_pull(skb, ipv6_rpl_srh_size(n, hdr->cmpri, - hdr->cmpre))) { - kfree_skb(skb); - return -1; - } - hdr->segments_left--; i = n - hdr->segments_left; @@ -588,8 +564,7 @@ looped_back: ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n); chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3)); - if ((ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST) || - (ipv6_addr_type(&ohdr->rpl_segaddr[i]) & IPV6_ADDR_MULTICAST)) { + if (ipv6_addr_is_multicast(&ohdr->rpl_segaddr[i])) { kfree_skb(skb); kfree(buf); return -1; @@ -827,7 +802,6 @@ looped_back: *addr = ipv6_hdr(skb)->daddr; ipv6_hdr(skb)->daddr = daddr; - skb_dst_drop(skb); ip6_route_input(skb); if (skb_dst(skb)->error) { skb_push(skb, skb->data - skb_network_header(skb)); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 00dc2e3b0184..d6314287338d 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -16,6 +16,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <net/gro.h> +#include <net/gso.h> #include "ip6_offload.h" diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9554cf46ed88..1e8c90e97608 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -42,6 +42,7 @@ #include <net/sock.h> #include <net/snmp.h> +#include <net/gso.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> @@ -1589,6 +1590,15 @@ emsgsize: skb_zcopy_set(skb, uarg, &extra_uref); } } + } else if ((flags & MSG_SPLICE_PAGES) && length) { + if (inet_sk(sk)->hdrincl) + return -EPERM; + if (rt->dst.dev->features & NETIF_F_SG && + getfrag == ip_generic_getfrag) + /* We need an empty buffer to attach stuff to */ + paged = true; + else + flags &= ~MSG_SPLICE_PAGES; } /* @@ -1778,6 +1788,15 @@ alloc_new_skb: err = -EFAULT; goto error; } + } else if (flags & MSG_SPLICE_PAGES) { + struct msghdr *msg = from; + + err = skb_splice_from_iter(skb, &msg->msg_iter, copy, + sk->sk_allocation); + if (err < 0) + goto error; + copy = err; + wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 51cf37abd142..cc3d5ad17257 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1879,11 +1879,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, /* * The IP multicast ioctl support routines. */ - -int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) +int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { - struct sioc_sg_req6 sr; - struct sioc_mif_req6 vr; + struct sioc_sg_req6 *sr; + struct sioc_mif_req6 *vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); @@ -1895,40 +1894,33 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) switch (cmd) { case SIOCGETMIFCNT_IN6: - if (copy_from_user(&vr, arg, sizeof(vr))) - return -EFAULT; - if (vr.mifi >= mrt->maxvif) + vr = (struct sioc_mif_req6 *)arg; + if (vr->mifi >= mrt->maxvif) return -EINVAL; - vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); + vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif); rcu_read_lock(); - vif = &mrt->vif_table[vr.mifi]; - if (VIF_EXISTS(mrt, vr.mifi)) { - vr.icount = READ_ONCE(vif->pkt_in); - vr.ocount = READ_ONCE(vif->pkt_out); - vr.ibytes = READ_ONCE(vif->bytes_in); - vr.obytes = READ_ONCE(vif->bytes_out); + vif = &mrt->vif_table[vr->mifi]; + if (VIF_EXISTS(mrt, vr->mifi)) { + vr->icount = READ_ONCE(vif->pkt_in); + vr->ocount = READ_ONCE(vif->pkt_out); + vr->ibytes = READ_ONCE(vif->bytes_in); + vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); - - if (copy_to_user(arg, &vr, sizeof(vr))) - return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: - if (copy_from_user(&sr, arg, sizeof(sr))) - return -EFAULT; + sr = (struct sioc_sg_req6 *)arg; rcu_read_lock(); - c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); + c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, + &sr->grp.sin6_addr); if (c) { - sr.pktcnt = c->_c.mfc_un.res.pkt; - sr.bytecnt = c->_c.mfc_un.res.bytes; - sr.wrong_if = c->_c.mfc_un.res.wrong_if; + sr->pktcnt = c->_c.mfc_un.res.pkt; + sr->bytecnt = c->_c.mfc_un.res.bytes; + sr->wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); - - if (copy_to_user(arg, &sr, sizeof(sr))) - return -EFAULT; return 0; } rcu_read_unlock(); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 44ee7a2e72ac..c9caeb5a43ed 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1118,29 +1118,29 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, return do_rawv6_getsockopt(sk, level, optname, optval, optlen); } -static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int rawv6_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { - int amount = sk_wmem_alloc_get(sk); - - return put_user(amount, (int __user *)arg); + *karg = sk_wmem_alloc_get(sk); + return 0; } case SIOCINQ: { struct sk_buff *skb; - int amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) - amount = skb->len; + *karg = skb->len; + else + *karg = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); + return 0; } default: #ifdef CONFIG_IPV6_MROUTE - return ip6mr_ioctl(sk, cmd, (void __user *)arg); + return ip6mr_ioctl(sk, cmd, karg); #else return -ENOIOCTLCMD; #endif diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 392aaa373b66..64e873f5895f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3360,6 +3360,7 @@ static int ip6_route_check_nh_onlink(struct net *net, static int ip6_route_check_nh(struct net *net, struct fib6_config *cfg, struct net_device **_dev, + netdevice_tracker *dev_tracker, struct inet6_dev **idev) { const struct in6_addr *gw_addr = &cfg->fc_gateway; @@ -3404,7 +3405,7 @@ static int ip6_route_check_nh(struct net *net, err = -EHOSTUNREACH; } else { *_dev = dev = res.nh->fib_nh_dev; - dev_hold(dev); + netdev_hold(dev, dev_tracker, GFP_ATOMIC); *idev = in6_dev_get(dev); } @@ -3412,7 +3413,9 @@ static int ip6_route_check_nh(struct net *net, } static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, - struct net_device **_dev, struct inet6_dev **idev, + struct net_device **_dev, + netdevice_tracker *dev_tracker, + struct inet6_dev **idev, struct netlink_ext_ack *extack) { const struct in6_addr *gw_addr = &cfg->fc_gateway; @@ -3453,7 +3456,8 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, if (cfg->fc_flags & RTNH_F_ONLINK) err = ip6_route_check_nh_onlink(net, cfg, dev, extack); else - err = ip6_route_check_nh(net, cfg, _dev, idev); + err = ip6_route_check_nh(net, cfg, _dev, dev_tracker, + idev); rcu_read_unlock(); @@ -3503,6 +3507,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { + netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; int addr_type; @@ -3520,7 +3525,8 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, err = -ENODEV; if (cfg->fc_ifindex) { - dev = dev_get_by_index(net, cfg->fc_ifindex); + dev = netdev_get_by_index(net, cfg->fc_ifindex, + dev_tracker, gfp_flags); if (!dev) goto out; idev = in6_dev_get(dev); @@ -3554,11 +3560,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, /* hold loopback dev/idev if we haven't done so. */ if (dev != net->loopback_dev) { if (dev) { - dev_put(dev); + netdev_put(dev, dev_tracker); in6_dev_put(idev); } dev = net->loopback_dev; - dev_hold(dev); + netdev_hold(dev, dev_tracker, gfp_flags); idev = in6_dev_get(dev); if (!idev) { err = -ENODEV; @@ -3569,7 +3575,8 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, } if (cfg->fc_flags & RTF_GATEWAY) { - err = ip6_validate_gw(net, cfg, &dev, &idev, extack); + err = ip6_validate_gw(net, cfg, &dev, dev_tracker, + &idev, extack); if (err) goto out; @@ -3610,8 +3617,6 @@ pcpu_alloc: } fib6_nh->fib_nh_dev = dev; - netdev_tracker_alloc(dev, &fib6_nh->fib_nh_dev_tracker, gfp_flags); - fib6_nh->fib_nh_oif = dev->ifindex; err = 0; out: @@ -3621,7 +3626,7 @@ out: if (err) { lwtstate_put(fib6_nh->fib_nh_lws); fib6_nh->fib_nh_lws = NULL; - dev_put(dev); + netdev_put(dev, dev_tracker); } return err; diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c index d1876f192225..e186998bfbf7 100644 --- a/net/ipv6/rpl.c +++ b/net/ipv6/rpl.c @@ -29,13 +29,6 @@ static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i) return (void *)&hdr->rpl_segdata[i * IPV6_PFXTAIL_LEN(hdr->cmpri)]; } -size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri, - unsigned char cmpre) -{ - return sizeof(struct ipv6_rpl_sr_hdr) + (n * IPV6_PFXTAIL_LEN(cmpri)) + - IPV6_PFXTAIL_LEN(cmpre); -} - void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr, const struct ipv6_rpl_sr_hdr *inhdr, const struct in6_addr *daddr, unsigned char n) diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 34db881204d2..03b877ff4558 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -470,8 +470,6 @@ static int seg6_input_core(struct net *net, struct sock *sk, dst = dst_cache_get(&slwt->cache); preempt_enable(); - skb_dst_drop(skb); - if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); @@ -482,6 +480,7 @@ static int seg6_input_core(struct net *net, struct sock *sk, preempt_enable(); } } else { + skb_dst_drop(skb); skb_dst_set(skb, dst); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7132eb213a7a..c17c8ff94b79 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -93,12 +93,8 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, * This avoids a dereference and allow compiler optimizations. * It is a specialized version of inet6_sk_generic(). */ -static struct ipv6_pinfo *tcp_inet6_sk(const struct sock *sk) -{ - unsigned int offset = sizeof(struct tcp6_sock) - sizeof(struct ipv6_pinfo); - - return (struct ipv6_pinfo *)(((u8 *)sk) + offset); -} +#define tcp_inet6_sk(sk) (&container_of_const(tcp_sk(sk), \ + struct tcp6_sock, tcp)->inet6) static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { @@ -533,7 +529,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = tcp_inet6_sk(sk); + const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; @@ -2154,6 +2150,7 @@ struct proto tcpv6_prot = { .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, + .splice_eof = tcp_splice_eof, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 39db5a226855..bf0c957e4b5e 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -36,7 +36,8 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) &iph->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; - return tcp_gro_complete(skb); + tcp_gro_complete(skb); + return 0; } static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e5a337e6b970..317b01c9bc39 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1653,6 +1653,20 @@ do_confirm: } EXPORT_SYMBOL(udpv6_sendmsg); +static void udpv6_splice_eof(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct udp_sock *up = udp_sk(sk); + + if (!up->pending || READ_ONCE(up->corkflag)) + return; + + lock_sock(sk); + if (up->pending && !READ_ONCE(up->corkflag)) + udp_v6_push_pending_frames(sk); + release_sock(sk); +} + void udpv6_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); @@ -1764,6 +1778,7 @@ struct proto udpv6_prot = { .getsockopt = udpv6_getsockopt, .sendmsg = udpv6_sendmsg, .recvmsg = udpv6_recvmsg, + .splice_eof = udpv6_splice_eof, .release_cb = ip6_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index c39c1e32f980..ad3b8726873e 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -14,6 +14,7 @@ #include <net/ip6_checksum.h> #include "ip6_offload.h" #include <net/gro.h> +#include <net/gso.h> static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, netdev_features_t features) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index cfe828bd7fc6..d0537c1c8cd7 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -581,12 +581,10 @@ static void kcm_report_tx_retry(struct kcm_sock *kcm) */ static int kcm_write_msgs(struct kcm_sock *kcm) { + unsigned int total_sent = 0; struct sock *sk = &kcm->sk; struct kcm_psock *psock; - struct sk_buff *skb, *head; - struct kcm_tx_msg *txm; - unsigned short fragidx, frag_offset; - unsigned int sent, total_sent = 0; + struct sk_buff *head; int ret = 0; kcm->tx_wait_more = false; @@ -600,72 +598,57 @@ static int kcm_write_msgs(struct kcm_sock *kcm) if (skb_queue_empty(&sk->sk_write_queue)) return 0; - kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0; - - } else if (skb_queue_empty(&sk->sk_write_queue)) { - return 0; + kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false; } - head = skb_peek(&sk->sk_write_queue); - txm = kcm_tx_msg(head); +retry: + while ((head = skb_peek(&sk->sk_write_queue))) { + struct msghdr msg = { + .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, + }; + struct kcm_tx_msg *txm = kcm_tx_msg(head); + struct sk_buff *skb; + unsigned int msize; + int i; - if (txm->sent) { - /* Send of first skbuff in queue already in progress */ - if (WARN_ON(!psock)) { - ret = -EINVAL; - goto out; + if (!txm->started_tx) { + psock = reserve_psock(kcm); + if (!psock) + goto out; + skb = head; + txm->frag_offset = 0; + txm->sent = 0; + txm->started_tx = true; + } else { + if (WARN_ON(!psock)) { + ret = -EINVAL; + goto out; + } + skb = txm->frag_skb; } - sent = txm->sent; - frag_offset = txm->frag_offset; - fragidx = txm->fragidx; - skb = txm->frag_skb; - goto do_frag; - } - -try_again: - psock = reserve_psock(kcm); - if (!psock) - goto out; - - do { - skb = head; - txm = kcm_tx_msg(head); - sent = 0; - -do_frag_list: if (WARN_ON(!skb_shinfo(skb)->nr_frags)) { ret = -EINVAL; goto out; } - for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; - fragidx++) { - skb_frag_t *frag; + msize = 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + msize += skb_shinfo(skb)->frags[i].bv_len; - frag_offset = 0; -do_frag: - frag = &skb_shinfo(skb)->frags[fragidx]; - if (WARN_ON(!skb_frag_size(frag))) { - ret = -EINVAL; - goto out; - } + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, + skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags, + msize); + iov_iter_advance(&msg.msg_iter, txm->frag_offset); - ret = kernel_sendpage(psock->sk->sk_socket, - skb_frag_page(frag), - skb_frag_off(frag) + frag_offset, - skb_frag_size(frag) - frag_offset, - MSG_DONTWAIT); + do { + ret = sock_sendmsg(psock->sk->sk_socket, &msg); if (ret <= 0) { if (ret == -EAGAIN) { /* Save state to try again when there's * write space on the socket */ - txm->sent = sent; - txm->frag_offset = frag_offset; - txm->fragidx = fragidx; txm->frag_skb = skb; - ret = 0; goto out; } @@ -678,45 +661,44 @@ do_frag: kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, true); unreserve_psock(kcm); + psock = NULL; - txm->sent = 0; + txm->started_tx = false; kcm_report_tx_retry(kcm); ret = 0; - - goto try_again; + goto retry; } - sent += ret; - frag_offset += ret; + txm->sent += ret; + txm->frag_offset += ret; KCM_STATS_ADD(psock->stats.tx_bytes, ret); - if (frag_offset < skb_frag_size(frag)) { - /* Not finished with this frag */ - goto do_frag; - } - } + } while (msg.msg_iter.count > 0); if (skb == head) { if (skb_has_frag_list(skb)) { - skb = skb_shinfo(skb)->frag_list; - goto do_frag_list; + txm->frag_skb = skb_shinfo(skb)->frag_list; + txm->frag_offset = 0; + continue; } } else if (skb->next) { - skb = skb->next; - goto do_frag_list; + txm->frag_skb = skb->next; + txm->frag_offset = 0; + continue; } /* Successfully sent the whole packet, account for it. */ + sk->sk_wmem_queued -= txm->sent; + total_sent += txm->sent; skb_dequeue(&sk->sk_write_queue); kfree_skb(head); - sk->sk_wmem_queued -= sent; - total_sent += sent; KCM_STATS_INCR(psock->stats.tx_msgs); - } while ((head = skb_peek(&sk->sk_write_queue))); + } out: if (!head) { /* Done with all queued messages. */ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); - unreserve_psock(kcm); + if (psock) + unreserve_psock(kcm); } /* Check if write space is available */ @@ -761,149 +743,6 @@ static void kcm_push(struct kcm_sock *kcm) kcm_write_msgs(kcm); } -static ssize_t kcm_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int flags) - -{ - struct sock *sk = sock->sk; - struct kcm_sock *kcm = kcm_sk(sk); - struct sk_buff *skb = NULL, *head = NULL; - long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - bool eor; - int err = 0; - int i; - - if (flags & MSG_SENDPAGE_NOTLAST) - flags |= MSG_MORE; - - /* No MSG_EOR from splice, only look at MSG_MORE */ - eor = !(flags & MSG_MORE); - - lock_sock(sk); - - sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - - err = -EPIPE; - if (sk->sk_err) - goto out_error; - - if (kcm->seq_skb) { - /* Previously opened message */ - head = kcm->seq_skb; - skb = kcm_tx_msg(head)->last_skb; - i = skb_shinfo(skb)->nr_frags; - - if (skb_can_coalesce(skb, i, page, offset)) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); - skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; - goto coalesced; - } - - if (i >= MAX_SKB_FRAGS) { - struct sk_buff *tskb; - - tskb = alloc_skb(0, sk->sk_allocation); - while (!tskb) { - kcm_push(kcm); - err = sk_stream_wait_memory(sk, &timeo); - if (err) - goto out_error; - } - - if (head == skb) - skb_shinfo(head)->frag_list = tskb; - else - skb->next = tskb; - - skb = tskb; - skb->ip_summed = CHECKSUM_UNNECESSARY; - i = 0; - } - } else { - /* Call the sk_stream functions to manage the sndbuf mem. */ - if (!sk_stream_memory_free(sk)) { - kcm_push(kcm); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = sk_stream_wait_memory(sk, &timeo); - if (err) - goto out_error; - } - - head = alloc_skb(0, sk->sk_allocation); - while (!head) { - kcm_push(kcm); - err = sk_stream_wait_memory(sk, &timeo); - if (err) - goto out_error; - } - - skb = head; - i = 0; - } - - get_page(page); - skb_fill_page_desc_noacc(skb, i, page, offset, size); - skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; - -coalesced: - skb->len += size; - skb->data_len += size; - skb->truesize += size; - sk->sk_wmem_queued += size; - sk_mem_charge(sk, size); - - if (head != skb) { - head->len += size; - head->data_len += size; - head->truesize += size; - } - - if (eor) { - bool not_busy = skb_queue_empty(&sk->sk_write_queue); - - /* Message complete, queue it on send buffer */ - __skb_queue_tail(&sk->sk_write_queue, head); - kcm->seq_skb = NULL; - KCM_STATS_INCR(kcm->stats.tx_msgs); - - if (flags & MSG_BATCH) { - kcm->tx_wait_more = true; - } else if (kcm->tx_wait_more || not_busy) { - err = kcm_write_msgs(kcm); - if (err < 0) { - /* We got a hard error in write_msgs but have - * already queued this message. Report an error - * in the socket, but don't affect return value - * from sendmsg - */ - pr_warn("KCM: Hard failure on kcm_write_msgs\n"); - report_csk_error(&kcm->sk, -err); - } - } - } else { - /* Message not complete, save state */ - kcm->seq_skb = head; - kcm_tx_msg(head)->last_skb = skb; - } - - KCM_STATS_ADD(kcm->stats.tx_bytes, size); - - release_sock(sk); - return size; - -out_error: - kcm_push(kcm); - - err = sk_stream_error(sk, flags, err); - - /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) - sk->sk_write_space(sk); - - release_sock(sk); - return err; -} - static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; @@ -989,29 +828,52 @@ start: merge = false; } - copy = min_t(int, msg_data_left(msg), - pfrag->size - pfrag->offset); + if (msg->msg_flags & MSG_SPLICE_PAGES) { + copy = msg_data_left(msg); + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_memory; - if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; + err = skb_splice_from_iter(skb, &msg->msg_iter, copy, + sk->sk_allocation); + if (err < 0) { + if (err == -EMSGSIZE) + goto wait_for_memory; + goto out_error; + } - err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, - pfrag->page, - pfrag->offset, - copy); - if (err) - goto out_error; + copy = err; + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; + sk_wmem_queued_add(sk, copy); + sk_mem_charge(sk, copy); - /* Update the skb. */ - if (merge) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + if (head != skb) + head->truesize += copy; } else { - skb_fill_page_desc(skb, i, pfrag->page, - pfrag->offset, copy); - get_page(pfrag->page); + copy = min_t(int, msg_data_left(msg), + pfrag->size - pfrag->offset); + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_memory; + + err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, + pfrag->page, + pfrag->offset, + copy); + if (err) + goto out_error; + + /* Update the skb. */ + if (merge) { + skb_frag_size_add( + &skb_shinfo(skb)->frags[i - 1], copy); + } else { + skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, copy); + get_page(pfrag->page); + } + + pfrag->offset += copy; } - pfrag->offset += copy; copied += copy; if (head != skb) { head->len += copy; @@ -1088,6 +950,37 @@ out_error: return err; } +static void kcm_splice_eof(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + + if (skb_queue_empty_lockless(&sk->sk_write_queue)) + return; + + lock_sock(sk); + kcm_write_msgs(kcm); + release_sock(sk); +} + +static ssize_t kcm_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int flags) + +{ + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; + + if (flags & MSG_SENDPAGE_NOTLAST) + msg.msg_flags |= MSG_MORE; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + bvec_set_page(&bvec, page, size, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); + return kcm_sendmsg(sock, &msg, size); +} + static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { @@ -1875,6 +1768,7 @@ static const struct proto_ops kcm_dgram_ops = { .sendmsg = kcm_sendmsg, .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, + .splice_eof = kcm_splice_eof, .sendpage = kcm_sendpage, }; @@ -1896,6 +1790,7 @@ static const struct proto_ops kcm_seqpacket_ops = { .sendmsg = kcm_sendmsg, .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, + .splice_eof = kcm_splice_eof, .sendpage = kcm_sendpage, .splice_read = kcm_splice_read, }; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index a88e070b431d..91ebf0a3f499 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -272,7 +272,7 @@ int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); /* IOCTL helper for IP encap modules. */ -int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg); +int l2tp_ioctl(struct sock *sk, int cmd, int *karg); /* Extract the tunnel structure from a socket's sk_user_data pointer, * validating the tunnel magic feather. diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 41a74fc84ca1..2b795c1064f5 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -562,19 +562,18 @@ out: return err ? err : copied; } -int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg) +int l2tp_ioctl(struct sock *sk, int cmd, int *karg) { struct sk_buff *skb; - int amount; switch (cmd) { case SIOCOUTQ: - amount = sk_wmem_alloc_get(sk); + *karg = sk_wmem_alloc_get(sk); break; case SIOCINQ: spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); - amount = skb ? skb->len : 0; + *karg = skb ? skb->len : 0; spin_unlock_bh(&sk->sk_receive_queue.lock); break; @@ -582,7 +581,7 @@ int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg) return -ENOIOCTLCMD; } - return put_user(amount, (int __user *)arg); + return 0; } EXPORT_SYMBOL_GPL(l2tp_ioctl); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f2d08dbccfb7..4aaead4895b7 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1101,18 +1101,20 @@ ieee80211_copy_rnr_beacon(u8 *pos, struct cfg80211_rnr_elems *dst, return offset; } -static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, - struct ieee80211_link_data *link, - struct cfg80211_beacon_data *params, - const struct ieee80211_csa_settings *csa, - const struct ieee80211_color_change_settings *cca) +static int +ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_data *link, + struct cfg80211_beacon_data *params, + const struct ieee80211_csa_settings *csa, + const struct ieee80211_color_change_settings *cca, + u64 *changed) { struct cfg80211_mbssid_elems *mbssid = NULL; struct cfg80211_rnr_elems *rnr = NULL; struct beacon_data *new, *old; int new_head_len, new_tail_len; int size, err; - u32 changed = BSS_CHANGED_BEACON; + u64 _changed = BSS_CHANGED_BEACON; struct ieee80211_bss_conf *link_conf = link->conf; old = sdata_dereference(link->u.ap.beacon, sdata); @@ -1219,7 +1221,7 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, return err; } if (err == 0) - changed |= BSS_CHANGED_AP_PROBE_RESP; + _changed |= BSS_CHANGED_AP_PROBE_RESP; if (params->ftm_responder != -1) { link_conf->ftm_responder = params->ftm_responder; @@ -1235,7 +1237,7 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, return err; } - changed |= BSS_CHANGED_FTM_RESPONDER; + _changed |= BSS_CHANGED_FTM_RESPONDER; } rcu_assign_pointer(link->u.ap.beacon, new); @@ -1244,7 +1246,8 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, if (old) kfree_rcu(old, rcu_head); - return changed; + *changed |= _changed; + return 0; } static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, @@ -1446,10 +1449,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) link_conf->beacon_tx_rate = params->beacon_rate; - err = ieee80211_assign_beacon(sdata, link, ¶ms->beacon, NULL, NULL); + err = ieee80211_assign_beacon(sdata, link, ¶ms->beacon, NULL, NULL, + &changed); if (err < 0) goto error; - changed |= err; if (params->fils_discovery.max_interval) { err = ieee80211_set_fils_discovery(sdata, @@ -1506,6 +1509,7 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, struct beacon_data *old; int err; struct ieee80211_bss_conf *link_conf; + u64 changed = 0; sdata_assert_lock(sdata); @@ -1525,17 +1529,18 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, if (!old) return -ENOENT; - err = ieee80211_assign_beacon(sdata, link, params, NULL, NULL); + err = ieee80211_assign_beacon(sdata, link, params, NULL, NULL, + &changed); if (err < 0) return err; if (params->he_bss_color_valid && params->he_bss_color.enabled != link_conf->he_bss_color.enabled) { link_conf->he_bss_color.enabled = params->he_bss_color.enabled; - err |= BSS_CHANGED_HE_BSS_COLOR; + changed |= BSS_CHANGED_HE_BSS_COLOR; } - ieee80211_link_info_change_notify(sdata, link, err); + ieee80211_link_info_change_notify(sdata, link, changed); return 0; } @@ -1718,7 +1723,7 @@ static void sta_apply_mesh_params(struct ieee80211_local *local, { #ifdef CONFIG_MAC80211_MESH struct ieee80211_sub_if_data *sdata = sta->sdata; - u32 changed = 0; + u64 changed = 0; if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) { switch (params->plink_state) { @@ -2665,7 +2670,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; struct ieee80211_supported_band *sband; - u32 changed = 0; + u64 changed = 0; link = ieee80211_link_or_deflink(sdata, params->link_id, true); if (IS_ERR(link)) @@ -3585,7 +3590,7 @@ void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif, bool block_t sdata->deflink.csa_block_tx = block_tx; sdata_info(sdata, "channel switch failed, disconnecting\n"); - ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); + wiphy_work_queue(local->hw.wiphy, &ifmgd->csa_connection_drop_work); } EXPORT_SYMBOL(ieee80211_channel_switch_disconnect); @@ -3601,25 +3606,22 @@ static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata, err = ieee80211_assign_beacon(sdata, &sdata->deflink, sdata->deflink.u.ap.next_beacon, - NULL, NULL); + NULL, NULL, changed); ieee80211_free_next_beacon(&sdata->deflink); if (err < 0) return err; - *changed |= err; break; case NL80211_IFTYPE_ADHOC: - err = ieee80211_ibss_finish_csa(sdata); + err = ieee80211_ibss_finish_csa(sdata, changed); if (err < 0) return err; - *changed |= err; break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: - err = ieee80211_mesh_finish_csa(sdata); + err = ieee80211_mesh_finish_csa(sdata, changed); if (err < 0) return err; - *changed |= err; break; #endif default: @@ -3730,7 +3732,7 @@ unlock: static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *params, - u32 *changed) + u64 *changed) { struct ieee80211_csa_settings csa = {}; int err; @@ -3777,12 +3779,11 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, err = ieee80211_assign_beacon(sdata, &sdata->deflink, ¶ms->beacon_csa, &csa, - NULL); + NULL, changed); if (err < 0) { ieee80211_free_next_beacon(&sdata->deflink); return err; } - *changed |= err; break; case NL80211_IFTYPE_ADHOC: @@ -3814,10 +3815,9 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, /* see comments in the NL80211_IFTYPE_AP block */ if (params->count > 1) { - err = ieee80211_ibss_csa_beacon(sdata, params); + err = ieee80211_ibss_csa_beacon(sdata, params, changed); if (err < 0) return err; - *changed |= err; } ieee80211_send_action_csa(sdata, params); @@ -3842,12 +3842,11 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, /* see comments in the NL80211_IFTYPE_AP block */ if (params->count > 1) { - err = ieee80211_mesh_csa_beacon(sdata, params); + err = ieee80211_mesh_csa_beacon(sdata, params, changed); if (err < 0) { ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE; return err; } - *changed |= err; } if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_INIT) @@ -3881,7 +3880,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_channel_switch ch_switch; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *chanctx; - u32 changed = 0; + u64 changed = 0; int err; sdata_assert_lock(sdata); @@ -4614,7 +4613,7 @@ static int ieee80211_set_sar_specs(struct wiphy *wiphy, static int ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata, - u32 *changed) + u64 *changed) { switch (sdata->vif.type) { case NL80211_IFTYPE_AP: { @@ -4625,13 +4624,12 @@ ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata, ret = ieee80211_assign_beacon(sdata, &sdata->deflink, sdata->deflink.u.ap.next_beacon, - NULL, NULL); + NULL, NULL, changed); ieee80211_free_next_beacon(&sdata->deflink); if (ret < 0) return ret; - *changed |= ret; break; } default: @@ -4645,7 +4643,7 @@ ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata, static int ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_color_change_settings *params, - u32 *changed) + u64 *changed) { struct ieee80211_color_change_settings color_change = {}; int err; @@ -4668,12 +4666,11 @@ ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, err = ieee80211_assign_beacon(sdata, &sdata->deflink, ¶ms->beacon_color_change, - NULL, &color_change); + NULL, &color_change, changed); if (err < 0) { ieee80211_free_next_beacon(&sdata->deflink); return err; } - *changed |= err; break; default: return -EOPNOTSUPP; @@ -4684,7 +4681,7 @@ ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, static void ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata, - u8 color, int enable, u32 changed) + u8 color, int enable, u64 changed) { sdata->vif.bss_conf.he_bss_color.color = color; sdata->vif.bss_conf.he_bss_color.enabled = enable; @@ -4712,7 +4709,7 @@ ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata, static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; - u32 changed = 0; + u64 changed = 0; int err; sdata_assert_lock(sdata); @@ -4809,7 +4806,7 @@ ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev, { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; - u32 changed = 0; + u64 changed = 0; int err; sdata_assert_lock(sdata); diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 77c90ed8f5d7..168bf3edd4b4 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1205,8 +1205,8 @@ ieee80211_link_chanctx_reservation_complete(struct ieee80211_link_data *link) &link->csa_finalize_work); break; case NL80211_IFTYPE_STATION: - ieee80211_queue_work(&sdata->local->hw, - &link->u.mgd.chswitch_work); + wiphy_delayed_work_queue(sdata->local->hw.wiphy, + &link->u.mgd.chswitch_work, 0); break; case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_AP_VLAN: @@ -1257,7 +1257,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) struct ieee80211_vif_chanctx_switch vif_chsw[1] = {}; struct ieee80211_chanctx *old_ctx, *new_ctx; const struct cfg80211_chan_def *chandef; - u32 changed = 0; + u64 changed = 0; int err; lockdep_assert_held(&local->mtx); @@ -1653,7 +1653,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) reserved_chanctx_list) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; - u32 changed = 0; + u64 changed = 0; if (!ieee80211_link_has_in_place_reservation(link)) continue; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 45d3e53c7383..c4505593ba7a 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016 Intel Deutschland GmbH -* Copyright (C) 2018 - 2019, 2021 Intel Corporation +* Copyright (C) 2018 - 2019, 2021 - 2023 Intel Corporation */ #ifndef __MAC80211_DRIVER_OPS @@ -13,9 +13,11 @@ #include "trace.h" #define check_sdata_in_driver(sdata) ({ \ - !WARN_ONCE(!(sdata->flags & IEEE80211_SDATA_IN_DRIVER), \ - "%s: Failed check-sdata-in-driver check, flags: 0x%x\n", \ - sdata->dev ? sdata->dev->name : sdata->name, sdata->flags); \ + WARN_ONCE(!sdata->local->reconfig_failure && \ + !(sdata->flags & IEEE80211_SDATA_IN_DRIVER), \ + "%s: Failed check-sdata-in-driver check, flags: 0x%x\n", \ + sdata->dev ? sdata->dev->name : sdata->name, sdata->flags); \ + !!(sdata->flags & IEEE80211_SDATA_IN_DRIVER); \ }) static inline struct ieee80211_sub_if_data * diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 5315ab750280..33729870ad8a 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright 2017 Intel Deutschland GmbH - * Copyright(c) 2020-2022 Intel Corporation + * Copyright(c) 2020-2023 Intel Corporation */ #include <linux/ieee80211.h> @@ -602,7 +602,8 @@ void ieee80211_request_smps(struct ieee80211_vif *vif, unsigned int link_id, goto out; link->u.mgd.driver_smps_mode = smps_mode; - ieee80211_queue_work(&sdata->local->hw, &link->u.mgd.request_smps_work); + wiphy_work_queue(sdata->local->hw.wiphy, + &link->u.mgd.request_smps_work); out: rcu_read_unlock(); } diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 9dffc3079588..e1900077bc4b 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -9,7 +9,7 @@ * Copyright 2009, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright(c) 2018-2022 Intel Corporation + * Copyright(c) 2018-2023 Intel Corporation */ #include <linux/delay.h> @@ -226,7 +226,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct cfg80211_bss *bss; - u32 bss_change; + u64 bss_change; struct cfg80211_chan_def chandef; struct ieee80211_channel *chan; struct beacon_data *presp; @@ -478,7 +478,8 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, } int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, - struct cfg80211_csa_settings *csa_settings) + struct cfg80211_csa_settings *csa_settings, + u64 *changed) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct beacon_data *presp, *old_presp; @@ -520,10 +521,11 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, if (old_presp) kfree_rcu(old_presp, rcu_head); - return BSS_CHANGED_BEACON; + *changed |= BSS_CHANGED_BEACON; + return 0; } -int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) +int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct cfg80211_bss *cbss; @@ -552,14 +554,15 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) ifibss->chandef = sdata->deflink.csa_chandef; /* generate the beacon */ - return ieee80211_ibss_csa_beacon(sdata, NULL); + return ieee80211_ibss_csa_beacon(sdata, NULL, changed); } void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; - cancel_work_sync(&ifibss->csa_connection_drop_work); + wiphy_work_cancel(sdata->local->hw.wiphy, + &ifibss->csa_connection_drop_work); } static struct sta_info *ieee80211_ibss_finish_sta(struct sta_info *sta) @@ -728,7 +731,8 @@ static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->mtx); } -static void ieee80211_csa_connection_drop_work(struct work_struct *work) +static void ieee80211_csa_connection_drop_work(struct wiphy *wiphy, + struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, @@ -741,7 +745,7 @@ static void ieee80211_csa_connection_drop_work(struct work_struct *work) skb_queue_purge(&sdata->skb_queue); /* trigger a scan to find another IBSS network to join */ - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); sdata_unlock(sdata); } @@ -894,8 +898,8 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, return true; disconnect: ibss_dbg(sdata, "Can't handle channel switch, disconnect\n"); - ieee80211_queue_work(&sdata->local->hw, - &ifibss->csa_connection_drop_work); + wiphy_work_queue(sdata->local->hw.wiphy, + &ifibss->csa_connection_drop_work); ieee80211_ibss_csa_mark_radar(sdata); @@ -1242,7 +1246,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, spin_lock(&ifibss->incomplete_lock); list_add(&sta->list, &ifibss->incomplete_stations); spin_unlock(&ifibss->incomplete_lock); - ieee80211_queue_work(&local->hw, &sdata->work); + wiphy_work_queue(local->hw.wiphy, &sdata->work); } static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata) @@ -1721,7 +1725,7 @@ static void ieee80211_ibss_timer(struct timer_list *t) struct ieee80211_sub_if_data *sdata = from_timer(sdata, t, u.ibss.timer); - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata) @@ -1731,8 +1735,8 @@ void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata) timer_setup(&ifibss->timer, ieee80211_ibss_timer, 0); INIT_LIST_HEAD(&ifibss->incomplete_stations); spin_lock_init(&ifibss->incomplete_lock); - INIT_WORK(&ifibss->csa_connection_drop_work, - ieee80211_csa_connection_drop_work); + wiphy_work_init(&ifibss->csa_connection_drop_work, + ieee80211_csa_connection_drop_work); } /* scan finished notification */ @@ -1754,7 +1758,7 @@ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local) int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, struct cfg80211_ibss_params *params) { - u32 changed = 0; + u64 changed = 0; u32 rate_flags; struct ieee80211_supported_band *sband; enum ieee80211_chanctx_mode chanmode; @@ -1856,7 +1860,7 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, sdata->deflink.needed_rx_chains = local->rx_chains; sdata->control_port_over_nl80211 = params->control_port_over_nl80211; - ieee80211_queue_work(&local->hw, &sdata->work); + wiphy_work_queue(local->hw.wiphy, &sdata->work); return 0; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 4159fb65038b..8e90cb2ea4bb 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -466,8 +466,8 @@ struct ieee80211_if_managed { struct timer_list conn_mon_timer; struct timer_list bcn_mon_timer; struct work_struct monitor_work; - struct work_struct beacon_connection_loss_work; - struct work_struct csa_connection_drop_work; + struct wiphy_work beacon_connection_loss_work; + struct wiphy_work csa_connection_drop_work; unsigned long beacon_timeout; unsigned long probe_timeout; @@ -553,7 +553,7 @@ struct ieee80211_if_managed { struct ieee80211_if_ibss { struct timer_list timer; - struct work_struct csa_connection_drop_work; + struct wiphy_work csa_connection_drop_work; unsigned long last_scan_completed; @@ -918,10 +918,9 @@ struct ieee80211_link_data_managed { bool csa_waiting_bcn; bool csa_ignored_same_chan; - struct timer_list chswitch_timer; - struct work_struct chswitch_work; + struct wiphy_delayed_work chswitch_work; - struct work_struct request_smps_work; + struct wiphy_work request_smps_work; bool beacon_crc_valid; u32 beacon_crc; struct ewma_beacon_signal ave_beacon_signal; @@ -1061,7 +1060,7 @@ struct ieee80211_sub_if_data { /* used to reconfigure hardware SM PS */ struct work_struct recalc_smps; - struct work_struct work; + struct wiphy_work work; struct sk_buff_head skb_queue; struct sk_buff_head status_queue; @@ -1394,6 +1393,9 @@ struct ieee80211_local { /* device is during a HW reconfig */ bool in_reconfig; + /* reconfiguration failed ... suppress some warnings etc. */ + bool reconfig_failure; + /* wowlan is enabled -- don't reconfig on resume */ bool wowlan; @@ -1827,7 +1829,7 @@ void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, u64 changed); void ieee80211_configure_filter(struct ieee80211_local *local); -u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); +u64 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local); int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, @@ -1887,8 +1889,10 @@ void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, - struct cfg80211_csa_settings *csa_settings); -int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata); + struct cfg80211_csa_settings *csa_settings, + u64 *changed); +int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata, + u64 *changed); void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata); /* OCB code */ @@ -1905,8 +1909,10 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata); void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata, - struct cfg80211_csa_settings *csa_settings); -int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata); + struct cfg80211_csa_settings *csa_settings, + u64 *changed); +int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata, + u64 *changed); /* scan/BSS handling */ void ieee80211_scan_work(struct work_struct *work); @@ -2269,8 +2275,6 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, * (or re-association) response frame if this is given * @from_ap: frame is received from an AP (currently used only * for EHT capabilities parsing) - * @scratch_len: if non zero, specifies the requested length of the scratch - * buffer; otherwise, 'len' is used. */ struct ieee80211_elems_parse_params { const u8 *start; @@ -2281,7 +2285,6 @@ struct ieee80211_elems_parse_params { struct cfg80211_bss *bss; int link_id; bool from_ap; - size_t scratch_len; }; struct ieee802_11_elems * diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index bd2c48870add..5b67b44e3f89 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -8,7 +8,7 @@ * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (c) 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation */ #include <linux/slab.h> #include <linux/kernel.h> @@ -43,7 +43,7 @@ * by either the RTNL, the iflist_mtx or RCU. */ -static void ieee80211_iface_work(struct work_struct *work); +static void ieee80211_iface_work(struct wiphy *wiphy, struct wiphy_work *work); bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata) { @@ -614,7 +614,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do RCU_INIT_POINTER(local->p2p_sdata, NULL); fallthrough; default: - cancel_work_sync(&sdata->work); + wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->work); /* * When we get here, the interface is marked down. * Free the remaining keys, if there are any @@ -1173,7 +1173,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) skb_queue_head_init(&sdata->skb_queue); skb_queue_head_init(&sdata->status_queue); - INIT_WORK(&sdata->work, ieee80211_iface_work); + wiphy_work_init(&sdata->work, ieee80211_iface_work); return 0; } @@ -1221,7 +1221,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct net_device *dev = wdev->netdev; struct ieee80211_local *local = sdata->local; - u32 changed = 0; + u64 changed = 0; int res; u32 hw_reconf_flags = 0; @@ -1281,6 +1281,9 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) } if (local->open_count == 0) { + /* here we can consider everything in good order (again) */ + local->reconfig_failure = false; + res = drv_start(local); if (res) goto err_del_bss; @@ -1622,7 +1625,7 @@ static void ieee80211_iface_process_status(struct ieee80211_sub_if_data *sdata, } } -static void ieee80211_iface_work(struct work_struct *work) +static void ieee80211_iface_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, work); @@ -1734,7 +1737,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, skb_queue_head_init(&sdata->skb_queue); skb_queue_head_init(&sdata->status_queue); - INIT_WORK(&sdata->work, ieee80211_iface_work); + wiphy_work_init(&sdata->work, ieee80211_iface_work); INIT_WORK(&sdata->recalc_smps, ieee80211_recalc_smps_work); INIT_WORK(&sdata->activate_links_work, ieee80211_activate_links_work); @@ -2255,7 +2258,6 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata, *tmp; LIST_HEAD(unreg_list); - LIST_HEAD(wdev_list); ASSERT_RTNL(); @@ -2278,23 +2280,18 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) ieee80211_txq_teardown_flows(local); mutex_lock(&local->iflist_mtx); - list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) { - list_del(&sdata->list); - - if (sdata->dev) - unregister_netdevice_queue(sdata->dev, &unreg_list); - else - list_add(&sdata->list, &wdev_list); - } + list_splice_init(&local->interfaces, &unreg_list); mutex_unlock(&local->iflist_mtx); - unregister_netdevice_many(&unreg_list); - wiphy_lock(local->hw.wiphy); - list_for_each_entry_safe(sdata, tmp, &wdev_list, list) { + list_for_each_entry_safe(sdata, tmp, &unreg_list, list) { + bool netdev = sdata->dev; + list_del(&sdata->list); cfg80211_unregister_wdev(&sdata->wdev); - kfree(sdata); + + if (!netdev) + kfree(sdata); } wiphy_unlock(local->hw.wiphy); } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 55cdfaef0f5d..24315d7b3126 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation */ #include <net/mac80211.h> @@ -291,7 +291,7 @@ void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata, drv_link_info_changed(local, sdata, link->conf, link->link_id, changed); } -u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata) +u64 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata) { sdata->vif.bss_conf.use_cts_prot = false; sdata->vif.bss_conf.use_short_preamble = false; @@ -364,7 +364,8 @@ static void ieee80211_restart_work(struct work_struct *work) * The exception is ieee80211_chswitch_done. * Then we can have a race... */ - cancel_work_sync(&sdata->u.mgd.csa_connection_drop_work); + wiphy_work_cancel(local->hw.wiphy, + &sdata->u.mgd.csa_connection_drop_work); if (sdata->vif.bss_conf.csa_active) { sdata_lock(sdata); ieee80211_sta_connection_lost(sdata, diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index f72333201903..af8c5fc2db14 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2018 - 2022 Intel Corporation + * Copyright (C) 2018 - 2023 Intel Corporation * Authors: Luis Carlos Cobo <luisca@cozybit.com> * Javier Cardona <javier@cozybit.com> */ @@ -45,7 +45,7 @@ static void ieee80211_mesh_housekeeping_timer(struct timer_list *t) set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags); - ieee80211_queue_work(&local->hw, &sdata->work); + wiphy_work_queue(local->hw.wiphy, &sdata->work); } /** @@ -133,10 +133,10 @@ bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie) * * Returns: beacon changed flag if the beacon content changed. */ -u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata) +u64 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata) { bool free_plinks; - u32 changed = 0; + u64 changed = 0; /* In case mesh_plink_free_count > 0 and mesh_plinktbl_capacity == 0, * the mesh interface might be able to establish plinks with peers that @@ -162,7 +162,7 @@ u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata) void mesh_sta_cleanup(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; - u32 changed = mesh_plink_deactivate(sta); + u64 changed = mesh_plink_deactivate(sta); if (changed) ieee80211_mbss_info_change_notify(sdata, changed); @@ -703,7 +703,7 @@ static void ieee80211_mesh_path_timer(struct timer_list *t) struct ieee80211_sub_if_data *sdata = from_timer(sdata, t, u.mesh.mesh_path_timer); - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } static void ieee80211_mesh_path_root_timer(struct timer_list *t) @@ -714,7 +714,7 @@ static void ieee80211_mesh_path_root_timer(struct timer_list *t) set_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags); - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh) @@ -923,7 +923,7 @@ unsigned int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata, static void ieee80211_mesh_housekeeping(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - u32 changed; + u64 changed; if (ifmsh->mshcfg.plink_timeout > 0) ieee80211_sta_expire(sdata, ifmsh->mshcfg.plink_timeout * HZ); @@ -1164,7 +1164,7 @@ ieee80211_mesh_rebuild_beacon(struct ieee80211_sub_if_data *sdata) } void ieee80211_mbss_info_change_notify(struct ieee80211_sub_if_data *sdata, - u32 changed) + u64 changed) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; unsigned long bits = changed; @@ -1177,14 +1177,14 @@ void ieee80211_mbss_info_change_notify(struct ieee80211_sub_if_data *sdata, for_each_set_bit(bit, &bits, sizeof(changed) * BITS_PER_BYTE) set_bit(bit, &ifmsh->mbss_changed); set_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags); - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_local *local = sdata->local; - u32 changed = BSS_CHANGED_BEACON | + u64 changed = BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_HT | BSS_CHANGED_BASIC_RATES | @@ -1202,7 +1202,7 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) ifmsh->sync_offset_clockdrift_max = 0; set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags); ieee80211_mesh_root_setup(ifmsh); - ieee80211_queue_work(&local->hw, &sdata->work); + wiphy_work_queue(local->hw.wiphy, &sdata->work); sdata->vif.bss_conf.ht_operation_mode = ifmsh->mshcfg.ht_opmode; sdata->vif.bss_conf.enable_beacon = true; @@ -1525,12 +1525,11 @@ free: kfree(elems); } -int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata) +int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_csa_settings *tmp_csa_settings; int ret = 0; - int changed = 0; /* Reset the TTL value and Initiator flag */ ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE; @@ -1545,15 +1544,16 @@ int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata) if (ret) return -EINVAL; - changed |= BSS_CHANGED_BEACON; + *changed |= BSS_CHANGED_BEACON; mcsa_dbg(sdata, "complete switching to center freq %d MHz", sdata->vif.bss_conf.chandef.chan->center_freq); - return changed; + return 0; } int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata, - struct cfg80211_csa_settings *csa_settings) + struct cfg80211_csa_settings *csa_settings, + u64 *changed) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_csa_settings *tmp_csa_settings; @@ -1579,7 +1579,8 @@ int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata, return ret; } - return BSS_CHANGED_BEACON; + *changed |= BSS_CHANGED_BEACON; + return 0; } static int mesh_fwd_csa_frame(struct ieee80211_sub_if_data *sdata, @@ -1720,7 +1721,8 @@ out: static void mesh_bss_info_changed(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - u32 bit, changed = 0; + u32 bit; + u64 changed = 0; for_each_set_bit(bit, &ifmsh->mbss_changed, sizeof(changed) * BITS_PER_BYTE) { diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 022f41292a05..6c94222a9df5 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2008, 2009 open80211s Ltd. + * Copyright (C) 2023 Intel Corporation * Authors: Luis Carlos Cobo <luisca@cozybit.com> * Javier Cardona <javier@cozybit.com> */ @@ -252,11 +253,11 @@ void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh); const struct ieee80211_mesh_sync_ops *ieee80211_mesh_sync_ops_get(u8 method); /* wrapper for ieee80211_bss_info_change_notify() */ void ieee80211_mbss_info_change_notify(struct ieee80211_sub_if_data *sdata, - u32 changed); + u64 changed); /* mesh power save */ -u32 ieee80211_mps_local_status_update(struct ieee80211_sub_if_data *sdata); -u32 ieee80211_mps_set_sta_local_pm(struct sta_info *sta, +u64 ieee80211_mps_local_status_update(struct ieee80211_sub_if_data *sdata); +u64 ieee80211_mps_set_sta_local_pm(struct sta_info *sta, enum nl80211_mesh_power_mode pm); void ieee80211_mps_set_frame_flags(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, @@ -303,12 +304,12 @@ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, u8 *hw_addr, struct ieee802_11_elems *ie, struct ieee80211_rx_status *rx_status); bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie); -u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata); +u64 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata); void mesh_plink_timer(struct timer_list *t); void mesh_plink_broken(struct sta_info *sta); -u32 mesh_plink_deactivate(struct sta_info *sta); -u32 mesh_plink_open(struct sta_info *sta); -u32 mesh_plink_block(struct sta_info *sta); +u64 mesh_plink_deactivate(struct sta_info *sta); +u64 mesh_plink_open(struct sta_info *sta); +u64 mesh_plink_block(struct sta_info *sta); void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status); @@ -349,14 +350,14 @@ void mesh_path_refresh(struct ieee80211_sub_if_data *sdata, #ifdef CONFIG_MAC80211_MESH static inline -u32 mesh_plink_inc_estab_count(struct ieee80211_sub_if_data *sdata) +u64 mesh_plink_inc_estab_count(struct ieee80211_sub_if_data *sdata) { atomic_inc(&sdata->u.mesh.estab_plinks); return mesh_accept_plinks_update(sdata) | BSS_CHANGED_BEACON; } static inline -u32 mesh_plink_dec_estab_count(struct ieee80211_sub_if_data *sdata) +u64 mesh_plink_dec_estab_count(struct ieee80211_sub_if_data *sdata) { atomic_dec(&sdata->u.mesh.estab_plinks); return mesh_accept_plinks_update(sdata) | BSS_CHANGED_BEACON; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 5217e1d97dd6..51369072984e 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2022 Intel Corporation + * Copyright (C) 2019, 2021-2023 Intel Corporation * Author: Luis Carlos Cobo <luisca@cozybit.com> */ @@ -1026,14 +1026,14 @@ static void mesh_queue_preq(struct mesh_path *mpath, u8 flags) spin_unlock_bh(&ifmsh->mesh_preq_queue_lock); if (time_after(jiffies, ifmsh->last_preq + min_preq_int_jiff(sdata))) - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); else if (time_before(jiffies, ifmsh->last_preq)) { /* avoid long wait if did not send preqs for a long time * and jiffies wrapped around */ ifmsh->last_preq = jiffies - min_preq_int_jiff(sdata) - 1; - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } else mod_timer(&ifmsh->mesh_path_timer, ifmsh->last_preq + min_preq_int_jiff(sdata)); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 8f168bc4e4b8..f3d5bb0a59f1 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2022 Intel Corporation + * Copyright (C) 2019, 2021-2023 Intel Corporation * Author: Luis Carlos Cobo <luisca@cozybit.com> */ #include <linux/gfp.h> @@ -90,12 +90,13 @@ static inline void mesh_plink_fsm_restart(struct sta_info *sta) * * Returns BSS_CHANGED_ERP_SLOT or 0 for no change. */ -static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) +static u64 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; struct sta_info *sta; - u32 erp_rates = 0, changed = 0; + u32 erp_rates = 0; + u64 changed = 0; int i; bool short_slot = false; @@ -153,7 +154,7 @@ out: * is selected if all peers in our 20/40MHz MBSS support HT and at least one * HT20 peer is present. Otherwise no-protection mode is selected. */ -static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) +static u64 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; @@ -365,10 +366,10 @@ free: * * Locking: the caller must hold sta->mesh->plink_lock */ -static u32 __mesh_plink_deactivate(struct sta_info *sta) +static u64 __mesh_plink_deactivate(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; - u32 changed = 0; + u64 changed = 0; lockdep_assert_held(&sta->mesh->plink_lock); @@ -390,10 +391,10 @@ static u32 __mesh_plink_deactivate(struct sta_info *sta) * * All mesh paths with this peer as next hop will be flushed */ -u32 mesh_plink_deactivate(struct sta_info *sta) +u64 mesh_plink_deactivate(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; - u32 changed; + u64 changed; spin_lock_bh(&sta->mesh->plink_lock); changed = __mesh_plink_deactivate(sta); @@ -622,7 +623,7 @@ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, struct ieee80211_rx_status *rx_status) { struct sta_info *sta; - u32 changed = 0; + u64 changed = 0; sta = mesh_sta_info_get(sdata, hw_addr, elems, rx_status); if (!sta) @@ -775,10 +776,10 @@ static u16 mesh_get_new_llid(struct ieee80211_sub_if_data *sdata) return llid; } -u32 mesh_plink_open(struct sta_info *sta) +u64 mesh_plink_open(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; - u32 changed; + u64 changed; if (!test_sta_flag(sta, WLAN_STA_AUTH)) return 0; @@ -805,9 +806,9 @@ u32 mesh_plink_open(struct sta_info *sta) return changed; } -u32 mesh_plink_block(struct sta_info *sta) +u64 mesh_plink_block(struct sta_info *sta) { - u32 changed; + u64 changed; spin_lock_bh(&sta->mesh->plink_lock); changed = __mesh_plink_deactivate(sta); @@ -831,11 +832,11 @@ static void mesh_plink_close(struct ieee80211_sub_if_data *sdata, mod_plink_timer(sta, mshcfg->dot11MeshHoldingTimeout); } -static u32 mesh_plink_establish(struct ieee80211_sub_if_data *sdata, +static u64 mesh_plink_establish(struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg; - u32 changed = 0; + u64 changed = 0; del_timer(&sta->mesh->plink_timer); sta->mesh->plink_state = NL80211_PLINK_ESTAB; @@ -857,12 +858,12 @@ static u32 mesh_plink_establish(struct ieee80211_sub_if_data *sdata, * * Return: changed MBSS flags */ -static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, +static u64 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, enum plink_event event) { struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg; enum ieee80211_self_protected_actioncode action = 0; - u32 changed = 0; + u64 changed = 0; bool flush = false; mpl_dbg(sdata, "peer %pM in state %s got event %s\n", sta->sta.addr, @@ -1117,7 +1118,7 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, struct sta_info *sta; enum plink_event event; enum ieee80211_self_protected_actioncode ftype; - u32 changed = 0; + u64 changed = 0; u8 ie_len = elems->peering_len; u16 plid, llid = 0; diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 3fbd0b9ff913..35eacca43e49 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -3,6 +3,7 @@ * Copyright 2012-2013, Marco Porsch <marco.porsch@s2005.tu-chemnitz.de> * Copyright 2012-2013, cozybit Inc. * Copyright (C) 2021 Intel Corporation + * Copyright (C) 2023 Intel Corporation */ #include "mesh.h" @@ -77,14 +78,14 @@ static void mps_qos_null_tx(struct sta_info *sta) * sets the non-peer power mode and triggers the driver PS (re-)configuration * Return BSS_CHANGED_BEACON if a beacon update is necessary. */ -u32 ieee80211_mps_local_status_update(struct ieee80211_sub_if_data *sdata) +u64 ieee80211_mps_local_status_update(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct sta_info *sta; bool peering = false; int light_sleep_cnt = 0; int deep_sleep_cnt = 0; - u32 changed = 0; + u64 changed = 0; enum nl80211_mesh_power_mode nonpeer_pm; rcu_read_lock(); @@ -148,7 +149,7 @@ u32 ieee80211_mps_local_status_update(struct ieee80211_sub_if_data *sdata) * @pm: the power mode to set * Return BSS_CHANGED_BEACON if a beacon update is in order. */ -u32 ieee80211_mps_set_sta_local_pm(struct sta_info *sta, +u64 ieee80211_mps_set_sta_local_pm(struct sta_info *sta, enum nl80211_mesh_power_mode pm) { struct ieee80211_sub_if_data *sdata = sta->sdata; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 5a4303130ef2..5793f0e7f955 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1680,10 +1680,12 @@ void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, } /* spectrum management related things */ -static void ieee80211_chswitch_work(struct work_struct *work) +static void ieee80211_chswitch_work(struct wiphy *wiphy, + struct wiphy_work *work) { struct ieee80211_link_data *link = - container_of(work, struct ieee80211_link_data, u.mgd.chswitch_work); + container_of(work, struct ieee80211_link_data, + u.mgd.chswitch_work.work); struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -1723,8 +1725,8 @@ static void ieee80211_chswitch_work(struct work_struct *work) sdata_info(sdata, "failed to use reserved channel context, disconnecting (err=%d)\n", ret); - ieee80211_queue_work(&sdata->local->hw, - &ifmgd->csa_connection_drop_work); + wiphy_work_queue(sdata->local->hw.wiphy, + &ifmgd->csa_connection_drop_work); goto out; } @@ -1735,8 +1737,8 @@ static void ieee80211_chswitch_work(struct work_struct *work) &link->csa_chandef)) { sdata_info(sdata, "failed to finalize channel switch, disconnecting\n"); - ieee80211_queue_work(&sdata->local->hw, - &ifmgd->csa_connection_drop_work); + wiphy_work_queue(sdata->local->hw.wiphy, + &ifmgd->csa_connection_drop_work); goto out; } @@ -1780,8 +1782,8 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link) if (ret) { sdata_info(sdata, "driver post channel switch failed, disconnecting\n"); - ieee80211_queue_work(&local->hw, - &ifmgd->csa_connection_drop_work); + wiphy_work_queue(sdata->local->hw.wiphy, + &ifmgd->csa_connection_drop_work); return; } @@ -1800,24 +1802,16 @@ void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success) if (!success) { sdata_info(sdata, "driver channel switch failed, disconnecting\n"); - ieee80211_queue_work(&sdata->local->hw, - &ifmgd->csa_connection_drop_work); + wiphy_work_queue(sdata->local->hw.wiphy, + &ifmgd->csa_connection_drop_work); } else { - ieee80211_queue_work(&sdata->local->hw, - &sdata->deflink.u.mgd.chswitch_work); + wiphy_delayed_work_queue(sdata->local->hw.wiphy, + &sdata->deflink.u.mgd.chswitch_work, + 0); } } EXPORT_SYMBOL(ieee80211_chswitch_done); -static void ieee80211_chswitch_timer(struct timer_list *t) -{ - struct ieee80211_link_data *link = - from_timer(link, t, u.mgd.chswitch_timer); - - ieee80211_queue_work(&link->sdata->local->hw, - &link->u.mgd.chswitch_work); -} - static void ieee80211_sta_abort_chanswitch(struct ieee80211_link_data *link) { @@ -1861,6 +1855,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, struct ieee80211_csa_ie csa_ie; struct ieee80211_channel_switch ch_switch; struct ieee80211_bss *bss; + unsigned long timeout; int res; sdata_assert_lock(sdata); @@ -2004,12 +1999,11 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, } /* channel switch handled in software */ - if (csa_ie.count <= 1) - ieee80211_queue_work(&local->hw, &link->u.mgd.chswitch_work); - else - mod_timer(&link->u.mgd.chswitch_timer, - TU_TO_EXP_TIME((csa_ie.count - 1) * - cbss->beacon_interval)); + timeout = TU_TO_JIFFIES((max_t(int, csa_ie.count, 1) - 1) * + cbss->beacon_interval); + wiphy_delayed_work_queue(local->hw.wiphy, + &link->u.mgd.chswitch_work, + timeout); return; lock_and_drop_connection: mutex_lock(&local->mtx); @@ -2025,7 +2019,8 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, link->conf->csa_active = true; link->csa_block_tx = csa_ie.mode; - ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); + wiphy_work_queue(sdata->local->hw.wiphy, + &ifmgd->csa_connection_drop_work); mutex_unlock(&local->chanctx_mtx); mutex_unlock(&local->mtx); } @@ -2116,7 +2111,7 @@ static void ieee80211_find_cisco_dtpc(struct ieee80211_sub_if_data *sdata, *pwr_level = (__s8)cisco_dtpc_ie[4]; } -static u32 ieee80211_handle_pwr_constr(struct ieee80211_link_data *link, +static u64 ieee80211_handle_pwr_constr(struct ieee80211_link_data *link, struct ieee80211_channel *channel, struct ieee80211_mgmt *mgmt, const u8 *country_ie, u8 country_ie_len, @@ -2706,12 +2701,12 @@ static void ieee80211_stop_poll(struct ieee80211_sub_if_data *sdata) mutex_unlock(&sdata->local->mtx); } -static u32 ieee80211_handle_bss_capability(struct ieee80211_link_data *link, +static u64 ieee80211_handle_bss_capability(struct ieee80211_link_data *link, u16 capab, bool erp_valid, u8 erp) { struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_supported_band *sband; - u32 changed = 0; + u64 changed = 0; bool use_protection; bool use_short_preamble; bool use_short_slot; @@ -2757,7 +2752,7 @@ static u64 ieee80211_link_set_associated(struct ieee80211_link_data *link, struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_bss *bss = (void *)cbss->priv; - u32 changed = BSS_CHANGED_QOS; + u64 changed = BSS_CHANGED_QOS; /* not really used in MLO */ sdata->u.mgd.beacon_timeout = @@ -2895,7 +2890,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; unsigned int link_id; - u32 changed = 0; + u64 changed = 0; struct ieee80211_prep_tx_info info = { .subtype = stype, }; @@ -3031,7 +3026,6 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, del_timer_sync(&sdata->u.mgd.conn_mon_timer); del_timer_sync(&sdata->u.mgd.bcn_mon_timer); del_timer_sync(&sdata->u.mgd.timer); - del_timer_sync(&sdata->deflink.u.mgd.chswitch_timer); sdata->vif.bss_conf.dtim_period = 0; sdata->vif.bss_conf.beacon_rate = NULL; @@ -3162,7 +3156,7 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, sdata->u.mgd.probe_send_count = 0; else sdata->u.mgd.nullfunc_failed = true; - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata, @@ -3423,7 +3417,8 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) sdata_unlock(sdata); } -static void ieee80211_beacon_connection_loss_work(struct work_struct *work) +static void ieee80211_beacon_connection_loss_work(struct wiphy *wiphy, + struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, @@ -3448,7 +3443,8 @@ static void ieee80211_beacon_connection_loss_work(struct work_struct *work) } } -static void ieee80211_csa_connection_drop_work(struct work_struct *work) +static void ieee80211_csa_connection_drop_work(struct wiphy *wiphy, + struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, @@ -3465,7 +3461,7 @@ void ieee80211_beacon_loss(struct ieee80211_vif *vif) trace_api_beacon_loss(sdata); sdata->u.mgd.connection_loss = false; - ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work); + wiphy_work_queue(hw->wiphy, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_beacon_loss); @@ -3477,7 +3473,7 @@ void ieee80211_connection_loss(struct ieee80211_vif *vif) trace_api_connection_loss(sdata); sdata->u.mgd.connection_loss = true; - ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work); + wiphy_work_queue(hw->wiphy, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_connection_loss); @@ -3493,7 +3489,7 @@ void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect) sdata->u.mgd.driver_disconnect = true; sdata->u.mgd.reconnect = reconnect; - ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work); + wiphy_work_queue(hw->wiphy, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_disconnect); @@ -3909,8 +3905,8 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, *have_higher_than_11mbit = true; /* - * Skip HT, VHT, HE and SAE H2E only BSS membership selectors - * since they're not rates. + * Skip HT, VHT, HE, EHT and SAE H2E only BSS membership + * selectors since they're not rates. * * Note: Even though the membership selector and the basic * rate flag share the same bit, they are not exactly @@ -3919,6 +3915,7 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, if (supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HT_PHY) || supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY) || supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HE_PHY) || + supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_EHT_PHY) || supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_SAE_H2E)) continue; @@ -3965,7 +3962,7 @@ static bool ieee80211_twt_req_supported(struct ieee80211_sub_if_data *sdata, IEEE80211_HE_MAC_CAP0_TWT_REQ); } -static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, +static u64 ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_link_data *link, struct link_sta_info *link_sta, @@ -4844,6 +4841,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, IEEE80211_CONN_DISABLE_EHT)) && he_oper) { const struct cfg80211_bss_ies *cbss_ies; + const struct element *eht_ml_elem; const u8 *eht_oper_ie; cbss_ies = rcu_dereference(cbss->ies); @@ -4854,6 +4852,19 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, eht_oper = (void *)(eht_oper_ie + 3); else eht_oper = NULL; + + eht_ml_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK, + cbss_ies->data, cbss_ies->len); + + /* data + 1 / datalen - 1 since it's an extended element */ + if (eht_ml_elem && + ieee80211_mle_size_ok(eht_ml_elem->data + 1, + eht_ml_elem->datalen - 1)) { + sdata->vif.cfg.eml_cap = + ieee80211_mle_get_eml_cap(eht_ml_elem->data + 1); + sdata->vif.cfg.eml_med_sync_delay = + ieee80211_mle_get_eml_med_sync_delay(eht_ml_elem->data + 1); + } } /* Allow VHT if at least one channel on the sband supports 80 MHz */ @@ -6060,7 +6071,7 @@ static void ieee80211_sta_timer(struct timer_list *t) struct ieee80211_sub_if_data *sdata = from_timer(sdata, t, u.mgd.timer); - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, @@ -6204,7 +6215,7 @@ void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata, sdata->u.mgd.status_acked = acked; sdata->u.mgd.status_received = true; - ieee80211_queue_work(&local->hw, &sdata->work); + wiphy_work_queue(local->hw.wiphy, &sdata->work); } void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) @@ -6367,8 +6378,8 @@ static void ieee80211_sta_bcn_mon_timer(struct timer_list *t) return; sdata->u.mgd.connection_loss = false; - ieee80211_queue_work(&sdata->local->hw, - &sdata->u.mgd.beacon_connection_loss_work); + wiphy_work_queue(sdata->local->hw.wiphy, + &sdata->u.mgd.beacon_connection_loss_work); } static void ieee80211_sta_conn_mon_timer(struct timer_list *t) @@ -6524,7 +6535,8 @@ void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata) sdata_unlock(sdata); } -static void ieee80211_request_smps_mgd_work(struct work_struct *work) +static void ieee80211_request_smps_mgd_work(struct wiphy *wiphy, + struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, @@ -6542,10 +6554,10 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; INIT_WORK(&ifmgd->monitor_work, ieee80211_sta_monitor_work); - INIT_WORK(&ifmgd->beacon_connection_loss_work, - ieee80211_beacon_connection_loss_work); - INIT_WORK(&ifmgd->csa_connection_drop_work, - ieee80211_csa_connection_drop_work); + wiphy_work_init(&ifmgd->beacon_connection_loss_work, + ieee80211_beacon_connection_loss_work); + wiphy_work_init(&ifmgd->csa_connection_drop_work, + ieee80211_csa_connection_drop_work); INIT_DELAYED_WORK(&ifmgd->tdls_peer_del_work, ieee80211_tdls_peer_del_work); timer_setup(&ifmgd->timer, ieee80211_sta_timer, 0); @@ -6574,15 +6586,15 @@ void ieee80211_mgd_setup_link(struct ieee80211_link_data *link) link->u.mgd.conn_flags = 0; link->conf->bssid = link->u.mgd.bssid; - INIT_WORK(&link->u.mgd.request_smps_work, - ieee80211_request_smps_mgd_work); + wiphy_work_init(&link->u.mgd.request_smps_work, + ieee80211_request_smps_mgd_work); if (local->hw.wiphy->features & NL80211_FEATURE_DYNAMIC_SMPS) link->u.mgd.req_smps = IEEE80211_SMPS_AUTOMATIC; else link->u.mgd.req_smps = IEEE80211_SMPS_OFF; - INIT_WORK(&link->u.mgd.chswitch_work, ieee80211_chswitch_work); - timer_setup(&link->u.mgd.chswitch_timer, ieee80211_chswitch_timer, 0); + wiphy_delayed_work_init(&link->u.mgd.chswitch_work, + ieee80211_chswitch_work); if (sdata->u.mgd.assoc_data) ether_addr_copy(link->conf->addr, @@ -7538,8 +7550,10 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, void ieee80211_mgd_stop_link(struct ieee80211_link_data *link) { - cancel_work_sync(&link->u.mgd.request_smps_work); - cancel_work_sync(&link->u.mgd.chswitch_work); + wiphy_work_cancel(link->sdata->local->hw.wiphy, + &link->u.mgd.request_smps_work); + wiphy_delayed_work_cancel(link->sdata->local->hw.wiphy, + &link->u.mgd.chswitch_work); } void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) @@ -7552,8 +7566,10 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) * cancelled when disconnecting. */ cancel_work_sync(&ifmgd->monitor_work); - cancel_work_sync(&ifmgd->beacon_connection_loss_work); - cancel_work_sync(&ifmgd->csa_connection_drop_work); + wiphy_work_cancel(sdata->local->hw.wiphy, + &ifmgd->beacon_connection_loss_work); + wiphy_work_cancel(sdata->local->hw.wiphy, + &ifmgd->csa_connection_drop_work); cancel_delayed_work_sync(&ifmgd->tdls_peer_del_work); sdata_lock(sdata); diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c index a57dcbe99a0d..b44896e14522 100644 --- a/net/mac80211/ocb.c +++ b/net/mac80211/ocb.c @@ -4,7 +4,7 @@ * * Copyright: (c) 2014 Czech Technical University in Prague * (c) 2014 Volkswagen Group Research - * Copyright (C) 2022 Intel Corporation + * Copyright (C) 2022 - 2023 Intel Corporation * Author: Rostislav Lisovy <rostislav.lisovy@fel.cvut.cz> * Funded by: Volkswagen Group Research */ @@ -81,7 +81,7 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata, spin_lock(&ifocb->incomplete_lock); list_add(&sta->list, &ifocb->incomplete_stations); spin_unlock(&ifocb->incomplete_lock); - ieee80211_queue_work(&local->hw, &sdata->work); + wiphy_work_queue(local->hw.wiphy, &sdata->work); } static struct sta_info *ieee80211_ocb_finish_sta(struct sta_info *sta) @@ -157,7 +157,7 @@ static void ieee80211_ocb_housekeeping_timer(struct timer_list *t) set_bit(OCB_WORK_HOUSEKEEPING, &ifocb->wrkq_flags); - ieee80211_queue_work(&local->hw, &sdata->work); + wiphy_work_queue(local->hw.wiphy, &sdata->work); } void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata) @@ -175,7 +175,7 @@ int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; - u32 changed = BSS_CHANGED_OCB | BSS_CHANGED_BSSID; + u64 changed = BSS_CHANGED_OCB | BSS_CHANGED_BSSID; int err; if (ifocb->joined == true) @@ -197,7 +197,7 @@ int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata, ifocb->joined = true; set_bit(OCB_WORK_HOUSEKEEPING, &ifocb->wrkq_flags); - ieee80211_queue_work(&local->hw, &sdata->work); + wiphy_work_queue(local->hw.wiphy, &sdata->work); netif_carrier_on(sdata->dev); return 0; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index fc6e130364da..e2a973309bf7 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -229,7 +229,7 @@ static void __ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata, } skb_queue_tail(&sdata->skb_queue, skb); - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); if (sta) sta->deflink.rx_stats.packets++; } diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 32fa8aca7005..ea5383136fff 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -502,7 +502,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) */ list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (ieee80211_sdata_running(sdata)) - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } if (was_scanning) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 1400512e0dde..731b832b257c 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation */ #include <linux/module.h> @@ -1274,7 +1274,117 @@ static int __must_check __sta_info_destroy_part1(struct sta_info *sta) return 0; } -static void __sta_info_destroy_part2(struct sta_info *sta) +static int _sta_info_move_state(struct sta_info *sta, + enum ieee80211_sta_state new_state, + bool recalc) +{ + might_sleep(); + + if (sta->sta_state == new_state) + return 0; + + /* check allowed transitions first */ + + switch (new_state) { + case IEEE80211_STA_NONE: + if (sta->sta_state != IEEE80211_STA_AUTH) + return -EINVAL; + break; + case IEEE80211_STA_AUTH: + if (sta->sta_state != IEEE80211_STA_NONE && + sta->sta_state != IEEE80211_STA_ASSOC) + return -EINVAL; + break; + case IEEE80211_STA_ASSOC: + if (sta->sta_state != IEEE80211_STA_AUTH && + sta->sta_state != IEEE80211_STA_AUTHORIZED) + return -EINVAL; + break; + case IEEE80211_STA_AUTHORIZED: + if (sta->sta_state != IEEE80211_STA_ASSOC) + return -EINVAL; + break; + default: + WARN(1, "invalid state %d", new_state); + return -EINVAL; + } + + sta_dbg(sta->sdata, "moving STA %pM to state %d\n", + sta->sta.addr, new_state); + + /* notify the driver before the actual changes so it can + * fail the transition + */ + if (test_sta_flag(sta, WLAN_STA_INSERTED)) { + int err = drv_sta_state(sta->local, sta->sdata, sta, + sta->sta_state, new_state); + if (err) + return err; + } + + /* reflect the change in all state variables */ + + switch (new_state) { + case IEEE80211_STA_NONE: + if (sta->sta_state == IEEE80211_STA_AUTH) + clear_bit(WLAN_STA_AUTH, &sta->_flags); + break; + case IEEE80211_STA_AUTH: + if (sta->sta_state == IEEE80211_STA_NONE) { + set_bit(WLAN_STA_AUTH, &sta->_flags); + } else if (sta->sta_state == IEEE80211_STA_ASSOC) { + clear_bit(WLAN_STA_ASSOC, &sta->_flags); + if (recalc) { + ieee80211_recalc_min_chandef(sta->sdata, -1); + if (!sta->sta.support_p2p_ps) + ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); + } + } + break; + case IEEE80211_STA_ASSOC: + if (sta->sta_state == IEEE80211_STA_AUTH) { + set_bit(WLAN_STA_ASSOC, &sta->_flags); + sta->assoc_at = ktime_get_boottime_ns(); + if (recalc) { + ieee80211_recalc_min_chandef(sta->sdata, -1); + if (!sta->sta.support_p2p_ps) + ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); + } + } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { + ieee80211_vif_dec_num_mcast(sta->sdata); + clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); + ieee80211_clear_fast_xmit(sta); + ieee80211_clear_fast_rx(sta); + } + break; + case IEEE80211_STA_AUTHORIZED: + if (sta->sta_state == IEEE80211_STA_ASSOC) { + ieee80211_vif_inc_num_mcast(sta->sdata); + set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); + ieee80211_check_fast_xmit(sta); + ieee80211_check_fast_rx(sta); + } + if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN || + sta->sdata->vif.type == NL80211_IFTYPE_AP) + cfg80211_send_layer2_update(sta->sdata->dev, + sta->sta.addr); + break; + default: + break; + } + + sta->sta_state = new_state; + + return 0; +} + +int sta_info_move_state(struct sta_info *sta, + enum ieee80211_sta_state new_state) +{ + return _sta_info_move_state(sta, new_state, true); +} + +static void __sta_info_destroy_part2(struct sta_info *sta, bool recalc) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; @@ -1290,7 +1400,7 @@ static void __sta_info_destroy_part2(struct sta_info *sta) lockdep_assert_held(&local->sta_mtx); if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { - ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); + ret = _sta_info_move_state(sta, IEEE80211_STA_ASSOC, recalc); WARN_ON_ONCE(ret); } @@ -1318,7 +1428,7 @@ static void __sta_info_destroy_part2(struct sta_info *sta) local->sta_generation++; while (sta->sta_state > IEEE80211_STA_NONE) { - ret = sta_info_move_state(sta, sta->sta_state - 1); + ret = _sta_info_move_state(sta, sta->sta_state - 1, recalc); if (ret) { WARN_ON_ONCE(1); break; @@ -1355,7 +1465,7 @@ int __must_check __sta_info_destroy(struct sta_info *sta) synchronize_net(); - __sta_info_destroy_part2(sta); + __sta_info_destroy_part2(sta, true); return 0; } @@ -1462,9 +1572,18 @@ int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans) } if (!list_empty(&free_list)) { + bool support_p2p_ps = true; + synchronize_net(); - list_for_each_entry_safe(sta, tmp, &free_list, free_list) - __sta_info_destroy_part2(sta); + list_for_each_entry_safe(sta, tmp, &free_list, free_list) { + if (!sta->sta.support_p2p_ps) + support_p2p_ps = false; + __sta_info_destroy_part2(sta, false); + } + + ieee80211_recalc_min_chandef(sdata, -1); + if (!support_p2p_ps) + ieee80211_recalc_p2p_go_ps_allowed(sdata); } mutex_unlock(&local->sta_mtx); @@ -2252,106 +2371,6 @@ void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, } } -int sta_info_move_state(struct sta_info *sta, - enum ieee80211_sta_state new_state) -{ - might_sleep(); - - if (sta->sta_state == new_state) - return 0; - - /* check allowed transitions first */ - - switch (new_state) { - case IEEE80211_STA_NONE: - if (sta->sta_state != IEEE80211_STA_AUTH) - return -EINVAL; - break; - case IEEE80211_STA_AUTH: - if (sta->sta_state != IEEE80211_STA_NONE && - sta->sta_state != IEEE80211_STA_ASSOC) - return -EINVAL; - break; - case IEEE80211_STA_ASSOC: - if (sta->sta_state != IEEE80211_STA_AUTH && - sta->sta_state != IEEE80211_STA_AUTHORIZED) - return -EINVAL; - break; - case IEEE80211_STA_AUTHORIZED: - if (sta->sta_state != IEEE80211_STA_ASSOC) - return -EINVAL; - break; - default: - WARN(1, "invalid state %d", new_state); - return -EINVAL; - } - - sta_dbg(sta->sdata, "moving STA %pM to state %d\n", - sta->sta.addr, new_state); - - /* - * notify the driver before the actual changes so it can - * fail the transition - */ - if (test_sta_flag(sta, WLAN_STA_INSERTED)) { - int err = drv_sta_state(sta->local, sta->sdata, sta, - sta->sta_state, new_state); - if (err) - return err; - } - - /* reflect the change in all state variables */ - - switch (new_state) { - case IEEE80211_STA_NONE: - if (sta->sta_state == IEEE80211_STA_AUTH) - clear_bit(WLAN_STA_AUTH, &sta->_flags); - break; - case IEEE80211_STA_AUTH: - if (sta->sta_state == IEEE80211_STA_NONE) { - set_bit(WLAN_STA_AUTH, &sta->_flags); - } else if (sta->sta_state == IEEE80211_STA_ASSOC) { - clear_bit(WLAN_STA_ASSOC, &sta->_flags); - ieee80211_recalc_min_chandef(sta->sdata, -1); - if (!sta->sta.support_p2p_ps) - ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); - } - break; - case IEEE80211_STA_ASSOC: - if (sta->sta_state == IEEE80211_STA_AUTH) { - set_bit(WLAN_STA_ASSOC, &sta->_flags); - sta->assoc_at = ktime_get_boottime_ns(); - ieee80211_recalc_min_chandef(sta->sdata, -1); - if (!sta->sta.support_p2p_ps) - ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); - } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { - ieee80211_vif_dec_num_mcast(sta->sdata); - clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); - ieee80211_clear_fast_xmit(sta); - ieee80211_clear_fast_rx(sta); - } - break; - case IEEE80211_STA_AUTHORIZED: - if (sta->sta_state == IEEE80211_STA_ASSOC) { - ieee80211_vif_inc_num_mcast(sta->sdata); - set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); - ieee80211_check_fast_xmit(sta); - ieee80211_check_fast_rx(sta); - } - if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - sta->sdata->vif.type == NL80211_IFTYPE_AP) - cfg80211_send_layer2_update(sta->sdata->dev, - sta->sta.addr); - break; - default: - break; - } - - sta->sta_state = new_state; - - return 0; -} - static struct ieee80211_sta_rx_stats * sta_get_last_rx_stats(struct sta_info *sta) { @@ -2913,6 +2932,8 @@ int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id) if (!test_sta_flag(sta, WLAN_STA_INSERTED)) goto hash; + ieee80211_recalc_min_chandef(sdata, link_id); + /* Ensure the values are updated for the driver, * redone by sta_remove_link on failure. */ diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 2b13a52ce96c..44d83da60aee 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2021-2022 Intel Corporation + * Copyright 2021-2023 Intel Corporation */ #include <linux/export.h> @@ -747,8 +747,8 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, if (qskb) { skb_queue_tail(&sdata->status_queue, qskb); - ieee80211_queue_work(&local->hw, - &sdata->work); + wiphy_work_queue(local->hw.wiphy, + &sdata->work); } } } else { diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index b255f3b5bf01..52c47674a554 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1431,8 +1431,8 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, } if (ret == 0) - ieee80211_queue_work(&sdata->local->hw, - &sdata->deflink.u.mgd.request_smps_work); + wiphy_work_queue(sdata->local->hw.wiphy, + &sdata->deflink.u.mgd.request_smps_work); mutex_unlock(&local->mtx); sdata_unlock(sdata); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 13b522dab0a3..7e2d68e0d79f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -26,6 +26,7 @@ #include <net/codel_impl.h> #include <asm/unaligned.h> #include <net/fq_impl.h> +#include <net/gso.h> #include "ieee80211_i.h" #include "driver-ops.h" @@ -581,25 +582,9 @@ ieee80211_select_link_key(struct ieee80211_tx_data *tx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); - enum { - USE_NONE, - USE_MGMT_KEY, - USE_MCAST_KEY, - } which_key = USE_NONE; struct ieee80211_link_data *link; unsigned int link_id; - if (ieee80211_is_group_privacy_action(tx->skb)) - which_key = USE_MCAST_KEY; - else if (ieee80211_is_mgmt(hdr->frame_control) && - is_multicast_ether_addr(hdr->addr1) && - ieee80211_is_robust_mgmt_frame(tx->skb)) - which_key = USE_MGMT_KEY; - else if (is_multicast_ether_addr(hdr->addr1)) - which_key = USE_MCAST_KEY; - else - return NULL; - link_id = u32_get_bits(info->control.flags, IEEE80211_TX_CTRL_MLO_LINK); if (link_id == IEEE80211_LINK_UNSPECIFIED) { link = &tx->sdata->deflink; @@ -609,14 +594,14 @@ ieee80211_select_link_key(struct ieee80211_tx_data *tx) return NULL; } - switch (which_key) { - case USE_NONE: - break; - case USE_MGMT_KEY: + if (ieee80211_is_group_privacy_action(tx->skb)) + return rcu_dereference(link->default_multicast_key); + else if (ieee80211_is_mgmt(hdr->frame_control) && + is_multicast_ether_addr(hdr->addr1) && + ieee80211_is_robust_mgmt_frame(tx->skb)) return rcu_dereference(link->default_mgmt_key); - case USE_MCAST_KEY: + else if (is_multicast_ether_addr(hdr->addr1)) return rcu_dereference(link->default_multicast_key); - } return NULL; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 3bd07a0a782f..e07be65806b7 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1604,7 +1604,7 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) const struct element *non_inherit = NULL; u8 *nontransmitted_profile; int nontransmitted_profile_len = 0; - size_t scratch_len = params->scratch_len ?: 3 * params->len; + size_t scratch_len = 3 * params->len; elems = kzalloc(sizeof(*elems) + scratch_len, GFP_ATOMIC); if (!elems) @@ -2373,6 +2373,7 @@ static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) local->resuming = false; local->suspended = false; local->in_reconfig = false; + local->reconfig_failure = true; ieee80211_flush_completed_scan(local, true); @@ -2475,6 +2476,35 @@ static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata) return 0; } +static void ieee80211_reconfig_ap_links(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u64 changed) +{ + int link_id; + + for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { + struct ieee80211_link_data *link; + + if (!(sdata->vif.active_links & BIT(link_id))) + continue; + + link = sdata_dereference(sdata->link[link_id], sdata); + if (!link) + continue; + + if (rcu_access_pointer(link->u.ap.beacon)) + drv_start_ap(local, sdata, link->conf); + + if (!link->conf->enable_beacon) + continue; + + changed |= BSS_CHANGED_BEACON | + BSS_CHANGED_BEACON_ENABLED; + + ieee80211_link_info_change_notify(sdata, link, changed); + } +} + int ieee80211_reconfig(struct ieee80211_local *local) { struct ieee80211_hw *hw = &local->hw; @@ -2624,21 +2654,55 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* Finally also reconfigure all the BSS information */ list_for_each_entry(sdata, &local->interfaces, list) { + /* common change flags for all interface types - link only */ + u64 changed = BSS_CHANGED_ERP_CTS_PROT | + BSS_CHANGED_ERP_PREAMBLE | + BSS_CHANGED_ERP_SLOT | + BSS_CHANGED_HT | + BSS_CHANGED_BASIC_RATES | + BSS_CHANGED_BEACON_INT | + BSS_CHANGED_BSSID | + BSS_CHANGED_CQM | + BSS_CHANGED_QOS | + BSS_CHANGED_TXPOWER | + BSS_CHANGED_MCAST_RATE; + struct ieee80211_link_data *link = NULL; unsigned int link_id; - u32 changed; + u32 active_links = 0; if (!ieee80211_sdata_running(sdata)) continue; sdata_lock(sdata); + if (sdata->vif.valid_links) { + struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS] = { + [0] = &sdata->vif.bss_conf, + }; + + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + /* start with a single active link */ + active_links = sdata->vif.active_links; + link_id = ffs(active_links) - 1; + sdata->vif.active_links = BIT(link_id); + } + + drv_change_vif_links(local, sdata, 0, + sdata->vif.active_links, + old); + } + for (link_id = 0; link_id < ARRAY_SIZE(sdata->vif.link_conf); link_id++) { - struct ieee80211_link_data *link; + if (sdata->vif.valid_links && + !(sdata->vif.active_links & BIT(link_id))) + continue; link = sdata_dereference(sdata->link[link_id], sdata); - if (link) - ieee80211_assign_chanctx(local, sdata, link); + if (!link) + continue; + + ieee80211_assign_chanctx(local, sdata, link); } switch (sdata->vif.type) { @@ -2658,42 +2722,42 @@ int ieee80211_reconfig(struct ieee80211_local *local) &sdata->deflink.tx_conf[i]); break; } - sdata_unlock(sdata); - - /* common change flags for all interface types */ - changed = BSS_CHANGED_ERP_CTS_PROT | - BSS_CHANGED_ERP_PREAMBLE | - BSS_CHANGED_ERP_SLOT | - BSS_CHANGED_HT | - BSS_CHANGED_BASIC_RATES | - BSS_CHANGED_BEACON_INT | - BSS_CHANGED_BSSID | - BSS_CHANGED_CQM | - BSS_CHANGED_QOS | - BSS_CHANGED_IDLE | - BSS_CHANGED_TXPOWER | - BSS_CHANGED_MCAST_RATE; if (sdata->vif.bss_conf.mu_mimo_owner) changed |= BSS_CHANGED_MU_GROUPS; + if (!sdata->vif.valid_links) + changed |= BSS_CHANGED_IDLE; + switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: - changed |= BSS_CHANGED_ASSOC | - BSS_CHANGED_ARP_FILTER | - BSS_CHANGED_PS; - - /* Re-send beacon info report to the driver */ - if (sdata->deflink.u.mgd.have_beacon) - changed |= BSS_CHANGED_BEACON_INFO; - - if (sdata->vif.bss_conf.max_idle_period || - sdata->vif.bss_conf.protected_keep_alive) - changed |= BSS_CHANGED_KEEP_ALIVE; - - sdata_lock(sdata); - ieee80211_bss_info_change_notify(sdata, changed); - sdata_unlock(sdata); + if (!sdata->vif.valid_links) { + changed |= BSS_CHANGED_ASSOC | + BSS_CHANGED_ARP_FILTER | + BSS_CHANGED_PS; + + /* Re-send beacon info report to the driver */ + if (sdata->deflink.u.mgd.have_beacon) + changed |= BSS_CHANGED_BEACON_INFO; + + if (sdata->vif.bss_conf.max_idle_period || + sdata->vif.bss_conf.protected_keep_alive) + changed |= BSS_CHANGED_KEEP_ALIVE; + + if (sdata->vif.bss_conf.eht_puncturing) + changed |= BSS_CHANGED_EHT_PUNCTURING; + + ieee80211_bss_info_change_notify(sdata, + changed); + } else if (!WARN_ON(!link)) { + ieee80211_link_info_change_notify(sdata, link, + changed); + changed = BSS_CHANGED_ASSOC | + BSS_CHANGED_IDLE | + BSS_CHANGED_PS | + BSS_CHANGED_ARP_FILTER; + ieee80211_vif_cfg_change_notify(sdata, changed); + } break; case NL80211_IFTYPE_OCB: changed |= BSS_CHANGED_OCB; @@ -2703,7 +2767,13 @@ int ieee80211_reconfig(struct ieee80211_local *local) changed |= BSS_CHANGED_IBSS; fallthrough; case NL80211_IFTYPE_AP: - changed |= BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS; + changed |= BSS_CHANGED_P2P_PS; + + if (sdata->vif.valid_links) + ieee80211_vif_cfg_change_notify(sdata, + BSS_CHANGED_SSID); + else + changed |= BSS_CHANGED_SSID; if (sdata->vif.bss_conf.ftm_responder == 1 && wiphy_ext_feature_isset(sdata->local->hw.wiphy, @@ -2713,6 +2783,13 @@ int ieee80211_reconfig(struct ieee80211_local *local) if (sdata->vif.type == NL80211_IFTYPE_AP) { changed |= BSS_CHANGED_AP_PROBE_RESP; + if (sdata->vif.valid_links) { + ieee80211_reconfig_ap_links(local, + sdata, + changed); + break; + } + if (rcu_access_pointer(sdata->deflink.u.ap.beacon)) drv_start_ap(local, sdata, sdata->deflink.conf); @@ -2728,6 +2805,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) case NL80211_IFTYPE_NAN: res = ieee80211_reconfig_nan(sdata); if (res < 0) { + sdata_unlock(sdata); ieee80211_handle_reconfig_failure(local); return res; } @@ -2745,6 +2823,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) WARN_ON(1); break; } + sdata_unlock(sdata); + + if (active_links) + ieee80211_set_active_links(&sdata->vif, active_links); } ieee80211_recalc_ps(local); @@ -2860,7 +2942,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* Requeue all works */ list_for_each_entry(sdata, &local->interfaces, list) - ieee80211_queue_work(&local->hw, &sdata->work); + wiphy_work_queue(local->hw.wiphy, &sdata->work); } ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, diff --git a/net/mctp/route.c b/net/mctp/route.c index f51a05ec7162..ab62fe447038 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -1249,9 +1249,6 @@ static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, mtu = nla_get_u32(tbx[RTAX_MTU]); } - if (rtm->rtm_type != RTN_UNICAST) - return -EINVAL; - rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu, rtm->rtm_type); return rc; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index dc5165d3eec4..bf6e81d56263 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -12,6 +12,7 @@ #include <linux/nospec.h> #include <linux/vmalloc.h> #include <linux/percpu.h> +#include <net/gso.h> #include <net/ip.h> #include <net/dst.h> #include <net/sock.h> diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 1482259de9b5..533d082f0701 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -14,6 +14,7 @@ #include <linux/netdev_features.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <net/gso.h> #include <net/mpls.h> static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 0dac2863c6e1..a0990c365a2e 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -34,7 +34,11 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW), SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA), SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR), + SNMP_MIB_ITEM("AddAddrTx", MPTCP_MIB_ADDADDRTX), + SNMP_MIB_ITEM("AddAddrTxDrop", MPTCP_MIB_ADDADDRTXDROP), SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD), + SNMP_MIB_ITEM("EchoAddTx", MPTCP_MIB_ECHOADDTX), + SNMP_MIB_ITEM("EchoAddTxDrop", MPTCP_MIB_ECHOADDTXDROP), SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD), SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP), SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX), @@ -44,6 +48,8 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX), SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR), SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP), + SNMP_MIB_ITEM("RmAddrTx", MPTCP_MIB_RMADDRTX), + SNMP_MIB_ITEM("RmAddrTxDrop", MPTCP_MIB_RMADDRTXDROP), SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 2be3596374f4..cae71d947252 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -27,7 +27,15 @@ enum linux_mptcp_mib_field { MPTCP_MIB_NODSSWINDOW, /* Segments not in MPTCP windows */ MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */ MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */ + MPTCP_MIB_ADDADDRTX, /* Sent ADD_ADDR with echo-flag=0 */ + MPTCP_MIB_ADDADDRTXDROP, /* ADD_ADDR with echo-flag=0 not send due to + * resource exhaustion + */ MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */ + MPTCP_MIB_ECHOADDTX, /* Send ADD_ADDR with echo-flag=1 */ + MPTCP_MIB_ECHOADDTXDROP, /* ADD_ADDR with echo-flag=1 not send due + * to resource exhaustion + */ MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */ MPTCP_MIB_ADDADDRDROP, /* Dropped incoming ADD_ADDR */ MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */ @@ -37,6 +45,8 @@ enum linux_mptcp_mib_field { MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */ MPTCP_MIB_RMADDR, /* Received RM_ADDR */ MPTCP_MIB_RMADDRDROP, /* Dropped incoming RM_ADDR */ + MPTCP_MIB_RMADDRTX, /* Sent RM_ADDR */ + MPTCP_MIB_RMADDRTXDROP, /* RM_ADDR not sent due to resource exhaustion */ MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */ @@ -63,6 +73,14 @@ struct mptcp_mib { unsigned long mibs[LINUX_MIB_MPTCP_MAX]; }; +static inline void MPTCP_ADD_STATS(struct net *net, + enum linux_mptcp_mib_field field, + int val) +{ + if (likely(net->mib.mptcp_statistics)) + SNMP_ADD_STATS(net->mib.mptcp_statistics, field, val); +} + static inline void MPTCP_INC_STATS(struct net *net, enum linux_mptcp_mib_field field) { diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 19a01b6566f1..c254accb14de 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -687,9 +687,12 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * } opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); opts->ahmac = add_addr_generate_hmac(msk->local_key, msk->remote_key, &opts->addr); + } else { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX); } pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d", opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); @@ -723,7 +726,7 @@ static bool mptcp_established_options_rm_addr(struct sock *sk, for (i = 0; i < opts->rm_list.nr; i++) pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]); - + MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr); return true; } @@ -1023,6 +1026,12 @@ u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) return cur_seq; } +static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) +{ + msk->bytes_acked += new_snd_una - msk->snd_una; + msk->snd_una = new_snd_una; +} + static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) @@ -1054,7 +1063,7 @@ static void ack_update_msk(struct mptcp_sock *msk, __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { - msk->snd_una = new_snd_una; + __mptcp_snd_una_update(msk, new_snd_una); __mptcp_data_acked(sk); } mptcp_data_unlock(sk); @@ -1116,6 +1125,12 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) mptcp_data_lock(subflow->conn); if (sk_stream_memory_free(sk)) __mptcp_check_push(subflow->conn, sk); + + /* on fallback we just need to ignore the msk-level snd_una, as + * this is really plain TCP + */ + __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt)); + __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); return true; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 76612bca275a..7dbbad1e4f55 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -26,7 +26,8 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, if (add_addr & (echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) { - pr_warn("addr_signal error, add_addr=%d, echo=%d", add_addr, echo); + MPTCP_INC_STATS(sock_net((struct sock *)msk), + echo ? MPTCP_MIB_ECHOADDTXDROP : MPTCP_MIB_ADDADDRTXDROP); return -EINVAL; } @@ -48,7 +49,8 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); if (rm_addr) { - pr_warn("addr_signal error, rm_addr=%d", rm_addr); + MPTCP_ADD_STATS(sock_net((struct sock *)msk), + MPTCP_MIB_RMADDRTXDROP, rm_list->nr); return -EINVAL; } @@ -413,7 +415,46 @@ out_unlock: int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) { - return mptcp_pm_nl_get_local_id(msk, skc); + struct mptcp_addr_info skc_local; + struct mptcp_addr_info msk_local; + + if (WARN_ON_ONCE(!msk)) + return -1; + + /* The 0 ID mapping is defined by the first subflow, copied into the msk + * addr + */ + mptcp_local_address((struct sock_common *)msk, &msk_local); + mptcp_local_address((struct sock_common *)skc, &skc_local); + if (mptcp_addresses_equal(&msk_local, &skc_local, false)) + return 0; + + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_get_local_id(msk, &skc_local); + return mptcp_pm_nl_get_local_id(msk, &skc_local); +} + +int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, + u8 *flags, int *ifindex) +{ + *flags = 0; + *ifindex = 0; + + if (!id) + return 0; + + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk, id, flags, ifindex); + return mptcp_pm_nl_get_flags_and_ifindex_by_id(msk, id, flags, ifindex); +} + +int mptcp_pm_set_flags(struct net *net, struct nlattr *token, + struct mptcp_pm_addr_entry *loc, + struct mptcp_pm_addr_entry *rem, u8 bkup) +{ + if (token) + return mptcp_userspace_pm_set_flags(net, token, loc, rem, bkup); + return mptcp_pm_nl_set_flags(net, loc, bkup); } void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 1224dfca5bf3..5692daf57a4d 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -25,9 +25,9 @@ static int pm_nl_pernet_id; struct mptcp_pm_add_entry { struct list_head list; struct mptcp_addr_info addr; + u8 retrans_times; struct timer_list add_timer; struct mptcp_sock *sock; - u8 retrans_times; }; struct pm_nl_pernet { @@ -86,8 +86,7 @@ bool mptcp_addresses_equal(const struct mptcp_addr_info *a, return a->port == b->port; } -static void local_address(const struct sock_common *skc, - struct mptcp_addr_info *addr) +void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { addr->family = skc->skc_family; addr->port = htons(skc->skc_num); @@ -122,7 +121,7 @@ static bool lookup_subflow_by_saddr(const struct list_head *list, list_for_each_entry(subflow, list, node) { skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); - local_address(skc, &cur); + mptcp_local_address(skc, &cur); if (mptcp_addresses_equal(&cur, saddr, saddr->port)) return true; } @@ -263,7 +262,7 @@ bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) struct mptcp_addr_info saddr; bool ret = false; - local_address((struct sock_common *)sk, &saddr); + mptcp_local_address((struct sock_common *)sk, &saddr); spin_lock_bh(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { @@ -342,7 +341,7 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, } bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, - const struct mptcp_pm_addr_entry *entry) + const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; @@ -350,7 +349,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, lockdep_assert_held(&msk->pm.lock); - add_entry = mptcp_lookup_anno_list_by_saddr(msk, &entry->addr); + add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (add_entry) { if (mptcp_pm_is_kernel(msk)) @@ -367,7 +366,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, list_add(&add_entry->list, &msk->pm.anno_list); - add_entry->addr = entry->addr; + add_entry->addr = *addr; add_entry->sock = msk; add_entry->retrans_times = 0; @@ -541,7 +540,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) struct mptcp_addr_info mpc_addr; bool backup = false; - local_address((struct sock_common *)msk->first, &mpc_addr); + mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); rcu_read_lock(); entry = __lookup_addr(pernet, &mpc_addr, false); if (entry) { @@ -577,7 +576,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) return; if (local) { - if (mptcp_pm_alloc_anno_list(msk, local)) { + if (mptcp_pm_alloc_anno_list(msk, &local->addr)) { __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &local->addr, false); @@ -752,7 +751,7 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct mptcp_addr_info local, remote; - local_address((struct sock_common *)ssk, &local); + mptcp_local_address((struct sock_common *)ssk, &local); if (!mptcp_addresses_equal(&local, addr, addr->port)) continue; @@ -1057,33 +1056,17 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, return 0; } -int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info skc_local; - struct mptcp_addr_info msk_local; struct pm_nl_pernet *pernet; int ret = -1; - if (WARN_ON_ONCE(!msk)) - return -1; - - /* The 0 ID mapping is defined by the first subflow, copied into the msk - * addr - */ - local_address((struct sock_common *)msk, &msk_local); - local_address((struct sock_common *)skc, &skc_local); - if (mptcp_addresses_equal(&msk_local, &skc_local, false)) - return 0; - - if (mptcp_pm_is_userspace(msk)) - return mptcp_userspace_pm_get_local_id(msk, &skc_local); - pernet = pm_nl_get_pernet_from_msk(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, &skc_local, entry->addr.port)) { + if (mptcp_addresses_equal(&entry->addr, skc, entry->addr.port)) { ret = entry->addr.id; break; } @@ -1097,7 +1080,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) if (!entry) return -ENOMEM; - entry->addr = skc_local; + entry->addr = *skc; entry->addr.id = 0; entry->addr.port = 0; entry->ifindex = 0; @@ -1374,31 +1357,20 @@ out_free: return ret; } -int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, - u8 *flags, int *ifindex) +int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, + u8 *flags, int *ifindex) { struct mptcp_pm_addr_entry *entry; struct sock *sk = (struct sock *)msk; struct net *net = sock_net(sk); - *flags = 0; - *ifindex = 0; - - if (id) { - if (mptcp_pm_is_userspace(msk)) - return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk, - id, - flags, - ifindex); - - rcu_read_lock(); - entry = __lookup_addr_by_id(pm_nl_get_pernet(net), id); - if (entry) { - *flags = entry->flags; - *ifindex = entry->ifindex; - } - rcu_read_unlock(); + rcu_read_lock(); + entry = __lookup_addr_by_id(pm_nl_get_pernet(net), id); + if (entry) { + *flags = entry->flags; + *ifindex = entry->ifindex; } + rcu_read_unlock(); return 0; } @@ -1492,7 +1464,7 @@ static int mptcp_nl_remove_id_zero_address(struct net *net, if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) goto next; - local_address((struct sock_common *)msk, &msk_local); + mptcp_local_address((struct sock_common *)msk, &msk_local); if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) goto next; @@ -1911,18 +1883,50 @@ next: return ret; } +int mptcp_pm_nl_set_flags(struct net *net, struct mptcp_pm_addr_entry *addr, u8 bkup) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); + u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | + MPTCP_PM_ADDR_FLAG_FULLMESH; + struct mptcp_pm_addr_entry *entry; + u8 lookup_by_id = 0; + + if (addr->addr.family == AF_UNSPEC) { + lookup_by_id = 1; + if (!addr->addr.id) + return -EOPNOTSUPP; + } + + spin_lock_bh(&pernet->lock); + entry = __lookup_addr(pernet, &addr->addr, lookup_by_id); + if (!entry) { + spin_unlock_bh(&pernet->lock); + return -EINVAL; + } + if ((addr->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && + (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + spin_unlock_bh(&pernet->lock); + return -EINVAL; + } + + changed = (addr->flags ^ entry->flags) & mask; + entry->flags = (entry->flags & ~mask) | (addr->flags & mask); + *addr = *entry; + spin_unlock_bh(&pernet->lock); + + mptcp_nl_set_flags(net, &addr->addr, bkup, changed); + return 0; +} + static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) { - struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }, *entry; struct mptcp_pm_addr_entry remote = { .addr = { .family = AF_UNSPEC }, }; + struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }; struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | - MPTCP_PM_ADDR_FLAG_FULLMESH; struct net *net = sock_net(skb->sk); - u8 bkup = 0, lookup_by_id = 0; + u8 bkup = 0; int ret; ret = mptcp_pm_parse_entry(attr, info, false, &addr); @@ -1937,34 +1941,8 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; - if (addr.addr.family == AF_UNSPEC) { - lookup_by_id = 1; - if (!addr.addr.id) - return -EOPNOTSUPP; - } - - if (token) - return mptcp_userspace_pm_set_flags(net, token, &addr, &remote, bkup); - spin_lock_bh(&pernet->lock); - entry = __lookup_addr(pernet, &addr.addr, lookup_by_id); - if (!entry) { - spin_unlock_bh(&pernet->lock); - return -EINVAL; - } - if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && - (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { - spin_unlock_bh(&pernet->lock); - return -EINVAL; - } - - changed = (addr.flags ^ entry->flags) & mask; - entry->flags = (entry->flags & ~mask) | (addr.flags & mask); - addr = *entry; - spin_unlock_bh(&pernet->lock); - - mptcp_nl_set_flags(net, &addr.addr, bkup, changed); - return 0; + return mptcp_pm_set_flags(net, token, &addr, &remote, bkup); } static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index b06aa58dfcf2..b5a8aa4c1ebd 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -111,9 +111,6 @@ int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, { struct mptcp_pm_addr_entry *entry, *match = NULL; - *flags = 0; - *ifindex = 0; - spin_lock_bh(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { if (id == entry->addr.id) { @@ -196,7 +193,7 @@ int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) lock_sock((struct sock *)msk); spin_lock_bh(&msk->pm.lock); - if (mptcp_pm_alloc_anno_list(msk, &addr_val)) { + if (mptcp_pm_alloc_anno_list(msk, &addr_val.addr)) { msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &addr_val.addr, false); mptcp_pm_nl_addr_send_ack(msk); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a6c7f2d24909..bd023debedc8 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -96,6 +96,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; + subflow->subflow_id = msk->subflow_id++; /* This is the first subflow, always with id 0 */ subflow->local_id_valid = 1; @@ -377,6 +378,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ + msk->bytes_received += copy_len; WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail && mptcp_try_coalesce(sk, tail, skb)) @@ -757,6 +759,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) MPTCP_SKB_CB(skb)->map_seq += delta; __skb_queue_tail(&sk->sk_receive_queue, skb); } + msk->bytes_received += end_seq - msk->ack_seq; msk->ack_seq = end_seq; moved = true; } @@ -842,6 +845,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) if (sk->sk_socket && !ssk->sk_socket) mptcp_sock_graft(ssk, sk->sk_socket); + mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; mptcp_sockopt_sync_locked(msk, ssk); mptcp_subflow_joined(msk, ssk); return true; @@ -958,12 +962,6 @@ static void __mptcp_clean_una(struct sock *sk) struct mptcp_data_frag *dtmp, *dfrag; u64 snd_una; - /* on fallback we just need to ignore snd_una, as this is really - * plain TCP - */ - if (__mptcp_check_fallback(msk)) - msk->snd_una = READ_ONCE(msk->snd_nxt); - snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) @@ -1491,8 +1489,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, * that has been handed to the subflow for transmission * and skip update in case it was old dfrag. */ - if (likely(after64(snd_nxt_new, msk->snd_nxt))) + if (likely(after64(snd_nxt_new, msk->snd_nxt))) { + msk->bytes_sent += snd_nxt_new - msk->snd_nxt; msk->snd_nxt = snd_nxt_new; + } } void mptcp_check_and_set_pending(struct sock *sk) @@ -2549,6 +2549,7 @@ static void __mptcp_retrans(struct sock *sk) } if (copied) { dfrag->already_sent = max(dfrag->already_sent, info.sent); + msk->bytes_retrans += copied; tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); WRITE_ONCE(msk->allow_infinite_fallback, false); @@ -2607,6 +2608,7 @@ static void mptcp_do_fastclose(struct sock *sk) struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); + inet_sk_state_store(sk, TCP_CLOSE); mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, MPTCP_CF_FASTCLOSE); @@ -2640,10 +2642,9 @@ static void mptcp_worker(struct work_struct *work) * even if it is orphaned and in FIN_WAIT2 state */ if (sock_flag(sk, SOCK_DEAD)) { - if (mptcp_should_close(sk)) { - inet_sk_state_store(sk, TCP_CLOSE); + if (mptcp_should_close(sk)) mptcp_do_fastclose(sk); - } + if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); goto unlock; @@ -2682,6 +2683,7 @@ static int __mptcp_init_sock(struct sock *sk) WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); WRITE_ONCE(msk->allow_infinite_fallback, true); msk->recovery = false; + msk->subflow_id = 1; mptcp_pm_data_init(msk); @@ -2878,7 +2880,6 @@ static void __mptcp_destroy_sock(struct sock *sk) void __mptcp_unaccepted_force_close(struct sock *sk) { sock_set_flag(sk, SOCK_DEAD); - inet_sk_state_store(sk, TCP_CLOSE); mptcp_do_fastclose(sk); __mptcp_destroy_sock(sk); } @@ -2934,7 +2935,6 @@ bool __mptcp_close(struct sock *sk, long timeout) /* If the msk has read data, or the caller explicitly ask it, * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose */ - inet_sk_state_store(sk, TCP_CLOSE); mptcp_do_fastclose(sk); timeout = 0; } else if (mptcp_close_state(sk)) { @@ -3069,6 +3069,10 @@ static int mptcp_disconnect(struct sock *sk, int flags) WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); + msk->bytes_acked = 0; + msk->bytes_received = 0; + msk->bytes_sent = 0; + msk->bytes_retrans = 0; WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); @@ -3119,6 +3123,9 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; + /* passive msk is created after the first/MPC subflow */ + msk->subflow_id = 2; + sock_reset_flag(nsk, SOCK_RCU_FREE); security_inet_csk_clone(nsk, req); @@ -3538,11 +3545,10 @@ static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) return (int)delta; } -static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) { struct mptcp_sock *msk = mptcp_sk(sk); bool slow; - int answ; switch (cmd) { case SIOCINQ: @@ -3551,24 +3557,24 @@ static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg) lock_sock(sk); __mptcp_move_skbs(msk); - answ = mptcp_inq_hint(sk); + *karg = mptcp_inq_hint(sk); release_sock(sk); break; case SIOCOUTQ: slow = lock_sock_fast(sk); - answ = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); + *karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); unlock_sock_fast(sk, slow); break; case SIOCOUTQNSD: slow = lock_sock_fast(sk); - answ = mptcp_ioctl_outq(msk, msk->snd_nxt); + *karg = mptcp_ioctl_outq(msk, msk->snd_nxt); unlock_sock_fast(sk, slow); break; default: return -ENOIOCTLCMD; } - return put_user(answ, (int __user *)arg); + return 0; } static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, @@ -3726,6 +3732,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct socket *ssock; + struct sock *newsk; int err; pr_debug("msk=%p", msk); @@ -3737,17 +3744,20 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssock) return -EINVAL; - err = ssock->ops->accept(sock, newsock, flags, kern); - if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { - struct mptcp_sock *msk = mptcp_sk(newsock->sk); + newsk = mptcp_accept(sock->sk, flags, &err, kern); + if (!newsk) + return err; + + lock_sock(newsk); + + __inet_accept(sock, newsock, newsk); + if (!mptcp_is_tcpsk(newsock->sk)) { + struct mptcp_sock *msk = mptcp_sk(newsk); struct mptcp_subflow_context *subflow; - struct sock *newsk = newsock->sk; set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); msk->in_accept_queue = 0; - lock_sock(newsk); - /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ @@ -3768,11 +3778,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (unlikely(list_empty(&msk->conn_list))) inet_sk_state_store(newsk, TCP_CLOSE); } - - release_sock(newsk); } + release_sock(newsk); - return err; + return 0; } static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d3783a7056e1..37fbe22e2433 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -261,10 +261,13 @@ struct mptcp_sock { u64 local_key; u64 remote_key; u64 write_seq; + u64 bytes_sent; u64 snd_nxt; + u64 bytes_received; u64 ack_seq; atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; + u64 bytes_retrans; int rmem_fwd_alloc; struct sock *last_snd; int snd_burst; @@ -273,6 +276,7 @@ struct mptcp_sock { * recovery related fields are under data_lock * protection */ + u64 bytes_acked; u64 snd_una; u64 wnd_end; unsigned long timer_ival; @@ -318,7 +322,8 @@ struct mptcp_sock { u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; - u32 setsockopt_seq; + u32 subflow_id; + u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; struct mptcp_sock *dl_next; }; @@ -498,6 +503,8 @@ struct mptcp_subflow_context { u8 reset_reason:4; u8 stale_count; + u32 subflow_id; + long delegated_status; unsigned long fail_tout; @@ -636,6 +643,7 @@ void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); +void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, @@ -806,7 +814,7 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *rem, u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, - const struct mptcp_pm_addr_entry *entry); + const struct mptcp_addr_info *addr); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * @@ -818,9 +826,15 @@ mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex); +int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, + u8 *flags, int *ifindex); int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex); +int mptcp_pm_set_flags(struct net *net, struct nlattr *token, + struct mptcp_pm_addr_entry *loc, + struct mptcp_pm_addr_entry *rem, u8 bkup); +int mptcp_pm_nl_set_flags(struct net *net, struct mptcp_pm_addr_entry *addr, u8 bkup); int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, struct mptcp_pm_addr_entry *loc, struct mptcp_pm_addr_entry *rem, u8 bkup); @@ -913,13 +927,13 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_work(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); -int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index d4258869ac48..63f7a09335c5 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -14,7 +14,8 @@ #include <net/mptcp.h> #include "protocol.h" -#define MIN_INFO_OPTLEN_SIZE 16 +#define MIN_INFO_OPTLEN_SIZE 16 +#define MIN_FULL_INFO_OPTLEN_SIZE 40 static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { @@ -355,6 +356,7 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_BROADCAST: case SO_BSDCOMPAT: case SO_PASSCRED: + case SO_PASSPIDFD: case SO_PASSSEC: case SO_RXQ_OVFL: case SO_WIFI_STATUS: @@ -888,7 +890,9 @@ out: void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) { + struct sock *sk = (struct sock *)msk; u32 flags = 0; + bool slow; memset(info, 0, sizeof(*info)); @@ -897,6 +901,9 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); + if (inet_sk_state_load(sk) == TCP_LISTEN) + return; + /* The following limits only make sense for the in-kernel PM */ if (mptcp_pm_is_kernel(msk)) { info->mptcpi_subflows_max = @@ -914,11 +921,21 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) if (READ_ONCE(msk->can_ack)) flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; info->mptcpi_flags = flags; - info->mptcpi_token = READ_ONCE(msk->token); - info->mptcpi_write_seq = READ_ONCE(msk->write_seq); - info->mptcpi_snd_una = READ_ONCE(msk->snd_una); - info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); - info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); + mptcp_data_lock(sk); + info->mptcpi_snd_una = msk->snd_una; + info->mptcpi_rcv_nxt = msk->ack_seq; + info->mptcpi_bytes_acked = msk->bytes_acked; + mptcp_data_unlock(sk); + + slow = lock_sock_fast(sk); + info->mptcpi_csum_enabled = msk->csum_enabled; + info->mptcpi_token = msk->token; + info->mptcpi_write_seq = msk->write_seq; + info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits; + info->mptcpi_bytes_sent = msk->bytes_sent; + info->mptcpi_bytes_received = msk->bytes_received; + info->mptcpi_bytes_retrans = msk->bytes_retrans; + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); @@ -965,7 +982,8 @@ static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd, } static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd, - char __user *optval, int __user *optlen) + char __user *optval, + int __user *optlen) { int len, copylen; @@ -1146,6 +1164,125 @@ static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *o return 0; } +static int mptcp_get_full_info(struct mptcp_full_info *mfi, + char __user *optval, + int __user *optlen) +{ + int len; + + BUILD_BUG_ON(offsetof(struct mptcp_full_info, mptcp_info) != + MIN_FULL_INFO_OPTLEN_SIZE); + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < MIN_FULL_INFO_OPTLEN_SIZE) + return -EINVAL; + + memset(mfi, 0, sizeof(*mfi)); + if (copy_from_user(mfi, optval, MIN_FULL_INFO_OPTLEN_SIZE)) + return -EFAULT; + + if (mfi->size_tcpinfo_kernel || + mfi->size_sfinfo_kernel || + mfi->num_subflows) + return -EINVAL; + + if (mfi->size_sfinfo_user > INT_MAX || + mfi->size_tcpinfo_user > INT_MAX) + return -EINVAL; + + return len - MIN_FULL_INFO_OPTLEN_SIZE; +} + +static int mptcp_put_full_info(struct mptcp_full_info *mfi, + char __user *optval, + u32 copylen, + int __user *optlen) +{ + copylen += MIN_FULL_INFO_OPTLEN_SIZE; + if (put_user(copylen, optlen)) + return -EFAULT; + + if (copy_to_user(optval, mfi, copylen)) + return -EFAULT; + return 0; +} + +static int mptcp_getsockopt_full_info(struct mptcp_sock *msk, char __user *optval, + int __user *optlen) +{ + unsigned int sfcount = 0, copylen = 0; + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + void __user *tcpinfoptr, *sfinfoptr; + struct mptcp_full_info mfi; + int len; + + len = mptcp_get_full_info(&mfi, optval, optlen); + if (len < 0) + return len; + + /* don't bother filling the mptcp info if there is not enough + * user-space-provided storage + */ + if (len > 0) { + mptcp_diag_fill_info(msk, &mfi.mptcp_info); + copylen += min_t(unsigned int, len, sizeof(struct mptcp_info)); + } + + mfi.size_tcpinfo_kernel = sizeof(struct tcp_info); + mfi.size_tcpinfo_user = min_t(unsigned int, mfi.size_tcpinfo_user, + sizeof(struct tcp_info)); + sfinfoptr = u64_to_user_ptr(mfi.subflow_info); + mfi.size_sfinfo_kernel = sizeof(struct mptcp_subflow_info); + mfi.size_sfinfo_user = min_t(unsigned int, mfi.size_sfinfo_user, + sizeof(struct mptcp_subflow_info)); + tcpinfoptr = u64_to_user_ptr(mfi.tcp_info); + + lock_sock(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct mptcp_subflow_info sfinfo; + struct tcp_info tcp_info; + + if (sfcount++ >= mfi.size_arrays_user) + continue; + + /* fetch addr/tcp_info only if the user space buffers + * are wide enough + */ + memset(&sfinfo, 0, sizeof(sfinfo)); + sfinfo.id = subflow->subflow_id; + if (mfi.size_sfinfo_user > + offsetof(struct mptcp_subflow_info, addrs)) + mptcp_get_sub_addrs(ssk, &sfinfo.addrs); + if (copy_to_user(sfinfoptr, &sfinfo, mfi.size_sfinfo_user)) + goto fail_release; + + if (mfi.size_tcpinfo_user) { + tcp_get_info(ssk, &tcp_info); + if (copy_to_user(tcpinfoptr, &tcp_info, + mfi.size_tcpinfo_user)) + goto fail_release; + } + + tcpinfoptr += mfi.size_tcpinfo_user; + sfinfoptr += mfi.size_sfinfo_user; + } + release_sock(sk); + + mfi.num_subflows = sfcount; + if (mptcp_put_full_info(&mfi, optval, copylen, optlen)) + return -EFAULT; + + return 0; + +fail_release: + release_sock(sk); + return -EFAULT; +} + static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval, int __user *optlen, int val) { @@ -1219,6 +1356,8 @@ static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname, switch (optname) { case MPTCP_INFO: return mptcp_getsockopt_info(msk, optval, optlen); + case MPTCP_FULL_INFO: + return mptcp_getsockopt_full_info(msk, optval, optlen); case MPTCP_TCPINFO: return mptcp_getsockopt_tcpinfo(msk, optval, optlen); case MPTCP_SUBFLOW_ADDRS: diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index d9c8b21c6076..285e8ff74277 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -819,6 +819,7 @@ create_child: if (!ctx->conn) goto fallback; + ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); mptcp_pm_new_connection(owner, child, 1); @@ -1574,6 +1575,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, subflow->remote_id = remote_id; subflow->request_join = 1; subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); + subflow->subflow_id = msk->subflow_id++; mptcp_info2sockaddr(remote, &addr, ssk->sk_family); sock_hold(ssk); diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 6447a09932f5..069c2659074b 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -611,14 +611,14 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) return 0; } -/* Response handler for Mellanox command Get Mac Address */ -static int ncsi_rsp_handler_oem_mlx_gma(struct ncsi_request *nr) +/* Response handler for Get Mac Address command */ +static int ncsi_rsp_handler_oem_gma(struct ncsi_request *nr, int mfr_id) { struct ncsi_dev_priv *ndp = nr->ndp; struct net_device *ndev = ndp->ndev.dev; - const struct net_device_ops *ops = ndev->netdev_ops; struct ncsi_rsp_oem_pkt *rsp; struct sockaddr saddr; + u32 mac_addr_off = 0; int ret = 0; /* Get the response header */ @@ -626,11 +626,25 @@ static int ncsi_rsp_handler_oem_mlx_gma(struct ncsi_request *nr) saddr.sa_family = ndev->type; ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - memcpy(saddr.sa_data, &rsp->data[MLX_MAC_ADDR_OFFSET], ETH_ALEN); + if (mfr_id == NCSI_OEM_MFR_BCM_ID) + mac_addr_off = BCM_MAC_ADDR_OFFSET; + else if (mfr_id == NCSI_OEM_MFR_MLX_ID) + mac_addr_off = MLX_MAC_ADDR_OFFSET; + else if (mfr_id == NCSI_OEM_MFR_INTEL_ID) + mac_addr_off = INTEL_MAC_ADDR_OFFSET; + + memcpy(saddr.sa_data, &rsp->data[mac_addr_off], ETH_ALEN); + if (mfr_id == NCSI_OEM_MFR_BCM_ID || mfr_id == NCSI_OEM_MFR_INTEL_ID) + eth_addr_inc((u8 *)saddr.sa_data); + if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) + return -ENXIO; + /* Set the flag for GMA command which should only be called once */ ndp->gma_flag = 1; - ret = ops->ndo_set_mac_address(ndev, &saddr); + rtnl_lock(); + ret = dev_set_mac_address(ndev, &saddr, NULL); + rtnl_unlock(); if (ret < 0) netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); @@ -649,41 +663,10 @@ static int ncsi_rsp_handler_oem_mlx(struct ncsi_request *nr) if (mlx->cmd == NCSI_OEM_MLX_CMD_GMA && mlx->param == NCSI_OEM_MLX_CMD_GMA_PARAM) - return ncsi_rsp_handler_oem_mlx_gma(nr); + return ncsi_rsp_handler_oem_gma(nr, NCSI_OEM_MFR_MLX_ID); return 0; } -/* Response handler for Broadcom command Get Mac Address */ -static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr) -{ - struct ncsi_dev_priv *ndp = nr->ndp; - struct net_device *ndev = ndp->ndev.dev; - const struct net_device_ops *ops = ndev->netdev_ops; - struct ncsi_rsp_oem_pkt *rsp; - struct sockaddr saddr; - int ret = 0; - - /* Get the response header */ - rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); - - saddr.sa_family = ndev->type; - ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - memcpy(saddr.sa_data, &rsp->data[BCM_MAC_ADDR_OFFSET], ETH_ALEN); - /* Increase mac address by 1 for BMC's address */ - eth_addr_inc((u8 *)saddr.sa_data); - if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) - return -ENXIO; - - /* Set the flag for GMA command which should only be called once */ - ndp->gma_flag = 1; - - ret = ops->ndo_set_mac_address(ndev, &saddr); - if (ret < 0) - netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); - - return ret; -} - /* Response handler for Broadcom card */ static int ncsi_rsp_handler_oem_bcm(struct ncsi_request *nr) { @@ -695,42 +678,10 @@ static int ncsi_rsp_handler_oem_bcm(struct ncsi_request *nr) bcm = (struct ncsi_rsp_oem_bcm_pkt *)(rsp->data); if (bcm->type == NCSI_OEM_BCM_CMD_GMA) - return ncsi_rsp_handler_oem_bcm_gma(nr); + return ncsi_rsp_handler_oem_gma(nr, NCSI_OEM_MFR_BCM_ID); return 0; } -/* Response handler for Intel command Get Mac Address */ -static int ncsi_rsp_handler_oem_intel_gma(struct ncsi_request *nr) -{ - struct ncsi_dev_priv *ndp = nr->ndp; - struct net_device *ndev = ndp->ndev.dev; - const struct net_device_ops *ops = ndev->netdev_ops; - struct ncsi_rsp_oem_pkt *rsp; - struct sockaddr saddr; - int ret = 0; - - /* Get the response header */ - rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); - - saddr.sa_family = ndev->type; - ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - memcpy(saddr.sa_data, &rsp->data[INTEL_MAC_ADDR_OFFSET], ETH_ALEN); - /* Increase mac address by 1 for BMC's address */ - eth_addr_inc((u8 *)saddr.sa_data); - if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) - return -ENXIO; - - /* Set the flag for GMA command which should only be called once */ - ndp->gma_flag = 1; - - ret = ops->ndo_set_mac_address(ndev, &saddr); - if (ret < 0) - netdev_warn(ndev, - "NCSI: 'Writing mac address to device failed\n"); - - return ret; -} - /* Response handler for Intel card */ static int ncsi_rsp_handler_oem_intel(struct ncsi_request *nr) { @@ -742,7 +693,7 @@ static int ncsi_rsp_handler_oem_intel(struct ncsi_request *nr) intel = (struct ncsi_rsp_oem_intel_pkt *)(rsp->data); if (intel->cmd == NCSI_OEM_INTEL_CMD_GMA) - return ncsi_rsp_handler_oem_intel_gma(nr); + return ncsi_rsp_handler_oem_gma(nr, NCSI_OEM_MFR_INTEL_ID); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index a80b960223e1..9193e109e6b3 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -139,7 +139,7 @@ retry: if (PTR_ERR(rt) == -EINVAL && *saddr && rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { *saddr = 0; - flowi4_update_output(&fl4, 0, 0, daddr, 0); + flowi4_update_output(&fl4, 0, daddr, 0); goto retry; } IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); @@ -147,7 +147,7 @@ retry: } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { ip_rt_put(rt); *saddr = fl4.saddr; - flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); + flowi4_update_output(&fl4, 0, daddr, fl4.saddr); loop = true; goto retry; } diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 728eeb0aea87..ad6f0ca40cd2 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -296,6 +296,7 @@ void nf_conntrack_gre_init_net(struct net *net) /* protocol helper struct */ const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre = { .l4proto = IPPROTO_GRE, + .allow_clash = true, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index b0ef48b21dcb..1d34d700bd09 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -125,9 +125,6 @@ static int flow_offload_fill_route(struct flow_offload *flow, break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: - if (!dst_hold_safe(route->tuple[dir].dst)) - return -1; - flow_tuple->dst_cache = dst; flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; @@ -148,27 +145,12 @@ static void nft_flow_dst_release(struct flow_offload *flow, dst_release(flow->tuplehash[dir].tuple.dst_cache); } -int flow_offload_route_init(struct flow_offload *flow, +void flow_offload_route_init(struct flow_offload *flow, const struct nf_flow_route *route) { - int err; - - err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); - if (err < 0) - return err; - - err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); - if (err < 0) - goto err_route_reply; - + flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); flow->type = NF_FLOW_OFFLOAD_ROUTE; - - return 0; - -err_route_reply: - nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); - - return err; } EXPORT_SYMBOL_GPL(flow_offload_route_init); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 3bbaf9c7ea46..e45fade76409 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -8,6 +8,7 @@ #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/if_ether.h> +#include <net/gso.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> @@ -163,38 +164,43 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, } } -static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple, u32 *hdrsize, - u32 offset) +struct nf_flowtable_ctx { + const struct net_device *in; + u32 offset; + u32 hdrsize; +}; + +static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, + struct flow_offload_tuple *tuple) { struct flow_ports *ports; unsigned int thoff; struct iphdr *iph; u8 ipproto; - if (!pskb_may_pull(skb, sizeof(*iph) + offset)) + if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset)) return -1; - iph = (struct iphdr *)(skb_network_header(skb) + offset); + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); thoff = (iph->ihl * 4); if (ip_is_fragment(iph) || unlikely(ip_has_options(thoff))) return -1; - thoff += offset; + thoff += ctx->offset; ipproto = iph->protocol; switch (ipproto) { case IPPROTO_TCP: - *hdrsize = sizeof(struct tcphdr); + ctx->hdrsize = sizeof(struct tcphdr); break; case IPPROTO_UDP: - *hdrsize = sizeof(struct udphdr); + ctx->hdrsize = sizeof(struct udphdr); break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: - *hdrsize = sizeof(struct gre_base_hdr); + ctx->hdrsize = sizeof(struct gre_base_hdr); break; #endif default: @@ -204,7 +210,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, if (iph->ttl <= 1) return -1; - if (!pskb_may_pull(skb, thoff + *hdrsize)) + if (!pskb_may_pull(skb, thoff + ctx->hdrsize)) return -1; switch (ipproto) { @@ -224,13 +230,13 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, } } - iph = (struct iphdr *)(skb_network_header(skb) + offset); + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); tuple->src_v4.s_addr = iph->saddr; tuple->dst_v4.s_addr = iph->daddr; tuple->l3proto = AF_INET; tuple->l4proto = ipproto; - tuple->iifidx = dev->ifindex; + tuple->iifidx = ctx->in->ifindex; nf_flow_tuple_encap(skb, tuple); return 0; @@ -336,58 +342,56 @@ static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, return NF_STOLEN; } -unsigned int -nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +static struct flow_offload_tuple_rhash * +nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, struct sk_buff *skb) { - struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table = priv; struct flow_offload_tuple tuple = {}; - enum flow_offload_tuple_dir dir; - struct flow_offload *flow; - struct net_device *outdev; - u32 hdrsize, offset = 0; - unsigned int thoff, mtu; - struct rtable *rt; - struct iphdr *iph; - __be32 nexthop; - int ret; if (skb->protocol != htons(ETH_P_IP) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &offset)) - return NF_ACCEPT; + !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) + return NULL; - if (nf_flow_tuple_ip(skb, state->in, &tuple, &hdrsize, offset) < 0) - return NF_ACCEPT; + if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) + return NULL; - tuplehash = flow_offload_lookup(flow_table, &tuple); - if (tuplehash == NULL) - return NF_ACCEPT; + return flow_offload_lookup(flow_table, &tuple); +} + +static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, + struct flow_offload_tuple_rhash *tuplehash, + struct sk_buff *skb) +{ + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + unsigned int thoff, mtu; + struct iphdr *iph; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - mtu = flow->tuplehash[dir].tuple.mtu + offset; + mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) - return NF_ACCEPT; + return 0; - iph = (struct iphdr *)(skb_network_header(skb) + offset); - thoff = (iph->ihl * 4) + offset; + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); + thoff = (iph->ihl * 4) + ctx->offset; if (nf_flow_state_check(flow, iph->protocol, skb, thoff)) - return NF_ACCEPT; + return 0; if (!nf_flow_dst_check(&tuplehash->tuple)) { flow_offload_teardown(flow); - return NF_ACCEPT; + return 0; } - if (skb_try_make_writable(skb, thoff + hdrsize)) - return NF_DROP; + if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + return -1; flow_offload_refresh(flow_table, flow, false); nf_flow_encap_pop(skb, tuplehash); - thoff -= offset; + thoff -= ctx->offset; iph = ip_hdr(skb); nf_flow_nat_ip(flow, skb, thoff, dir, iph); @@ -398,6 +402,35 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); + return 1; +} + +unsigned int +nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + enum flow_offload_tuple_dir dir; + struct nf_flowtable_ctx ctx = { + .in = state->in, + }; + struct flow_offload *flow; + struct net_device *outdev; + struct rtable *rt; + __be32 nexthop; + int ret; + + tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb); + if (!tuplehash) + return NF_ACCEPT; + + ret = nf_flow_offload_forward(&ctx, flow_table, tuplehash, skb); + if (ret < 0) + return NF_DROP; + else if (ret == 0) + return NF_ACCEPT; + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { rt = (struct rtable *)tuplehash->tuple.dst_cache; memset(skb->cb, 0, sizeof(struct inet_skb_parm)); @@ -406,6 +439,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, return nf_flow_xmit_xfrm(skb, state, &rt->dst); } + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = (struct rtable *)tuplehash->tuple.dst_cache; @@ -535,32 +571,31 @@ static void nf_flow_nat_ipv6(const struct flow_offload *flow, } } -static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple, u32 *hdrsize, - u32 offset) +static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, + struct flow_offload_tuple *tuple) { struct flow_ports *ports; struct ipv6hdr *ip6h; unsigned int thoff; u8 nexthdr; - thoff = sizeof(*ip6h) + offset; + thoff = sizeof(*ip6h) + ctx->offset; if (!pskb_may_pull(skb, thoff)) return -1; - ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); nexthdr = ip6h->nexthdr; switch (nexthdr) { case IPPROTO_TCP: - *hdrsize = sizeof(struct tcphdr); + ctx->hdrsize = sizeof(struct tcphdr); break; case IPPROTO_UDP: - *hdrsize = sizeof(struct udphdr); + ctx->hdrsize = sizeof(struct udphdr); break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: - *hdrsize = sizeof(struct gre_base_hdr); + ctx->hdrsize = sizeof(struct gre_base_hdr); break; #endif default: @@ -570,7 +605,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, if (ip6h->hop_limit <= 1) return -1; - if (!pskb_may_pull(skb, thoff + *hdrsize)) + if (!pskb_may_pull(skb, thoff + ctx->hdrsize)) return -1; switch (nexthdr) { @@ -590,65 +625,47 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, } } - ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); tuple->src_v6 = ip6h->saddr; tuple->dst_v6 = ip6h->daddr; tuple->l3proto = AF_INET6; tuple->l4proto = nexthdr; - tuple->iifidx = dev->ifindex; + tuple->iifidx = ctx->in->ifindex; nf_flow_tuple_encap(skb, tuple); return 0; } -unsigned int -nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, + struct flow_offload_tuple_rhash *tuplehash, + struct sk_buff *skb) { - struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table = priv; - struct flow_offload_tuple tuple = {}; enum flow_offload_tuple_dir dir; - const struct in6_addr *nexthop; struct flow_offload *flow; - struct net_device *outdev; unsigned int thoff, mtu; - u32 hdrsize, offset = 0; struct ipv6hdr *ip6h; - struct rt6_info *rt; - int ret; - - if (skb->protocol != htons(ETH_P_IPV6) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &offset)) - return NF_ACCEPT; - - if (nf_flow_tuple_ipv6(skb, state->in, &tuple, &hdrsize, offset) < 0) - return NF_ACCEPT; - - tuplehash = flow_offload_lookup(flow_table, &tuple); - if (tuplehash == NULL) - return NF_ACCEPT; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - mtu = flow->tuplehash[dir].tuple.mtu + offset; + mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) - return NF_ACCEPT; + return 0; - ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); - thoff = sizeof(*ip6h) + offset; + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); + thoff = sizeof(*ip6h) + ctx->offset; if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff)) - return NF_ACCEPT; + return 0; if (!nf_flow_dst_check(&tuplehash->tuple)) { flow_offload_teardown(flow); - return NF_ACCEPT; + return 0; } - if (skb_try_make_writable(skb, thoff + hdrsize)) - return NF_DROP; + if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + return -1; flow_offload_refresh(flow_table, flow, false); @@ -663,6 +680,52 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); + return 1; +} + +static struct flow_offload_tuple_rhash * +nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, + struct sk_buff *skb) +{ + struct flow_offload_tuple tuple = {}; + + if (skb->protocol != htons(ETH_P_IPV6) && + !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset)) + return NULL; + + if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0) + return NULL; + + return flow_offload_lookup(flow_table, &tuple); +} + +unsigned int +nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + enum flow_offload_tuple_dir dir; + struct nf_flowtable_ctx ctx = { + .in = state->in, + }; + const struct in6_addr *nexthop; + struct flow_offload *flow; + struct net_device *outdev; + struct rt6_info *rt; + int ret; + + tuplehash = nf_flow_offload_ipv6_lookup(&ctx, flow_table, skb); + if (tuplehash == NULL) + return NF_ACCEPT; + + ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb); + if (ret < 0) + return NF_DROP; + else if (ret == 0) + return NF_ACCEPT; + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { rt = (struct rt6_info *)tuplehash->tuple.dst_cache; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); @@ -671,6 +734,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, return nf_flow_xmit_xfrm(skb, state, &rt->dst); } + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = (struct rt6_info *)tuplehash->tuple.dst_cache; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4c7937fd803f..d543787fc851 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6754,10 +6754,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_element_clash; } - if (!(flags & NFT_SET_ELEM_CATCHALL) && set->size && - !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { - err = -ENFILE; - goto err_set_full; + if (!(flags & NFT_SET_ELEM_CATCHALL)) { + unsigned int max = set->size ? set->size + set->ndeact : UINT_MAX; + + if (!atomic_add_unless(&set->nelems, 1, max)) { + err = -ENFILE; + goto err_set_full; + } } nft_trans_elem(trans) = elem; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index e311462f6d98..556bc902af00 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -30,6 +30,7 @@ #include <linux/netfilter/nf_conntrack_common.h> #include <linux/list.h> #include <linux/cgroup-defs.h> +#include <net/gso.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/netfilter/nf_queue.h> diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a54a7f772cec..671474e59817 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -10,6 +10,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/dccp.h> #include <linux/sctp.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> @@ -406,6 +407,82 @@ err: regs->verdict.code = NFT_BREAK; } +static void nft_exthdr_dccp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + unsigned int thoff, dataoff, optoff, optlen, i; + u32 *dest = ®s->data[priv->dreg]; + const struct dccp_hdr *dh; + struct dccp_hdr _dh; + + if (pkt->tprot != IPPROTO_DCCP || pkt->fragoff) + goto err; + + thoff = nft_thoff(pkt); + + dh = skb_header_pointer(pkt->skb, thoff, sizeof(_dh), &_dh); + if (!dh) + goto err; + + dataoff = dh->dccph_doff * sizeof(u32); + optoff = __dccp_hdr_len(dh); + if (dataoff <= optoff) + goto err; + + optlen = dataoff - optoff; + + for (i = 0; i < optlen; ) { + /* Options 0 (DCCPO_PADDING) - 31 (DCCPO_MAX_RESERVED) are 1B in + * the length; the remaining options are at least 2B long. In + * all cases, the first byte contains the option type. In + * multi-byte options, the second byte contains the option + * length, which must be at least two: 1 for the type plus 1 for + * the length plus 0-253 for any following option data. We + * aren't interested in the option data, only the type and the + * length, so we don't need to read more than two bytes at a + * time. + */ + unsigned int buflen = optlen - i; + u8 buf[2], *bufp; + u8 type, len; + + if (buflen > sizeof(buf)) + buflen = sizeof(buf); + + bufp = skb_header_pointer(pkt->skb, thoff + optoff + i, buflen, + &buf); + if (!bufp) + goto err; + + type = bufp[0]; + + if (type == priv->type) { + *dest = 1; + return; + } + + if (type <= DCCPO_MAX_RESERVED) { + i++; + continue; + } + + if (buflen < 2) + goto err; + + len = bufp[1]; + + if (len < 2) + goto err; + + i += len; + } + +err: + *dest = 0; +} + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, @@ -557,6 +634,22 @@ static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, return 0; } +static int nft_exthdr_dccp_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + int err = nft_exthdr_init(ctx, expr, tb); + + if (err < 0) + return err; + + if (!(priv->flags & NFT_EXTHDR_F_PRESENT)) + return -EOPNOTSUPP; + + return 0; +} + static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) @@ -686,6 +779,15 @@ static const struct nft_expr_ops nft_exthdr_sctp_ops = { .reduce = nft_exthdr_reduce, }; +static const struct nft_expr_ops nft_exthdr_dccp_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_dccp_eval, + .init = nft_exthdr_dccp_init, + .dump = nft_exthdr_dump, + .reduce = nft_exthdr_reduce, +}; + static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -720,6 +822,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_sctp_ops; break; + case NFT_EXTHDR_OP_DCCP: + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_dccp_ops; + break; } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index e860d8fe0e5e..5ef9146e74ad 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -250,9 +250,14 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, break; } + if (!dst_hold_safe(this_dst)) + return -ENOENT; + nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); - if (!other_dst) + if (!other_dst) { + dst_release(this_dst); return -ENOENT; + } nft_default_forward_path(route, this_dst, dir); nft_default_forward_path(route, other_dst, !dir); @@ -349,8 +354,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (!flow) goto err_flow_alloc; - if (flow_offload_route_init(flow, &route) < 0) - goto err_flow_add; + flow_offload_route_init(flow, &route); if (tcph) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; @@ -361,12 +365,12 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (ret < 0) goto err_flow_add; - dst_release(route.tuple[!dir].dst); return; err_flow_add: flow_offload_free(flow); err_flow_alloc: + dst_release(route.tuple[dir].dst); dst_release(route.tuple[!dir].dst); err_flow_route: clear_bit(IPS_OFFLOAD_BIT, &ct->status); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 03ef4fdaa460..29ac48cdd6db 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -19,6 +19,7 @@ struct nft_lookup { struct nft_set *set; u8 sreg; u8 dreg; + bool dreg_set; bool invert; struct nft_set_binding binding; }; @@ -75,7 +76,7 @@ void nft_lookup_eval(const struct nft_expr *expr, } if (ext) { - if (set->flags & NFT_SET_MAP) + if (priv->dreg_set) nft_data_copy(®s->data[priv->dreg], nft_set_ext_data(ext), set->dlen); @@ -122,11 +123,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (flags & ~NFT_LOOKUP_F_INV) return -EINVAL; - if (flags & NFT_LOOKUP_F_INV) { - if (set->flags & NFT_SET_MAP) - return -EINVAL; + if (flags & NFT_LOOKUP_F_INV) priv->invert = true; - } } if (tb[NFTA_LOOKUP_DREG] != NULL) { @@ -140,8 +138,17 @@ static int nft_lookup_init(const struct nft_ctx *ctx, set->dlen); if (err < 0) return err; - } else if (set->flags & NFT_SET_MAP) - return -EINVAL; + priv->dreg_set = true; + } else if (set->flags & NFT_SET_MAP) { + /* Map given, but user asks for lookup only (i.e. to + * ignore value assoicated with key). + * + * This makes no sense for anonymous maps since they are + * scoped to the rule, but for named sets this can be useful. + */ + if (set->flags & NFT_SET_ANONYMOUS) + return -EINVAL; + } priv->binding.flags = set->flags & NFT_SET_MAP; @@ -188,7 +195,7 @@ static int nft_lookup_dump(struct sk_buff *skb, goto nla_put_failure; if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg)) goto nla_put_failure; - if (priv->set->flags & NFT_SET_MAP) + if (priv->dreg_set) if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags))) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 0452ee586c1c..db526cb7a485 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1274,8 +1274,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) struct nft_pipapo_match *new; int i; - new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count, - GFP_KERNEL); + new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); @@ -2084,8 +2083,7 @@ static int nft_pipapo_init(const struct nft_set *set, if (field_count > NFT_PIPAPO_MAX_FIELDS) return -EINVAL; - m = kmalloc(sizeof(*priv->match) + sizeof(*f) * field_count, - GFP_KERNEL); + m = kmalloc(struct_size(m, f, field_count), GFP_KERNEL); if (!m) return -ENOMEM; diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h index 9f80972ae39b..7eaa35fdd9bd 100644 --- a/net/netlabel/netlabel_domainhash.h +++ b/net/netlabel/netlabel_domainhash.h @@ -57,8 +57,8 @@ struct netlbl_domaddr6_map { struct netlbl_dom_map { char *domain; - u16 family; struct netlbl_dommap_def def; + u16 family; u32 valid; struct list_head list; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 3a1e0fd5bf14..cbd9aa7ee24a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2360,7 +2360,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->strict_check = !!(nlk2->flags & NETLINK_F_STRICT_CHK); if (control->start) { + cb->extack = control->extack; ret = control->start(cb); + cb->extack = NULL; if (ret) goto error_put; } diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 04c4036bf406..a157247a1e45 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -912,6 +912,7 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family, .start = genl_start, .dump = genl_lock_dumpit, .done = genl_lock_done, + .extack = extack, }; genl_unlock(); @@ -924,6 +925,7 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family, .start = genl_start, .dump = ops->dumpit, .done = genl_parallel_done, + .extack = extack, }; err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 41e3a20c8935..cdb001de0692 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -390,7 +390,8 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) const u8 *service_name_tlv = NULL; const u8 *miux_tlv = NULL; const u8 *rw_tlv = NULL; - u8 service_name_tlv_length, miux_tlv_length, rw_tlv_length, rw; + u8 service_name_tlv_length = 0; + u8 miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c index 0f23e5e8e03e..f4a38bd6a7e0 100644 --- a/net/nsh/nsh.c +++ b/net/nsh/nsh.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <net/gso.h> #include <net/nsh.h> #include <net/tun_proto.h> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index a8cf9a88758e..cab1e02b63e0 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -17,6 +17,7 @@ #include <linux/if_vlan.h> #include <net/dst.h> +#include <net/gso.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_fib.h> @@ -1072,8 +1073,16 @@ static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, struct ovs_action_hash *hash_act = nla_data(attr); u32 hash = 0; - /* OVS_HASH_ALG_L4 is the only possible hash algorithm. */ - hash = skb_get_hash(skb); + if (hash_act->hash_alg == OVS_HASH_ALG_L4) { + /* OVS_HASH_ALG_L4 hasing type. */ + hash = skb_get_hash(skb); + } else if (hash_act->hash_alg == OVS_HASH_ALG_SYM_L4) { + /* OVS_HASH_ALG_SYM_L4 hashing type. NOTE: this doesn't + * extend past an encapsulated header. + */ + hash = __skb_get_hash_symmetric(skb); + } + hash = jhash_1word(hash, hash_act->hash_basis); if (!hash) hash = 0x1; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 58f530f60172..a6d2a0b1aa21 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -35,6 +35,7 @@ #include <linux/rculist.h> #include <linux/dmi.h> #include <net/genetlink.h> +#include <net/gso.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/pkt_cls.h> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index ead5418c126e..41116361433d 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -3221,6 +3221,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, switch (act_hash->hash_alg) { case OVS_HASH_ALG_L4: + fallthrough; + case OVS_HASH_ALG_SYM_L4: break; default: return -EINVAL; diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index f2698d2316df..c4ebf810e4b1 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -69,9 +69,7 @@ static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) { struct dp_meter_instance *ti; - ti = kvzalloc(sizeof(*ti) + - sizeof(struct dp_meter *) * size, - GFP_KERNEL); + ti = kvzalloc(struct_size(ti, dp_meters, size), GFP_KERNEL); if (!ti) return NULL; diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index ff5f49ab236e..3aa50dc7535b 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -28,24 +28,21 @@ static void pn_sock_close(struct sock *sk, long timeout) sk_common_release(sk); } -static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int pn_ioctl(struct sock *sk, int cmd, int *karg) { struct sk_buff *skb; - int answ; switch (cmd) { case SIOCINQ: lock_sock(sk); skb = skb_peek(&sk->sk_receive_queue); - answ = skb ? skb->len : 0; + *karg = skb ? skb->len : 0; release_sock(sk); - return put_user(answ, (int __user *)arg); + return 0; case SIOCPNADDRESOURCE: case SIOCPNDELRESOURCE: { - u32 res; - if (get_user(res, (u32 __user *)arg)) - return -EFAULT; + u32 res = *karg; if (res >= 256) return -EINVAL; if (cmd == SIOCPNADDRESOURCE) diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 83ea13a50690..faba31f2eff2 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -917,10 +917,9 @@ static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len) return 0; } -static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int pep_ioctl(struct sock *sk, int cmd, int *karg) { struct pep_sock *pn = pep_sk(sk); - int answ; int ret = -ENOIOCTLCMD; switch (cmd) { @@ -933,13 +932,13 @@ static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg) lock_sock(sk); if (sock_flag(sk, SOCK_URGINLINE) && !skb_queue_empty(&pn->ctrlreq_queue)) - answ = skb_peek(&pn->ctrlreq_queue)->len; + *karg = skb_peek(&pn->ctrlreq_queue)->len; else if (!skb_queue_empty(&sk->sk_receive_queue)) - answ = skb_peek(&sk->sk_receive_queue)->len; + *karg = skb_peek(&sk->sk_receive_queue)->len; else - answ = 0; + *karg = 0; release_sock(sk); - ret = put_user(answ, (int __user *)arg); + ret = 0; break; case SIOCPNENABLEPIPE: diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 71e2caf6ab85..967f9b4dc026 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -387,7 +387,7 @@ static int pn_socket_ioctl(struct socket *sock, unsigned int cmd, return put_user(handle, (__u16 __user *)arg); } - return sk->sk_prot->ioctl(sk, cmd, arg); + return sk_ioctl(sk, cmd, (void __user *)arg); } static int pn_socket_listen(struct socket *sock, int backlog) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index c819b812a899..b562fc2bb5b1 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -245,14 +245,12 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, nparms->tcfp_flags = parm->flags; nparms->tcfp_nkeys = parm->nkeys; - nparms->tcfp_keys = kmalloc(ksize, GFP_KERNEL); + nparms->tcfp_keys = kmemdup(parm->keys, ksize, GFP_KERNEL); if (!nparms->tcfp_keys) { ret = -ENOMEM; goto put_chain; } - memcpy(nparms->tcfp_keys, parm->keys, ksize); - for (i = 0; i < nparms->tcfp_nkeys; ++i) { u32 offmask = nparms->tcfp_keys[i].offmask; u32 cur = nparms->tcfp_keys[i].off; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 2e9dce03d1ec..f3121c5a85e9 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -16,6 +16,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <net/act_api.h> +#include <net/gso.h> #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_police.h> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 815c3e416bc5..56065cc5a661 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -11,6 +11,7 @@ #include <linux/rhashtable.h> #include <linux/workqueue.h> #include <linux/refcount.h> +#include <linux/bitfield.h> #include <linux/if_ether.h> #include <linux/in6.h> @@ -71,6 +72,7 @@ struct fl_flow_key { struct flow_dissector_key_num_of_vlans num_of_vlans; struct flow_dissector_key_pppoe pppoe; struct flow_dissector_key_l2tpv3 l2tpv3; + struct flow_dissector_key_cfm cfm; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -120,6 +122,7 @@ struct cls_fl_filter { u32 handle; u32 flags; u32 in_hw_count; + u8 needs_tc_skb_ext:1; struct rcu_work rwork; struct net_device *hw_dev; /* Flower classifier is unlocked, which means that its reference counter @@ -415,6 +418,8 @@ static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp) static void __fl_destroy_filter(struct cls_fl_filter *f) { + if (f->needs_tc_skb_ext) + tc_skb_ext_tc_disable(); tcf_exts_destroy(&f->exts); tcf_exts_put_net(&f->exts); kfree(f); @@ -615,7 +620,8 @@ static void *fl_get(struct tcf_proto *tp, u32 handle) } static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { - [TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC }, + [TCA_FLOWER_UNSPEC] = { .strict_start_type = + TCA_FLOWER_L2_MISS }, [TCA_FLOWER_CLASSID] = { .type = NLA_U32 }, [TCA_FLOWER_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, @@ -720,7 +726,8 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_PPPOE_SID] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_PPP_PROTO] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_L2TPV3_SID] = { .type = NLA_U32 }, - + [TCA_FLOWER_L2_MISS] = NLA_POLICY_MAX(NLA_U8, 1), + [TCA_FLOWER_KEY_CFM] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -769,6 +776,12 @@ mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL] = { .type = NLA_U32 }, }; +static const struct nla_policy cfm_opt_policy[TCA_FLOWER_KEY_CFM_OPT_MAX] = { + [TCA_FLOWER_KEY_CFM_MD_LEVEL] = NLA_POLICY_MAX(NLA_U8, + FLOW_DIS_CFM_MDL_MAX), + [TCA_FLOWER_KEY_CFM_OPCODE] = { .type = NLA_U8 }, +}; + static void fl_set_key_val(struct nlattr **tb, void *val, int val_type, void *mask, int mask_type, int len) @@ -1656,6 +1669,53 @@ static bool is_vlan_key(struct nlattr *tb, __be16 *ethertype, return false; } +static void fl_set_key_cfm_md_level(struct nlattr **tb, + struct fl_flow_key *key, + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) +{ + u8 level; + + if (!tb[TCA_FLOWER_KEY_CFM_MD_LEVEL]) + return; + + level = nla_get_u8(tb[TCA_FLOWER_KEY_CFM_MD_LEVEL]); + key->cfm.mdl_ver = FIELD_PREP(FLOW_DIS_CFM_MDL_MASK, level); + mask->cfm.mdl_ver = FLOW_DIS_CFM_MDL_MASK; +} + +static void fl_set_key_cfm_opcode(struct nlattr **tb, + struct fl_flow_key *key, + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) +{ + fl_set_key_val(tb, &key->cfm.opcode, TCA_FLOWER_KEY_CFM_OPCODE, + &mask->cfm.opcode, TCA_FLOWER_UNSPEC, + sizeof(key->cfm.opcode)); +} + +static int fl_set_key_cfm(struct nlattr **tb, + struct fl_flow_key *key, + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *nla_cfm_opt[TCA_FLOWER_KEY_CFM_OPT_MAX]; + int err; + + if (!tb[TCA_FLOWER_KEY_CFM]) + return 0; + + err = nla_parse_nested(nla_cfm_opt, TCA_FLOWER_KEY_CFM_OPT_MAX, + tb[TCA_FLOWER_KEY_CFM], cfm_opt_policy, extack); + if (err < 0) + return err; + + fl_set_key_cfm_opcode(nla_cfm_opt, key, mask, extack); + fl_set_key_cfm_md_level(nla_cfm_opt, key, mask, extack); + + return 0; +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -1671,6 +1731,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, mask->meta.ingress_ifindex = 0xffffffff; } + fl_set_key_val(tb, &key->meta.l2_miss, TCA_FLOWER_L2_MISS, + &mask->meta.l2_miss, TCA_FLOWER_UNSPEC, + sizeof(key->meta.l2_miss)); + fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, sizeof(key->eth.dst)); @@ -1806,6 +1870,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, TCA_FLOWER_KEY_L2TPV3_SID, &mask->l2tpv3.session_id, TCA_FLOWER_UNSPEC, sizeof(key->l2tpv3.session_id)); + } else if (key->basic.n_proto == htons(ETH_P_CFM)) { + ret = fl_set_key_cfm(tb, key, mask, extack); + if (ret) + return ret; } if (key->basic.ip_proto == IPPROTO_TCP || @@ -1988,6 +2056,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_PPPOE, pppoe); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_L2TPV3, l2tpv3); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_CFM, cfm); skb_flow_dissector_init(dissector, keys, cnt); } @@ -2088,6 +2158,11 @@ errout_cleanup: return ret; } +static bool fl_needs_tc_skb_ext(const struct fl_flow_key *mask) +{ + return mask->meta.l2_miss; +} + static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct cls_fl_filter *f, struct fl_flow_mask *mask, unsigned long base, struct nlattr **tb, @@ -2124,6 +2199,14 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, return -EINVAL; } + /* Enable tc skb extension if filter matches on data extracted from + * this extension. + */ + if (fl_needs_tc_skb_ext(&mask->key)) { + f->needs_tc_skb_ext = 1; + tc_skb_ext_tc_enable(); + } + return 0; } @@ -3008,6 +3091,43 @@ nla_put_failure: return -EMSGSIZE; } +static int fl_dump_key_cfm(struct sk_buff *skb, + struct flow_dissector_key_cfm *key, + struct flow_dissector_key_cfm *mask) +{ + struct nlattr *opts; + int err; + u8 mdl; + + if (!memchr_inv(mask, 0, sizeof(*mask))) + return 0; + + opts = nla_nest_start(skb, TCA_FLOWER_KEY_CFM); + if (!opts) + return -EMSGSIZE; + + if (FIELD_GET(FLOW_DIS_CFM_MDL_MASK, mask->mdl_ver)) { + mdl = FIELD_GET(FLOW_DIS_CFM_MDL_MASK, key->mdl_ver); + err = nla_put_u8(skb, TCA_FLOWER_KEY_CFM_MD_LEVEL, mdl); + if (err) + goto err_cfm_opts; + } + + if (mask->opcode) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_CFM_OPCODE, key->opcode); + if (err) + goto err_cfm_opts; + } + + nla_nest_end(skb, opts); + + return 0; + +err_cfm_opts: + nla_nest_cancel(skb, opts); + return err; +} + static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, struct flow_dissector_key_enc_opts *enc_opts) { @@ -3077,6 +3197,11 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, goto nla_put_failure; } + if (fl_dump_key_val(skb, &key->meta.l2_miss, + TCA_FLOWER_L2_MISS, &mask->meta.l2_miss, + TCA_FLOWER_UNSPEC, sizeof(key->meta.l2_miss))) + goto nla_put_failure; + if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, sizeof(key->eth.dst)) || @@ -3290,6 +3415,9 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, sizeof(key->hash.hash))) goto nla_put_failure; + if (fl_dump_key_cfm(skb, &key->cfm, &mask->cfm)) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 891e007d5c0b..9cff99558694 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -65,6 +65,7 @@ #include <linux/reciprocal_div.h> #include <net/netlink.h> #include <linux/if_vlan.h> +#include <net/gso.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tcp.h> diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 8aef7dd9fb88..325c29041c7d 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1814,10 +1814,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, NL_SET_ERR_MSG(extack, "HTB offload doesn't support the quantum parameter"); goto failure; } - if (hopt->prio) { - NL_SET_ERR_MSG(extack, "HTB offload doesn't support the prio parameter"); - goto failure; - } } /* Keeping backward compatible with rate_table based iproute2 tc */ @@ -1913,6 +1909,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, TC_HTB_CLASSID_ROOT, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), + .prio = hopt->prio, .extack = extack, }; err = htb_offload(dev, &offload_opt); @@ -1933,6 +1930,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, TC_H_MIN(parent->common.classid), .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), + .prio = hopt->prio, .extack = extack, }; err = htb_offload(dev, &offload_opt); @@ -2018,6 +2016,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .classid = cl->common.classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), + .prio = hopt->prio, .extack = extack, }; err = htb_offload(dev, &offload_opt); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e79be1b3e74d..33c0dbe35956 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -21,6 +21,7 @@ #include <linux/reciprocal_div.h> #include <linux/rbtree.h> +#include <net/gso.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index cf0e61ed9225..717ae51d94a0 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -20,6 +20,7 @@ #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/time.h> +#include <net/gso.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> @@ -27,6 +28,8 @@ #include <net/sock.h> #include <net/tcp.h> +#define TAPRIO_STAT_NOT_SET (~0ULL) + #include "sch_mqprio_lib.h" static LIST_HEAD(taprio_list); @@ -1527,7 +1530,7 @@ static int taprio_enable_offload(struct net_device *dev, "Not enough memory for enabling offload mode"); return -ENOMEM; } - offload->enable = 1; + offload->cmd = TAPRIO_CMD_REPLACE; offload->extack = extack; mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); offload->mqprio.extack = extack; @@ -1575,7 +1578,7 @@ static int taprio_disable_offload(struct net_device *dev, "Not enough memory to disable offload mode"); return -ENOMEM; } - offload->enable = 0; + offload->cmd = TAPRIO_CMD_DESTROY; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { @@ -2292,6 +2295,72 @@ nla_put_failure: return -EMSGSIZE; } +static int taprio_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) +{ + if (val == TAPRIO_STAT_NOT_SET) + return 0; + if (nla_put_u64_64bit(skb, attrtype, val, TCA_TAPRIO_OFFLOAD_STATS_PAD)) + return -EMSGSIZE; + return 0; +} + +static int taprio_dump_xstats(struct Qdisc *sch, struct gnet_dump *d, + struct tc_taprio_qopt_offload *offload, + struct tc_taprio_qopt_stats *stats) +{ + struct net_device *dev = qdisc_dev(sch); + const struct net_device_ops *ops; + struct sk_buff *skb = d->skb; + struct nlattr *xstats; + int err; + + ops = qdisc_dev(sch)->netdev_ops; + + /* FIXME I could use qdisc_offload_dump_helper(), but that messes + * with sch->flags depending on whether the device reports taprio + * stats, and I'm not sure whether that's a good idea, considering + * that stats are optional to the offload itself + */ + if (!ops->ndo_setup_tc) + return 0; + + memset(stats, 0xff, sizeof(*stats)); + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); + if (err == -EOPNOTSUPP) + return 0; + if (err) + return err; + + xstats = nla_nest_start(skb, TCA_STATS_APP); + if (!xstats) + goto err; + + if (taprio_put_stat(skb, stats->window_drops, + TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) || + taprio_put_stat(skb, stats->tx_overruns, + TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS)) + goto err_cancel; + + nla_nest_end(skb, xstats); + + return 0; + +err_cancel: + nla_nest_cancel(skb, xstats); +err: + return -EMSGSIZE; +} + +static int taprio_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct tc_taprio_qopt_offload offload = { + .cmd = TAPRIO_CMD_STATS, + }; + + return taprio_dump_xstats(sch, d, &offload, &offload.stats); +} + static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); @@ -2391,12 +2460,20 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, __acquires(d->lock) { struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + struct tc_taprio_qopt_offload offload = { + .cmd = TAPRIO_CMD_QUEUE_STATS, + .queue_stats = { + .queue = cl - 1, + }, + }; + struct Qdisc *child; - sch = rtnl_dereference(dev_queue->qdisc_sleeping); - if (gnet_stats_copy_basic(d, NULL, &sch->bstats, true) < 0 || - qdisc_qstats_copy(d, sch) < 0) + child = rtnl_dereference(dev_queue->qdisc_sleeping); + if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 || + qdisc_qstats_copy(d, child) < 0) return -1; - return 0; + + return taprio_dump_xstats(sch, d, &offload, &offload.queue_stats.stats); } static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) @@ -2443,6 +2520,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, .dump = taprio_dump, + .dump_stats = taprio_dump_stats, .owner = THIS_MODULE, }; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 277ad11f4d61..17d2d00ddb18 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -13,6 +13,7 @@ #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> +#include <net/gso.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> diff --git a/net/sctp/offload.c b/net/sctp/offload.c index eb874e3c399a..502095173d88 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -22,6 +22,7 @@ #include <net/sctp/sctp.h> #include <net/sctp/checksum.h> #include <net/protocol.h> +#include <net/gso.h> static __le32 sctp_gso_make_checksum(struct sk_buff *skb) { diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index c365df24ad33..664d1f2e9121 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -500,9 +500,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, continue; fl4->fl4_sport = laddr->a.v4.sin_port; - flowi4_update_output(fl4, - asoc->base.sk->sk_bound_dev_if, - RT_CONN_FLAGS_TOS(asoc->base.sk, tos), + flowi4_update_output(fl4, asoc->base.sk->sk_bound_dev_if, daddr->v4.sin_addr.s_addr, laddr->a.v4.sin_addr.s_addr); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index cda8c2874691..6554a357fe33 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4895,7 +4895,7 @@ out: } /* The SCTP ioctl handler. */ -static int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int sctp_ioctl(struct sock *sk, int cmd, int *karg) { int rc = -ENOTCONN; @@ -4911,7 +4911,7 @@ static int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) switch (cmd) { case SIOCINQ: { struct sk_buff *skb; - unsigned int amount = 0; + *karg = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { @@ -4919,9 +4919,9 @@ static int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) * We will only return the amount of this packet since * that is all that will be read. */ - amount = skb->len; + *karg = skb->len; } - rc = put_user(amount, (int __user *)arg); + rc = 0; break; } default: @@ -8281,6 +8281,22 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, return retval; } +static bool sctp_bpf_bypass_getsockopt(int level, int optname) +{ + if (level == SOL_SCTP) { + switch (optname) { + case SCTP_SOCKOPT_PEELOFF: + case SCTP_SOCKOPT_PEELOFF_FLAGS: + case SCTP_SOCKOPT_CONNECTX3: + return true; + default: + return false; + } + } + + return false; +} + static int sctp_hash(struct sock *sk) { /* STUB */ @@ -9650,6 +9666,7 @@ struct proto sctp_prot = { .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, + .bpf_bypass_getsockopt = sctp_bpf_bypass_getsockopt, .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, @@ -9705,6 +9722,7 @@ struct proto sctpv6_prot = { .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, + .bpf_bypass_getsockopt = sctp_bpf_bypass_getsockopt, .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index e843760e9aaa..54afbe4fb087 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -148,18 +148,19 @@ static void sctp_sched_free_sched(struct sctp_stream *stream) int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) { - struct sctp_sched_ops *n = sctp_sched_ops[sched]; struct sctp_sched_ops *old = asoc->outqueue.sched; struct sctp_datamsg *msg = NULL; + struct sctp_sched_ops *n; struct sctp_chunk *ch; int i, ret = 0; - if (old == n) - return ret; - if (sched > SCTP_SS_MAX) return -EINVAL; + n = sctp_sched_ops[sched]; + if (old == n) + return ret; + if (old) sctp_sched_free_sched(&asoc->stream); diff --git a/net/socket.c b/net/socket.c index b7e01d0fe082..b778fc03c6e0 100644 --- a/net/socket.c +++ b/net/socket.c @@ -57,6 +57,7 @@ #include <linux/mm.h> #include <linux/socket.h> #include <linux/file.h> +#include <linux/splice.h> #include <linux/net.h> #include <linux/interrupt.h> #include <linux/thread_info.h> @@ -126,11 +127,10 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #endif static int sock_fasync(int fd, struct file *filp, int on); -static ssize_t sock_sendpage(struct file *file, struct page *page, - int offset, size_t size, loff_t *ppos, int more); static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +static void sock_splice_eof(struct file *file); #ifdef CONFIG_PROC_FS static void sock_show_fdinfo(struct seq_file *m, struct file *f) @@ -162,9 +162,9 @@ static const struct file_operations socket_file_ops = { .mmap = sock_mmap, .release = sock_close, .fasync = sock_fasync, - .sendpage = sock_sendpage, - .splice_write = generic_splice_sendpage, + .splice_write = splice_to_socket, .splice_read = sock_splice_read, + .splice_eof = sock_splice_eof, .show_fdinfo = sock_show_fdinfo, }; @@ -1066,26 +1066,6 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, } EXPORT_SYMBOL(kernel_recvmsg); -static ssize_t sock_sendpage(struct file *file, struct page *page, - int offset, size_t size, loff_t *ppos, int more) -{ - struct socket *sock; - int flags; - int ret; - - sock = file->private_data; - - flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; - /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */ - flags |= more; - - ret = kernel_sendpage(sock, page, offset, size, flags); - - if (trace_sock_send_length_enabled()) - call_trace_sock_send_length(sock->sk, ret, 0); - return ret; -} - static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -1098,6 +1078,14 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, return sock->ops->splice_read(sock, ppos, pipe, len, flags); } +static void sock_splice_eof(struct file *file) +{ + struct socket *sock = file->private_data; + + if (sock->ops->splice_eof) + sock->ops->splice_eof(sock); +} + static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; @@ -2138,6 +2126,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = addr_len; } + flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; @@ -2483,6 +2472,7 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, msg_sys->msg_control = ctl_buf; msg_sys->msg_control_is_user = false; } + flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; msg_sys->msg_flags = flags; if (sock->file->f_flags & O_NONBLOCK) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f77cebe2c071..9d9f522e3ae1 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1203,13 +1203,14 @@ err_noclose: static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, int flags) { - return kernel_sendpage(sock, virt_to_page(vec->iov_base), - offset_in_page(vec->iov_base), - vec->iov_len, flags); + struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | flags, }; + + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, vec, 1, vec->iov_len); + return sock_sendmsg(sock, &msg); } /* - * kernel_sendpage() is used exclusively to reduce the number of + * MSG_SPLICE_PAGES is used exclusively to reduce the number of * copy operations in this path. Therefore the caller must ensure * that the pages backing @xdr are unchanging. * @@ -1249,28 +1250,13 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr, if (ret != head->iov_len) goto out; - if (xdr->page_len) { - unsigned int offset, len, remaining; - struct bio_vec *bvec; - - bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT); - offset = offset_in_page(xdr->page_base); - remaining = xdr->page_len; - while (remaining > 0) { - len = min(remaining, bvec->bv_len - offset); - ret = kernel_sendpage(sock, bvec->bv_page, - bvec->bv_offset + offset, - len, 0); - if (ret < 0) - return ret; - *sentp += ret; - if (ret != len) - goto out; - remaining -= len; - offset = 0; - bvec++; - } - } + msg.msg_flags = MSG_SPLICE_PAGES; + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec, + xdr_buf_pagecount(xdr), xdr->page_len); + ret = sock_sendmsg(sock, &msg); + if (ret < 0) + return ret; + *sentp += ret; if (tail->iov_len) { ret = svc_tcp_send_kvec(sock, tail, 0); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index cdcd2731860b..2cde375477e3 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -176,7 +176,7 @@ static int bearer_name_validate(const char *name, */ struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); struct tipc_bearer *b; u32 i; @@ -211,11 +211,10 @@ int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id) void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest) { - struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference(tn->bearer_list[bearer_id]); + b = bearer_get(net, bearer_id); if (b) tipc_disc_add_dest(b->disc); rcu_read_unlock(); @@ -223,11 +222,10 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest) void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) { - struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference(tn->bearer_list[bearer_id]); + b = bearer_get(net, bearer_id); if (b) tipc_disc_remove_dest(b->disc); rcu_read_unlock(); @@ -431,7 +429,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, dev = dev_get_by_name(net, dev_name); if (!dev) return -ENODEV; - if (tipc_mtu_bad(dev, 0)) { + if (tipc_mtu_bad(dev)) { dev_put(dev); return -EINVAL; } @@ -534,7 +532,7 @@ int tipc_bearer_mtu(struct net *net, u32 bearer_id) struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference(tipc_net(net)->bearer_list[bearer_id]); + b = bearer_get(net, bearer_id); if (b) mtu = b->mtu; rcu_read_unlock(); @@ -708,7 +706,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, test_and_set_bit_lock(0, &b->up); break; case NETDEV_CHANGEMTU: - if (tipc_mtu_bad(dev, 0)) { + if (tipc_mtu_bad(dev)) { bearer_disable(net, b); break; } @@ -745,7 +743,7 @@ void tipc_bearer_cleanup(void) void tipc_bearer_stop(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); struct tipc_bearer *b; u32 i; @@ -881,7 +879,7 @@ int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb) struct tipc_bearer *bearer; struct tipc_nl_msg msg; struct net *net = sock_net(skb->sk); - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); if (i == MAX_BEARERS) return 0; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index bd0cc5c287ef..1ee60649bd17 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -257,9 +257,9 @@ static inline void tipc_loopback_trace(struct net *net, } /* check if device MTU is too low for tipc headers */ -static inline bool tipc_mtu_bad(struct net_device *dev, unsigned int reserve) +static inline bool tipc_mtu_bad(struct net_device *dev) { - if (dev->mtu >= TIPC_MIN_BEARER_MTU + reserve) + if (dev->mtu >= TIPC_MIN_BEARER_MTU) return false; netdev_warn(dev, "MTU too low for tipc bearer\n"); return true; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 0a85244fd618..926232557e77 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -739,10 +739,6 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, udp_conf.use_udp_checksums = false; ub->ifindex = dev->ifindex; b->encap_hlen = sizeof(struct iphdr) + sizeof(struct udphdr); - if (tipc_mtu_bad(dev, b->encap_hlen)) { - err = -EINVAL; - goto err; - } b->mtu = b->media->mtu; #if IS_ENABLED(CONFIG_IPV6) } else if (local.proto == htons(ETH_P_IPV6)) { diff --git a/net/tls/tls.h b/net/tls/tls.h index 0672acab2773..d002c3af1966 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -97,6 +97,7 @@ void tls_update_rx_zc_capable(struct tls_context *tls_ctx); void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); void tls_sw_strparser_done(struct tls_context *tls_ctx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +void tls_sw_splice_eof(struct socket *sock); int tls_sw_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tls_sw_sendpage(struct sock *sk, struct page *page, @@ -115,6 +116,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, size_t len, unsigned int flags); int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +void tls_device_splice_eof(struct socket *sock); int tls_device_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tls_tx_records(struct sock *sk, int flags); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index bf69c9d6d06c..b82770f68807 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -268,9 +268,8 @@ static void tls_append_frag(struct tls_record_info *record, skb_frag_size_add(frag, size); } else { ++frag; - __skb_frag_set_page(frag, pfrag->page); - skb_frag_off_set(frag, pfrag->offset); - skb_frag_size_set(frag, size); + skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, + size); ++record->num_frags; get_page(pfrag->page); } @@ -357,9 +356,8 @@ static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, return -ENOMEM; frag = &record->frags[0]; - __skb_frag_set_page(frag, pfrag->page); - skb_frag_off_set(frag, pfrag->offset); - skb_frag_size_set(frag, prepend_size); + skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, + prepend_size); get_page(pfrag->page); pfrag->offset += prepend_size; @@ -424,16 +422,10 @@ static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) return 0; } -union tls_iter_offset { - struct iov_iter *msg_iter; - int offset; -}; - static int tls_push_data(struct sock *sk, - union tls_iter_offset iter_offset, + struct iov_iter *iter, size_t size, int flags, - unsigned char record_type, - struct page *zc_page) + unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; @@ -449,7 +441,8 @@ static int tls_push_data(struct sock *sk, long timeo; if (flags & - ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST)) + ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST | + MSG_SPLICE_PAGES)) return -EOPNOTSUPP; if (unlikely(sk->sk_err)) @@ -501,21 +494,35 @@ handle_error: record = ctx->open_record; copy = min_t(size_t, size, max_open_record_len - record->len); - if (copy && zc_page) { + if (copy && (flags & MSG_SPLICE_PAGES)) { struct page_frag zc_pfrag; + struct page **pages = &zc_pfrag.page; + size_t off; + + rc = iov_iter_extract_pages(iter, &pages, + copy, 1, 0, &off); + if (rc <= 0) { + if (rc == 0) + rc = -EIO; + goto handle_error; + } + copy = rc; - zc_pfrag.page = zc_page; - zc_pfrag.offset = iter_offset.offset; + if (WARN_ON_ONCE(!sendpage_ok(zc_pfrag.page))) { + iov_iter_revert(iter, copy); + rc = -EIO; + goto handle_error; + } + + zc_pfrag.offset = off; zc_pfrag.size = copy; tls_append_frag(record, &zc_pfrag, copy); - - iter_offset.offset += copy; } else if (copy) { copy = min_t(size_t, copy, pfrag->size - pfrag->offset); rc = tls_device_copy_data(page_address(pfrag->page) + pfrag->offset, copy, - iter_offset.msg_iter); + iter); if (rc) goto handle_error; tls_append_frag(record, pfrag, copy); @@ -570,9 +577,11 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { unsigned char record_type = TLS_RECORD_TYPE_DATA; struct tls_context *tls_ctx = tls_get_ctx(sk); - union tls_iter_offset iter; int rc; + if (!tls_ctx->zerocopy_sendfile) + msg->msg_flags &= ~MSG_SPLICE_PAGES; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); @@ -582,8 +591,8 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto out; } - iter.msg_iter = &msg->msg_iter; - rc = tls_push_data(sk, iter, size, msg->msg_flags, record_type, NULL); + rc = tls_push_data(sk, &msg->msg_iter, size, msg->msg_flags, + record_type); out: release_sock(sk); @@ -591,47 +600,42 @@ out: return rc; } -int tls_device_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) +void tls_device_splice_eof(struct socket *sock) { + struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); - union tls_iter_offset iter_offset; - struct iov_iter msg_iter; - char *kaddr; - struct kvec iov; - int rc; + struct iov_iter iter = {}; - if (flags & MSG_SENDPAGE_NOTLAST) - flags |= MSG_MORE; + if (!tls_is_partially_sent_record(tls_ctx)) + return; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); - if (flags & MSG_OOB) { - rc = -EOPNOTSUPP; - goto out; - } - - if (tls_ctx->zerocopy_sendfile) { - iter_offset.offset = offset; - rc = tls_push_data(sk, iter_offset, size, - flags, TLS_RECORD_TYPE_DATA, page); - goto out; + if (tls_is_partially_sent_record(tls_ctx)) { + iov_iter_bvec(&iter, ITER_SOURCE, NULL, 0, 0); + tls_push_data(sk, &iter, 0, 0, TLS_RECORD_TYPE_DATA); } - kaddr = kmap(page); - iov.iov_base = kaddr + offset; - iov.iov_len = size; - iov_iter_kvec(&msg_iter, ITER_SOURCE, &iov, 1, size); - iter_offset.msg_iter = &msg_iter; - rc = tls_push_data(sk, iter_offset, size, flags, TLS_RECORD_TYPE_DATA, - NULL); - kunmap(page); - -out: release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); - return rc; +} + +int tls_device_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; + + if (flags & MSG_SENDPAGE_NOTLAST) + msg.msg_flags |= MSG_MORE; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + bvec_set_page(&bvec, page, size, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); + return tls_device_sendmsg(sk, &msg, size); } struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, @@ -696,12 +700,10 @@ EXPORT_SYMBOL(tls_get_record); static int tls_device_push_pending_record(struct sock *sk, int flags) { - union tls_iter_offset iter; - struct iov_iter msg_iter; + struct iov_iter iter; - iov_iter_kvec(&msg_iter, ITER_SOURCE, NULL, 0, 0); - iter.msg_iter = &msg_iter; - return tls_push_data(sk, iter, 0, flags, TLS_RECORD_TYPE_DATA, NULL); + iov_iter_kvec(&iter, ITER_SOURCE, NULL, 0, 0); + return tls_push_data(sk, &iter, 0, flags, TLS_RECORD_TYPE_DATA); } void tls_device_write_space(struct sock *sk, struct tls_context *ctx) @@ -1217,7 +1219,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) tls_device_attach(ctx, sk, netdev); up_read(&device_offload_lock); - /* following this assignment tls_is_sk_tx_device_offloaded + /* following this assignment tls_is_skb_tx_device_offloaded * will return true and the context might be accessed * by the netdev's xmit function. */ @@ -1370,7 +1372,7 @@ static int tls_device_down(struct net_device *netdev) list_for_each_entry_safe(ctx, tmp, &list, list) { /* Stop offloaded TX and switch to the fallback. - * tls_is_sk_tx_device_offloaded will return false. + * tls_is_skb_tx_device_offloaded will return false. */ WRITE_ONCE(ctx->sk->sk_validate_xmit_skb, tls_validate_xmit_skb_sw); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index f2e7302a4d96..7b9c83dd7de2 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -125,7 +125,10 @@ int tls_push_sg(struct sock *sk, u16 first_offset, int flags) { - int sendpage_flags = flags | MSG_SENDPAGE_NOTLAST; + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = MSG_SENDPAGE_NOTLAST | MSG_SPLICE_PAGES | flags, + }; int ret = 0; struct page *p; size_t size; @@ -134,16 +137,19 @@ int tls_push_sg(struct sock *sk, size = sg->length - offset; offset += sg->offset; - ctx->in_tcp_sendpages = true; + ctx->splicing_pages = true; while (1) { if (sg_is_last(sg)) - sendpage_flags = flags; + msg.msg_flags = flags; /* is sending application-limited? */ tcp_rate_check_app_limited(sk); p = sg_page(sg); retry: - ret = do_tcp_sendpages(sk, p, offset, size, sendpage_flags); + bvec_set_page(&bvec, p, size, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); + + ret = tcp_sendmsg_locked(sk, &msg, size); if (ret != size) { if (ret > 0) { @@ -155,7 +161,7 @@ retry: offset -= sg->offset; ctx->partially_sent_offset = offset; ctx->partially_sent_record = (void *)sg; - ctx->in_tcp_sendpages = false; + ctx->splicing_pages = false; return ret; } @@ -169,7 +175,7 @@ retry: size = sg->length; } - ctx->in_tcp_sendpages = false; + ctx->splicing_pages = false; return 0; } @@ -247,11 +253,11 @@ static void tls_write_space(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); - /* If in_tcp_sendpages call lower protocol write space handler + /* If splicing_pages call lower protocol write space handler * to ensure we wake up any waiting operations there. For example - * if do_tcp_sendpages where to call sk_wait_event. + * if splicing pages where to call sk_wait_event. */ - if (ctx->in_tcp_sendpages) { + if (ctx->splicing_pages) { ctx->sk_write_space(sk); return; } @@ -352,6 +358,39 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) tls_ctx_free(sk, ctx); } +static __poll_t tls_sk_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + struct tls_sw_context_rx *ctx; + struct tls_context *tls_ctx; + struct sock *sk = sock->sk; + struct sk_psock *psock; + __poll_t mask = 0; + u8 shutdown; + int state; + + mask = tcp_poll(file, sock, wait); + + state = inet_sk_state_load(sk); + shutdown = READ_ONCE(sk->sk_shutdown); + if (unlikely(state != TCP_ESTABLISHED || shutdown & RCV_SHUTDOWN)) + return mask; + + tls_ctx = tls_get_ctx(sk); + ctx = tls_sw_ctx_rx(tls_ctx); + psock = sk_psock_get(sk); + + if (skb_queue_empty_lockless(&ctx->rx_list) && + !tls_strp_msg_ready(ctx) && + sk_psock_queue_empty(psock)) + mask &= ~(EPOLLIN | EPOLLRDNORM); + + if (psock) + sk_psock_put(sk, psock); + + return mask; +} + static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, int __user *optlen, int tx) { @@ -918,13 +957,16 @@ static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG] ops[TLS_BASE][TLS_BASE] = *base; ops[TLS_SW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; + ops[TLS_SW ][TLS_BASE].splice_eof = tls_sw_splice_eof; ops[TLS_SW ][TLS_BASE].sendpage_locked = tls_sw_sendpage_locked; ops[TLS_BASE][TLS_SW ] = ops[TLS_BASE][TLS_BASE]; ops[TLS_BASE][TLS_SW ].splice_read = tls_sw_splice_read; + ops[TLS_BASE][TLS_SW ].poll = tls_sk_poll; ops[TLS_SW ][TLS_SW ] = ops[TLS_SW ][TLS_BASE]; ops[TLS_SW ][TLS_SW ].splice_read = tls_sw_splice_read; + ops[TLS_SW ][TLS_SW ].poll = tls_sk_poll; #ifdef CONFIG_TLS_DEVICE ops[TLS_HW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; @@ -986,6 +1028,7 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; prot[TLS_SW][TLS_BASE].sendmsg = tls_sw_sendmsg; + prot[TLS_SW][TLS_BASE].splice_eof = tls_sw_splice_eof; prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage; prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; @@ -1001,10 +1044,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], #ifdef CONFIG_TLS_DEVICE prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; prot[TLS_HW][TLS_BASE].sendmsg = tls_device_sendmsg; + prot[TLS_HW][TLS_BASE].splice_eof = tls_device_splice_eof; prot[TLS_HW][TLS_BASE].sendpage = tls_device_sendpage; prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; + prot[TLS_HW][TLS_SW].splice_eof = tls_device_splice_eof; prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage; prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW]; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 1a53c8f481e9..319f61590d2c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -931,7 +931,37 @@ static int tls_sw_push_pending_record(struct sock *sk, int flags) &copied, flags); } -int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +static int tls_sw_sendmsg_splice(struct sock *sk, struct msghdr *msg, + struct sk_msg *msg_pl, size_t try_to_copy, + ssize_t *copied) +{ + struct page *page = NULL, **pages = &page; + + do { + ssize_t part; + size_t off; + + part = iov_iter_extract_pages(&msg->msg_iter, &pages, + try_to_copy, 1, 0, &off); + if (part <= 0) + return part ?: -EIO; + + if (WARN_ON_ONCE(!sendpage_ok(page))) { + iov_iter_revert(&msg->msg_iter, part); + return -EIO; + } + + sk_msg_page_add(msg_pl, page, part, off); + sk_mem_charge(sk, part); + *copied += part; + try_to_copy -= part; + } while (try_to_copy && !sk_msg_full(msg_pl)); + + return 0; +} + +static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, + size_t size) { long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); struct tls_context *tls_ctx = tls_get_ctx(sk); @@ -954,15 +984,6 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int ret = 0; int pending; - if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_CMSG_COMPAT)) - return -EOPNOTSUPP; - - ret = mutex_lock_interruptible(&tls_ctx->tx_lock); - if (ret) - return ret; - lock_sock(sk); - if (unlikely(msg->msg_controllen)) { ret = tls_process_cmsg(sk, msg, &record_type); if (ret) { @@ -1020,6 +1041,17 @@ alloc_encrypted: full_record = true; } + if (try_to_copy && (msg->msg_flags & MSG_SPLICE_PAGES)) { + ret = tls_sw_sendmsg_splice(sk, msg, msg_pl, + try_to_copy, &copied); + if (ret < 0) + goto send_end; + tls_ctx->pending_open_record_frags = true; + if (full_record || eor || sk_msg_full(msg_pl)) + goto copied; + continue; + } + if (!is_kvec && (full_record || eor) && !async_capable) { u32 first = msg_pl->sg.end; @@ -1084,6 +1116,7 @@ fallback_to_reg_send: */ tls_ctx->pending_open_record_frags = true; copied += try_to_copy; +copied: if (full_record || eor) { ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, record_type, &copied, @@ -1151,157 +1184,136 @@ trim_sgl: send_end: ret = sk_stream_error(sk, msg->msg_flags, ret); + return copied > 0 ? copied : ret; +} + +int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + int ret; + + if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_CMSG_COMPAT | MSG_SPLICE_PAGES | + MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) + return -EOPNOTSUPP; + ret = mutex_lock_interruptible(&tls_ctx->tx_lock); + if (ret) + return ret; + lock_sock(sk); + ret = tls_sw_sendmsg_locked(sk, msg, size); release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); - return copied > 0 ? copied : ret; + return ret; } -static int tls_sw_do_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) +/* + * Handle unexpected EOF during splice without SPLICE_F_MORE set. + */ +void tls_sw_splice_eof(struct socket *sock) { - long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct tls_prot_info *prot = &tls_ctx->prot_info; - unsigned char record_type = TLS_RECORD_TYPE_DATA; - struct sk_msg *msg_pl; struct tls_rec *rec; - int num_async = 0; + struct sk_msg *msg_pl; ssize_t copied = 0; - bool full_record; - int record_room; + bool retrying = false; int ret = 0; - bool eor; - - eor = !(flags & MSG_SENDPAGE_NOTLAST); - sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - - /* Call the sk_stream functions to manage the sndbuf mem. */ - while (size > 0) { - size_t copy, required_size; - - if (sk->sk_err) { - ret = -sk->sk_err; - goto sendpage_end; - } + int pending; - if (ctx->open_rec) - rec = ctx->open_rec; - else - rec = ctx->open_rec = tls_get_rec(sk); - if (!rec) { - ret = -ENOMEM; - goto sendpage_end; - } + if (!ctx->open_rec) + return; - msg_pl = &rec->msg_plaintext; + mutex_lock(&tls_ctx->tx_lock); + lock_sock(sk); - full_record = false; - record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; - copy = size; - if (copy >= record_room) { - copy = record_room; - full_record = true; - } +retry: + rec = ctx->open_rec; + if (!rec) + goto unlock; - required_size = msg_pl->sg.size + copy + prot->overhead_size; + msg_pl = &rec->msg_plaintext; - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; -alloc_payload: - ret = tls_alloc_encrypted_msg(sk, required_size); - if (ret) { - if (ret != -ENOSPC) - goto wait_for_memory; + /* Check the BPF advisor and perform transmission. */ + ret = bpf_exec_tx_verdict(msg_pl, sk, false, TLS_RECORD_TYPE_DATA, + &copied, 0); + switch (ret) { + case 0: + case -EAGAIN: + if (retrying) + goto unlock; + retrying = true; + goto retry; + case -EINPROGRESS: + break; + default: + goto unlock; + } - /* Adjust copy according to the amount that was - * actually allocated. The difference is due - * to max sg elements limit - */ - copy -= required_size - msg_pl->sg.size; - full_record = true; - } + /* Wait for pending encryptions to get completed */ + spin_lock_bh(&ctx->encrypt_compl_lock); + ctx->async_notify = true; - sk_msg_page_add(msg_pl, page, copy, offset); - sk_mem_charge(sk, copy); + pending = atomic_read(&ctx->encrypt_pending); + spin_unlock_bh(&ctx->encrypt_compl_lock); + if (pending) + crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + else + reinit_completion(&ctx->async_wait.completion); - offset += copy; - size -= copy; - copied += copy; + /* There can be no concurrent accesses, since we have no pending + * encrypt operations + */ + WRITE_ONCE(ctx->async_notify, false); - tls_ctx->pending_open_record_frags = true; - if (full_record || eor || sk_msg_full(msg_pl)) { - ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, - record_type, &copied, flags); - if (ret) { - if (ret == -EINPROGRESS) - num_async++; - else if (ret == -ENOMEM) - goto wait_for_memory; - else if (ret != -EAGAIN) { - if (ret == -ENOSPC) - ret = 0; - goto sendpage_end; - } - } - } - continue; -wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: - ret = sk_stream_wait_memory(sk, &timeo); - if (ret) { - if (ctx->open_rec) - tls_trim_both_msgs(sk, msg_pl->sg.size); - goto sendpage_end; - } + if (ctx->async_wait.err) + goto unlock; - if (ctx->open_rec) - goto alloc_payload; + /* Transmit if any encryptions have completed */ + if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { + cancel_delayed_work(&ctx->tx_work.work); + tls_tx_records(sk, 0); } - if (num_async) { - /* Transmit if any encryptions have completed */ - if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { - cancel_delayed_work(&ctx->tx_work.work); - tls_tx_records(sk, flags); - } - } -sendpage_end: - ret = sk_stream_error(sk, flags, ret); - return copied > 0 ? copied : ret; +unlock: + release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); } int tls_sw_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; + if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY | MSG_NO_SHARED_FRAGS)) return -EOPNOTSUPP; + if (flags & MSG_SENDPAGE_NOTLAST) + msg.msg_flags |= MSG_MORE; - return tls_sw_do_sendpage(sk, page, offset, size, flags); + bvec_set_page(&bvec, page, size, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); + return tls_sw_sendmsg_locked(sk, &msg, size); } int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - int ret; + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) return -EOPNOTSUPP; + if (flags & MSG_SENDPAGE_NOTLAST) + msg.msg_flags |= MSG_MORE; - ret = mutex_lock_interruptible(&tls_ctx->tx_lock); - if (ret) - return ret; - lock_sock(sk); - ret = tls_sw_do_sendpage(sk, page, offset, size, flags); - release_sock(sk); - mutex_unlock(&tls_ctx->tx_lock); - return ret; + bvec_set_page(&bvec, page, size, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); + return tls_sw_sendmsg(sk, &msg, size); } static int diff --git a/net/unix/Kconfig b/net/unix/Kconfig index b7f811216820..28b232f281ab 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -4,7 +4,7 @@ # config UNIX - tristate "Unix domain sockets" + bool "Unix domain sockets" help If you say Y here, you will include support for Unix domain sockets; sockets are the standard Unix mechanism for establishing and @@ -14,10 +14,6 @@ config UNIX an embedded system or something similar, you therefore definitely want to say Y here. - To compile this driver as a module, choose M here: the module will be - called unix. Note that several important services won't work - correctly if you say M here and then neglect to load the module. - Say Y unless you know what you are doing. config UNIX_SCM diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e7728b57a8c7..73c61a010b01 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -921,11 +921,26 @@ static void unix_unhash(struct sock *sk) */ } +static bool unix_bpf_bypass_getsockopt(int level, int optname) +{ + if (level == SOL_SOCKET) { + switch (optname) { + case SO_PEERPIDFD: + return true; + default: + return false; + } + } + + return false; +} + struct proto unix_dgram_proto = { .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, + .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_dgram_bpf_update_proto, #endif @@ -937,6 +952,7 @@ struct proto unix_stream_proto = { .obj_size = sizeof(struct unix_sock), .close = unix_close, .unhash = unix_unhash, + .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_stream_bpf_update_proto, #endif @@ -1361,7 +1377,8 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; - if (test_bit(SOCK_PASSCRED, &sock->flags) && + if ((test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags)) && !unix_sk(sk)->addr) { err = unix_autobind(sk); if (err) @@ -1469,7 +1486,8 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (err) goto out; - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { + if ((test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { err = unix_autobind(sk); if (err) goto out; @@ -1670,6 +1688,8 @@ static void unix_sock_inherit_flags(const struct socket *old, { if (test_bit(SOCK_PASSCRED, &old->flags)) set_bit(SOCK_PASSCRED, &new->flags); + if (test_bit(SOCK_PASSPIDFD, &old->flags)) + set_bit(SOCK_PASSPIDFD, &new->flags); if (test_bit(SOCK_PASSSEC, &old->flags)) set_bit(SOCK_PASSSEC, &new->flags); } @@ -1819,8 +1839,10 @@ static bool unix_passcred_enabled(const struct socket *sock, const struct sock *other) { return test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags) || !other->sk_socket || - test_bit(SOCK_PASSCRED, &other->sk_socket->flags); + test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || + test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); } /* @@ -1839,24 +1861,6 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, } } -static int maybe_init_creds(struct scm_cookie *scm, - struct socket *socket, - const struct sock *other) -{ - int err; - struct msghdr msg = { .msg_controllen = 0 }; - - err = scm_send(socket, &msg, scm, false); - if (err) - return err; - - if (unix_passcred_enabled(socket, other)) { - scm->pid = get_pid(task_tgid(current)); - current_uid_gid(&scm->creds.uid, &scm->creds.gid); - } - return err; -} - static bool unix_skb_scm_eq(struct sk_buff *skb, struct scm_cookie *scm) { @@ -1922,7 +1926,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { + if ((test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { err = unix_autobind(sk); if (err) goto out; @@ -2200,19 +2205,25 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, while (sent < len) { size = len - sent; - /* Keep two messages in the pipe so it schedules better */ - size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); + if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { + skb = sock_alloc_send_pskb(sk, 0, 0, + msg->msg_flags & MSG_DONTWAIT, + &err, 0); + } else { + /* Keep two messages in the pipe so it schedules better */ + size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); - /* allow fallback to order-0 allocations */ - size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); + /* allow fallback to order-0 allocations */ + size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); - data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); + data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); - data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); + data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); - skb = sock_alloc_send_pskb(sk, size - data_len, data_len, - msg->msg_flags & MSG_DONTWAIT, &err, - get_order(UNIX_SKB_FRAGS_SZ)); + skb = sock_alloc_send_pskb(sk, size - data_len, data_len, + msg->msg_flags & MSG_DONTWAIT, &err, + get_order(UNIX_SKB_FRAGS_SZ)); + } if (!skb) goto out_err; @@ -2224,13 +2235,24 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, } fds_sent = true; - skb_put(skb, size - data_len); - skb->data_len = data_len; - skb->len = size; - err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); - if (err) { - kfree_skb(skb); - goto out_err; + if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { + err = skb_splice_from_iter(skb, &msg->msg_iter, size, + sk->sk_allocation); + if (err < 0) { + kfree_skb(skb); + goto out_err; + } + size = err; + refcount_add(size, &sk->sk_wmem_alloc); + } else { + skb_put(skb, size - data_len); + skb->data_len = data_len; + skb->len = size; + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); + if (err) { + kfree_skb(skb); + goto out_err; + } } unix_state_lock(other); @@ -2275,117 +2297,15 @@ out_err: static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, int offset, size_t size, int flags) { - int err; - bool send_sigpipe = false; - bool init_scm = true; - struct scm_cookie scm; - struct sock *other, *sk = socket->sk; - struct sk_buff *skb, *newskb = NULL, *tail = NULL; + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES }; - if (flags & MSG_OOB) - return -EOPNOTSUPP; + if (flags & MSG_SENDPAGE_NOTLAST) + msg.msg_flags |= MSG_MORE; - other = unix_peer(sk); - if (!other || sk->sk_state != TCP_ESTABLISHED) - return -ENOTCONN; - - if (false) { -alloc_skb: - unix_state_unlock(other); - mutex_unlock(&unix_sk(other)->iolock); - newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT, - &err, 0); - if (!newskb) - goto err; - } - - /* we must acquire iolock as we modify already present - * skbs in the sk_receive_queue and mess with skb->len - */ - err = mutex_lock_interruptible(&unix_sk(other)->iolock); - if (err) { - err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS; - goto err; - } - - if (sk->sk_shutdown & SEND_SHUTDOWN) { - err = -EPIPE; - send_sigpipe = true; - goto err_unlock; - } - - unix_state_lock(other); - - if (sock_flag(other, SOCK_DEAD) || - other->sk_shutdown & RCV_SHUTDOWN) { - err = -EPIPE; - send_sigpipe = true; - goto err_state_unlock; - } - - if (init_scm) { - err = maybe_init_creds(&scm, socket, other); - if (err) - goto err_state_unlock; - init_scm = false; - } - - skb = skb_peek_tail(&other->sk_receive_queue); - if (tail && tail == skb) { - skb = newskb; - } else if (!skb || !unix_skb_scm_eq(skb, &scm)) { - if (newskb) { - skb = newskb; - } else { - tail = skb; - goto alloc_skb; - } - } else if (newskb) { - /* this is fast path, we don't necessarily need to - * call to kfree_skb even though with newskb == NULL - * this - does no harm - */ - consume_skb(newskb); - newskb = NULL; - } - - if (skb_append_pagefrags(skb, page, offset, size)) { - tail = skb; - goto alloc_skb; - } - - skb->len += size; - skb->data_len += size; - skb->truesize += size; - refcount_add(size, &sk->sk_wmem_alloc); - - if (newskb) { - err = unix_scm_to_skb(&scm, skb, false); - if (err) - goto err_state_unlock; - spin_lock(&other->sk_receive_queue.lock); - __skb_queue_tail(&other->sk_receive_queue, newskb); - spin_unlock(&other->sk_receive_queue.lock); - } - - unix_state_unlock(other); - mutex_unlock(&unix_sk(other)->iolock); - - other->sk_data_ready(other); - scm_destroy(&scm); - return size; - -err_state_unlock: - unix_state_unlock(other); -err_unlock: - mutex_unlock(&unix_sk(other)->iolock); -err: - kfree_skb(newskb); - if (send_sigpipe && !(flags & MSG_NOSIGNAL)) - send_sig(SIGPIPE, current, 0); - if (!init_scm) - scm_destroy(&scm); - return err; + bvec_set_page(&bvec, page, size, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); + return unix_stream_sendmsg(socket, &msg, size); } static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, @@ -2821,7 +2741,8 @@ unlock: /* Never glue messages from different writers */ if (!unix_skb_scm_eq(skb, &scm)) break; - } else if (test_bit(SOCK_PASSCRED, &sock->flags)) { + } else if (test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); diff --git a/net/wireless/core.c b/net/wireless/core.c index b3ec9eaec36b..fc449bea39a1 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -129,6 +129,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, int result; ASSERT_RTNL(); + lockdep_assert_wiphy(&rdev->wiphy); /* Ignore nop renames */ if (strcmp(newname, wiphy_name(&rdev->wiphy)) == 0) @@ -195,6 +196,8 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, continue; nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); } + + wiphy_lock(&rdev->wiphy); nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY); wiphy_net_set(&rdev->wiphy, net); @@ -203,6 +206,8 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, WARN_ON(err); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); + wiphy_unlock(&rdev->wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; @@ -360,7 +365,8 @@ static void cfg80211_destroy_iface_wk(struct work_struct *work) rtnl_unlock(); } -static void cfg80211_sched_scan_stop_wk(struct work_struct *work) +static void cfg80211_sched_scan_stop_wk(struct wiphy *wiphy, + struct wiphy_work *work) { struct cfg80211_registered_device *rdev; struct cfg80211_sched_scan_request *req, *tmp; @@ -368,12 +374,10 @@ static void cfg80211_sched_scan_stop_wk(struct work_struct *work) rdev = container_of(work, struct cfg80211_registered_device, sched_scan_stop_wk); - wiphy_lock(&rdev->wiphy); list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { if (req->nl_owner_dead) cfg80211_stop_sched_scan_req(rdev, req, false); } - wiphy_unlock(&rdev->wiphy); } static void cfg80211_propagate_radar_detect_wk(struct work_struct *work) @@ -408,6 +412,34 @@ static void cfg80211_propagate_cac_done_wk(struct work_struct *work) rtnl_unlock(); } +static void cfg80211_wiphy_work(struct work_struct *work) +{ + struct cfg80211_registered_device *rdev; + struct wiphy_work *wk; + + rdev = container_of(work, struct cfg80211_registered_device, wiphy_work); + + wiphy_lock(&rdev->wiphy); + if (rdev->suspended) + goto out; + + spin_lock_irq(&rdev->wiphy_work_lock); + wk = list_first_entry_or_null(&rdev->wiphy_work_list, + struct wiphy_work, entry); + if (wk) { + list_del_init(&wk->entry); + if (!list_empty(&rdev->wiphy_work_list)) + schedule_work(work); + spin_unlock_irq(&rdev->wiphy_work_lock); + + wk->func(&rdev->wiphy, wk); + } else { + spin_unlock_irq(&rdev->wiphy_work_lock); + } +out: + wiphy_unlock(&rdev->wiphy); +} + /* exported functions */ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv, @@ -495,7 +527,7 @@ use_default_name: spin_lock_init(&rdev->bss_lock); INIT_LIST_HEAD(&rdev->bss_list); INIT_LIST_HEAD(&rdev->sched_scan_req_list); - INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done); + wiphy_work_init(&rdev->scan_done_wk, __cfg80211_scan_done); INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk, cfg80211_dfs_channels_update_work); #ifdef CONFIG_CFG80211_WEXT @@ -508,7 +540,7 @@ use_default_name: device_enable_async_suspend(&rdev->wiphy.dev); INIT_WORK(&rdev->destroy_work, cfg80211_destroy_iface_wk); - INIT_WORK(&rdev->sched_scan_stop_wk, cfg80211_sched_scan_stop_wk); + wiphy_work_init(&rdev->sched_scan_stop_wk, cfg80211_sched_scan_stop_wk); INIT_WORK(&rdev->sched_scan_res_wk, cfg80211_sched_scan_results_wk); INIT_WORK(&rdev->propagate_radar_detect_wk, cfg80211_propagate_radar_detect_wk); @@ -533,6 +565,9 @@ use_default_name: return NULL; } + INIT_WORK(&rdev->wiphy_work, cfg80211_wiphy_work); + INIT_LIST_HEAD(&rdev->wiphy_work_list); + spin_lock_init(&rdev->wiphy_work_lock); INIT_WORK(&rdev->rfkill_block, cfg80211_rfkill_block_work); INIT_WORK(&rdev->conn_work, cfg80211_conn_work); INIT_WORK(&rdev->event_work, cfg80211_event_work); @@ -941,8 +976,10 @@ int wiphy_register(struct wiphy *wiphy) rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH; rtnl_lock(); + wiphy_lock(&rdev->wiphy); res = device_add(&rdev->wiphy.dev); if (res) { + wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return res; } @@ -956,6 +993,7 @@ int wiphy_register(struct wiphy *wiphy) cfg80211_debugfs_rdev_add(rdev); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); + wiphy_unlock(&rdev->wiphy); /* set up regulatory info */ wiphy_regulatory_register(wiphy); @@ -1027,6 +1065,31 @@ void wiphy_rfkill_start_polling(struct wiphy *wiphy) } EXPORT_SYMBOL(wiphy_rfkill_start_polling); +void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev) +{ + unsigned int runaway_limit = 100; + unsigned long flags; + + lockdep_assert_held(&rdev->wiphy.mtx); + + spin_lock_irqsave(&rdev->wiphy_work_lock, flags); + while (!list_empty(&rdev->wiphy_work_list)) { + struct wiphy_work *wk; + + wk = list_first_entry(&rdev->wiphy_work_list, + struct wiphy_work, entry); + list_del_init(&wk->entry); + spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); + + wk->func(&rdev->wiphy, wk); + + spin_lock_irqsave(&rdev->wiphy_work_lock, flags); + if (WARN_ON(--runaway_limit == 0)) + INIT_LIST_HEAD(&rdev->wiphy_work_list); + } + spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); +} + void wiphy_unregister(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -1065,25 +1128,29 @@ void wiphy_unregister(struct wiphy *wiphy) cfg80211_rdev_list_generation++; device_del(&rdev->wiphy.dev); +#ifdef CONFIG_PM + if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) + rdev_set_wakeup(rdev, false); +#endif + + /* surely nothing is reachable now, clean up work */ + cfg80211_process_wiphy_works(rdev); wiphy_unlock(&rdev->wiphy); rtnl_unlock(); - flush_work(&rdev->scan_done_wk); + /* this has nothing to do now but make sure it's gone */ + cancel_work_sync(&rdev->wiphy_work); + cancel_work_sync(&rdev->conn_work); flush_work(&rdev->event_work); cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); cancel_delayed_work_sync(&rdev->background_cac_done_wk); flush_work(&rdev->destroy_work); - flush_work(&rdev->sched_scan_stop_wk); flush_work(&rdev->propagate_radar_detect_wk); flush_work(&rdev->propagate_cac_done_wk); flush_work(&rdev->mgmt_registrations_update_wk); flush_work(&rdev->background_cac_abort_wk); -#ifdef CONFIG_PM - if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) - rdev_set_wakeup(rdev, false); -#endif cfg80211_rdev_free_wowlan(rdev); cfg80211_rdev_free_coalesce(rdev); } @@ -1145,8 +1212,6 @@ static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, ASSERT_RTNL(); lockdep_assert_held(&rdev->wiphy.mtx); - flush_work(&wdev->pmsr_free_wk); - nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); wdev->registered = false; @@ -1178,10 +1243,6 @@ static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, kfree_sensitive(wdev->wext.keys); wdev->wext.keys = NULL; #endif - /* only initialized if we have a netdev */ - if (wdev->netdev) - flush_work(&wdev->disconnect_wk); - cfg80211_cqm_config_free(wdev); /* @@ -1455,6 +1516,9 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, cfg80211_leave(rdev, wdev); cfg80211_remove_links(wdev); wiphy_unlock(&rdev->wiphy); + /* since we just did cfg80211_leave() nothing to do there */ + cancel_work_sync(&wdev->disconnect_wk); + cancel_work_sync(&wdev->pmsr_free_wk); break; case NETDEV_DOWN: wiphy_lock(&rdev->wiphy); @@ -1564,6 +1628,66 @@ static struct pernet_operations cfg80211_pernet_ops = { .exit = cfg80211_pernet_exit, }; +void wiphy_work_queue(struct wiphy *wiphy, struct wiphy_work *work) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + unsigned long flags; + + spin_lock_irqsave(&rdev->wiphy_work_lock, flags); + if (list_empty(&work->entry)) + list_add_tail(&work->entry, &rdev->wiphy_work_list); + spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); + + schedule_work(&rdev->wiphy_work); +} +EXPORT_SYMBOL_GPL(wiphy_work_queue); + +void wiphy_work_cancel(struct wiphy *wiphy, struct wiphy_work *work) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + unsigned long flags; + + lockdep_assert_held(&wiphy->mtx); + + spin_lock_irqsave(&rdev->wiphy_work_lock, flags); + if (!list_empty(&work->entry)) + list_del_init(&work->entry); + spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); +} +EXPORT_SYMBOL_GPL(wiphy_work_cancel); + +void wiphy_delayed_work_timer(struct timer_list *t) +{ + struct wiphy_delayed_work *dwork = from_timer(dwork, t, timer); + + wiphy_work_queue(dwork->wiphy, &dwork->work); +} +EXPORT_SYMBOL(wiphy_delayed_work_timer); + +void wiphy_delayed_work_queue(struct wiphy *wiphy, + struct wiphy_delayed_work *dwork, + unsigned long delay) +{ + if (!delay) { + wiphy_work_queue(wiphy, &dwork->work); + return; + } + + dwork->wiphy = wiphy; + mod_timer(&dwork->timer, jiffies + delay); +} +EXPORT_SYMBOL_GPL(wiphy_delayed_work_queue); + +void wiphy_delayed_work_cancel(struct wiphy *wiphy, + struct wiphy_delayed_work *dwork) +{ + lockdep_assert_held(&wiphy->mtx); + + del_timer_sync(&dwork->timer); + wiphy_work_cancel(wiphy, &dwork->work); +} +EXPORT_SYMBOL_GPL(wiphy_delayed_work_cancel); + static int __init cfg80211_init(void) { int err; diff --git a/net/wireless/core.h b/net/wireless/core.h index 7c61752f6d83..291c6d83d56f 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -75,7 +75,7 @@ struct cfg80211_registered_device { struct sk_buff *scan_msg; struct list_head sched_scan_req_list; time64_t suspend_at; - struct work_struct scan_done_wk; + struct wiphy_work scan_done_wk; struct genl_info *cur_cmd_info; @@ -95,7 +95,7 @@ struct cfg80211_registered_device { struct cfg80211_coalesce *coalesce; struct work_struct destroy_work; - struct work_struct sched_scan_stop_wk; + struct wiphy_work sched_scan_stop_wk; struct work_struct sched_scan_res_wk; struct cfg80211_chan_def radar_chandef; @@ -108,6 +108,12 @@ struct cfg80211_registered_device { /* lock for all wdev lists */ spinlock_t mgmt_registrations_lock; + struct work_struct wiphy_work; + struct list_head wiphy_work_list; + /* protects the list above */ + spinlock_t wiphy_work_lock; + bool suspended; + /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wiphy wiphy __aligned(NETDEV_ALIGN); @@ -435,7 +441,7 @@ bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev, int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr); -void __cfg80211_scan_done(struct work_struct *wk); +void __cfg80211_scan_done(struct wiphy *wiphy, struct wiphy_work *wk); void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message); void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, @@ -453,6 +459,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, struct vif_params *params); void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); +void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev); void cfg80211_process_wdev_events(struct wireless_dev *wdev); bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 087d60c0f6e4..772671b9bc42 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3081,6 +3081,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) if (state->filter_wiphy != -1 && state->filter_wiphy != rdev->wiphy_idx) continue; + wiphy_lock(&rdev->wiphy); /* attempt to fit multiple wiphy data chunks into the skb */ do { ret = nl80211_send_wiphy(rdev, NL80211_CMD_NEW_WIPHY, @@ -3107,6 +3108,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) cb->min_dump_alloc < 4096) { cb->min_dump_alloc = 4096; state->split_start = 0; + wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return 1; } @@ -3114,6 +3116,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) break; } } while (state->split_start > 0); + wiphy_unlock(&rdev->wiphy); break; } rtnl_unlock(); @@ -19774,7 +19777,8 @@ static int nl80211_netlink_notify(struct notifier_block * nb, list) { if (sched_scan_req->owner_nlportid == notify->portid) { sched_scan_req->nl_owner_dead = true; - schedule_work(&rdev->sched_scan_stop_wk); + wiphy_work_queue(&rdev->wiphy, + &rdev->sched_scan_stop_wk); } } diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 2bc647720cda..77000a264855 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (C) 2018 - 2021 Intel Corporation + * Copyright (C) 2018 - 2021, 2023 Intel Corporation */ #include <net/cfg80211.h> #include "core.h" @@ -623,9 +623,11 @@ void cfg80211_pmsr_free_wk(struct work_struct *work) struct wireless_dev *wdev = container_of(work, struct wireless_dev, pmsr_free_wk); + wiphy_lock(wdev->wiphy); wdev_lock(wdev); cfg80211_pmsr_process_abort(wdev); wdev_unlock(wdev); + wiphy_unlock(wdev->wiphy); } void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index c501db7bbdb3..ce2104dc05c6 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1004,16 +1004,9 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, nl80211_send_scan_msg(rdev, msg); } -void __cfg80211_scan_done(struct work_struct *wk) +void __cfg80211_scan_done(struct wiphy *wiphy, struct wiphy_work *wk) { - struct cfg80211_registered_device *rdev; - - rdev = container_of(wk, struct cfg80211_registered_device, - scan_done_wk); - - wiphy_lock(&rdev->wiphy); - ___cfg80211_scan_done(rdev, true); - wiphy_unlock(&rdev->wiphy); + ___cfg80211_scan_done(wiphy_to_rdev(wiphy), true); } void cfg80211_scan_done(struct cfg80211_scan_request *request, @@ -1039,7 +1032,8 @@ void cfg80211_scan_done(struct cfg80211_scan_request *request, } request->notified = true; - queue_work(cfg80211_wq, &wiphy_to_rdev(request->wiphy)->scan_done_wk); + wiphy_work_queue(request->wiphy, + &wiphy_to_rdev(request->wiphy)->scan_done_wk); } EXPORT_SYMBOL(cfg80211_scan_done); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 7bdeb8eea92d..247369004aaa 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -5,7 +5,7 @@ * (for nl80211's connect() and wext) * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2009, 2020, 2022 Intel Corporation. All rights reserved. + * Copyright (C) 2009, 2020, 2022-2023 Intel Corporation. All rights reserved. * Copyright 2017 Intel Deutschland GmbH */ @@ -1569,6 +1569,7 @@ void cfg80211_autodisconnect_wk(struct work_struct *work) container_of(work, struct wireless_dev, disconnect_wk); struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + wiphy_lock(wdev->wiphy); wdev_lock(wdev); if (wdev->conn_owner_nlportid) { @@ -1607,4 +1608,5 @@ void cfg80211_autodisconnect_wk(struct work_struct *work) } wdev_unlock(wdev); + wiphy_unlock(wdev->wiphy); } diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 268f670835e9..c629bac3f298 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -5,7 +5,7 @@ * * Copyright 2005-2006 Jiri Benc <jbenc@suse.cz> * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2020-2021 Intel Corporation + * Copyright (C) 2020-2021, 2023 Intel Corporation */ #include <linux/device.h> @@ -105,14 +105,18 @@ static int wiphy_suspend(struct device *dev) cfg80211_leave_all(rdev); cfg80211_process_rdev_events(rdev); } + cfg80211_process_wiphy_works(rdev); if (rdev->ops->suspend) ret = rdev_suspend(rdev, rdev->wiphy.wowlan_config); if (ret == 1) { /* Driver refuse to configure wowlan */ cfg80211_leave_all(rdev); cfg80211_process_rdev_events(rdev); + cfg80211_process_wiphy_works(rdev); ret = rdev_suspend(rdev, NULL); } + if (ret == 0) + rdev->suspended = true; } wiphy_unlock(&rdev->wiphy); rtnl_unlock(); @@ -132,6 +136,8 @@ static int wiphy_resume(struct device *dev) wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); + rdev->suspended = false; + schedule_work(&rdev->wiphy_work); wiphy_unlock(&rdev->wiphy); if (ret) diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index f231207ca210..f3eaa3388694 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -3,7 +3,7 @@ * cfg80211 wext compat for managed mode. * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2009, 2020-2022 Intel Corporation + * Copyright (C) 2009, 2020-2023 Intel Corporation */ #include <linux/export.h> @@ -338,6 +338,7 @@ int cfg80211_wext_siwgenie(struct net_device *dev, if (!ie_len) ie = NULL; + wiphy_lock(wdev->wiphy); wdev_lock(wdev); /* no change */ @@ -370,6 +371,7 @@ int cfg80211_wext_siwgenie(struct net_device *dev, err = 0; out: wdev_unlock(wdev); + wiphy_unlock(wdev->wiphy); return err; } diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index b2df1e0f8153..26f6d304451e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -350,7 +350,7 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { struct xsk_dma_map *dma_map; - if (pool->dma_pages_cnt == 0) + if (!pool->dma_pages) return; dma_map = xp_find_dma_map(pool); @@ -364,6 +364,7 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) __xp_dma_unmap(dma_map, attrs); kvfree(pool->dma_pages); + pool->dma_pages = NULL; pool->dma_pages_cnt = 0; pool->dev = NULL; } @@ -503,7 +504,7 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) if (pool->unaligned) { xskb = pool->free_heads[--pool->free_heads_cnt]; xp_init_xskb_addr(xskb, pool, addr); - if (pool->dma_pages_cnt) + if (pool->dma_pages) xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); } else { xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; @@ -569,7 +570,7 @@ static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xd if (pool->unaligned) { xskb = pool->free_heads[--pool->free_heads_cnt]; xp_init_xskb_addr(xskb, pool, addr); - if (pool->dma_pages_cnt) + if (pool->dma_pages) xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); } else { xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index 872b80188e83..3504925babdb 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -205,14 +205,16 @@ static int espintcp_sendskb_locked(struct sock *sk, struct espintcp_msg *emsg, static int espintcp_sendskmsg_locked(struct sock *sk, struct espintcp_msg *emsg, int flags) { + struct msghdr msghdr = { .msg_flags = flags | MSG_SPLICE_PAGES, }; struct sk_msg *skmsg = &emsg->skmsg; struct scatterlist *sg; int done = 0; int ret; - flags |= MSG_SENDPAGE_NOTLAST; + msghdr.msg_flags |= MSG_SENDPAGE_NOTLAST; sg = &skmsg->sg.data[skmsg->sg.start]; do { + struct bio_vec bvec; size_t size = sg->length - emsg->offset; int offset = sg->offset + emsg->offset; struct page *p; @@ -220,11 +222,13 @@ static int espintcp_sendskmsg_locked(struct sock *sk, emsg->offset = 0; if (sg_is_last(sg)) - flags &= ~MSG_SENDPAGE_NOTLAST; + msghdr.msg_flags &= ~MSG_SENDPAGE_NOTLAST; p = sg_page(sg); retry: - ret = do_tcp_sendpages(sk, p, offset, size, flags); + bvec_set_page(&bvec, p, size, offset); + iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, size); + ret = tcp_sendmsg_locked(sk, &msghdr, size); if (ret < 0) { emsg->offset = offset - sg->offset; skmsg->sg.start += done; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 408f5e55744e..533697e2488f 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <net/dst.h> +#include <net/gso.h> #include <net/xfrm.h> #include <linux/notifier.h> diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 35279c220bd7..a3319965470a 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -33,6 +33,7 @@ #include <linux/uaccess.h> #include <linux/atomic.h> +#include <net/gso.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ipv6.h> diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 80143360bf09..9c0fa0e1786a 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -74,14 +74,11 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) if (!page) return -ENOMEM; - __skb_frag_set_page(frag, page); - len = PAGE_SIZE; if (dlen < len) len = dlen; - skb_frag_off_set(frag, 0); - skb_frag_size_set(frag, len); + skb_frag_fill_page_desc(frag, page, 0, len); memcpy(skb_frag_address(frag), scratch, len); skb->truesize += len; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 369e5de8558f..662c83beb345 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <net/dst.h> +#include <net/gso.h> #include <net/icmp.h> #include <net/inet_ecn.h> #include <net/xfrm.h> |