diff options
Diffstat (limited to 'net')
251 files changed, 6152 insertions, 1952 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index fb3d3262dc1a..4cdf8416869d 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -638,7 +638,8 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) case GET_VLAN_REALDEV_NAME_CMD: err = 0; - vlan_dev_get_realdev_name(dev, args.u.device2); + vlan_dev_get_realdev_name(dev, args.u.device2, + sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index fa3ad3d4d58c..1a705a4ef7fa 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -108,7 +108,8 @@ static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev) netdev_features_t ret; ret = real_dev->hw_enc_features & - (NETIF_F_CSUM_MASK | NETIF_F_ALL_TSO | NETIF_F_GSO_ENCAP_ALL); + (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE | + NETIF_F_GSO_ENCAP_ALL); if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK)) return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM; @@ -129,7 +130,8 @@ void vlan_dev_set_ingress_priority(const struct net_device *dev, int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask); -void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); +void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, + size_t size); int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 4db3f0621959..a0367b37512d 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -239,9 +239,9 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) return 0; } -void vlan_dev_get_realdev_name(const struct net_device *dev, char *result) +void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size) { - strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23); + strscpy_pad(result, vlan_dev_priv(dev)->real_dev->name, size); } bool vlan_dev_inherit_address(struct net_device *dev, @@ -360,7 +360,7 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) struct ifreq ifrr; int err = -EOPNOTSUPP; - strncpy(ifrr.ifr_name, real_dev->name, IFNAMSIZ); + strscpy_pad(ifrr.ifr_name, real_dev->name, IFNAMSIZ); ifrr.ifr_ifru = ifr->ifr_ifru; switch (cmd) { diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 93f2f8654882..2bbd7dce0f1d 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -99,7 +99,7 @@ static unsigned int rest_of_page(void *data) * @client: client instance * * This reclaims a channel by freeing its resources and - * reseting its inuse flag. + * resetting its inuse flag. * */ @@ -463,7 +463,7 @@ req_retry_pinned: * For example TREAD have 11. * 11 is the read/write header = PDU Header(7) + IO Size (4). * Arrange in such a way that server places header in the - * alloced memory and payload onto the user buffer. + * allocated memory and payload onto the user buffer. */ in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc.sdata, in_hdr_len); @@ -760,7 +760,7 @@ static struct p9_trans_module p9_virtio_trans = { .cancelled = p9_virtio_cancelled, /* * We leave one entry for input and one entry for response - * headers. We also skip one more entry to accomodate, address + * headers. We also skip one more entry to accommodate, address * that are not at page boundary, that can result in an extra * page in zero copy. */ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index ebda397fa95a..8ade5a4ceaf5 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -707,7 +707,7 @@ static int atif_ioctl(int cmd, void __user *arg) /* * Phase 1 is fine on LocalTalk but we don't do - * EtherTalk phase 1. Anyone wanting to add it go ahead. + * EtherTalk phase 1. Anyone wanting to add it, go ahead. */ if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2) return -EPROTONOSUPPORT; @@ -828,7 +828,7 @@ static int atif_ioctl(int cmd, void __user *arg) nr = (struct atalk_netrange *)&(atif->nets); /* * Phase 1 is fine on Localtalk but we don't do - * Ethertalk phase 1. Anyone wanting to add it go ahead. + * Ethertalk phase 1. Anyone wanting to add it, go ahead. */ if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2) return -EPROTONOSUPPORT; @@ -2018,7 +2018,7 @@ module_init(atalk_init); * by the network device layer. * * Ergo, before the AppleTalk module can be removed, all AppleTalk - * sockets be closed from user space. + * sockets should be closed from user space. */ static void __exit atalk_exit(void) { diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c index aa1b57161f3b..0fdbdfd19474 100644 --- a/net/atm/atm_sysfs.c +++ b/net/atm/atm_sysfs.c @@ -11,7 +11,7 @@ #define to_atm_dev(cldev) container_of(cldev, struct atm_dev, class_dev) -static ssize_t show_type(struct device *cdev, +static ssize_t type_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -19,7 +19,7 @@ static ssize_t show_type(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%s\n", adev->type); } -static ssize_t show_address(struct device *cdev, +static ssize_t address_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -27,7 +27,7 @@ static ssize_t show_address(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%pM\n", adev->esi); } -static ssize_t show_atmaddress(struct device *cdev, +static ssize_t atmaddress_show(struct device *cdev, struct device_attribute *attr, char *buf) { unsigned long flags; @@ -50,7 +50,7 @@ static ssize_t show_atmaddress(struct device *cdev, return count; } -static ssize_t show_atmindex(struct device *cdev, +static ssize_t atmindex_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -58,7 +58,7 @@ static ssize_t show_atmindex(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%d\n", adev->number); } -static ssize_t show_carrier(struct device *cdev, +static ssize_t carrier_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -67,7 +67,7 @@ static ssize_t show_carrier(struct device *cdev, adev->signal == ATM_PHY_SIG_LOST ? 0 : 1); } -static ssize_t show_link_rate(struct device *cdev, +static ssize_t link_rate_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -90,12 +90,12 @@ static ssize_t show_link_rate(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%d\n", link_rate); } -static DEVICE_ATTR(address, 0444, show_address, NULL); -static DEVICE_ATTR(atmaddress, 0444, show_atmaddress, NULL); -static DEVICE_ATTR(atmindex, 0444, show_atmindex, NULL); -static DEVICE_ATTR(carrier, 0444, show_carrier, NULL); -static DEVICE_ATTR(type, 0444, show_type, NULL); -static DEVICE_ATTR(link_rate, 0444, show_link_rate, NULL); +static DEVICE_ATTR_RO(address); +static DEVICE_ATTR_RO(atmaddress); +static DEVICE_ATTR_RO(atmindex); +static DEVICE_ATTR_RO(carrier); +static DEVICE_ATTR_RO(type); +static DEVICE_ATTR_RO(link_rate); static struct device_attribute *atm_attrs[] = { &dev_attr_atmaddress, diff --git a/net/atm/br2684.c b/net/atm/br2684.c index 3e17a5ecaa94..dd2a8dabed84 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -93,8 +93,8 @@ struct br2684_dev { * This lock should be held for writing any time the list of devices or * their attached vcc's could be altered. It should be held for reading * any time these are being queried. Note that we sometimes need to - * do read-locking under interrupt context, so write locking must block - * the current CPU's interrupts + * do read-locking under interrupting context, so write locking must block + * the current CPU's interrupts. */ static DEFINE_RWLOCK(devs_lock); diff --git a/net/atm/resources.c b/net/atm/resources.c index 53236986dfe0..2b2d33eeaf20 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -52,10 +52,8 @@ static struct atm_dev *__alloc_atm_dev(const char *type) static struct atm_dev *__atm_dev_lookup(int number) { struct atm_dev *dev; - struct list_head *p; - list_for_each(p, &atm_devs) { - dev = list_entry(p, struct atm_dev, dev_list); + list_for_each_entry(dev, &atm_devs, dev_list) { if (dev->number == number) { atm_dev_hold(dev); return dev; @@ -215,8 +213,7 @@ int atm_getnames(void __user *buf, int __user *iobuf_len) return -ENOMEM; } tmp_p = tmp_buf; - list_for_each(p, &atm_devs) { - dev = list_entry(p, struct atm_dev, dev_list); + list_for_each_entry(dev, &atm_devs, dev_list) { *tmp_p++ = dev->number; } mutex_unlock(&atm_dev_mutex); diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index fc8be49010b9..12022378f892 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1851,6 +1851,8 @@ batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) || @@ -2080,6 +2082,8 @@ batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, @@ -2461,6 +2465,8 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, router->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, router->if_incoming->net_dev->name) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + router->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index e1ca2b8c3152..b98aea958e3d 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -146,6 +146,8 @@ batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, @@ -298,6 +300,8 @@ batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) || @@ -739,6 +743,12 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, goto out; } + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + router->if_incoming->net_dev->ifindex)) { + genlmsg_cancel(msg, hdr); + goto out; + } + if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down)) { genlmsg_cancel(msg, hdr); diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 7dc133cfc363..63d42dcc9324 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -395,7 +395,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, break; case BATADV_CLAIM_TYPE_ANNOUNCE: /* announcement frame - * set HW SRC to the special mac containg the crc + * set HW SRC to the special mac containing the crc */ ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, @@ -1040,7 +1040,7 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, /* lets see if this originator is in our mesh */ orig_node = batadv_orig_hash_find(bat_priv, backbone_addr); - /* dont accept claims from gateways which are not in + /* don't accept claims from gateways which are not in * the same mesh or group. */ if (!orig_node) diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 5c22955bb9d5..8673a265995f 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -52,7 +52,6 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, void batadv_bla_status_update(struct net_device *net_dev); int batadv_bla_init(struct batadv_priv *bat_priv); void batadv_bla_free(struct batadv_priv *bat_priv); -int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); #ifdef CONFIG_BATMAN_ADV_DAT bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 4a6a25d551a8..55d97e18aa4a 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -9,7 +9,6 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> -#include <linux/errno.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_arp.h> @@ -403,7 +402,7 @@ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, goto out; } - /* >1 neighbors -> (re)brodcast */ + /* >1 neighbors -> (re)broadcast */ if (rcu_dereference(hlist_next_rcu(first))) goto out; @@ -678,43 +677,16 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) } /** - * batadv_master_del_slave() - remove hard_iface from the current master iface - * @slave: the interface enslaved in another master - * @master: the master from which slave has to be removed - * - * Invoke ndo_del_slave on master passing slave as argument. In this way the - * slave is free'd and the master can correctly change its internal state. - * - * Return: 0 on success, a negative value representing the error otherwise - */ -static int batadv_master_del_slave(struct batadv_hard_iface *slave, - struct net_device *master) -{ - int ret; - - if (!master) - return 0; - - ret = -EBUSY; - if (master->netdev_ops->ndo_del_slave) - ret = master->netdev_ops->ndo_del_slave(master, slave->net_dev); - - return ret; -} - -/** * batadv_hardif_enable_interface() - Enslave hard interface to soft interface * @hard_iface: hard interface to add to soft interface - * @net: the applicable net namespace - * @iface_name: name of the soft interface + * @soft_iface: netdev struct of the mesh interface * * Return: 0 on success or negative error number in case of failure */ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, - struct net *net, const char *iface_name) + struct net_device *soft_iface) { struct batadv_priv *bat_priv; - struct net_device *soft_iface, *master; __be16 ethertype = htons(ETH_P_BATMAN); int max_header_len = batadv_max_header_len(); int ret; @@ -724,35 +696,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, kref_get(&hard_iface->refcount); - soft_iface = dev_get_by_name(net, iface_name); - - if (!soft_iface) { - soft_iface = batadv_softif_create(net, iface_name); - - if (!soft_iface) { - ret = -ENOMEM; - goto err; - } - - /* dev_get_by_name() increases the reference counter for us */ - dev_hold(soft_iface); - } - - if (!batadv_softif_is_valid(soft_iface)) { - pr_err("Can't create batman mesh interface %s: already exists as regular interface\n", - soft_iface->name); - ret = -EINVAL; - goto err_dev; - } - - /* check if the interface is enslaved in another virtual one and - * in that case unlink it first - */ - master = netdev_master_upper_dev_get(hard_iface->net_dev); - ret = batadv_master_del_slave(hard_iface, master); - if (ret) - goto err_dev; - + dev_hold(soft_iface); hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); @@ -810,7 +754,6 @@ err_upper: err_dev: hard_iface->soft_iface = NULL; dev_put(soft_iface); -err: batadv_hardif_put(hard_iface); return ret; } diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 83d11b46a9d8..8cb2a1f10080 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -16,7 +16,6 @@ #include <linux/rcupdate.h> #include <linux/stddef.h> #include <linux/types.h> -#include <net/net_namespace.h> /** * enum batadv_hard_if_state - State of a hard interface @@ -75,7 +74,7 @@ bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface); struct batadv_hard_iface* batadv_hardif_get_by_netdev(const struct net_device *net_dev); int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, - struct net *net, const char *iface_name); + struct net_device *soft_iface); void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface); int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 46696759f194..fb251c385a1b 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -18,7 +18,7 @@ #include <linux/stddef.h> #include <linux/types.h> -/* callback to a compare function. should compare 2 element datas for their +/* callback to a compare function. should compare 2 element data for their * keys * * Return: true if same and false if not same diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 8f0102b71656..014235fd4681 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2021.1" +#define BATADV_SOURCE_VERSION "2021.2" #endif /* B.A.T.M.A.N. parameters */ @@ -88,7 +88,6 @@ /* number of packets to send for broadcasts on different interface types */ #define BATADV_NUM_BCASTS_DEFAULT 1 #define BATADV_NUM_BCASTS_WIRELESS 3 -#define BATADV_NUM_BCASTS_MAX 3 /* length of the single packet used by the TP meter */ #define BATADV_TP_PACKET_LEN ETH_DATA_LEN diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 1d63c8cbbfe7..923e2197c2db 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -193,53 +193,22 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv, * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ -#if IS_ENABLED(CONFIG_IPV6) static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, struct net_device *bridge) { - struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); struct net_device *dev = bat_priv->soft_iface; - struct br_ip_list *br_ip_entry, *tmp; - u8 flags = BATADV_MCAST_WANT_NO_RTR6; - int ret; + u8 flags = BATADV_NO_FLAGS; if (!bridge) return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; - /* TODO: ask the bridge if a multicast router is present (the bridge - * is capable of performing proper RFC4286 multicast router - * discovery) instead of searching for a ff02::2 listener here - */ - ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); - if (ret < 0) - return BATADV_NO_FLAGS; - - list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { - /* the bridge snooping does not maintain IPv4 link-local - * addresses - therefore we won't find any IPv4 multicast router - * address here, only IPv6 ones - */ - if (br_ip_entry->addr.proto == htons(ETH_P_IPV6) && - ipv6_addr_is_ll_all_routers(&br_ip_entry->addr.dst.ip6)) - flags &= ~BATADV_MCAST_WANT_NO_RTR6; - - list_del(&br_ip_entry->list); - kfree(br_ip_entry); - } + if (!br_multicast_has_router_adjacent(dev, ETH_P_IP)) + flags |= BATADV_MCAST_WANT_NO_RTR4; + if (!br_multicast_has_router_adjacent(dev, ETH_P_IPV6)) + flags |= BATADV_MCAST_WANT_NO_RTR6; return flags; } -#else -static inline u8 -batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, - struct net_device *bridge) -{ - if (bridge) - return BATADV_NO_FLAGS; - else - return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; -} -#endif /** * batadv_mcast_mla_rtr_flags_get() - get multicast router flags diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index f317d206b411..b6cc746e01a6 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -814,6 +814,10 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg, bat_priv->soft_iface->ifindex)) goto nla_put_failure; + if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, + bat_priv->soft_iface->name)) + goto nla_put_failure; + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, net_dev->ifindex) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, @@ -1045,6 +1049,10 @@ static int batadv_netlink_vlan_fill(struct sk_buff *msg, bat_priv->soft_iface->ifindex)) goto nla_put_failure; + if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, + bat_priv->soft_iface->name)) + goto nla_put_failure; + if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK)) goto nla_put_failure; diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 40f5cffde6a3..bb9e93e3d98c 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1182,9 +1182,9 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, struct batadv_bcast_packet *bcast_packet; struct ethhdr *ethhdr; int hdr_size = sizeof(*bcast_packet); - int ret = NET_RX_DROP; s32 seq_diff; u32 seqno; + int ret; /* drop packet if it has not necessary minimum size */ if (unlikely(!pskb_may_pull(skb, hdr_size))) @@ -1210,7 +1210,7 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, if (batadv_is_my_mac(bat_priv, bcast_packet->orig)) goto free_skb; - if (bcast_packet->ttl < 2) + if (bcast_packet->ttl-- < 2) goto free_skb; orig_node = batadv_orig_hash_find(bat_priv, bcast_packet->orig); @@ -1249,7 +1249,9 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, batadv_skb_set_priority(skb, sizeof(struct batadv_bcast_packet)); /* rebroadcast packet */ - batadv_add_bcast_packet_to_list(bat_priv, skb, 1, false); + ret = batadv_forw_bcast_packet(bat_priv, skb, 0, false); + if (ret == NETDEV_TX_BUSY) + goto free_skb; /* don't hand the broadcast up if it is from an originator * from the same backbone. @@ -1275,6 +1277,7 @@ spin_unlock: spin_unlock_bh(&orig_node->bcast_seqno_lock); free_skb: kfree_skb(skb); + ret = NET_RX_DROP; out: if (orig_node) batadv_orig_node_put(orig_node); diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 157abe92d827..0b9dd29d3b6a 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -737,57 +737,48 @@ void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv, } /** - * batadv_add_bcast_packet_to_list() - queue broadcast packet for multiple sends + * batadv_forw_bcast_packet_to_list() - queue broadcast packet for transmissions * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet + * @if_in: the interface where the packet was received on + * @if_out: the outgoing interface to queue on * - * add a broadcast packet to the queue and setup timers. broadcast packets + * Adds a broadcast packet to the queue and sets up timers. Broadcast packets * are sent multiple times to increase probability for being received. * - * The skb is not consumed, so the caller should make sure that the - * skb is freed. - * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ -int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, - const struct sk_buff *skb, - unsigned long delay, - bool own_packet) +static int batadv_forw_bcast_packet_to_list(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet, + struct batadv_hard_iface *if_in, + struct batadv_hard_iface *if_out) { - struct batadv_hard_iface *primary_if; struct batadv_forw_packet *forw_packet; - struct batadv_bcast_packet *bcast_packet; + unsigned long send_time = jiffies; struct sk_buff *newskb; - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) - goto err; - newskb = skb_copy(skb, GFP_ATOMIC); - if (!newskb) { - batadv_hardif_put(primary_if); + if (!newskb) goto err; - } - forw_packet = batadv_forw_packet_alloc(primary_if, NULL, + forw_packet = batadv_forw_packet_alloc(if_in, if_out, &bat_priv->bcast_queue_left, bat_priv, newskb); - batadv_hardif_put(primary_if); if (!forw_packet) goto err_packet_free; - /* as we have a copy now, it is safe to decrease the TTL */ - bcast_packet = (struct batadv_bcast_packet *)newskb->data; - bcast_packet->ttl--; - forw_packet->own = own_packet; INIT_DELAYED_WORK(&forw_packet->delayed_work, batadv_send_outstanding_bcast_packet); - batadv_forw_packet_bcast_queue(bat_priv, forw_packet, jiffies + delay); + send_time += delay ? delay : msecs_to_jiffies(5); + + batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time); return NETDEV_TX_OK; err_packet_free: @@ -797,9 +788,219 @@ err: } /** + * batadv_forw_bcast_packet_if() - forward and queue a broadcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending + * @own_packet: true if it is a self-generated broadcast packet + * @if_in: the interface where the packet was received on + * @if_out: the outgoing interface to forward to + * + * Transmits a broadcast packet on the specified interface either immediately + * or if a delay is given after that. Furthermore, queues additional + * retransmissions if this interface is a wireless one. + * + * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. + */ +static int batadv_forw_bcast_packet_if(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet, + struct batadv_hard_iface *if_in, + struct batadv_hard_iface *if_out) +{ + unsigned int num_bcasts = if_out->num_bcasts; + struct sk_buff *newskb; + int ret = NETDEV_TX_OK; + + if (!delay) { + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) + return NETDEV_TX_BUSY; + + batadv_send_broadcast_skb(newskb, if_out); + num_bcasts--; + } + + /* delayed broadcast or rebroadcasts? */ + if (num_bcasts >= 1) { + BATADV_SKB_CB(skb)->num_bcasts = num_bcasts; + + ret = batadv_forw_bcast_packet_to_list(bat_priv, skb, delay, + own_packet, if_in, + if_out); + } + + return ret; +} + +/** + * batadv_send_no_broadcast() - check whether (re)broadcast is necessary + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to check + * @own_packet: true if it is a self-generated broadcast packet + * @if_out: the outgoing interface checked and considered for (re)broadcast + * + * Return: False if a packet needs to be (re)broadcasted on the given interface, + * true otherwise. + */ +static bool batadv_send_no_broadcast(struct batadv_priv *bat_priv, + struct sk_buff *skb, bool own_packet, + struct batadv_hard_iface *if_out) +{ + struct batadv_hardif_neigh_node *neigh_node = NULL; + struct batadv_bcast_packet *bcast_packet; + u8 *orig_neigh; + u8 *neigh_addr; + char *type; + int ret; + + if (!own_packet) { + neigh_addr = eth_hdr(skb)->h_source; + neigh_node = batadv_hardif_neigh_get(if_out, + neigh_addr); + } + + bcast_packet = (struct batadv_bcast_packet *)skb->data; + orig_neigh = neigh_node ? neigh_node->orig : NULL; + + ret = batadv_hardif_no_broadcast(if_out, bcast_packet->orig, + orig_neigh); + + if (neigh_node) + batadv_hardif_neigh_put(neigh_node); + + /* ok, may broadcast */ + if (!ret) + return false; + + /* no broadcast */ + switch (ret) { + case BATADV_HARDIF_BCAST_NORECIPIENT: + type = "no neighbor"; + break; + case BATADV_HARDIF_BCAST_DUPFWD: + type = "single neighbor is source"; + break; + case BATADV_HARDIF_BCAST_DUPORIG: + type = "single neighbor is originator"; + break; + default: + type = "unknown"; + } + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "BCAST packet from orig %pM on %s suppressed: %s\n", + bcast_packet->orig, + if_out->net_dev->name, type); + + return true; +} + +/** + * __batadv_forw_bcast_packet() - forward and queue a broadcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending + * @own_packet: true if it is a self-generated broadcast packet + * + * Transmits a broadcast packet either immediately or if a delay is given + * after that. Furthermore, queues additional retransmissions on wireless + * interfaces. + * + * This call clones the given skb, hence the caller needs to take into + * account that the data segment of the given skb might not be + * modifiable anymore. + * + * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. + */ +static int __batadv_forw_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet) +{ + struct batadv_hard_iface *hard_iface; + struct batadv_hard_iface *primary_if; + int ret = NETDEV_TX_OK; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if) + return NETDEV_TX_BUSY; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + + if (batadv_send_no_broadcast(bat_priv, skb, own_packet, + hard_iface)) { + batadv_hardif_put(hard_iface); + continue; + } + + ret = batadv_forw_bcast_packet_if(bat_priv, skb, delay, + own_packet, primary_if, + hard_iface); + batadv_hardif_put(hard_iface); + + if (ret == NETDEV_TX_BUSY) + break; + } + rcu_read_unlock(); + + batadv_hardif_put(primary_if); + return ret; +} + +/** + * batadv_forw_bcast_packet() - forward and queue a broadcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending + * @own_packet: true if it is a self-generated broadcast packet + * + * Transmits a broadcast packet either immediately or if a delay is given + * after that. Furthermore, queues additional retransmissions on wireless + * interfaces. + * + * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. + */ +int batadv_forw_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet) +{ + return __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet); +} + +/** + * batadv_send_bcast_packet() - send and queue a broadcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending + * @own_packet: true if it is a self-generated broadcast packet + * + * Transmits a broadcast packet either immediately or if a delay is given + * after that. Furthermore, queues additional retransmissions on wireless + * interfaces. + * + * Consumes the provided skb. + */ +void batadv_send_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet) +{ + __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet); + consume_skb(skb); +} + +/** * batadv_forw_packet_bcasts_left() - check if a retransmission is necessary * @forw_packet: the forwarding packet to check - * @hard_iface: the interface to check on * * Checks whether a given packet has any (re)transmissions left on the provided * interface. @@ -811,28 +1012,20 @@ err: * Return: True if (re)transmissions are left, false otherwise. */ static bool -batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet, - struct batadv_hard_iface *hard_iface) +batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet) { - unsigned int max; - - if (hard_iface) - max = hard_iface->num_bcasts; - else - max = BATADV_NUM_BCASTS_MAX; - - return BATADV_SKB_CB(forw_packet->skb)->num_bcasts < max; + return BATADV_SKB_CB(forw_packet->skb)->num_bcasts; } /** - * batadv_forw_packet_bcasts_inc() - increment retransmission counter of a + * batadv_forw_packet_bcasts_dec() - decrement retransmission counter of a * packet - * @forw_packet: the packet to increase the counter for + * @forw_packet: the packet to decrease the counter for */ static void -batadv_forw_packet_bcasts_inc(struct batadv_forw_packet *forw_packet) +batadv_forw_packet_bcasts_dec(struct batadv_forw_packet *forw_packet) { - BATADV_SKB_CB(forw_packet->skb)->num_bcasts++; + BATADV_SKB_CB(forw_packet->skb)->num_bcasts--; } /** @@ -843,30 +1036,30 @@ batadv_forw_packet_bcasts_inc(struct batadv_forw_packet *forw_packet) */ bool batadv_forw_packet_is_rebroadcast(struct batadv_forw_packet *forw_packet) { - return BATADV_SKB_CB(forw_packet->skb)->num_bcasts > 0; + unsigned char num_bcasts = BATADV_SKB_CB(forw_packet->skb)->num_bcasts; + + return num_bcasts != forw_packet->if_outgoing->num_bcasts; } +/** + * batadv_send_outstanding_bcast_packet() - transmit a queued broadcast packet + * @work: work queue item + * + * Transmits a queued broadcast packet and if necessary reschedules it. + */ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) { - struct batadv_hard_iface *hard_iface; - struct batadv_hardif_neigh_node *neigh_node; - struct delayed_work *delayed_work; + unsigned long send_time = jiffies + msecs_to_jiffies(5); struct batadv_forw_packet *forw_packet; - struct batadv_bcast_packet *bcast_packet; - struct sk_buff *skb1; - struct net_device *soft_iface; + struct delayed_work *delayed_work; struct batadv_priv *bat_priv; - unsigned long send_time = jiffies + msecs_to_jiffies(5); + struct sk_buff *skb1; bool dropped = false; - u8 *neigh_addr; - u8 *orig_neigh; - int ret = 0; delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); - soft_iface = forw_packet->if_incoming->soft_iface; - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) { dropped = true; @@ -878,76 +1071,15 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) goto out; } - bcast_packet = (struct batadv_bcast_packet *)forw_packet->skb->data; - - /* rebroadcast packet */ - rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != soft_iface) - continue; - - if (!batadv_forw_packet_bcasts_left(forw_packet, hard_iface)) - continue; - - if (forw_packet->own) { - neigh_node = NULL; - } else { - neigh_addr = eth_hdr(forw_packet->skb)->h_source; - neigh_node = batadv_hardif_neigh_get(hard_iface, - neigh_addr); - } - - orig_neigh = neigh_node ? neigh_node->orig : NULL; - - ret = batadv_hardif_no_broadcast(hard_iface, bcast_packet->orig, - orig_neigh); - - if (ret) { - char *type; - - switch (ret) { - case BATADV_HARDIF_BCAST_NORECIPIENT: - type = "no neighbor"; - break; - case BATADV_HARDIF_BCAST_DUPFWD: - type = "single neighbor is source"; - break; - case BATADV_HARDIF_BCAST_DUPORIG: - type = "single neighbor is originator"; - break; - default: - type = "unknown"; - } - - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "BCAST packet from orig %pM on %s suppressed: %s\n", - bcast_packet->orig, - hard_iface->net_dev->name, type); - - if (neigh_node) - batadv_hardif_neigh_put(neigh_node); - - continue; - } - - if (neigh_node) - batadv_hardif_neigh_put(neigh_node); - - if (!kref_get_unless_zero(&hard_iface->refcount)) - continue; - - /* send a copy of the saved skb */ - skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); - if (skb1) - batadv_send_broadcast_skb(skb1, hard_iface); - - batadv_hardif_put(hard_iface); - } - rcu_read_unlock(); + /* send a copy of the saved skb */ + skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); + if (!skb1) + goto out; - batadv_forw_packet_bcasts_inc(forw_packet); + batadv_send_broadcast_skb(skb1, forw_packet->if_outgoing); + batadv_forw_packet_bcasts_dec(forw_packet); - /* if we still have some more bcasts to send */ - if (batadv_forw_packet_bcasts_left(forw_packet, NULL)) { + if (batadv_forw_packet_bcasts_left(forw_packet)) { batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time); return; diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 2b0daf8b2bc4..08af251b765c 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -39,10 +39,14 @@ int batadv_send_broadcast_skb(struct sk_buff *skb, struct batadv_hard_iface *hard_iface); int batadv_send_unicast_skb(struct sk_buff *skb, struct batadv_neigh_node *neigh_node); -int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, - const struct sk_buff *skb, - unsigned long delay, - bool own_packet); +int batadv_forw_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet); +void batadv_send_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet); void batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, const struct batadv_hard_iface *hard_iface); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 6b8181bc3122..ae368a42a4ad 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -26,7 +26,6 @@ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> -#include <linux/printk.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> @@ -37,6 +36,7 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> +#include <net/net_namespace.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> @@ -191,7 +191,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; - unsigned long brd_delay = 1; + unsigned long brd_delay = 0; bool do_bcast = false, client_added; unsigned short vid; u32 seqno; @@ -330,7 +330,7 @@ send: bcast_packet = (struct batadv_bcast_packet *)skb->data; bcast_packet->version = BATADV_COMPAT_VERSION; - bcast_packet->ttl = BATADV_TTL; + bcast_packet->ttl = BATADV_TTL - 1; /* batman packet type: broadcast */ bcast_packet->packet_type = BATADV_BCAST; @@ -346,13 +346,7 @@ send: seqno = atomic_inc_return(&bat_priv->bcast_seqno); bcast_packet->seqno = htonl(seqno); - batadv_add_bcast_packet_to_list(bat_priv, skb, brd_delay, true); - - /* a copy is stored in the bcast list, therefore removing - * the original skb. - */ - consume_skb(skb); - + batadv_send_bcast_packet(bat_priv, skb, brd_delay, true); /* unicast packet */ } else { /* DHCP packets going to a server will use the GW feature */ @@ -848,14 +842,13 @@ static int batadv_softif_slave_add(struct net_device *dev, struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; - struct net *net = dev_net(dev); int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface) goto out; - ret = batadv_hardif_enable_interface(hard_iface, net, dev->name); + ret = batadv_hardif_enable_interface(hard_iface, dev); out: if (hard_iface) @@ -1093,38 +1086,6 @@ static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, } /** - * batadv_softif_create() - Create and register soft interface - * @net: the applicable net namespace - * @name: name of the new soft interface - * - * Return: newly allocated soft_interface, NULL on errors - */ -struct net_device *batadv_softif_create(struct net *net, const char *name) -{ - struct net_device *soft_iface; - int ret; - - soft_iface = alloc_netdev(sizeof(struct batadv_priv), name, - NET_NAME_UNKNOWN, batadv_softif_init_early); - if (!soft_iface) - return NULL; - - dev_net_set(soft_iface, net); - - soft_iface->rtnl_link_ops = &batadv_link_ops; - - ret = register_netdevice(soft_iface); - if (ret < 0) { - pr_err("Unable to register the batman interface '%s': %i\n", - name, ret); - free_netdev(soft_iface); - return NULL; - } - - return soft_iface; -} - -/** * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via * netlink * @soft_iface: the to-be-removed batman-adv interface diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 38b0ad182584..67a2ddd6832f 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -12,14 +12,12 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> -#include <net/net_namespace.h> #include <net/rtnetlink.h> int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node); -struct net_device *batadv_softif_create(struct net *net, const char *name); bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 88ec08978ff4..0ceb72d32208 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -758,7 +758,7 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) conn->state = BT_CLOSED; /* If the status indicates successful cancellation of - * the attempt (i.e. Unkown Connection Id) there's no point of + * the attempt (i.e. Unknown Connection Id) there's no point of * notifying failure since we'll go back to keep trying to * connect. The only exception is explicit connect requests * where a timeout + cancel does indicate an actual failure. diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 7d71d104fdfd..25484bb0773d 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -648,7 +648,7 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) */ /* If the controller supports Extended Scanner Filter - * Policies, enable the correspondig event. + * Policies, enable the corresponding event. */ if (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY) events[1] |= 0x04; /* LE Direct Advertising @@ -1454,7 +1454,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) } /* Check for valid public address or a configured static - * random adddress, but let the HCI setup proceed to + * random address, but let the HCI setup proceed to * be able to determine if there is a public address * or not. * @@ -3549,7 +3549,7 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev) if (params->auto_connect != HCI_AUTO_CONN_DISABLED) continue; - /* If trying to estabilish one time connection to disabled + /* If trying to establish one time connection to disabled * device, leave the params, but mark them as just once. */ if (params->explicit_connect) { @@ -4284,7 +4284,7 @@ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE; } -/* Send HCI command and wait for command commplete event */ +/* Send HCI command and wait for command complete event */ struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 016b2999f219..ea06b010ccad 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6032,7 +6032,7 @@ static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode, return true; } - /* Check if request ended in Command Status - no way to retreive + /* Check if request ended in Command Status - no way to retrieve * any extra parameters in this case. */ if (hdr->evt == HCI_EV_CMD_STATUS) diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index eed0dd066e12..e8d53af7c6a6 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1130,7 +1130,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been assigned, * then there has been already an ioctl issued against - * an unbound socket and with that triggerd an open + * an unbound socket and with that triggered an open * notification. Send a close notification first to * allow the state transition to bounded. */ @@ -1326,9 +1326,9 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) { if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been - * assigned, this socket will transtion from + * assigned, this socket will transition from * a raw socket into a control socket. To - * allow for a clean transtion, send the + * allow for a clean transition, send the * close notification first. */ skb = create_monitor_ctrl_close(sk); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f9be7f9084d6..f290d0c54d32 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3341,7 +3341,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, } /* The name is stored in the scan response data and so - * no need to udpate the advertising data here. + * no need to update the advertising data here. */ if (lmp_le_capable(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING)) __hci_req_update_scan_rsp_data(&req, hdev->cur_adv_instance); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 7dd51da73845..4d93c6c32a71 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -40,7 +40,7 @@ ((struct smp_dev *)((struct l2cap_chan *)((hdev)->smp_data))->data) /* Low-level debug macros to be used for stuff that we don't want - * accidentially in dmesg, i.e. the values of the various crypto keys + * accidentally in dmesg, i.e. the values of the various crypto keys * and the inputs & outputs of crypto functions. */ #ifdef DEBUG @@ -560,7 +560,7 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) return err; /* This is unlikely, but we need to check that - * we didn't accidentially generate a debug key. + * we didn't accidentally generate a debug key. */ if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; @@ -1902,7 +1902,7 @@ static u8 sc_send_public_key(struct smp_chan *smp) return SMP_UNSPECIFIED; /* This is unlikely, but we need to check that - * we didn't accidentially generate a debug key. + * we didn't accidentally generate a debug key. */ if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index a5d72c48fb66..aa47af349ba8 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -409,7 +409,7 @@ static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) return ERR_PTR(-ENOMEM); if (data_in) { - err = bpf_check_uarg_tail_zero(data_in, max_size, size); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(data_in), max_size, size); if (err) { kfree(data); return ERR_PTR(err); @@ -918,3 +918,46 @@ out: kfree(user_ctx); return ret; } + +int bpf_prog_test_run_syscall(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); + __u32 ctx_size_in = kattr->test.ctx_size_in; + void *ctx = NULL; + u32 retval; + int err = 0; + + /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */ + if (kattr->test.data_in || kattr->test.data_out || + kattr->test.ctx_out || kattr->test.duration || + kattr->test.repeat || kattr->test.flags) + return -EINVAL; + + if (ctx_size_in < prog->aux->max_ctx_offset || + ctx_size_in > U16_MAX) + return -EINVAL; + + if (ctx_size_in) { + ctx = kzalloc(ctx_size_in, GFP_USER); + if (!ctx) + return -ENOMEM; + if (copy_from_user(ctx, ctx_in, ctx_size_in)) { + err = -EFAULT; + goto out; + } + } + retval = bpf_prog_run_pin_on_cpu(prog, ctx); + + if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) { + err = -EFAULT; + goto out; + } + if (ctx_size_in) + if (copy_to_user(ctx_in, ctx, ctx_size_in)) + err = -EFAULT; +out: + kfree(ctx); + return err; +} diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6e9b049ae521..07856362538f 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -276,7 +276,8 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, bool allow_mode_include = true; struct hlist_node *rp; - rp = rcu_dereference(hlist_first_rcu(&br->router_list)); + rp = br_multicast_get_first_rport_node(br, skb); + if (mdst) { p = rcu_dereference(mdst->ports); if (br_multicast_should_handle_mode(br, mdst->addr.proto) && @@ -290,7 +291,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct net_bridge_port *port, *lport, *rport; lport = p ? p->key.port : NULL; - rport = hlist_entry_safe(rp, struct net_bridge_port, rlist); + rport = br_multicast_rport_from_node_skb(rp, skb); if ((unsigned long)lport > (unsigned long)rport) { port = lport; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 8875e953ac53..1f506309efa8 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -132,7 +132,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(br, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || - br_multicast_is_router(br)) { + br_multicast_is_router(br, skb)) { local_rcv = true; br->dev->stats.multicast++; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 95fa4af0e8dd..3f839a8cc9fb 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -16,31 +16,76 @@ #include "br_private.h" +static bool br_rports_have_mc_router(struct net_bridge *br) +{ +#if IS_ENABLED(CONFIG_IPV6) + return !hlist_empty(&br->ip4_mc_router_list) || + !hlist_empty(&br->ip6_mc_router_list); +#else + return !hlist_empty(&br->ip4_mc_router_list); +#endif +} + +static bool +br_ip4_rports_get_timer(struct net_bridge_port *port, unsigned long *timer) +{ + *timer = br_timer_value(&port->ip4_mc_router_timer); + return !hlist_unhashed(&port->ip4_rlist); +} + +static bool +br_ip6_rports_get_timer(struct net_bridge_port *port, unsigned long *timer) +{ +#if IS_ENABLED(CONFIG_IPV6) + *timer = br_timer_value(&port->ip6_mc_router_timer); + return !hlist_unhashed(&port->ip6_rlist); +#else + *timer = 0; + return false; +#endif +} + static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - struct net_bridge_port *p; + bool have_ip4_mc_rtr, have_ip6_mc_rtr; + unsigned long ip4_timer, ip6_timer; struct nlattr *nest, *port_nest; + struct net_bridge_port *p; - if (!br->multicast_router || hlist_empty(&br->router_list)) + if (!br->multicast_router) + return 0; + + if (!br_rports_have_mc_router(br)) return 0; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (nest == NULL) return -EMSGSIZE; - hlist_for_each_entry_rcu(p, &br->router_list, rlist) { - if (!p) + list_for_each_entry_rcu(p, &br->port_list, list) { + have_ip4_mc_rtr = br_ip4_rports_get_timer(p, &ip4_timer); + have_ip6_mc_rtr = br_ip6_rports_get_timer(p, &ip6_timer); + + if (!have_ip4_mc_rtr && !have_ip6_mc_rtr) continue; + port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto fail; + if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, - br_timer_value(&p->multicast_router_timer)) || + max(ip4_timer, ip6_timer)) || nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, - p->multicast_router)) { + p->multicast_router) || + (have_ip4_mc_rtr && + nla_put_u32(skb, MDBA_ROUTER_PATTR_INET_TIMER, + ip4_timer)) || + (have_ip6_mc_rtr && + nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER, + ip6_timer))) { nla_nest_cancel(skb, port_nest); goto fail; } diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index cd2b1e424e54..f7012b7d7ce4 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -627,8 +627,7 @@ int br_mrp_set_ring_state(struct net_bridge *br, if (!mrp) return -EINVAL; - if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED && - state->ring_state != BR_MRP_RING_STATE_CLOSED) + if (mrp->ring_state != state->ring_state) mrp->ring_transitions++; mrp->ring_state = state->ring_state; @@ -715,8 +714,7 @@ int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state) if (!mrp) return -EINVAL; - if (mrp->in_state == BR_MRP_IN_STATE_CLOSED && - state->in_state != BR_MRP_IN_STATE_CLOSED) + if (mrp->in_state != state->in_state) mrp->in_transitions++; mrp->in_state = state->in_state; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 226bb05c3b42..53c3a9d80d9c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -51,8 +51,8 @@ static const struct rhashtable_params br_sg_port_rht_params = { static void br_multicast_start_querier(struct net_bridge *br, struct bridge_mcast_own_query *query); -static void br_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port); +static void br_ip4_multicast_add_router(struct net_bridge *br, + struct net_bridge_port *port); static void br_ip4_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, __be32 group, @@ -60,7 +60,10 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, const unsigned char *src); static void br_multicast_port_group_rexmit(struct timer_list *t); -static void __del_port_router(struct net_bridge_port *p); +static void +br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted); +static void br_ip6_multicast_add_router(struct net_bridge *br, + struct net_bridge_port *port); #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, @@ -1354,23 +1357,64 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, } #endif -static void br_multicast_router_expired(struct timer_list *t) +static bool br_multicast_rport_del(struct hlist_node *rlist) +{ + if (hlist_unhashed(rlist)) + return false; + + hlist_del_init_rcu(rlist); + return true; +} + +static bool br_ip4_multicast_rport_del(struct net_bridge_port *p) +{ + return br_multicast_rport_del(&p->ip4_rlist); +} + +static bool br_ip6_multicast_rport_del(struct net_bridge_port *p) +{ +#if IS_ENABLED(CONFIG_IPV6) + return br_multicast_rport_del(&p->ip6_rlist); +#else + return false; +#endif +} + +static void br_multicast_router_expired(struct net_bridge_port *port, + struct timer_list *t, + struct hlist_node *rlist) { - struct net_bridge_port *port = - from_timer(port, t, multicast_router_timer); struct net_bridge *br = port->br; + bool del; spin_lock(&br->multicast_lock); if (port->multicast_router == MDB_RTR_TYPE_DISABLED || port->multicast_router == MDB_RTR_TYPE_PERM || - timer_pending(&port->multicast_router_timer)) + timer_pending(t)) goto out; - __del_port_router(port); + del = br_multicast_rport_del(rlist); + br_multicast_rport_del_notify(port, del); out: spin_unlock(&br->multicast_lock); } +static void br_ip4_multicast_router_expired(struct timer_list *t) +{ + struct net_bridge_port *port = from_timer(port, t, ip4_mc_router_timer); + + br_multicast_router_expired(port, t, &port->ip4_rlist); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_router_expired(struct timer_list *t) +{ + struct net_bridge_port *port = from_timer(port, t, ip6_mc_router_timer); + + br_multicast_router_expired(port, t, &port->ip6_rlist); +} +#endif + static void br_mc_router_state_change(struct net_bridge *p, bool is_mc_router) { @@ -1384,14 +1428,14 @@ static void br_mc_router_state_change(struct net_bridge *p, switchdev_port_attr_set(p->dev, &attr, NULL); } -static void br_multicast_local_router_expired(struct timer_list *t) +static void br_multicast_local_router_expired(struct net_bridge *br, + struct timer_list *timer) { - struct net_bridge *br = from_timer(br, t, multicast_router_timer); - spin_lock(&br->multicast_lock); if (br->multicast_router == MDB_RTR_TYPE_DISABLED || br->multicast_router == MDB_RTR_TYPE_PERM || - timer_pending(&br->multicast_router_timer)) + br_ip4_multicast_is_router(br) || + br_ip6_multicast_is_router(br)) goto out; br_mc_router_state_change(br, false); @@ -1399,6 +1443,22 @@ out: spin_unlock(&br->multicast_lock); } +static void br_ip4_multicast_local_router_expired(struct timer_list *t) +{ + struct net_bridge *br = from_timer(br, t, ip4_mc_router_timer); + + br_multicast_local_router_expired(br, t); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_local_router_expired(struct timer_list *t) +{ + struct net_bridge *br = from_timer(br, t, ip6_mc_router_timer); + + br_multicast_local_router_expired(br, t); +} +#endif + static void br_multicast_querier_expired(struct net_bridge *br, struct bridge_mcast_own_query *query) { @@ -1613,11 +1673,13 @@ int br_multicast_add_port(struct net_bridge_port *port) port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; - timer_setup(&port->multicast_router_timer, - br_multicast_router_expired, 0); + timer_setup(&port->ip4_mc_router_timer, + br_ip4_multicast_router_expired, 0); timer_setup(&port->ip4_own_query.timer, br_ip4_multicast_port_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) + timer_setup(&port->ip6_mc_router_timer, + br_ip6_multicast_router_expired, 0); timer_setup(&port->ip6_own_query.timer, br_ip6_multicast_port_query_expired, 0); #endif @@ -1649,7 +1711,10 @@ void br_multicast_del_port(struct net_bridge_port *port) hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); br_multicast_gc(&deleted_head); - del_timer_sync(&port->multicast_router_timer); + del_timer_sync(&port->ip4_mc_router_timer); +#if IS_ENABLED(CONFIG_IPV6) + del_timer_sync(&port->ip6_mc_router_timer); +#endif free_percpu(port->mcast_stats); } @@ -1673,9 +1738,10 @@ static void __br_multicast_enable_port(struct net_bridge_port *port) #if IS_ENABLED(CONFIG_IPV6) br_multicast_enable(&port->ip6_own_query); #endif - if (port->multicast_router == MDB_RTR_TYPE_PERM && - hlist_unhashed(&port->rlist)) - br_multicast_add_router(br, port); + if (port->multicast_router == MDB_RTR_TYPE_PERM) { + br_ip4_multicast_add_router(br, port); + br_ip6_multicast_add_router(br, port); + } } void br_multicast_enable_port(struct net_bridge_port *port) @@ -1692,19 +1758,22 @@ void br_multicast_disable_port(struct net_bridge_port *port) struct net_bridge *br = port->br; struct net_bridge_port_group *pg; struct hlist_node *n; + bool del = false; spin_lock(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) if (!(pg->flags & MDB_PG_FLAGS_PERMANENT)) br_multicast_find_del_pg(br, pg); - __del_port_router(port); - - del_timer(&port->multicast_router_timer); + del |= br_ip4_multicast_rport_del(port); + del_timer(&port->ip4_mc_router_timer); del_timer(&port->ip4_own_query.timer); + del |= br_ip6_multicast_rport_del(port); #if IS_ENABLED(CONFIG_IPV6) + del_timer(&port->ip6_mc_router_timer); del_timer(&port->ip6_own_query.timer); #endif + br_multicast_rport_del_notify(port, del); spin_unlock(&br->multicast_lock); } @@ -2615,22 +2684,6 @@ update: } #endif -static bool br_multicast_select_querier(struct net_bridge *br, - struct net_bridge_port *port, - struct br_ip *saddr) -{ - switch (saddr->proto) { - case htons(ETH_P_IP): - return br_ip4_multicast_select_querier(br, port, saddr->src.ip4); -#if IS_ENABLED(CONFIG_IPV6) - case htons(ETH_P_IPV6): - return br_ip6_multicast_select_querier(br, port, &saddr->src.ip6); -#endif - } - - return false; -} - static void br_multicast_update_query_timer(struct net_bridge *br, struct bridge_mcast_other_query *query, @@ -2655,45 +2708,122 @@ static void br_port_mc_router_state_change(struct net_bridge_port *p, switchdev_port_attr_set(p->dev, &attr, NULL); } -/* - * Add port to router_list +static struct net_bridge_port * +br_multicast_rport_from_node(struct net_bridge *br, + struct hlist_head *mc_router_list, + struct hlist_node *rlist) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (mc_router_list == &br->ip6_mc_router_list) + return hlist_entry(rlist, struct net_bridge_port, ip6_rlist); +#endif + return hlist_entry(rlist, struct net_bridge_port, ip4_rlist); +} + +static struct hlist_node * +br_multicast_get_rport_slot(struct net_bridge *br, + struct net_bridge_port *port, + struct hlist_head *mc_router_list) + +{ + struct hlist_node *slot = NULL; + struct net_bridge_port *p; + struct hlist_node *rlist; + + hlist_for_each(rlist, mc_router_list) { + p = br_multicast_rport_from_node(br, mc_router_list, rlist); + + if ((unsigned long)port >= (unsigned long)p) + break; + + slot = rlist; + } + + return slot; +} + +static bool br_multicast_no_router_otherpf(struct net_bridge_port *port, + struct hlist_node *rnode) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (rnode != &port->ip6_rlist) + return hlist_unhashed(&port->ip6_rlist); + else + return hlist_unhashed(&port->ip4_rlist); +#else + return true; +#endif +} + +/* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ static void br_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port) + struct net_bridge_port *port, + struct hlist_node *rlist, + struct hlist_head *mc_router_list) { - struct net_bridge_port *p; - struct hlist_node *slot = NULL; + struct hlist_node *slot; - if (!hlist_unhashed(&port->rlist)) + if (!hlist_unhashed(rlist)) return; - hlist_for_each_entry(p, &br->router_list, rlist) { - if ((unsigned long) port >= (unsigned long) p) - break; - slot = &p->rlist; - } + slot = br_multicast_get_rport_slot(br, port, mc_router_list); if (slot) - hlist_add_behind_rcu(&port->rlist, slot); + hlist_add_behind_rcu(rlist, slot); else - hlist_add_head_rcu(&port->rlist, &br->router_list); - br_rtr_notify(br->dev, port, RTM_NEWMDB); - br_port_mc_router_state_change(port, true); + hlist_add_head_rcu(rlist, mc_router_list); + + /* For backwards compatibility for now, only notify if we + * switched from no IPv4/IPv6 multicast router to a new + * IPv4 or IPv6 multicast router. + */ + if (br_multicast_no_router_otherpf(port, rlist)) { + br_rtr_notify(br->dev, port, RTM_NEWMDB); + br_port_mc_router_state_change(port, true); + } +} + +/* Add port to router_list + * list is maintained ordered by pointer value + * and locked by br->multicast_lock and RCU + */ +static void br_ip4_multicast_add_router(struct net_bridge *br, + struct net_bridge_port *port) +{ + br_multicast_add_router(br, port, &port->ip4_rlist, + &br->ip4_mc_router_list); +} + +/* Add port to router_list + * list is maintained ordered by pointer value + * and locked by br->multicast_lock and RCU + */ +static void br_ip6_multicast_add_router(struct net_bridge *br, + struct net_bridge_port *port) +{ +#if IS_ENABLED(CONFIG_IPV6) + br_multicast_add_router(br, port, &port->ip6_rlist, + &br->ip6_mc_router_list); +#endif } static void br_multicast_mark_router(struct net_bridge *br, - struct net_bridge_port *port) + struct net_bridge_port *port, + struct timer_list *timer, + struct hlist_node *rlist, + struct hlist_head *mc_router_list) { unsigned long now = jiffies; if (!port) { if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { - if (!timer_pending(&br->multicast_router_timer)) + if (!br_ip4_multicast_is_router(br) && + !br_ip6_multicast_is_router(br)) br_mc_router_state_change(br, true); - mod_timer(&br->multicast_router_timer, - now + br->multicast_querier_interval); + mod_timer(timer, now + br->multicast_querier_interval); } return; } @@ -2702,24 +2832,71 @@ static void br_multicast_mark_router(struct net_bridge *br, port->multicast_router == MDB_RTR_TYPE_PERM) return; - br_multicast_add_router(br, port); + br_multicast_add_router(br, port, rlist, mc_router_list); + mod_timer(timer, now + br->multicast_querier_interval); +} + +static void br_ip4_multicast_mark_router(struct net_bridge *br, + struct net_bridge_port *port) +{ + struct timer_list *timer = &br->ip4_mc_router_timer; + struct hlist_node *rlist = NULL; + + if (port) { + timer = &port->ip4_mc_router_timer; + rlist = &port->ip4_rlist; + } - mod_timer(&port->multicast_router_timer, - now + br->multicast_querier_interval); + br_multicast_mark_router(br, port, timer, rlist, + &br->ip4_mc_router_list); } -static void br_multicast_query_received(struct net_bridge *br, - struct net_bridge_port *port, - struct bridge_mcast_other_query *query, - struct br_ip *saddr, - unsigned long max_delay) +static void br_ip6_multicast_mark_router(struct net_bridge *br, + struct net_bridge_port *port) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct timer_list *timer = &br->ip6_mc_router_timer; + struct hlist_node *rlist = NULL; + + if (port) { + timer = &port->ip6_mc_router_timer; + rlist = &port->ip6_rlist; + } + + br_multicast_mark_router(br, port, timer, rlist, + &br->ip6_mc_router_list); +#endif +} + +static void +br_ip4_multicast_query_received(struct net_bridge *br, + struct net_bridge_port *port, + struct bridge_mcast_other_query *query, + struct br_ip *saddr, + unsigned long max_delay) +{ + if (!br_ip4_multicast_select_querier(br, port, saddr->src.ip4)) + return; + + br_multicast_update_query_timer(br, query, max_delay); + br_ip4_multicast_mark_router(br, port); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void +br_ip6_multicast_query_received(struct net_bridge *br, + struct net_bridge_port *port, + struct bridge_mcast_other_query *query, + struct br_ip *saddr, + unsigned long max_delay) { - if (!br_multicast_select_querier(br, port, saddr)) + if (!br_ip6_multicast_select_querier(br, port, &saddr->src.ip6)) return; br_multicast_update_query_timer(br, query, max_delay); - br_multicast_mark_router(br, port); + br_ip6_multicast_mark_router(br, port); } +#endif static void br_ip4_multicast_query(struct net_bridge *br, struct net_bridge_port *port, @@ -2768,8 +2945,8 @@ static void br_ip4_multicast_query(struct net_bridge *br, saddr.proto = htons(ETH_P_IP); saddr.src.ip4 = iph->saddr; - br_multicast_query_received(br, port, &br->ip4_other_query, - &saddr, max_delay); + br_ip4_multicast_query_received(br, port, &br->ip4_other_query, + &saddr, max_delay); goto out; } @@ -2856,8 +3033,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, saddr.proto = htons(ETH_P_IPV6); saddr.src.ip6 = ipv6_hdr(skb)->saddr; - br_multicast_query_received(br, port, &br->ip6_other_query, - &saddr, max_delay); + br_ip6_multicast_query_received(br, port, &br->ip6_other_query, + &saddr, max_delay); goto out; } else if (!group) { goto out; @@ -3087,7 +3264,7 @@ static void br_multicast_pim(struct net_bridge *br, pim_hdr_type(pimhdr) != PIM_TYPE_HELLO) return; - br_multicast_mark_router(br, port); + br_ip4_multicast_mark_router(br, port); } static int br_ip4_multicast_mrd_rcv(struct net_bridge *br, @@ -3098,7 +3275,7 @@ static int br_ip4_multicast_mrd_rcv(struct net_bridge *br, igmp_hdr(skb)->type != IGMP_MRDISC_ADV) return -ENOMSG; - br_multicast_mark_router(br, port); + br_ip4_multicast_mark_router(br, port); return 0; } @@ -3166,7 +3343,7 @@ static void br_ip6_multicast_mrd_rcv(struct net_bridge *br, if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV) return; - br_multicast_mark_router(br, port); + br_ip6_multicast_mark_router(br, port); } static int br_multicast_ipv6_rcv(struct net_bridge *br, @@ -3316,13 +3493,15 @@ void br_multicast_init(struct net_bridge *br) br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); spin_lock_init(&br->multicast_lock); - timer_setup(&br->multicast_router_timer, - br_multicast_local_router_expired, 0); + timer_setup(&br->ip4_mc_router_timer, + br_ip4_multicast_local_router_expired, 0); timer_setup(&br->ip4_other_query.timer, br_ip4_multicast_querier_expired, 0); timer_setup(&br->ip4_own_query.timer, br_ip4_multicast_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) + timer_setup(&br->ip6_mc_router_timer, + br_ip6_multicast_local_router_expired, 0); timer_setup(&br->ip6_other_query.timer, br_ip6_multicast_querier_expired, 0); timer_setup(&br->ip6_own_query.timer, @@ -3416,10 +3595,11 @@ void br_multicast_open(struct net_bridge *br) void br_multicast_stop(struct net_bridge *br) { - del_timer_sync(&br->multicast_router_timer); + del_timer_sync(&br->ip4_mc_router_timer); del_timer_sync(&br->ip4_other_query.timer); del_timer_sync(&br->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) + del_timer_sync(&br->ip6_mc_router_timer); del_timer_sync(&br->ip6_other_query.timer); del_timer_sync(&br->ip6_own_query.timer); #endif @@ -3453,7 +3633,10 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) case MDB_RTR_TYPE_DISABLED: case MDB_RTR_TYPE_PERM: br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM); - del_timer(&br->multicast_router_timer); + del_timer(&br->ip4_mc_router_timer); +#if IS_ENABLED(CONFIG_IPV6) + del_timer(&br->ip6_mc_router_timer); +#endif br->multicast_router = val; err = 0; break; @@ -3470,11 +3653,22 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) return err; } -static void __del_port_router(struct net_bridge_port *p) +static void +br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted) { - if (hlist_unhashed(&p->rlist)) + if (!deleted) + return; + + /* For backwards compatibility for now, only notify if there is + * no multicast router anymore for both IPv4 and IPv6. + */ + if (!hlist_unhashed(&p->ip4_rlist)) return; - hlist_del_init_rcu(&p->rlist); +#if IS_ENABLED(CONFIG_IPV6) + if (!hlist_unhashed(&p->ip6_rlist)) + return; +#endif + br_rtr_notify(p->br->dev, p, RTM_DELMDB); br_port_mc_router_state_change(p, false); @@ -3488,34 +3682,52 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) struct net_bridge *br = p->br; unsigned long now = jiffies; int err = -EINVAL; + bool del = false; spin_lock(&br->multicast_lock); if (p->multicast_router == val) { /* Refresh the temp router port timer */ - if (p->multicast_router == MDB_RTR_TYPE_TEMP) - mod_timer(&p->multicast_router_timer, + if (p->multicast_router == MDB_RTR_TYPE_TEMP) { + mod_timer(&p->ip4_mc_router_timer, now + br->multicast_querier_interval); +#if IS_ENABLED(CONFIG_IPV6) + mod_timer(&p->ip6_mc_router_timer, + now + br->multicast_querier_interval); +#endif + } err = 0; goto unlock; } switch (val) { case MDB_RTR_TYPE_DISABLED: p->multicast_router = MDB_RTR_TYPE_DISABLED; - __del_port_router(p); - del_timer(&p->multicast_router_timer); + del |= br_ip4_multicast_rport_del(p); + del_timer(&p->ip4_mc_router_timer); + del |= br_ip6_multicast_rport_del(p); +#if IS_ENABLED(CONFIG_IPV6) + del_timer(&p->ip6_mc_router_timer); +#endif + br_multicast_rport_del_notify(p, del); break; case MDB_RTR_TYPE_TEMP_QUERY: p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - __del_port_router(p); + del |= br_ip4_multicast_rport_del(p); + del |= br_ip6_multicast_rport_del(p); + br_multicast_rport_del_notify(p, del); break; case MDB_RTR_TYPE_PERM: p->multicast_router = MDB_RTR_TYPE_PERM; - del_timer(&p->multicast_router_timer); - br_multicast_add_router(br, p); + del_timer(&p->ip4_mc_router_timer); + br_ip4_multicast_add_router(br, p); +#if IS_ENABLED(CONFIG_IPV6) + del_timer(&p->ip6_mc_router_timer); +#endif + br_ip6_multicast_add_router(br, p); break; case MDB_RTR_TYPE_TEMP: p->multicast_router = MDB_RTR_TYPE_TEMP; - br_multicast_mark_router(br, p); + br_ip4_multicast_mark_router(br, p); + br_ip6_multicast_mark_router(br, p); break; default: goto unlock; @@ -3621,7 +3833,7 @@ bool br_multicast_router(const struct net_device *dev) bool is_router; spin_lock_bh(&br->multicast_lock); - is_router = br_multicast_is_router(br); + is_router = br_multicast_is_router(br, NULL); spin_unlock_bh(&br->multicast_lock); return is_router; } @@ -3842,6 +4054,61 @@ unlock: } EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent); +/** + * br_multicast_has_router_adjacent - Checks for a router behind a bridge port + * @dev: The bridge port adjacent to which to check for a multicast router + * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 + * + * Checks whether the given interface has a bridge on top and if so returns + * true if a multicast router is behind one of the other ports of this + * bridge. Otherwise returns false. + */ +bool br_multicast_has_router_adjacent(struct net_device *dev, int proto) +{ + struct net_bridge_port *port, *p; + bool ret = false; + + rcu_read_lock(); + port = br_port_get_check_rcu(dev); + if (!port) + goto unlock; + + switch (proto) { + case ETH_P_IP: + hlist_for_each_entry_rcu(p, &port->br->ip4_mc_router_list, + ip4_rlist) { + if (p == port) + continue; + + ret = true; + goto unlock; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case ETH_P_IPV6: + hlist_for_each_entry_rcu(p, &port->br->ip6_mc_router_list, + ip6_rlist) { + if (p == port) + continue; + + ret = true; + goto unlock; + } + break; +#endif + default: + /* when compiled without IPv6 support, be conservative and + * always assume presence of an IPv6 multicast router + */ + ret = true; + } + +unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(br_multicast_has_router_adjacent); + static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, const struct sk_buff *skb, u8 type, u8 dir) { diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e4e6e991313e..8642e56059fb 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1644,7 +1644,6 @@ static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) p = br_port_get_rtnl(dev); if (!p) return 0; - br = p->br; vg = nbp_vlan_group(p); break; default: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e013d33f1c7c..a684d0cfc58c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -307,16 +307,18 @@ struct net_bridge_port { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct bridge_mcast_own_query ip4_own_query; + struct timer_list ip4_mc_router_timer; + struct hlist_node ip4_rlist; #if IS_ENABLED(CONFIG_IPV6) struct bridge_mcast_own_query ip6_own_query; + struct timer_list ip6_mc_router_timer; + struct hlist_node ip6_rlist; #endif /* IS_ENABLED(CONFIG_IPV6) */ u32 multicast_eht_hosts_limit; u32 multicast_eht_hosts_cnt; unsigned char multicast_router; struct bridge_mcast_stats __percpu *mcast_stats; - struct timer_list multicast_router_timer; struct hlist_head mglist; - struct hlist_node rlist; #endif #ifdef CONFIG_SYSFS @@ -449,14 +451,16 @@ struct net_bridge { struct hlist_head mcast_gc_list; struct hlist_head mdb_list; - struct hlist_head router_list; - struct timer_list multicast_router_timer; + struct hlist_head ip4_mc_router_list; + struct timer_list ip4_mc_router_timer; struct bridge_mcast_other_query ip4_other_query; struct bridge_mcast_own_query ip4_own_query; struct bridge_mcast_querier ip4_querier; struct bridge_mcast_stats __percpu *mcast_stats; #if IS_ENABLED(CONFIG_IPV6) + struct hlist_head ip6_mc_router_list; + struct timer_list ip6_mc_router_timer; struct bridge_mcast_other_query ip6_other_query; struct bridge_mcast_own_query ip6_own_query; struct bridge_mcast_querier ip6_querier; @@ -864,11 +868,58 @@ static inline bool br_group_is_l2(const struct br_ip *group) #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) -static inline bool br_multicast_is_router(struct net_bridge *br) +static inline struct hlist_node * +br_multicast_get_first_rport_node(struct net_bridge *b, struct sk_buff *skb) { +#if IS_ENABLED(CONFIG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) + return rcu_dereference(hlist_first_rcu(&b->ip6_mc_router_list)); +#endif + return rcu_dereference(hlist_first_rcu(&b->ip4_mc_router_list)); +} + +static inline struct net_bridge_port * +br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb) { +#if IS_ENABLED(CONFIG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) + return hlist_entry_safe(rp, struct net_bridge_port, ip6_rlist); +#endif + return hlist_entry_safe(rp, struct net_bridge_port, ip4_rlist); +} + +static inline bool br_ip4_multicast_is_router(struct net_bridge *br) +{ + return timer_pending(&br->ip4_mc_router_timer); +} + +static inline bool br_ip6_multicast_is_router(struct net_bridge *br) { - return br->multicast_router == 2 || - (br->multicast_router == 1 && - timer_pending(&br->multicast_router_timer)); +#if IS_ENABLED(CONFIG_IPV6) + return timer_pending(&br->ip6_mc_router_timer); +#else + return false; +#endif +} + +static inline bool +br_multicast_is_router(struct net_bridge *br, struct sk_buff *skb) +{ + switch (br->multicast_router) { + case MDB_RTR_TYPE_PERM: + return true; + case MDB_RTR_TYPE_TEMP_QUERY: + if (skb) { + if (skb->protocol == htons(ETH_P_IP)) + return br_ip4_multicast_is_router(br); + else if (skb->protocol == htons(ETH_P_IPV6)) + return br_ip6_multicast_is_router(br); + } else { + return br_ip4_multicast_is_router(br) || + br_ip6_multicast_is_router(br); + } + fallthrough; + default: + return false; + } } static inline bool @@ -1017,7 +1068,8 @@ static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst, { } -static inline bool br_multicast_is_router(struct net_bridge *br) +static inline bool br_multicast_is_router(struct net_bridge *br, + struct sk_buff *skb) { return false; } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index da3256a3eed0..8789a57af543 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -113,9 +113,7 @@ static void __vlan_add_list(struct net_bridge_vlan *v) headp = &vg->vlan_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct net_bridge_vlan, vlist); - if (v->vid < vent->vid) - continue; - else + if (v->vid >= vent->vid) break; } list_add_rcu(&v->vlist, hpos); diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index cac30e676ac9..23267c8db7c4 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -480,7 +480,7 @@ got_phyid: phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); if (!phyinfo) { res = -ENOMEM; - goto out_err; + goto out; } phy_layer->id = phyid; diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index fadc7c8a3107..37b67194c0df 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -76,8 +76,6 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) u8 buf; priv = container_of(layr, struct chnl_net, chnl); - if (!priv) - return -EINVAL; skb = (struct sk_buff *) cfpkt_tonative(pkt); diff --git a/net/can/isotp.c b/net/can/isotp.c index be6183f8ca11..bd49299319a1 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -225,8 +225,8 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) can_send_ret = can_send(nskb, 1); if (can_send_ret) - pr_notice_once("can-isotp: %s: can_send_ret %d\n", - __func__, can_send_ret); + pr_notice_once("can-isotp: %s: can_send_ret %pe\n", + __func__, ERR_PTR(can_send_ret)); dev_put(dev); @@ -801,10 +801,12 @@ isotp_tx_burst: can_skb_set_owner(skb, sk); can_send_ret = can_send(skb, 1); - if (can_send_ret) - pr_notice_once("can-isotp: %s: can_send_ret %d\n", - __func__, can_send_ret); - + if (can_send_ret) { + pr_notice_once("can-isotp: %s: can_send_ret %pe\n", + __func__, ERR_PTR(can_send_ret)); + if (can_send_ret == -ENOBUFS) + pr_notice_once("can-isotp: tx queue is full, increasing txqueuelen may prevent this error\n"); + } if (so->tx.idx >= so->tx.len) { /* we are done */ so->tx.state = ISOTP_IDLE; @@ -950,8 +952,8 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) err = can_send(skb, 1); dev_put(dev); if (err) { - pr_notice_once("can-isotp: %s: can_send_ret %d\n", - __func__, err); + pr_notice_once("can-isotp: %s: can_send_ret %pe\n", + __func__, ERR_PTR(err)); return err; } @@ -1482,7 +1484,7 @@ static __init int isotp_module_init(void) err = can_proto_register(&isotp_can_proto); if (err < 0) - pr_err("can: registration of isotp protocol failed\n"); + pr_err("can: registration of isotp protocol failed %pe\n", ERR_PTR(err)); else register_netdevice_notifier(&canisotp_notifier); diff --git a/net/can/proc.c b/net/can/proc.c index d1fe49e6f16d..b3099f0a3cb8 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -99,8 +99,6 @@ static void can_init_stats(struct net *net) static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, unsigned long count) { - unsigned long rate; - if (oldjif == newjif) return 0; @@ -111,9 +109,7 @@ static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, return 99999999; } - rate = (count * HZ) / (newjif - oldjif); - - return rate; + return (count * HZ) / (newjif - oldjif); } void can_stat_update(struct timer_list *t) diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h index 792fcb974dc3..9c60feeb1bcb 100644 --- a/net/ceph/auth_x_protocol.h +++ b/net/ceph/auth_x_protocol.h @@ -87,7 +87,7 @@ struct ceph_x_authorize_reply { /* - * encyption bundle + * encryption bundle */ #define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 195ceb8afb06..013cbdb6cfe2 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1508,7 +1508,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, return get_generic_reply(con, hdr, skip); /* - * Older OSDs don't set reply tid even if the orignal + * Older OSDs don't set reply tid even if the original * request had a non-zero tid. Work around this weirdness * by allocating a new message. */ diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c959320c4775..75b738083523 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1309,7 +1309,7 @@ static int get_osdmap_client_data_v(void **p, void *end, return -EINVAL; } - /* old osdmap enconding */ + /* old osdmap encoding */ struct_v = 0; } @@ -3010,7 +3010,7 @@ static bool is_valid_crush_name(const char *name) * parent, returns 0. * * Does a linear search, as there are no parent pointers of any - * kind. Note that the result is ambigous for items that occur + * kind. Note that the result is ambiguous for items that occur * multiple times in the map. */ static int get_immediate_parent(struct crush_map *c, int id, diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index cc3712ad8716..f564f82e91d9 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -524,8 +524,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) nr_maps++; } - diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps, - GFP_KERNEL); + diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); if (!diag) return ERR_PTR(-ENOMEM); diff --git a/net/core/dev.c b/net/core/dev.c index ef8cf7619baf..50531a2d0b20 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6520,11 +6520,18 @@ EXPORT_SYMBOL(napi_schedule_prep); * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * - * Variant of __napi_schedule() assuming hard irqs are masked + * Variant of __napi_schedule() assuming hard irqs are masked. + * + * On PREEMPT_RT enabled kernels this maps to __napi_schedule() + * because the interrupt disabled assumption might not be true + * due to force-threaded interrupts and spinlock substitution. */ void __napi_schedule_irqoff(struct napi_struct *n) { - ____napi_schedule(this_cpu_ptr(&softnet_data), n); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + else + __napi_schedule(n); } EXPORT_SYMBOL(__napi_schedule_irqoff); diff --git a/net/core/devlink.c b/net/core/devlink.c index 051432ea4f69..566ddd147633 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -190,6 +190,80 @@ static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, return devlink_port_get_from_attrs(devlink, info->attrs); } +static inline bool +devlink_rate_is_leaf(struct devlink_rate *devlink_rate) +{ + return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; +} + +static inline bool +devlink_rate_is_node(struct devlink_rate *devlink_rate) +{ + return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; +} + +static struct devlink_rate * +devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + struct devlink_rate *devlink_rate; + struct devlink_port *devlink_port; + + devlink_port = devlink_port_get_from_attrs(devlink, info->attrs); + if (IS_ERR(devlink_port)) + return ERR_CAST(devlink_port); + devlink_rate = devlink_port->devlink_rate; + return devlink_rate ?: ERR_PTR(-ENODEV); +} + +static struct devlink_rate * +devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) +{ + static struct devlink_rate *devlink_rate; + + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + if (devlink_rate_is_node(devlink_rate) && + !strcmp(node_name, devlink_rate->name)) + return devlink_rate; + } + return ERR_PTR(-ENODEV); +} + +static struct devlink_rate * +devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) +{ + const char *rate_node_name; + size_t len; + + if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME]) + return ERR_PTR(-EINVAL); + rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]); + len = strlen(rate_node_name); + /* Name cannot be empty or decimal number */ + if (!len || strspn(rate_node_name, "0123456789") == len) + return ERR_PTR(-EINVAL); + + return devlink_rate_node_get_by_name(devlink, rate_node_name); +} + +static struct devlink_rate * +devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + return devlink_rate_node_get_from_attrs(devlink, info->attrs); +} + +static struct devlink_rate * +devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + struct nlattr **attrs = info->attrs; + + if (attrs[DEVLINK_ATTR_PORT_INDEX]) + return devlink_rate_leaf_get_from_info(devlink, info); + else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME]) + return devlink_rate_node_get_from_info(devlink, info); + else + return ERR_PTR(-EINVAL); +} + struct devlink_sb { struct list_head list; unsigned int index; @@ -408,12 +482,14 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) #define DEVLINK_NL_FLAG_NEED_PORT BIT(0) #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) +#define DEVLINK_NL_FLAG_NEED_RATE BIT(2) +#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) /* The per devlink instance lock is taken by default in the pre-doit * operation, yet several commands do not require this. The global * devlink lock is taken and protects from disruption by user-calls. */ -#define DEVLINK_NL_FLAG_NO_LOCK BIT(2) +#define DEVLINK_NL_FLAG_NO_LOCK BIT(4) static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) @@ -442,6 +518,24 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, devlink_port = devlink_port_get_from_info(devlink, info); if (!IS_ERR(devlink_port)) info->user_ptr[1] = devlink_port; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { + struct devlink_rate *devlink_rate; + + devlink_rate = devlink_rate_get_from_info(devlink, info); + if (IS_ERR(devlink_rate)) { + err = PTR_ERR(devlink_rate); + goto unlock; + } + info->user_ptr[1] = devlink_rate; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) { + struct devlink_rate *rate_node; + + rate_node = devlink_rate_node_get_from_info(devlink, info); + if (IS_ERR(rate_node)) { + err = PTR_ERR(rate_node); + goto unlock; + } + info->user_ptr[1] = rate_node; } return 0; @@ -748,6 +842,56 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops * return 0; } +static int devlink_nl_rate_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_rate *devlink_rate, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, + struct netlink_ext_ack *extack) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type)) + goto nla_put_failure; + + if (devlink_rate_is_leaf(devlink_rate)) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, + devlink_rate->devlink_port->index)) + goto nla_put_failure; + } else if (devlink_rate_is_node(devlink_rate)) { + if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME, + devlink_rate->name)) + goto nla_put_failure; + } + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE, + devlink_rate->tx_share, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_MAX, + devlink_rate->tx_max, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (devlink_rate->parent) + if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, + devlink_rate->parent->name)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + static bool devlink_port_fn_state_valid(enum devlink_port_fn_state state) { @@ -919,6 +1063,111 @@ static void devlink_port_notify(struct devlink_port *devlink_port, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static void devlink_rate_notify(struct devlink_rate *devlink_rate, + enum devlink_command cmd) +{ + struct devlink *devlink = devlink_rate->devlink; + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && + cmd != DEVLINK_CMD_RATE_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_rate_fill(msg, devlink, devlink_rate, + cmd, 0, 0, 0, NULL); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_rate *devlink_rate; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err = 0; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; + u32 id = NETLINK_CB(cb->skb).portid; + + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_rate_fill(msg, devlink, + devlink_rate, + cmd, id, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, NULL); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_rate *devlink_rate = info->user_ptr[1]; + struct devlink *devlink = devlink_rate->devlink; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_rate_fill(msg, devlink, devlink_rate, + DEVLINK_CMD_RATE_NEW, + info->snd_portid, info->snd_seq, 0, + info->extack); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static bool +devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, + struct devlink_rate *parent) +{ + while (parent) { + if (parent == devlink_rate) + return true; + parent = parent->parent; + } + return false; +} + static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; @@ -1339,6 +1588,255 @@ static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, return devlink->ops->port_del(devlink, port_index, extack); } +static int +devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, + struct genl_info *info, + struct nlattr *nla_parent) +{ + struct devlink *devlink = devlink_rate->devlink; + const char *parent_name = nla_data(nla_parent); + const struct devlink_ops *ops = devlink->ops; + size_t len = strlen(parent_name); + struct devlink_rate *parent; + int err = -EOPNOTSUPP; + + parent = devlink_rate->parent; + if (parent && len) { + NL_SET_ERR_MSG_MOD(info->extack, "Rate object already has parent."); + return -EBUSY; + } else if (parent && !len) { + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_parent_set(devlink_rate, NULL, + devlink_rate->priv, NULL, + info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_parent_set(devlink_rate, NULL, + devlink_rate->priv, NULL, + info->extack); + if (err) + return err; + + refcount_dec(&parent->refcnt); + devlink_rate->parent = NULL; + } else if (!parent && len) { + parent = devlink_rate_node_get_by_name(devlink, parent_name); + if (IS_ERR(parent)) + return -ENODEV; + + if (parent == devlink_rate) { + NL_SET_ERR_MSG_MOD(info->extack, "Parent to self is not allowed"); + return -EINVAL; + } + + if (devlink_rate_is_node(devlink_rate) && + devlink_rate_is_parent_node(devlink_rate, parent->parent)) { + NL_SET_ERR_MSG_MOD(info->extack, "Node is already a parent of parent node."); + return -EEXIST; + } + + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_parent_set(devlink_rate, parent, + devlink_rate->priv, parent->priv, + info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_parent_set(devlink_rate, parent, + devlink_rate->priv, parent->priv, + info->extack); + if (err) + return err; + + refcount_inc(&parent->refcnt); + devlink_rate->parent = parent; + } + + return 0; +} + +static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, + const struct devlink_ops *ops, + struct genl_info *info) +{ + struct nlattr *nla_parent, **attrs = info->attrs; + int err = -EOPNOTSUPP; + u64 rate; + + if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { + rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + if (err) + return err; + devlink_rate->tx_share = rate; + } + + if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { + rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + if (err) + return err; + devlink_rate->tx_max = rate; + } + + nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; + if (nla_parent) { + err = devlink_nl_rate_parent_node_set(devlink_rate, info, + nla_parent); + if (err) + return err; + } + + return 0; +} + +static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, + struct genl_info *info, + enum devlink_rate_type type) +{ + struct nlattr **attrs = info->attrs; + + if (type == DEVLINK_RATE_TYPE_LEAF) { + if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { + NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the leafs"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { + NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the leafs"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && + !ops->rate_leaf_parent_set) { + NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs"); + return false; + } + } else if (type == DEVLINK_RATE_TYPE_NODE) { + if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { + NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { + NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the nodes"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && + !ops->rate_node_parent_set) { + NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes"); + return false; + } + } else { + WARN(1, "Unknown type of rate object"); + return false; + } + + return true; +} + +static int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_rate *devlink_rate = info->user_ptr[1]; + struct devlink *devlink = devlink_rate->devlink; + const struct devlink_ops *ops = devlink->ops; + int err; + + if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) + return -EOPNOTSUPP; + + err = devlink_nl_rate_set(devlink_rate, ops, info); + + if (!err) + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); + return err; +} + +static int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *rate_node; + const struct devlink_ops *ops; + int err; + + ops = devlink->ops; + if (!ops || !ops->rate_node_new || !ops->rate_node_del) { + NL_SET_ERR_MSG_MOD(info->extack, "Rate nodes aren't supported"); + return -EOPNOTSUPP; + } + + if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE)) + return -EOPNOTSUPP; + + rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs); + if (!IS_ERR(rate_node)) + return -EEXIST; + else if (rate_node == ERR_PTR(-EINVAL)) + return -EINVAL; + + rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); + if (!rate_node) + return -ENOMEM; + + rate_node->devlink = devlink; + rate_node->type = DEVLINK_RATE_TYPE_NODE; + rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL); + if (!rate_node->name) { + err = -ENOMEM; + goto err_strdup; + } + + err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack); + if (err) + goto err_node_new; + + err = devlink_nl_rate_set(rate_node, ops, info); + if (err) + goto err_rate_set; + + refcount_set(&rate_node->refcnt, 1); + list_add(&rate_node->list, &devlink->rate_list); + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); + return 0; + +err_rate_set: + ops->rate_node_del(rate_node, rate_node->priv, info->extack); +err_node_new: + kfree(rate_node->name); +err_strdup: + kfree(rate_node); + return err; +} + +static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_rate *rate_node = info->user_ptr[1]; + struct devlink *devlink = rate_node->devlink; + const struct devlink_ops *ops = devlink->ops; + int err; + + if (refcount_read(&rate_node->refcnt) > 1) { + NL_SET_ERR_MSG_MOD(info->extack, "Node has children. Cannot delete node."); + return -EBUSY; + } + + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); + err = ops->rate_node_del(rate_node, rate_node->priv, info->extack); + if (rate_node->parent) + refcount_dec(&rate_node->parent->refcnt); + list_del(&rate_node->list); + kfree(rate_node->name); + kfree(rate_node); + return err; +} + static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, enum devlink_command cmd, u32 portid, @@ -2207,6 +2705,30 @@ static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } +static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) +{ + struct devlink_rate *devlink_rate; + u16 old_mode; + int err; + + if (!devlink->ops->eswitch_mode_get) + return -EOPNOTSUPP; + err = devlink->ops->eswitch_mode_get(devlink, &old_mode); + if (err) + return err; + + if (old_mode == mode) + return 0; + + list_for_each_entry(devlink_rate, &devlink->rate_list, list) + if (devlink_rate_is_node(devlink_rate)) { + NL_SET_ERR_MSG_MOD(extack, "Rate node(s) exists."); + return -EBUSY; + } + return 0; +} + static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info) { @@ -2221,6 +2743,9 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, if (!ops->eswitch_mode_set) return -EOPNOTSUPP; mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + err = devlink_rate_nodes_check(devlink, mode, info->extack); + if (err) + return err; err = ops->eswitch_mode_set(devlink, mode, info->extack); if (err) return err; @@ -6994,8 +7519,9 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, } } -static int devlink_trap_stats_put(struct sk_buff *msg, - struct devlink_stats __percpu *trap_stats) +static int +devlink_trap_group_stats_put(struct sk_buff *msg, + struct devlink_stats __percpu *trap_stats) { struct devlink_stats stats; struct nlattr *attr; @@ -7023,6 +7549,50 @@ nla_put_failure: return -EMSGSIZE; } +static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_item *trap_item) +{ + struct devlink_stats stats; + struct nlattr *attr; + u64 drops = 0; + int err; + + if (devlink->ops->trap_drop_counter_get) { + err = devlink->ops->trap_drop_counter_get(devlink, + trap_item->trap, + &drops); + if (err) + return err; + } + + devlink_trap_stats_read(trap_item->stats, &stats); + + attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); + if (!attr) + return -EMSGSIZE; + + if (devlink->ops->trap_drop_counter_get && + nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, + stats.rx_packets, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, + stats.rx_bytes, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd, u32 portid, u32 seq, @@ -7060,7 +7630,7 @@ static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto nla_put_failure; - err = devlink_trap_stats_put(msg, trap_item->stats); + err = devlink_trap_stats_put(msg, devlink, trap_item); if (err) goto nla_put_failure; @@ -7277,7 +7847,7 @@ devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink, group_item->policer_item->policer->id)) goto nla_put_failure; - err = devlink_trap_stats_put(msg, group_item->stats); + err = devlink_trap_group_stats_put(msg, group_item->stats); if (err) goto nla_put_failure; @@ -7801,6 +8371,11 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, + [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 }, + [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, + [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING }, }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -7827,6 +8402,30 @@ static const struct genl_small_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { + .cmd = DEVLINK_CMD_RATE_GET, + .doit = devlink_nl_cmd_rate_get_doit, + .dumpit = devlink_nl_cmd_rate_get_dumpit, + .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_RATE_SET, + .doit = devlink_nl_cmd_rate_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, + }, + { + .cmd = DEVLINK_CMD_RATE_NEW, + .doit = devlink_nl_cmd_rate_new_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_RATE_DEL, + .doit = devlink_nl_cmd_rate_del_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_RATE_NODE, + }, + { .cmd = DEVLINK_CMD_PORT_SPLIT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_split_doit, @@ -8201,6 +8800,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); __devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); + INIT_LIST_HEAD(&devlink->rate_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); @@ -8303,6 +8903,7 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->resource_list)); WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); + WARN_ON(!list_empty(&devlink->rate_list)); WARN_ON(!list_empty(&devlink->port_list)); xa_destroy(&devlink->snapshot_ids); @@ -8619,6 +9220,108 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); +/** + * devlink_rate_leaf_create - create devlink rate leaf + * + * @devlink_port: devlink port object to create rate object on + * @priv: driver private data + * + * Create devlink rate object of type leaf on provided @devlink_port. + * Throws call trace if @devlink_port already has a devlink rate object. + * + * Context: Takes and release devlink->lock <mutex>. + * + * Return: -ENOMEM if failed to allocate rate object, 0 otherwise. + */ +int +devlink_rate_leaf_create(struct devlink_port *devlink_port, void *priv) +{ + struct devlink *devlink = devlink_port->devlink; + struct devlink_rate *devlink_rate; + + devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); + if (!devlink_rate) + return -ENOMEM; + + mutex_lock(&devlink->lock); + WARN_ON(devlink_port->devlink_rate); + devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; + devlink_rate->devlink = devlink; + devlink_rate->devlink_port = devlink_port; + devlink_rate->priv = priv; + list_add_tail(&devlink_rate->list, &devlink->rate_list); + devlink_port->devlink_rate = devlink_rate; + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); + mutex_unlock(&devlink->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_rate_leaf_create); + +/** + * devlink_rate_leaf_destroy - destroy devlink rate leaf + * + * @devlink_port: devlink port linked to the rate object + * + * Context: Takes and release devlink->lock <mutex>. + */ +void devlink_rate_leaf_destroy(struct devlink_port *devlink_port) +{ + struct devlink_rate *devlink_rate = devlink_port->devlink_rate; + struct devlink *devlink = devlink_port->devlink; + + if (!devlink_rate) + return; + + mutex_lock(&devlink->lock); + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); + list_del(&devlink_rate->list); + devlink_port->devlink_rate = NULL; + mutex_unlock(&devlink->lock); + kfree(devlink_rate); +} +EXPORT_SYMBOL_GPL(devlink_rate_leaf_destroy); + +/** + * devlink_rate_nodes_destroy - destroy all devlink rate nodes on device + * + * @devlink: devlink instance + * + * Unset parent for all rate objects and destroy all rate nodes + * on specified device. + * + * Context: Takes and release devlink->lock <mutex>. + */ +void devlink_rate_nodes_destroy(struct devlink *devlink) +{ + static struct devlink_rate *devlink_rate, *tmp; + const struct devlink_ops *ops = devlink->ops; + + mutex_lock(&devlink->lock); + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + if (!devlink_rate->parent) + continue; + + refcount_dec(&devlink_rate->parent->refcnt); + if (devlink_rate_is_leaf(devlink_rate)) + ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, + NULL, NULL); + else if (devlink_rate_is_node(devlink_rate)) + ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, + NULL, NULL); + } + list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { + if (devlink_rate_is_node(devlink_rate)) { + ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); + list_del(&devlink_rate->list); + kfree(devlink_rate->name); + kfree(devlink_rate); + } + } + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_rate_nodes_destroy); + static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { @@ -8630,12 +9333,18 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: + case DEVLINK_PORT_FLAVOUR_VIRTUAL: + n = snprintf(name, len, "p%u", attrs->phys.port_number); + if (n < len && attrs->split) + n += snprintf(name + n, len - n, "s%u", + attrs->phys.split_subport_number); if (!attrs->split) n = snprintf(name, len, "p%u", attrs->phys.port_number); else n = snprintf(name, len, "p%us%u", attrs->phys.port_number, attrs->phys.split_subport_number); + break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: @@ -8677,8 +9386,6 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; - case DEVLINK_PORT_FLAVOUR_VIRTUAL: - return -EOPNOTSUPP; } if (n >= len) diff --git a/net/core/filter.c b/net/core/filter.c index 65ab4e21c087..0b13d8157a8f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3235,7 +3235,7 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) return ret; } -static int bpf_skb_proto_4_to_6(struct sk_buff *skb) +static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); @@ -3264,7 +3264,9 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) } /* Due to IPv6 header, MSS needs to be downgraded. */ - skb_decrease_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_decrease_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3276,7 +3278,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) return 0; } -static int bpf_skb_proto_6_to_4(struct sk_buff *skb) +static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); @@ -3305,7 +3307,9 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) } /* Due to IPv4 header, MSS can be upgraded. */ - skb_increase_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_increase_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3317,17 +3321,17 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) return 0; } -static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) +static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto, u64 flags) { __be16 from_proto = skb->protocol; if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) - return bpf_skb_proto_4_to_6(skb); + return bpf_skb_proto_4_to_6(skb, flags); if (from_proto == htons(ETH_P_IPV6) && to_proto == htons(ETH_P_IP)) - return bpf_skb_proto_6_to_4(skb); + return bpf_skb_proto_6_to_4(skb, flags); return -ENOTSUPP; } @@ -3337,7 +3341,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, { int ret; - if (unlikely(flags)) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO))) return -EINVAL; /* General idea is that this helper does the basic groundwork @@ -3357,7 +3361,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * that. For offloads, we mark packet as dodgy, so that headers * need to be verified first. */ - ret = bpf_skb_proto_xlat(skb, proto); + ret = bpf_skb_proto_xlat(skb, proto, flags); bpf_compute_data_pointers(skb); return ret; } @@ -3927,6 +3931,23 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); +void bpf_clear_redirect_map(struct bpf_map *map) +{ + struct bpf_redirect_info *ri; + int cpu; + + for_each_possible_cpu(cpu) { + ri = per_cpu_ptr(&bpf_redirect_info, cpu); + /* Avoid polluting remote cacheline due to writes if + * not needed. Once we pass this test, we need the + * cmpxchg() to make sure it hasn't been changed in + * the meantime by remote CPU. + */ + if (unlikely(READ_ONCE(ri->map) == map)) + cmpxchg(&ri->map, map, NULL); + } +} + int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { @@ -3934,6 +3955,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ @@ -3943,7 +3965,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - err = dev_map_enqueue(fwd, xdp, dev); + map = READ_ONCE(ri->map); + if (unlikely(map)) { + WRITE_ONCE(ri->map, NULL); + err = dev_map_enqueue_multi(xdp, dev, map, + ri->flags & BPF_F_EXCLUDE_INGRESS); + } else { + err = dev_map_enqueue(fwd, xdp, dev); + } break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_enqueue(fwd, xdp, dev); @@ -3985,13 +4014,21 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, enum bpf_map_type map_type, u32 map_id) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map; int err; switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - err = dev_map_generic_redirect(fwd, skb, xdp_prog); + map = READ_ONCE(ri->map); + if (unlikely(map)) { + WRITE_ONCE(ri->map, NULL); + err = dev_map_redirect_multi(dev, skb, xdp_prog, map, + ri->flags & BPF_F_EXCLUDE_INGRESS); + } else { + err = dev_map_generic_redirect(fwd, skb, xdp_prog); + } if (unlikely(err)) goto err; break; @@ -10008,11 +10045,13 @@ out: static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash) { reuse_kern->skb = skb; reuse_kern->sk = sk; reuse_kern->selected_sk = NULL; + reuse_kern->migrating_sk = migrating_sk; reuse_kern->data_end = skb->data + skb_headlen(skb); reuse_kern->hash = hash; reuse_kern->reuseport_id = reuse->reuseport_id; @@ -10021,12 +10060,13 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash) { struct sk_reuseport_kern reuse_kern; enum sk_action action; - bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); action = BPF_PROG_RUN(prog, &reuse_kern); if (action == SK_PASS) @@ -10136,6 +10176,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, return &sk_reuseport_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &sk_reuseport_load_bytes_relative_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; default: return bpf_base_func_proto(func_id); } @@ -10165,6 +10207,14 @@ sk_reuseport_is_valid_access(int off, int size, case offsetof(struct sk_reuseport_md, hash): return size == size_default; + case offsetof(struct sk_reuseport_md, sk): + info->reg_type = PTR_TO_SOCKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, migrating_sk): + info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; + return size == sizeof(__u64); + /* Fields that allow narrowing */ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < sizeof_field(struct sk_buff, protocol)) @@ -10237,6 +10287,14 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, case offsetof(struct sk_reuseport_md, bind_inany): SK_REUSEPORT_LOAD_FIELD(bind_inany); break; + + case offsetof(struct sk_reuseport_md, sk): + SK_REUSEPORT_LOAD_FIELD(sk); + break; + + case offsetof(struct sk_reuseport_md, migrating_sk): + SK_REUSEPORT_LOAD_FIELD(migrating_sk); + break; } return insn - insn_buf; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3ed7c98a98e1..2aadbfc5193b 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -943,8 +943,8 @@ bool __skb_flow_dissect(const struct net *net, int offset = 0; ops = skb->dev->dsa_ptr->tag_ops; - /* Tail taggers don't break flow dissection */ - if (!ops->tail_tag) { + /* Only DSA header taggers break flow dissection */ + if (ops->needed_headroom) { if (ops->flow_dissect) ops->flow_dissect(skb, &proto, &offset); else diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bf774575ad71..53e85c70c6e5 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3142,7 +3142,7 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; struct pneigh_entry *pn = NULL; - int bucket = state->bucket; + int bucket; state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 283ddb2dbc7d..c40cd8dd75c7 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -60,3 +60,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); +EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c310c7c1cef7..0a6b04714558 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -36,6 +36,7 @@ #include <net/ip6_checksum.h> #include <asm/unaligned.h> #include <trace/events/napi.h> +#include <linux/kconfig.h> /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -389,7 +390,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) static atomic_t ip_ident; struct ipv6hdr *ip6h; - WARN_ON_ONCE(!irqs_disabled()); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!irqs_disabled()); udp_len = len + sizeof(*udph); if (np->ipv6) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 3c4c4c7a0402..5e4eb45b139c 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -17,6 +17,7 @@ #include <linux/dma-mapping.h> #include <linux/page-flags.h> #include <linux/mm.h> /* for __put_page() */ +#include <linux/poison.h> #include <trace/events/page_pool.h> @@ -221,6 +222,8 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; } + page->pp_magic |= PP_SIGNATURE; + /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); @@ -263,6 +266,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, put_page(page); continue; } + page->pp_magic |= PP_SIGNATURE; pool->alloc.cache[pool->alloc.count++] = page; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -341,6 +345,8 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: + page->pp_magic = 0; + /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ @@ -622,3 +628,25 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +bool page_pool_return_skb_page(struct page *page) +{ + struct page_pool *pp; + + page = compound_head(page); + if (unlikely(page->pp_magic != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page->pp = NULL; + page_pool_put_full_page(pp, page, false); + + return true; +} +EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 3fba429f1f57..7e258d255e90 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -467,7 +467,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, static int pktgen_device_event(struct notifier_block *, unsigned long, void *); static void pktgen_run_all_threads(struct pktgen_net *pn); static void pktgen_reset_all_threads(struct pktgen_net *pn); -static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn); +static void pktgen_stop_all_threads(struct pktgen_net *pn); static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); @@ -516,14 +516,11 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, data[count - 1] = 0; /* Strip trailing '\n' and terminate string */ if (!strcmp(data, "stop")) - pktgen_stop_all_threads_ifs(pn); - + pktgen_stop_all_threads(pn); else if (!strcmp(data, "start")) pktgen_run_all_threads(pn); - else if (!strcmp(data, "reset")) pktgen_reset_all_threads(pn); - else return -EINVAL; @@ -3027,20 +3024,25 @@ static void pktgen_run(struct pktgen_thread *t) t->control &= ~(T_STOP); } -static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn) +static void pktgen_handle_all_threads(struct pktgen_net *pn, u32 flags) { struct pktgen_thread *t; - func_enter(); - mutex_lock(&pktgen_thread_lock); list_for_each_entry(t, &pn->pktgen_threads, th_list) - t->control |= T_STOP; + t->control |= (flags); mutex_unlock(&pktgen_thread_lock); } +static void pktgen_stop_all_threads(struct pktgen_net *pn) +{ + func_enter(); + + pktgen_handle_all_threads(pn, T_STOP); +} + static int thread_is_running(const struct pktgen_thread *t) { const struct pktgen_dev *pkt_dev; @@ -3103,16 +3105,9 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn) static void pktgen_run_all_threads(struct pktgen_net *pn) { - struct pktgen_thread *t; - func_enter(); - mutex_lock(&pktgen_thread_lock); - - list_for_each_entry(t, &pn->pktgen_threads, th_list) - t->control |= (T_RUN); - - mutex_unlock(&pktgen_thread_lock); + pktgen_handle_all_threads(pn, T_RUN); /* Propagate thread->control */ schedule_timeout_interruptible(msecs_to_jiffies(125)); @@ -3122,16 +3117,9 @@ static void pktgen_run_all_threads(struct pktgen_net *pn) static void pktgen_reset_all_threads(struct pktgen_net *pn) { - struct pktgen_thread *t; - func_enter(); - mutex_lock(&pktgen_thread_lock); - - list_for_each_entry(t, &pn->pktgen_threads, th_list) - t->control |= (T_REMDEVALL); - - mutex_unlock(&pktgen_thread_lock); + pktgen_handle_all_threads(pn, T_REMDEVALL); /* Propagate thread->control */ schedule_timeout_interruptible(msecs_to_jiffies(125)); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ec931b080156..745965e49f78 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -9,7 +9,7 @@ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: - * Vitaly E. Lavrov RTA_OK arithmetics was wrong. + * Vitaly E. Lavrov RTA_OK arithmetic was wrong. */ #include <linux/bitops.h> @@ -234,7 +234,7 @@ unlock: * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions * * Like rtnl_register, but for use by removable modules. */ @@ -254,7 +254,7 @@ EXPORT_SYMBOL_GPL(rtnl_register_module); * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions * * Registers the specified function pointers (at least one of them has * to be non-NULL) to be called whenever a request message for the @@ -376,12 +376,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) if (rtnl_link_ops_get(ops->kind)) return -EEXIST; - /* The check for setup is here because if ops + /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not * fill up dellink as well. That disables rtnl_dellink. */ - if (ops->setup && !ops->dellink) + if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; list_add_tail(&ops->list, &link_ops); @@ -543,7 +543,9 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) { const struct rtnl_af_ops *ops; - list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { + ASSERT_RTNL(); + + list_for_each_entry(ops, &rtnl_af_ops, list) { if (ops->family == family) return ops; } @@ -1819,6 +1821,16 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (rtnl_fill_prop_list(skb, dev)) goto nla_put_failure; + if (dev->dev.parent && + nla_put_string(skb, IFLA_PARENT_DEV_NAME, + dev_name(dev->dev.parent))) + goto nla_put_failure; + + if (dev->dev.parent && dev->dev.parent->bus && + nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME, + dev->dev.parent->bus->name)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -1878,6 +1890,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), + [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2274,27 +2287,18 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; - rcu_read_lock(); af_ops = rtnl_af_lookup(nla_type(af)); - if (!af_ops) { - rcu_read_unlock(); + if (!af_ops) return -EAFNOSUPPORT; - } - if (!af_ops->set_link_af) { - rcu_read_unlock(); + if (!af_ops->set_link_af) return -EOPNOTSUPP; - } if (af_ops->validate_link_af) { err = af_ops->validate_link_af(dev, af); - if (err < 0) { - rcu_read_unlock(); + if (err < 0) return err; - } } - - rcu_read_unlock(); } } @@ -2574,7 +2578,7 @@ static int do_set_proto_down(struct net_device *dev, if (nl_proto_down) { proto_down = nla_get_u8(nl_proto_down); - /* Dont turn off protodown if there are active reasons */ + /* Don't turn off protodown if there are active reasons */ if (!proto_down && dev->proto_down_reason) { NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); return -EBUSY; @@ -2868,17 +2872,12 @@ static int do_setlink(const struct sk_buff *skb, nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; - rcu_read_lock(); - BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); err = af_ops->set_link_af(dev, af, extack); - if (err < 0) { - rcu_read_unlock(); + if (err < 0) goto errout; - } - rcu_read_unlock(); status |= DO_SETLINK_NOTIFY; } } @@ -3177,8 +3176,17 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, return ERR_PTR(-EINVAL); } - dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, - ops->setup, num_tx_queues, num_rx_queues); + if (ops->alloc) { + dev = ops->alloc(tb, ifname, name_assign_type, + num_tx_queues, num_rx_queues); + if (IS_ERR(dev)) + return dev; + } else { + dev = alloc_netdev_mqs(ops->priv_size, ifname, + name_assign_type, ops->setup, + num_tx_queues, num_rx_queues); + } + if (!dev) return ERR_PTR(-ENOMEM); @@ -3411,7 +3419,7 @@ replay: return -EOPNOTSUPP; } - if (!ops->setup) + if (!ops->alloc && !ops->setup) return -EOPNOTSUPP; if (!ifname[0]) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bbc3b4b62032..2531ac4ffa69 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -70,6 +70,7 @@ #include <net/xfrm.h> #include <net/mpls.h> #include <net/mptcp.h> +#include <net/page_pool.h> #include <linux/uaccess.h> #include <trace/events/skb.h> @@ -645,10 +646,13 @@ static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; - if (skb->head_frag) + if (skb->head_frag) { + if (skb_pp_recycle(skb, head)) + return; skb_free_frag(head); - else + } else { kfree(head); + } } static void skb_release_data(struct sk_buff *skb) @@ -664,7 +668,7 @@ static void skb_release_data(struct sk_buff *skb) skb_zcopy_clear(skb, true); for (i = 0; i < shinfo->nr_frags; i++) - __skb_frag_unref(&shinfo->frags[i]); + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); @@ -1046,6 +1050,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->nohdr = 0; n->peeked = 0; C(pfmemalloc); + C(pp_recycle); n->destructor = NULL; C(tail); C(end); @@ -3497,7 +3502,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, skb_frag_size(fragfrom)); - __skb_frag_unref(fragfrom); + __skb_frag_unref(fragfrom, skb->pp_recycle); } /* Reposition in the original skb */ @@ -5287,6 +5292,13 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false; + /* The page pool signature of struct page will eventually figure out + * which pages can be recycled or not but for now let's prohibit slab + * allocated and page_pool allocated SKBs from being coalesced. + */ + if (to->pp_recycle != from->pp_recycle) + return false; + if (len <= skb_tailroom(to)) { if (len) BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 43ce17a6a585..f0b9decdf279 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,8 +399,7 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, - long timeo, int *err) +int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; diff --git a/net/core/sock.c b/net/core/sock.c index 946888afef88..ddfa88082a2b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -776,6 +776,58 @@ void sock_enable_timestamps(struct sock *sk) } EXPORT_SYMBOL(sock_enable_timestamps); +void sock_set_timestamp(struct sock *sk, int optname, bool valbool) +{ + switch (optname) { + case SO_TIMESTAMP_OLD: + __sock_set_timestamps(sk, valbool, false, false); + break; + case SO_TIMESTAMP_NEW: + __sock_set_timestamps(sk, valbool, true, false); + break; + case SO_TIMESTAMPNS_OLD: + __sock_set_timestamps(sk, valbool, false, true); + break; + case SO_TIMESTAMPNS_NEW: + __sock_set_timestamps(sk, valbool, true, true); + break; + } +} + +int sock_set_timestamping(struct sock *sk, int optname, int val) +{ + if (val & ~SOF_TIMESTAMPING_MASK) + return -EINVAL; + + if (val & SOF_TIMESTAMPING_OPT_ID && + !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) { + if ((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN)) + return -EINVAL; + sk->sk_tskey = tcp_sk(sk)->snd_una; + } else { + sk->sk_tskey = 0; + } + } + + if (val & SOF_TIMESTAMPING_OPT_STATS && + !(val & SOF_TIMESTAMPING_OPT_TSONLY)) + return -EINVAL; + + sk->sk_tsflags = val; + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) + sock_enable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); + else + sock_disable_timestamp(sk, + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); + return 0; +} + void sock_set_keepalive(struct sock *sk) { lock_sock(sk); @@ -997,54 +1049,15 @@ set_sndbuf: break; case SO_TIMESTAMP_OLD: - __sock_set_timestamps(sk, valbool, false, false); - break; case SO_TIMESTAMP_NEW: - __sock_set_timestamps(sk, valbool, true, false); - break; case SO_TIMESTAMPNS_OLD: - __sock_set_timestamps(sk, valbool, false, true); - break; case SO_TIMESTAMPNS_NEW: - __sock_set_timestamps(sk, valbool, true, true); + sock_set_timestamp(sk, valbool, optname); break; + case SO_TIMESTAMPING_NEW: case SO_TIMESTAMPING_OLD: - if (val & ~SOF_TIMESTAMPING_MASK) { - ret = -EINVAL; - break; - } - - if (val & SOF_TIMESTAMPING_OPT_ID && - !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) { - if ((1 << sk->sk_state) & - (TCPF_CLOSE | TCPF_LISTEN)) { - ret = -EINVAL; - break; - } - sk->sk_tskey = tcp_sk(sk)->snd_una; - } else { - sk->sk_tskey = 0; - } - } - - if (val & SOF_TIMESTAMPING_OPT_STATS && - !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { - ret = -EINVAL; - break; - } - - sk->sk_tsflags = val; - sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); - - if (val & SOF_TIMESTAMPING_RX_SOFTWARE) - sock_enable_timestamp(sk, - SOCK_TIMESTAMPING_RX_SOFTWARE); - else - sock_disable_timestamp(sk, - (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); + ret = sock_set_timestamping(sk, optname, val); break; case SO_RCVLOWAT: diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index b065f0a103ed..de5ee3ae86d5 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -17,6 +17,74 @@ DEFINE_SPINLOCK(reuseport_lock); static DEFINE_IDA(reuseport_ida); +static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, + struct sock_reuseport *reuse, bool bind_inany); + +static int reuseport_sock_index(struct sock *sk, + const struct sock_reuseport *reuse, + bool closed) +{ + int left, right; + + if (!closed) { + left = 0; + right = reuse->num_socks; + } else { + left = reuse->max_socks - reuse->num_closed_socks; + right = reuse->max_socks; + } + + for (; left < right; left++) + if (reuse->socks[left] == sk) + return left; + return -1; +} + +static void __reuseport_add_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + reuse->socks[reuse->num_socks] = sk; + /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */ + smp_wmb(); + reuse->num_socks++; +} + +static bool __reuseport_detach_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + int i = reuseport_sock_index(sk, reuse, false); + + if (i == -1) + return false; + + reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; + reuse->num_socks--; + + return true; +} + +static void __reuseport_add_closed_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk; + /* paired with READ_ONCE() in inet_csk_bind_conflict() */ + WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1); +} + +static bool __reuseport_detach_closed_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + int i = reuseport_sock_index(sk, reuse, true); + + if (i == -1) + return false; + + reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; + /* paired with READ_ONCE() in inet_csk_bind_conflict() */ + WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1); + + return true; +} static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { @@ -49,6 +117,12 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); if (reuse) { + if (reuse->num_closed_socks) { + /* sk was shutdown()ed before */ + ret = reuseport_resurrect(sk, reuse, NULL, bind_inany); + goto out; + } + /* Only set reuse->bind_inany if the bind_inany is true. * Otherwise, it will overwrite the reuse->bind_inany * which was set by the bind/hash path. @@ -72,9 +146,9 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) } reuse->reuseport_id = id; + reuse->bind_inany = bind_inany; reuse->socks[0] = sk; reuse->num_socks = 1; - reuse->bind_inany = bind_inany; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -90,14 +164,30 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) u32 more_socks_size, i; more_socks_size = reuse->max_socks * 2U; - if (more_socks_size > U16_MAX) + if (more_socks_size > U16_MAX) { + if (reuse->num_closed_socks) { + /* Make room by removing a closed sk. + * The child has already been migrated. + * Only reqsk left at this point. + */ + struct sock *sk; + + sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; + RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL); + __reuseport_detach_closed_sock(sk, reuse); + + return reuse; + } + return NULL; + } more_reuse = __reuseport_alloc(more_socks_size); if (!more_reuse) return NULL; more_reuse->num_socks = reuse->num_socks; + more_reuse->num_closed_socks = reuse->num_closed_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; more_reuse->bind_inany = reuse->bind_inany; @@ -105,9 +195,13 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); + memcpy(more_reuse->socks + + (more_reuse->max_socks - more_reuse->num_closed_socks), + reuse->socks + (reuse->max_socks - reuse->num_closed_socks), + reuse->num_closed_socks * sizeof(struct sock *)); more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); - for (i = 0; i < reuse->num_socks; ++i) + for (i = 0; i < reuse->max_socks; ++i) rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, more_reuse); @@ -152,13 +246,21 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock)); + lockdep_is_held(&reuseport_lock)); + if (old_reuse && old_reuse->num_closed_socks) { + /* sk was shutdown()ed before */ + int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany); + + spin_unlock_bh(&reuseport_lock); + return err; + } + if (old_reuse && old_reuse->num_socks != 1) { spin_unlock_bh(&reuseport_lock); return -EBUSY; } - if (reuse->num_socks == reuse->max_socks) { + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { reuse = reuseport_grow(reuse); if (!reuse) { spin_unlock_bh(&reuseport_lock); @@ -166,10 +268,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) } } - reuse->socks[reuse->num_socks] = sk; - /* paired with smp_rmb() in reuseport_select_sock() */ - smp_wmb(); - reuse->num_socks++; + __reuseport_add_sock(sk, reuse); rcu_assign_pointer(sk->sk_reuseport_cb, reuse); spin_unlock_bh(&reuseport_lock); @@ -180,15 +279,77 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) } EXPORT_SYMBOL(reuseport_add_sock); +static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, + struct sock_reuseport *reuse, bool bind_inany) +{ + if (old_reuse == reuse) { + /* If sk was in the same reuseport group, just pop sk out of + * the closed section and push sk into the listening section. + */ + __reuseport_detach_closed_sock(sk, old_reuse); + __reuseport_add_sock(sk, old_reuse); + return 0; + } + + if (!reuse) { + /* In bind()/listen() path, we cannot carry over the eBPF prog + * for the shutdown()ed socket. In setsockopt() path, we should + * not change the eBPF prog of listening sockets by attaching a + * prog to the shutdown()ed socket. Thus, we will allocate a new + * reuseport group and detach sk from the old group. + */ + int id; + + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) + return -ENOMEM; + + id = ida_alloc(&reuseport_ida, GFP_ATOMIC); + if (id < 0) { + kfree(reuse); + return id; + } + + reuse->reuseport_id = id; + reuse->bind_inany = bind_inany; + } else { + /* Move sk from the old group to the new one if + * - all the other listeners in the old group were close()d or + * shutdown()ed, and then sk2 has listen()ed on the same port + * OR + * - sk listen()ed without bind() (or with autobind), was + * shutdown()ed, and then listen()s on another port which + * sk2 listen()s on. + */ + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) + return -ENOMEM; + } + } + + __reuseport_detach_closed_sock(sk, old_reuse); + __reuseport_add_sock(sk, reuse); + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + if (old_reuse->num_socks + old_reuse->num_closed_socks == 0) + call_rcu(&old_reuse->rcu, reuseport_free_rcu); + + return 0; +} + void reuseport_detach_sock(struct sock *sk) { struct sock_reuseport *reuse; - int i; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + /* reuseport_grow() has detached a closed sk */ + if (!reuse) + goto out; + /* Notify the bpf side. The sk may be added to a sockarray * map. If so, sockarray logic will remove it from the map. * @@ -201,19 +362,52 @@ void reuseport_detach_sock(struct sock *sk) rcu_assign_pointer(sk->sk_reuseport_cb, NULL); - for (i = 0; i < reuse->num_socks; i++) { - if (reuse->socks[i] == sk) { - reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; - reuse->num_socks--; - if (reuse->num_socks == 0) - call_rcu(&reuse->rcu, reuseport_free_rcu); - break; - } - } + if (!__reuseport_detach_closed_sock(sk, reuse)) + __reuseport_detach_sock(sk, reuse); + + if (reuse->num_socks + reuse->num_closed_socks == 0) + call_rcu(&reuse->rcu, reuseport_free_rcu); + +out: spin_unlock_bh(&reuseport_lock); } EXPORT_SYMBOL(reuseport_detach_sock); +void reuseport_stop_listen_sock(struct sock *sk) +{ + if (sk->sk_protocol == IPPROTO_TCP) { + struct sock_reuseport *reuse; + struct bpf_prog *prog; + + spin_lock_bh(&reuseport_lock); + + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); + + if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req || + (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) { + /* Migration capable, move sk from the listening section + * to the closed section. + */ + bpf_sk_reuseport_detach(sk); + + __reuseport_detach_sock(sk, reuse); + __reuseport_add_closed_sock(sk, reuse); + + spin_unlock_bh(&reuseport_lock); + return; + } + + spin_unlock_bh(&reuseport_lock); + } + + /* Not capable to do migration, detach immediately */ + reuseport_detach_sock(sk); +} +EXPORT_SYMBOL(reuseport_stop_listen_sock); + static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, struct bpf_prog *prog, struct sk_buff *skb, int hdr_len) @@ -244,6 +438,23 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, return reuse->socks[index]; } +static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse, + u32 hash, u16 num_socks) +{ + int i, j; + + i = j = reciprocal_scale(hash, num_socks); + while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { + i++; + if (i >= num_socks) + i = 0; + if (i == j) + return NULL; + } + + return reuse->socks[i]; +} + /** * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. * @sk: First socket in the group. @@ -274,32 +485,21 @@ struct sock *reuseport_select_sock(struct sock *sk, prog = rcu_dereference(reuse->prog); socks = READ_ONCE(reuse->num_socks); if (likely(socks)) { - /* paired with smp_wmb() in reuseport_add_sock() */ + /* paired with smp_wmb() in __reuseport_add_sock() */ smp_rmb(); if (!prog || !skb) goto select_by_hash; if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) - sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash); + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash); else sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ - if (!sk2) { - int i, j; - - i = j = reciprocal_scale(hash, socks); - while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { - i++; - if (i >= socks) - i = 0; - if (i == j) - goto out; - } - sk2 = reuse->socks[i]; - } + if (!sk2) + sk2 = reuseport_select_sock_by_hash(reuse, hash, socks); } out: @@ -308,14 +508,84 @@ out: } EXPORT_SYMBOL(reuseport_select_sock); +/** + * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group. + * @sk: close()ed or shutdown()ed socket in the group. + * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or + * NEW_SYN_RECV request socket during 3WHS. + * @skb: skb to run through BPF filter. + * Returns a socket (with sk_refcnt +1) that should accept the child socket + * (or NULL on error). + */ +struct sock *reuseport_migrate_sock(struct sock *sk, + struct sock *migrating_sk, + struct sk_buff *skb) +{ + struct sock_reuseport *reuse; + struct sock *nsk = NULL; + bool allocated = false; + struct bpf_prog *prog; + u16 socks; + u32 hash; + + rcu_read_lock(); + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (!reuse) + goto out; + + socks = READ_ONCE(reuse->num_socks); + if (unlikely(!socks)) + goto out; + + /* paired with smp_wmb() in __reuseport_add_sock() */ + smp_rmb(); + + hash = migrating_sk->sk_hash; + prog = rcu_dereference(reuse->prog); + if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) { + if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) + goto select_by_hash; + goto out; + } + + if (!skb) { + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) + goto out; + allocated = true; + } + + nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash); + + if (allocated) + kfree_skb(skb); + +select_by_hash: + if (!nsk) + nsk = reuseport_select_sock_by_hash(reuse, hash, socks); + + if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) + nsk = NULL; + +out: + rcu_read_unlock(); + return nsk; +} +EXPORT_SYMBOL(reuseport_migrate_sock); + int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; - if (sk_unhashed(sk) && sk->sk_reuseport) { - int err = reuseport_alloc(sk, false); + if (sk_unhashed(sk)) { + int err; + if (!sk->sk_reuseport) + return -EINVAL; + + err = reuseport_alloc(sk, false); if (err) return err; } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { @@ -341,13 +611,24 @@ int reuseport_detach_prog(struct sock *sk) struct sock_reuseport *reuse; struct bpf_prog *old_prog; - if (!rcu_access_pointer(sk->sk_reuseport_cb)) - return sk->sk_reuseport ? -ENOENT : -EINVAL; - old_prog = NULL; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + + /* reuse must be checked after acquiring the reuseport_lock + * because reuseport_grow() can detach a closed sk. + */ + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return sk->sk_reuseport ? -ENOENT : -EINVAL; + } + + if (sk_unhashed(sk) && reuse->num_closed_socks) { + spin_unlock_bh(&reuseport_lock); + return -ENOENT; + } + old_prog = rcu_replace_pointer(reuse->prog, old_prog, lockdep_is_held(&reuseport_lock)); spin_unlock_bh(&reuseport_lock); diff --git a/net/core/xdp.c b/net/core/xdp.c index 858276e72c68..725d20f1b100 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -584,3 +584,31 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, return __xdp_build_skb_from_frame(xdpf, skb, dev); } EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame); + +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) +{ + unsigned int headroom, totalsize; + struct xdp_frame *nxdpf; + struct page *page; + void *addr; + + headroom = xdpf->headroom + sizeof(*xdpf); + totalsize = headroom + xdpf->len; + + if (unlikely(totalsize > PAGE_SIZE)) + return NULL; + page = dev_alloc_page(); + if (!page) + return NULL; + addr = page_to_virt(page); + + memcpy(addr, xdpf, totalsize); + + nxdpf = addr; + nxdpf->data = addr + headroom; + nxdpf->frame_sz = PAGE_SIZE; + nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + nxdpf->mem.id = 0; + + return nxdpf; +} diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 653e3bc9c87b..b441ab330fd3 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1381,7 +1381,7 @@ static int dcbnl_notify(struct net_device *dev, int event, int cmd, skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh); if (!skb) - return -ENOBUFS; + return -ENOMEM; if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE) err = dcbnl_ieee_fill(skb, dev); @@ -1781,7 +1781,7 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq, nlh->nlmsg_flags, &reply_nlh); if (!reply_skb) - return -ENOBUFS; + return -ENOMEM; ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb); if (ret < 0) { @@ -2075,8 +2075,6 @@ EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); static int __init dcbnl_init(void) { - INIT_LIST_HEAD(&dcb_app_list); - rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, 0); rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, 0); diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index e2a337fa9ff7..92a8c6bea316 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -688,6 +688,7 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) /** * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% + * @loss_event_rate: loss event rate to invert * When @loss_event_rate is large, there is a chance that p is truncated to 0. * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ffc601a3b329..f81c1df761d3 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -977,7 +977,6 @@ static const struct net_protocol dccp_v4_protocol = { .handler = dccp_v4_rcv, .err_handler = dccp_v4_err, .no_policy = 1, - .netns_ok = 1, .icmp_strict_tag_validation = 1, }; diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 1a12912b88d6..7ab788f41a3f 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -870,7 +870,7 @@ int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb) /* * Read out ack data here, this applies equally - * to data, other data, link serivce and both + * to data, other data, link service and both * ack data and ack otherdata. */ dn_process_ack(sk, skb, other); diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 00f2ed721ec1..eadc89583168 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -179,7 +179,7 @@ static void dn_nsp_rtt(struct sock *sk, long rtt) scp->nsp_srtt = 1; /* - * Add new rtt varience to smoothed varience + * Add new rtt variance to smoothed varience */ delta >>= 1; rttvar += ((((delta>0)?(delta):(-delta)) - rttvar) >> 2); diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 32b1bed8ae51..729d3de6020d 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -604,7 +604,7 @@ drop_it: static int dn_route_discard(struct net *net, struct sock *sk, struct sk_buff *skb) { /* - * I know we drop the packet here, but thats considered success in + * I know we drop the packet here, but that's considered success in * this case */ kfree_skb(skb); diff --git a/net/devres.c b/net/devres.c index 1f9be2133787..5ccf6ca311dc 100644 --- a/net/devres.c +++ b/net/devres.c @@ -60,7 +60,7 @@ static int netdev_devres_match(struct device *dev, void *this, void *match_data) * @ndev: device to register * * This is a devres variant of register_netdev() for which the unregister - * function will be call automatically when the managing device is + * function will be called automatically when the managing device is * detached. Note: the net_device used must also be resource managed by * the same struct device. */ diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 92282de54230..b8b17474b72b 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -154,6 +154,11 @@ const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf); bool dsa_schedule_work(struct work_struct *work); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); +static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) +{ + return ops->needed_headroom + ops->needed_tailroom; +} + /* master.c */ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp); void dsa_master_teardown(struct net_device *dev); diff --git a/net/dsa/master.c b/net/dsa/master.c index 63adbc21a735..3fc90e36772d 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -346,10 +346,12 @@ static struct lock_class_key dsa_master_addr_list_lock_key; int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { - int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead; + const struct dsa_device_ops *tag_ops = cpu_dp->tag_ops; struct dsa_switch *ds = cpu_dp->ds; struct device_link *consumer_link; - int ret; + int mtu, ret; + + mtu = ETH_DATA_LEN + dsa_tag_protocol_overhead(tag_ops); /* The DSA master must use SET_NETDEV_DEV for this to work. */ consumer_link = device_link_add(ds->dev, dev->dev.parent, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d4756b920108..798944aa847a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1569,7 +1569,7 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) mtu_limit = min_t(int, master->max_mtu, dev->max_mtu); old_master_mtu = master->mtu; - new_master_mtu = largest_mtu + cpu_dp->tag_ops->overhead; + new_master_mtu = largest_mtu + dsa_tag_protocol_overhead(cpu_dp->tag_ops); if (new_master_mtu > mtu_limit) return -ERANGE; @@ -1605,7 +1605,7 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) out_port_failed: if (new_master_mtu != old_master_mtu) dsa_port_mtu_change(cpu_dp, old_master_mtu - - cpu_dp->tag_ops->overhead, + dsa_tag_protocol_overhead(cpu_dp->tag_ops), true); out_cpu_failed: if (new_master_mtu != old_master_mtu) @@ -1749,7 +1749,8 @@ static void dsa_slave_phylink_fixed_state(struct phylink_config *config, } /* slave device setup *******************************************************/ -static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) +static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr, + u32 flags) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct dsa_switch *ds = dp->ds; @@ -1760,6 +1761,8 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) return -ENODEV; } + slave_dev->phydev->dev_flags |= flags; + return phylink_connect_phy(dp->pl, slave_dev->phydev); } @@ -1804,7 +1807,7 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) /* We could not connect to a designated PHY or SFP, so try to * use the switch internal MDIO bus instead */ - ret = dsa_slave_phy_connect(slave_dev, dp->index); + ret = dsa_slave_phy_connect(slave_dev, dp->index, phy_flags); if (ret) { netdev_err(slave_dev, "failed to connect to port %d: %d\n", @@ -1824,10 +1827,8 @@ void dsa_slave_setup_tagger(struct net_device *slave) const struct dsa_port *cpu_dp = dp->cpu_dp; struct net_device *master = cpu_dp->master; - if (cpu_dp->tag_ops->tail_tag) - slave->needed_tailroom = cpu_dp->tag_ops->overhead; - else - slave->needed_headroom = cpu_dp->tag_ops->overhead; + slave->needed_headroom = cpu_dp->tag_ops->needed_headroom; + slave->needed_tailroom = cpu_dp->tag_ops->needed_tailroom; /* Try to save one extra realloc later in the TX path (in the master) * by also inheriting the master's needed headroom and tailroom. * The 8021q driver also does this. diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 122ad5833fb1..4aa29f90ecea 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -471,4 +471,27 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); +void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, + int *subvlan) +{ + u16 vid, tci; + + skb_push_rcsum(skb, ETH_HLEN); + if (skb_vlan_tag_present(skb)) { + tci = skb_vlan_tag_get(skb); + __vlan_hwaccel_clear_tag(skb); + } else { + __skb_vlan_pop(skb, &tci); + } + skb_pull_rcsum(skb, ETH_HLEN); + + vid = tci & VLAN_VID_MASK; + + *source_port = dsa_8021q_rx_source_port(vid); + *switch_id = dsa_8021q_rx_switch_id(vid); + *subvlan = dsa_8021q_rx_subvlan(vid); + skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; +} +EXPORT_SYMBOL_GPL(dsa_8021q_rcv); + MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c index 002cf7f952e2..0efae1a372b3 100644 --- a/net/dsa/tag_ar9331.c +++ b/net/dsa/tag_ar9331.c @@ -85,7 +85,7 @@ static const struct dsa_device_ops ar9331_netdev_ops = { .proto = DSA_TAG_PROTO_AR9331, .xmit = ar9331_tag_xmit, .rcv = ar9331_tag_rcv, - .overhead = AR9331_HDR_LEN, + .needed_headroom = AR9331_HDR_LEN, }; MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 40e9f3098c8d..0750af951fc9 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -205,7 +205,7 @@ static const struct dsa_device_ops brcm_netdev_ops = { .proto = DSA_TAG_PROTO_BRCM, .xmit = brcm_tag_xmit, .rcv = brcm_tag_rcv, - .overhead = BRCM_TAG_LEN, + .needed_headroom = BRCM_TAG_LEN, }; DSA_TAG_DRIVER(brcm_netdev_ops); @@ -286,7 +286,7 @@ static const struct dsa_device_ops brcm_legacy_netdev_ops = { .proto = DSA_TAG_PROTO_BRCM_LEGACY, .xmit = brcm_leg_tag_xmit, .rcv = brcm_leg_tag_rcv, - .overhead = BRCM_LEG_TAG_LEN, + .needed_headroom = BRCM_LEG_TAG_LEN, }; DSA_TAG_DRIVER(brcm_legacy_netdev_ops); @@ -314,7 +314,7 @@ static const struct dsa_device_ops brcm_prepend_netdev_ops = { .proto = DSA_TAG_PROTO_BRCM_PREPEND, .xmit = brcm_tag_xmit_prepend, .rcv = brcm_tag_rcv_prepend, - .overhead = BRCM_TAG_LEN, + .needed_headroom = BRCM_TAG_LEN, }; DSA_TAG_DRIVER(brcm_prepend_netdev_ops); diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 7e7b7decdf39..a822355afc90 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -303,7 +303,7 @@ static const struct dsa_device_ops dsa_netdev_ops = { .proto = DSA_TAG_PROTO_DSA, .xmit = dsa_xmit, .rcv = dsa_rcv, - .overhead = DSA_HLEN, + .needed_headroom = DSA_HLEN, }; DSA_TAG_DRIVER(dsa_netdev_ops); @@ -346,7 +346,7 @@ static const struct dsa_device_ops edsa_netdev_ops = { .proto = DSA_TAG_PROTO_EDSA, .xmit = edsa_xmit, .rcv = edsa_rcv, - .overhead = EDSA_HLEN, + .needed_headroom = EDSA_HLEN, }; DSA_TAG_DRIVER(edsa_netdev_ops); diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 2f5bd5e338ab..5985dab06ab8 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -103,7 +103,7 @@ static const struct dsa_device_ops gswip_netdev_ops = { .proto = DSA_TAG_PROTO_GSWIP, .xmit = gswip_tag_xmit, .rcv = gswip_tag_rcv, - .overhead = GSWIP_RX_HEADER_LEN, + .needed_headroom = GSWIP_RX_HEADER_LEN, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index a09805c8e1ab..424130f85f59 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -54,8 +54,7 @@ static const struct dsa_device_ops hellcreek_netdev_ops = { .proto = DSA_TAG_PROTO_HELLCREEK, .xmit = hellcreek_xmit, .rcv = hellcreek_rcv, - .overhead = HELLCREEK_TAG_LEN, - .tail_tag = true, + .needed_tailroom = HELLCREEK_TAG_LEN, }; MODULE_LICENSE("Dual MIT/GPL"); diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 4820dbcedfa2..53565f48934c 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -77,8 +77,7 @@ static const struct dsa_device_ops ksz8795_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ8795, .xmit = ksz8795_xmit, .rcv = ksz8795_rcv, - .overhead = KSZ_INGRESS_TAG_LEN, - .tail_tag = true, + .needed_tailroom = KSZ_INGRESS_TAG_LEN, }; DSA_TAG_DRIVER(ksz8795_netdev_ops); @@ -149,8 +148,7 @@ static const struct dsa_device_ops ksz9477_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ9477, .xmit = ksz9477_xmit, .rcv = ksz9477_rcv, - .overhead = KSZ9477_INGRESS_TAG_LEN, - .tail_tag = true, + .needed_tailroom = KSZ9477_INGRESS_TAG_LEN, }; DSA_TAG_DRIVER(ksz9477_netdev_ops); @@ -183,8 +181,7 @@ static const struct dsa_device_ops ksz9893_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ9893, .xmit = ksz9893_xmit, .rcv = ksz9477_rcv, - .overhead = KSZ_INGRESS_TAG_LEN, - .tail_tag = true, + .needed_tailroom = KSZ_INGRESS_TAG_LEN, }; DSA_TAG_DRIVER(ksz9893_netdev_ops); diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index aa1318dccaf0..26207ef39ebc 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -125,7 +125,7 @@ static const struct dsa_device_ops lan9303_netdev_ops = { .proto = DSA_TAG_PROTO_LAN9303, .xmit = lan9303_xmit, .rcv = lan9303_rcv, - .overhead = LAN9303_TAG_LEN, + .needed_headroom = LAN9303_TAG_LEN, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index f9b2966d1936..cc3ba864ad5b 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -102,7 +102,7 @@ static const struct dsa_device_ops mtk_netdev_ops = { .proto = DSA_TAG_PROTO_MTK, .xmit = mtk_tag_xmit, .rcv = mtk_tag_rcv, - .overhead = MTK_HDR_LEN, + .needed_headroom = MTK_HDR_LEN, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 91f0fd1242cd..190f4bfd3bef 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -143,7 +143,7 @@ static const struct dsa_device_ops ocelot_netdev_ops = { .proto = DSA_TAG_PROTO_OCELOT, .xmit = ocelot_xmit, .rcv = ocelot_rcv, - .overhead = OCELOT_TOTAL_TAG_LEN, + .needed_headroom = OCELOT_TOTAL_TAG_LEN, .promisc_on_master = true, }; @@ -155,7 +155,7 @@ static const struct dsa_device_ops seville_netdev_ops = { .proto = DSA_TAG_PROTO_SEVILLE, .xmit = seville_xmit, .rcv = ocelot_rcv, - .overhead = OCELOT_TOTAL_TAG_LEN, + .needed_headroom = OCELOT_TOTAL_TAG_LEN, .promisc_on_master = true, }; diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 62a93303bd63..85ac85c3af8c 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -41,29 +41,15 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) { - int src_port, switch_id, qos_class; - u16 vid, tci; + int src_port, switch_id, subvlan; - skb_push_rcsum(skb, ETH_HLEN); - if (skb_vlan_tag_present(skb)) { - tci = skb_vlan_tag_get(skb); - __vlan_hwaccel_clear_tag(skb); - } else { - __skb_vlan_pop(skb, &tci); - } - skb_pull_rcsum(skb, ETH_HLEN); - - vid = tci & VLAN_VID_MASK; - src_port = dsa_8021q_rx_source_port(vid); - switch_id = dsa_8021q_rx_switch_id(vid); - qos_class = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + dsa_8021q_rcv(skb, &src_port, &switch_id, &subvlan); skb->dev = dsa_master_find_slave(netdev, switch_id, src_port); if (!skb->dev) return NULL; skb->offload_fwd_mark = 1; - skb->priority = qos_class; return skb; } @@ -73,7 +59,7 @@ static const struct dsa_device_ops ocelot_8021q_netdev_ops = { .proto = DSA_TAG_PROTO_OCELOT_8021Q, .xmit = ocelot_xmit, .rcv = ocelot_rcv, - .overhead = VLAN_HLEN, + .needed_headroom = VLAN_HLEN, .promisc_on_master = true, }; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 88181b52f480..693bda013065 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -91,7 +91,7 @@ static const struct dsa_device_ops qca_netdev_ops = { .proto = DSA_TAG_PROTO_QCA, .xmit = qca_tag_xmit, .rcv = qca_tag_rcv, - .overhead = QCA_HDR_LEN, + .needed_headroom = QCA_HDR_LEN, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index cf8ac316f4c7..57c46b4ab2b3 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -124,7 +124,7 @@ static const struct dsa_device_ops rtl4a_netdev_ops = { .proto = DSA_TAG_PROTO_RTL4_A, .xmit = rtl4a_tag_xmit, .rcv = rtl4a_tag_rcv, - .overhead = RTL4_A_HDR_LEN, + .needed_headroom = RTL4_A_HDR_LEN, }; module_dsa_tag_driver(rtl4a_netdev_ops); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 50496013cdb7..9c2df9ece01b 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -7,6 +7,52 @@ #include <linux/packing.h> #include "dsa_priv.h" +/* Is this a TX or an RX header? */ +#define SJA1110_HEADER_HOST_TO_SWITCH BIT(15) + +/* RX header */ +#define SJA1110_RX_HEADER_IS_METADATA BIT(14) +#define SJA1110_RX_HEADER_HOST_ONLY BIT(13) +#define SJA1110_RX_HEADER_HAS_TRAILER BIT(12) + +/* Trap-to-host format (no trailer present) */ +#define SJA1110_RX_HEADER_SRC_PORT(x) (((x) & GENMASK(7, 4)) >> 4) +#define SJA1110_RX_HEADER_SWITCH_ID(x) ((x) & GENMASK(3, 0)) + +/* Timestamp format (trailer present) */ +#define SJA1110_RX_HEADER_TRAILER_POS(x) ((x) & GENMASK(11, 0)) + +#define SJA1110_RX_TRAILER_SWITCH_ID(x) (((x) & GENMASK(7, 4)) >> 4) +#define SJA1110_RX_TRAILER_SRC_PORT(x) ((x) & GENMASK(3, 0)) + +/* Meta frame format (for 2-step TX timestamps) */ +#define SJA1110_RX_HEADER_N_TS(x) (((x) & GENMASK(8, 4)) >> 4) + +/* TX header */ +#define SJA1110_TX_HEADER_UPDATE_TC BIT(14) +#define SJA1110_TX_HEADER_TAKE_TS BIT(13) +#define SJA1110_TX_HEADER_TAKE_TS_CASC BIT(12) +#define SJA1110_TX_HEADER_HAS_TRAILER BIT(11) + +/* Only valid if SJA1110_TX_HEADER_HAS_TRAILER is false */ +#define SJA1110_TX_HEADER_PRIO(x) (((x) << 7) & GENMASK(10, 7)) +#define SJA1110_TX_HEADER_TSTAMP_ID(x) ((x) & GENMASK(7, 0)) + +/* Only valid if SJA1110_TX_HEADER_HAS_TRAILER is true */ +#define SJA1110_TX_HEADER_TRAILER_POS(x) ((x) & GENMASK(10, 0)) + +#define SJA1110_TX_TRAILER_TSTAMP_ID(x) (((x) << 24) & GENMASK(31, 24)) +#define SJA1110_TX_TRAILER_PRIO(x) (((x) << 21) & GENMASK(23, 21)) +#define SJA1110_TX_TRAILER_SWITCHID(x) (((x) << 12) & GENMASK(15, 12)) +#define SJA1110_TX_TRAILER_DESTPORTS(x) (((x) << 1) & GENMASK(11, 1)) + +#define SJA1110_META_TSTAMP_SIZE 10 + +#define SJA1110_HEADER_LEN 4 +#define SJA1110_RX_TRAILER_LEN 13 +#define SJA1110_TX_TRAILER_LEN 4 +#define SJA1110_MAX_PADDING_LEN 15 + /* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */ static inline bool sja1105_is_link_local(const struct sk_buff *skb) { @@ -140,6 +186,57 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); } +static struct sk_buff *sja1110_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone; + struct dsa_port *dp = dsa_slave_to_port(netdev); + u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); + u16 queue_mapping = skb_get_queue_mapping(skb); + u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); + struct ethhdr *eth_hdr; + __be32 *tx_trailer; + __be16 *tx_header; + int trailer_pos; + + /* Transmitting control packets is done using in-band control + * extensions, while data packets are transmitted using + * tag_8021q TX VLANs. + */ + if (likely(!sja1105_is_link_local(skb))) + return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv), + ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); + + skb_push(skb, SJA1110_HEADER_LEN); + + /* Move Ethernet header to the left, making space for DSA tag */ + memmove(skb->data, skb->data + SJA1110_HEADER_LEN, 2 * ETH_ALEN); + + trailer_pos = skb->len; + + /* On TX, skb->data points to skb_mac_header(skb) */ + eth_hdr = (struct ethhdr *)skb->data; + tx_header = (__be16 *)(eth_hdr + 1); + tx_trailer = skb_put(skb, SJA1110_TX_TRAILER_LEN); + + eth_hdr->h_proto = htons(ETH_P_SJA1110); + + *tx_header = htons(SJA1110_HEADER_HOST_TO_SWITCH | + SJA1110_TX_HEADER_HAS_TRAILER | + SJA1110_TX_HEADER_TRAILER_POS(trailer_pos)); + *tx_trailer = cpu_to_be32(SJA1110_TX_TRAILER_PRIO(pcp) | + SJA1110_TX_TRAILER_SWITCHID(dp->ds->index) | + SJA1110_TX_TRAILER_DESTPORTS(BIT(dp->index))); + if (clone) { + u8 ts_id = SJA1105_SKB_CB(clone)->ts_id; + + *tx_header |= htons(SJA1110_TX_HEADER_TAKE_TS); + *tx_trailer |= cpu_to_be32(SJA1110_TX_TRAILER_TSTAMP_ID(ts_id)); + } + + return skb; +} + static void sja1105_transfer_meta(struct sk_buff *skb, const struct sja1105_meta *meta) { @@ -147,7 +244,7 @@ static void sja1105_transfer_meta(struct sk_buff *skb, hdr->h_dest[3] = meta->dmac_byte_3; hdr->h_dest[4] = meta->dmac_byte_4; - SJA1105_SKB_CB(skb)->meta_tstamp = meta->tstamp; + SJA1105_SKB_CB(skb)->tstamp = meta->tstamp; } /* This is a simple state machine which follows the hardware mechanism of @@ -275,46 +372,38 @@ static void sja1105_decode_subvlan(struct sk_buff *skb, u16 subvlan) __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); } +static bool sja1105_skb_has_tag_8021q(const struct sk_buff *skb) +{ + u16 tpid = ntohs(eth_hdr(skb)->h_proto); + + return tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q || + skb_vlan_tag_present(skb); +} + +static bool sja1110_skb_has_inband_control_extension(const struct sk_buff *skb) +{ + return ntohs(eth_hdr(skb)->h_proto) == ETH_P_SJA1110; +} + static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) { + int source_port, switch_id, subvlan = 0; struct sja1105_meta meta = {0}; - int source_port, switch_id; struct ethhdr *hdr; - u16 tpid, vid, tci; bool is_link_local; - u16 subvlan = 0; - bool is_tagged; bool is_meta; hdr = eth_hdr(skb); - tpid = ntohs(hdr->h_proto); - is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q || - skb_vlan_tag_present(skb)); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); skb->offload_fwd_mark = 1; - if (is_tagged) { + if (sja1105_skb_has_tag_8021q(skb)) { /* Normal traffic path. */ - skb_push_rcsum(skb, ETH_HLEN); - if (skb_vlan_tag_present(skb)) { - tci = skb_vlan_tag_get(skb); - __vlan_hwaccel_clear_tag(skb); - } else { - __skb_vlan_pop(skb, &tci); - } - skb_pull_rcsum(skb, ETH_HLEN); - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - - vid = tci & VLAN_VID_MASK; - source_port = dsa_8021q_rx_source_port(vid); - switch_id = dsa_8021q_rx_switch_id(vid); - skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; - subvlan = dsa_8021q_rx_subvlan(vid); + dsa_8021q_rcv(skb, &source_port, &switch_id, &subvlan); } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of @@ -346,6 +435,138 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, is_meta); } +static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header) +{ + int switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header); + int n_ts = SJA1110_RX_HEADER_N_TS(rx_header); + struct net_device *master = skb->dev; + struct dsa_port *cpu_dp; + u8 *buf = skb->data + 2; + struct dsa_switch *ds; + int i; + + cpu_dp = master->dsa_ptr; + ds = dsa_switch_find(cpu_dp->dst->index, switch_id); + if (!ds) { + net_err_ratelimited("%s: cannot find switch id %d\n", + master->name, switch_id); + return NULL; + } + + for (i = 0; i <= n_ts; i++) { + u8 ts_id, source_port, dir; + u64 tstamp; + + ts_id = buf[0]; + source_port = (buf[1] & GENMASK(7, 4)) >> 4; + dir = (buf[1] & BIT(3)) >> 3; + tstamp = be64_to_cpu(*(__be64 *)(buf + 2)); + + sja1110_process_meta_tstamp(ds, source_port, ts_id, dir, + tstamp); + + buf += SJA1110_META_TSTAMP_SIZE; + } + + /* Discard the meta frame, we've consumed the timestamps it contained */ + return NULL; +} + +static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, + int *source_port, + int *switch_id) +{ + u16 rx_header; + + if (unlikely(!pskb_may_pull(skb, SJA1110_HEADER_LEN))) + return NULL; + + /* skb->data points to skb_mac_header(skb) + ETH_HLEN, which is exactly + * what we need because the caller has checked the EtherType (which is + * located 2 bytes back) and we just need a pointer to the header that + * comes afterwards. + */ + rx_header = ntohs(*(__be16 *)skb->data); + + if (rx_header & SJA1110_RX_HEADER_IS_METADATA) + return sja1110_rcv_meta(skb, rx_header); + + /* Timestamp frame, we have a trailer */ + if (rx_header & SJA1110_RX_HEADER_HAS_TRAILER) { + int start_of_padding = SJA1110_RX_HEADER_TRAILER_POS(rx_header); + u8 *rx_trailer = skb_tail_pointer(skb) - SJA1110_RX_TRAILER_LEN; + u64 *tstamp = &SJA1105_SKB_CB(skb)->tstamp; + u8 last_byte = rx_trailer[12]; + + /* The timestamp is unaligned, so we need to use packing() + * to get it + */ + packing(rx_trailer, tstamp, 63, 0, 8, UNPACK, 0); + + *source_port = SJA1110_RX_TRAILER_SRC_PORT(last_byte); + *switch_id = SJA1110_RX_TRAILER_SWITCH_ID(last_byte); + + /* skb->len counts from skb->data, while start_of_padding + * counts from the destination MAC address. Right now skb->data + * is still as set by the DSA master, so to trim away the + * padding and trailer we need to account for the fact that + * skb->data points to skb_mac_header(skb) + ETH_HLEN. + */ + pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN); + /* Trap-to-host frame, no timestamp trailer */ + } else { + *source_port = SJA1110_RX_HEADER_SRC_PORT(rx_header); + *switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header); + } + + /* Advance skb->data past the DSA header */ + skb_pull_rcsum(skb, SJA1110_HEADER_LEN); + + /* Remove the DSA header */ + memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - SJA1110_HEADER_LEN, + 2 * ETH_ALEN); + + /* With skb->data in its final place, update the MAC header + * so that eth_hdr() continues to works properly. + */ + skb_set_mac_header(skb, -ETH_HLEN); + + return skb; +} + +static struct sk_buff *sja1110_rcv(struct sk_buff *skb, + struct net_device *netdev, + struct packet_type *pt) +{ + int source_port = -1, switch_id = -1, subvlan = 0; + + skb->offload_fwd_mark = 1; + + if (sja1110_skb_has_inband_control_extension(skb)) { + skb = sja1110_rcv_inband_control_extension(skb, &source_port, + &switch_id); + if (!skb) + return NULL; + } + + /* Packets with in-band control extensions might still have RX VLANs */ + if (likely(sja1105_skb_has_tag_8021q(skb))) + dsa_8021q_rcv(skb, &source_port, &switch_id, &subvlan); + + skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); + if (!skb->dev) { + netdev_warn(netdev, + "Couldn't decode source port %d and switch id %d\n", + source_port, switch_id); + return NULL; + } + + if (subvlan) + sja1105_decode_subvlan(skb, subvlan); + + return skb; +} + static void sja1105_flow_dissect(const struct sk_buff *skb, __be16 *proto, int *offset) { @@ -356,18 +577,53 @@ static void sja1105_flow_dissect(const struct sk_buff *skb, __be16 *proto, dsa_tag_generic_flow_dissect(skb, proto, offset); } +static void sja1110_flow_dissect(const struct sk_buff *skb, __be16 *proto, + int *offset) +{ + /* Management frames have 2 DSA tags on RX, so the needed_headroom we + * declared is fine for the generic dissector adjustment procedure. + */ + if (unlikely(sja1105_is_link_local(skb))) + return dsa_tag_generic_flow_dissect(skb, proto, offset); + + /* For the rest, there is a single DSA tag, the tag_8021q one */ + *offset = VLAN_HLEN; + *proto = ((__be16 *)skb->data)[(VLAN_HLEN / 2) - 1]; +} + static const struct dsa_device_ops sja1105_netdev_ops = { .name = "sja1105", .proto = DSA_TAG_PROTO_SJA1105, .xmit = sja1105_xmit, .rcv = sja1105_rcv, .filter = sja1105_filter, - .overhead = VLAN_HLEN, + .needed_headroom = VLAN_HLEN, .flow_dissect = sja1105_flow_dissect, .promisc_on_master = true, }; -MODULE_LICENSE("GPL v2"); +DSA_TAG_DRIVER(sja1105_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1105); -module_dsa_tag_driver(sja1105_netdev_ops); +static const struct dsa_device_ops sja1110_netdev_ops = { + .name = "sja1110", + .proto = DSA_TAG_PROTO_SJA1110, + .xmit = sja1110_xmit, + .rcv = sja1110_rcv, + .filter = sja1105_filter, + .flow_dissect = sja1110_flow_dissect, + .needed_headroom = SJA1110_HEADER_LEN + VLAN_HLEN, + .needed_tailroom = SJA1110_RX_TRAILER_LEN + SJA1110_MAX_PADDING_LEN, +}; + +DSA_TAG_DRIVER(sja1110_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1110); + +static struct dsa_tag_driver *sja1105_tag_driver_array[] = { + &DSA_TAG_DRIVER_NAME(sja1105_netdev_ops), + &DSA_TAG_DRIVER_NAME(sja1110_netdev_ops), +}; + +module_dsa_tag_drivers(sja1105_tag_driver_array); + +MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 5b97ede56a0f..ba73804340a5 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -55,8 +55,7 @@ static const struct dsa_device_ops trailer_netdev_ops = { .proto = DSA_TAG_PROTO_TRAILER, .xmit = trailer_xmit, .rcv = trailer_rcv, - .overhead = 4, - .tail_tag = true, + .needed_tailroom = 4, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c index 858cdf9d2913..a31ff7fcb45f 100644 --- a/net/dsa/tag_xrs700x.c +++ b/net/dsa/tag_xrs700x.c @@ -56,8 +56,7 @@ static const struct dsa_device_ops xrs700x_netdev_ops = { .proto = DSA_TAG_PROTO_XRS700X, .xmit = xrs700x_xmit, .rcv = xrs700x_rcv, - .overhead = 1, - .tail_tag = true, + .needed_tailroom = 1, }; MODULE_LICENSE("GPL"); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 88d8a0243f35..a7346346114f 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -315,9 +315,9 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) struct ethnl_req_info *req_info = NULL; const u8 cmd = info->genlhdr->cmd; const struct ethnl_request_ops *ops; + int hdr_len, reply_len; struct sk_buff *rskb; void *reply_payload; - int reply_len; int ret; ops = ethnl_default_requests[cmd]; @@ -346,15 +346,20 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; - reply_len = ret + ethnl_reply_header_size(); + reply_len = ret; ret = -ENOMEM; - rskb = ethnl_reply_init(reply_len, req_info->dev, ops->reply_cmd, + rskb = ethnl_reply_init(reply_len + ethnl_reply_header_size(), + req_info->dev, ops->reply_cmd, ops->hdr_attr, info, &reply_payload); if (!rskb) goto err_cleanup; + hdr_len = rskb->len; ret = ops->fill_reply(rskb, req_info, reply_data); if (ret < 0) goto err_msg; + WARN_ONCE(rskb->len - hdr_len > reply_len, + "ethnl cmd %d: calculated reply length %d, but consumed %d\n", + cmd, reply_len, rskb->len - hdr_len); if (ops->cleanup_data) ops->cleanup_data(reply_data); diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 8abcbc10796c..90b10966b16b 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -138,7 +138,7 @@ static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, } /** - * ethnl_update_binary() - update binary data from NLA_BINARY atribute + * ethnl_update_binary() - update binary data from NLA_BINARY attribute * @dst: value to update * @len: destination buffer length * @attr: netlink attribute with new value or null diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index bb1351c38397..e31949479305 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -397,7 +397,8 @@ void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, * ensures entries of restarted nodes gets pruned so that they can * re-register and resume communications. */ - if (seq_nr_before(sequence_nr, node->seq_out[port->type])) + if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && + seq_nr_before(sequence_nr, node->seq_out[port->type])) return; node->time_in[port->type] = jiffies; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2f94d221c00e..54648181dd56 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -318,7 +318,7 @@ lookup_protocol: WARN_ON(!answer_prot->slab); - err = -ENOBUFS; + err = -ENOMEM; sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; @@ -1720,7 +1720,6 @@ EXPORT_SYMBOL_GPL(snmp_fold_field64); #ifdef CONFIG_IP_MULTICAST static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, - .netns_ok = 1, }; #endif @@ -1733,7 +1732,6 @@ static struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, - .netns_ok = 1, .icmp_strict_tag_validation = 1, }; @@ -1746,14 +1744,12 @@ static struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, - .netns_ok = 1, }; static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, .err_handler = icmp_err, .no_policy = 1, - .netns_ok = 1, }; static __net_init int ipv4_mib_init_net(struct net *net) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index e0480c6cebaa..099259fc826a 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -187,8 +187,7 @@ static int __init cipso_v4_cache_init(void) * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache * * Description: - * Invalidates and frees any entries in the CIPSO cache. Returns zero on - * success and negative values on failure. + * Invalidates and frees any entries in the CIPSO cache. * */ void cipso_v4_cache_invalidate(void) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 1c6429c353a9..73721a4448bd 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1955,7 +1955,7 @@ static int inet_validate_link_af(const struct net_device *dev, struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; - if (dev && !__in_dev_get_rcu(dev)) + if (dev && !__in_dev_get_rtnl(dev)) return -EAFNOSUPPORT; err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, @@ -1981,7 +1981,7 @@ static int inet_validate_link_af(const struct net_device *dev, static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { - struct in_device *in_dev = __in_dev_get_rcu(dev); + struct in_device *in_dev = __in_dev_get_rtnl(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 84bb707bd88d..af8814a11378 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1122,10 +1122,8 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) prefix, ifa->ifa_prefixlen, prim, ifa->ifa_rt_priority); - /* Add network specific broadcasts, when it takes a sense */ + /* Add the network broadcast address, when it makes sense */ if (ifa->ifa_prefixlen < 31) { - fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, - prim, 0); fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); } @@ -1516,6 +1514,12 @@ static int __net_init ip_fib_net_init(struct net *net) if (err) return err; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* Default to 3-tuple */ + net->ipv4.sysctl_fib_multipath_hash_fields = + FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; +#endif + /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index b58db1ca4bfb..e184bcb19943 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -25,7 +25,7 @@ struct fib_alias { #define FA_S_ACCESSED 0x01 -/* Dont write on fa_state unless needed, to keep it shared on all cpus */ +/* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { if (!(fa->fa_state & FA_S_ACCESSED)) diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 5d1e6fe9d838..cbb2b4bb0dfa 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -195,7 +195,6 @@ static int gre_err(struct sk_buff *skb, u32 info) static const struct net_protocol net_gre_protocol = { .handler = gre_rcv, .err_handler = gre_err, - .netns_ok = 1, }; static int __init gre_init(void) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 752e392083e6..0a57f1892e7e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1066,7 +1066,7 @@ static bool icmp_echo(struct sk_buff *skb) if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + sizeof(struct in_addr)) goto send_mal_query; - dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr.s_addr); + dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr); break; #if IS_ENABLED(CONFIG_IPV6) case ICMP_AFI_IP6: diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fd472eae4f5c..0eea878edc30 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -135,10 +135,18 @@ static int inet_csk_bind_conflict(const struct sock *sk, bool relax, bool reuseport_ok) { struct sock *sk2; + bool reuseport_cb_ok; bool reuse = sk->sk_reuse; bool reuseport = !!sk->sk_reuseport; + struct sock_reuseport *reuseport_cb; kuid_t uid = sock_i_uid((struct sock *)sk); + rcu_read_lock(); + reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); + /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ + reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); + rcu_read_unlock(); + /* * Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed @@ -156,14 +164,14 @@ static int inet_csk_bind_conflict(const struct sock *sk, if ((!relax || (!reuseport_ok && reuseport && sk2->sk_reuseport && - !rcu_access_pointer(sk->sk_reuseport_cb) && + reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT || uid_eq(uid, sock_i_uid(sk2))))) && inet_rcv_saddr_equal(sk, sk2, true)) break; } else if (!reuseport_ok || !reuseport || !sk2->sk_reuseport || - rcu_access_pointer(sk->sk_reuseport_cb) || + !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2)))) { if (inet_rcv_saddr_equal(sk, sk2, true)) @@ -687,6 +695,64 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); +static struct request_sock *inet_reqsk_clone(struct request_sock *req, + struct sock *sk) +{ + struct sock *req_sk, *nreq_sk; + struct request_sock *nreq; + + nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN); + if (!nreq) { + /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */ + sock_put(sk); + return NULL; + } + + req_sk = req_to_sk(req); + nreq_sk = req_to_sk(nreq); + + memcpy(nreq_sk, req_sk, + offsetof(struct sock, sk_dontcopy_begin)); + memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, + req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end)); + + sk_node_init(&nreq_sk->sk_node); + nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping; +#ifdef CONFIG_XPS + nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping; +#endif + nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu; + + nreq->rsk_listener = sk; + + /* We need not acquire fastopenq->lock + * because the child socket is locked in inet_csk_listen_stop(). + */ + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener) + rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq); + + return nreq; +} + +static void reqsk_queue_migrated(struct request_sock_queue *queue, + const struct request_sock *req) +{ + if (req->num_timeout == 0) + atomic_inc(&queue->young); + atomic_inc(&queue->qlen); +} + +static void reqsk_migrate_reset(struct request_sock *req) +{ + req->saved_syn = NULL; +#if IS_ENABLED(CONFIG_IPV6) + inet_rsk(req)->ipv6_opt = NULL; + inet_rsk(req)->pktopts = NULL; +#else + inet_rsk(req)->ireq_opt = NULL; +#endif +} + /* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock *req) { @@ -727,15 +793,39 @@ EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { struct request_sock *req = from_timer(req, t, rsk_timer); + struct request_sock *nreq = NULL, *oreq = req; struct sock *sk_listener = req->rsk_listener; - struct net *net = sock_net(sk_listener); - struct inet_connection_sock *icsk = inet_csk(sk_listener); - struct request_sock_queue *queue = &icsk->icsk_accept_queue; + struct inet_connection_sock *icsk; + struct request_sock_queue *queue; + struct net *net; int max_syn_ack_retries, qlen, expire = 0, resend = 0; - if (inet_sk_state_load(sk_listener) != TCP_LISTEN) - goto drop; + if (inet_sk_state_load(sk_listener) != TCP_LISTEN) { + struct sock *nsk; + + nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL); + if (!nsk) + goto drop; + + nreq = inet_reqsk_clone(req, nsk); + if (!nreq) + goto drop; + /* The new timer for the cloned req can decrease the 2 + * by calling inet_csk_reqsk_queue_drop_and_put(), so + * hold another count to prevent use-after-free and + * call reqsk_put() just before return. + */ + refcount_set(&nreq->rsk_refcnt, 2 + 1); + timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED); + reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req); + + req = nreq; + sk_listener = nsk; + } + + icsk = inet_csk(sk_listener); + net = sock_net(sk_listener); max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. @@ -754,6 +844,7 @@ static void reqsk_timer_handler(struct timer_list *t) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ + queue = &icsk->icsk_accept_queue; qlen = reqsk_queue_len(queue); if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; @@ -778,10 +869,36 @@ static void reqsk_timer_handler(struct timer_list *t) atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer(&req->rsk_timer, jiffies + timeo); + + if (!nreq) + return; + + if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { + /* delete timer */ + inet_csk_reqsk_queue_drop(sk_listener, nreq); + goto drop; + } + + reqsk_migrate_reset(oreq); + reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq); + reqsk_put(oreq); + + reqsk_put(nreq); return; } + drop: - inet_csk_reqsk_queue_drop_and_put(sk_listener, req); + /* Even if we can clone the req, we may need not retransmit any more + * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another + * CPU may win the "own_req" race so that inet_ehash_insert() fails. + */ + if (nreq) { + reqsk_migrate_reset(nreq); + reqsk_queue_removed(queue, nreq); + __reqsk_free(nreq); + } + + inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); } static void reqsk_queue_hash_req(struct request_sock *req, @@ -997,12 +1114,40 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { if (own_req) { - inet_csk_reqsk_queue_drop(sk, req); - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - if (inet_csk_reqsk_queue_add(sk, req, child)) + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + + if (sk != req->rsk_listener) { + /* another listening sk has been selected, + * migrate the req to it. + */ + struct request_sock *nreq; + + /* hold a refcnt for the nreq->rsk_listener + * which is assigned in inet_reqsk_clone() + */ + sock_hold(sk); + nreq = inet_reqsk_clone(req, sk); + if (!nreq) { + inet_child_forget(sk, req, child); + goto child_put; + } + + refcount_set(&nreq->rsk_refcnt, 1); + if (inet_csk_reqsk_queue_add(sk, nreq, child)) { + reqsk_migrate_reset(req); + reqsk_put(req); + return child; + } + + reqsk_migrate_reset(nreq); + __reqsk_free(nreq); + } else if (inet_csk_reqsk_queue_add(sk, req, child)) { return child; + } } /* Too bad, another child took ownership of the request, undo. */ +child_put: bh_unlock_sock(child); sock_put(child); return NULL; @@ -1028,14 +1173,36 @@ void inet_csk_listen_stop(struct sock *sk) * of the variants now. --ANK */ while ((req = reqsk_queue_remove(queue, sk)) != NULL) { - struct sock *child = req->sk; + struct sock *child = req->sk, *nsk; + struct request_sock *nreq; local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); + nsk = reuseport_migrate_sock(sk, child, NULL); + if (nsk) { + nreq = inet_reqsk_clone(req, nsk); + if (nreq) { + refcount_set(&nreq->rsk_refcnt, 1); + + if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { + reqsk_migrate_reset(req); + } else { + reqsk_migrate_reset(nreq); + __reqsk_free(nreq); + } + + /* inet_csk_reqsk_queue_add() has already + * called inet_child_forget() on failure case. + */ + goto skip_child_forget; + } + } + inet_child_forget(sk, req, child); +skip_child_forget: reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 93474b1bea4e..e65f4ef024a4 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL_GPL(inet_sk_diag_fill); static int inet_twsk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, - u16 nlmsg_flags) + u16 nlmsg_flags, bool net_admin) { struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_diag_msg *r; @@ -444,6 +444,12 @@ static int inet_twsk_diag_fill(struct sock *sk, r->idiag_uid = 0; r->idiag_inode = 0; + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + tw->tw_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + nlmsg_end(skb, nlh); return 0; } @@ -494,7 +500,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, u16 nlmsg_flags, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags); + return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); if (sk->sk_state == TCP_NEW_SYN_RECV) return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); @@ -801,6 +807,8 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.mark = sk->sk_mark; else if (sk->sk_state == TCP_NEW_SYN_RECV) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else if (sk->sk_state == TCP_TIME_WAIT) + entry.mark = inet_twsk(sk)->tw_mark; else entry.mark = 0; #ifdef CONFIG_SOCK_CGROUP_DATA diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c96866a53a66..80aeaf9e6e16 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -697,7 +697,7 @@ void inet_unhash(struct sock *sk) goto unlock; if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_detach_sock(sk); + reuseport_stop_listen_sock(sk); if (ilb) { inet_unhash2(hashinfo, sk); ilb->count--; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 939792a38814..7b12a40dd465 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1317,7 +1317,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) } /* called from ip_ra_control(), before an RCU grace period, - * we dont need to call synchronize_rcu() here + * we don't need to call synchronize_rcu() here */ static void mrtsock_destruct(struct sock *sk) { @@ -1938,7 +1938,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; - /* For an (*,G) entry, we only check that the incomming + /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); @@ -3007,7 +3007,6 @@ static const struct seq_operations ipmr_mfc_seq_ops = { #ifdef CONFIG_IP_PIMSM_V2 static const struct net_protocol pim_protocol = { .handler = pim_rcv, - .netns_ok = 1, }; #endif diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index ff437e4ed6db..55fc23a8f7a7 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,7 +27,7 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nf_send_reset(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; default: diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 9a8c0892622b..6913979948d7 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -31,12 +31,6 @@ EXPORT_SYMBOL(inet_offloads); int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { - if (!prot->netns_ok) { - pr_err("Protocol %u is not namespace aware, cannot register.\n", - protocol); - return -EINVAL; - } - return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], NULL, prot) ? 0 : -1; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6a36ac98476f..66aacb939d3e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1906,13 +1906,128 @@ out: hash_keys->addrs.v4addrs.dst = key_iph->daddr; } +static u32 fib_multipath_custom_hash_outer(const struct net *net, + const struct sk_buff *skb, + bool *p_has_inner) +{ + u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + struct flow_keys keys, hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); + + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); + return flow_hash_from_keys(&hash_keys); +} + +static u32 fib_multipath_custom_hash_inner(const struct net *net, + const struct sk_buff *skb, + bool has_inner) +{ + u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + struct flow_keys keys, hash_keys; + + /* We assume the packet carries an encapsulation, but if none was + * encountered during dissection of the outer flow, then there is no + * point in calling the flow dissector again. + */ + if (!has_inner) + return 0; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, 0); + + if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) + return 0; + + if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) + hash_keys.tags.flow_label = keys.tags.flow_label; + } + + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + return flow_hash_from_keys(&hash_keys); +} + +static u32 fib_multipath_custom_hash_skb(const struct net *net, + const struct sk_buff *skb) +{ + u32 mhash, mhash_inner; + bool has_inner = true; + + mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner); + mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner); + + return jhash_2words(mhash, mhash_inner, 0); +} + +static u32 fib_multipath_custom_hash_fl4(const struct net *net, + const struct flowi4 *fl4) +{ + u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + struct flow_keys hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v4addrs.src = fl4->saddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v4addrs.dst = fl4->daddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = fl4->flowi4_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = fl4->fl4_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = fl4->fl4_dport; + + return flow_hash_from_keys(&hash_keys); +} + /* if skb is set it will be used and fl4 can be NULL */ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; struct flow_keys hash_keys; - u32 mhash; + u32 mhash = 0; switch (net->ipv4.sysctl_fib_multipath_hash_policy) { case 0: @@ -1924,6 +2039,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } + mhash = flow_hash_from_keys(&hash_keys); break; case 1: /* skb is currently provided only when forwarding */ @@ -1957,6 +2073,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } + mhash = flow_hash_from_keys(&hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -1987,9 +2104,15 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } + mhash = flow_hash_from_keys(&hash_keys); + break; + case 3: + if (skb) + mhash = fib_multipath_custom_hash_skb(net, skb); + else + mhash = fib_multipath_custom_hash_fl4(net, fl4); break; } - mhash = flow_hash_from_keys(&hash_keys); if (multipath_hash) mhash = jhash_2words(mhash, multipath_hash, 0); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a62934b9f15a..6f1e64d49232 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -19,6 +19,7 @@ #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> +#include <net/ip_fib.h> #include <net/route.h> #include <net/tcp.h> #include <net/udp.h> @@ -29,6 +30,7 @@ #include <net/netevent.h> static int two = 2; +static int three __maybe_unused = 3; static int four = 4; static int thousand = 1000; static int tcp_retr1_max = 255; @@ -48,6 +50,8 @@ static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; +static u32 fib_multipath_hash_fields_all_mask __maybe_unused = + FIB_MULTIPATH_HASH_FIELD_ALL_MASK; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -461,6 +465,22 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, return ret; } + +static int proc_fib_multipath_hash_fields(struct ctl_table *table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv4.sysctl_fib_multipath_hash_fields); + ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); + + return ret; +} #endif static struct ctl_table ipv4_table[] = { @@ -941,6 +961,15 @@ static struct ctl_table ipv4_net_table[] = { }, #endif { + .procname = "tcp_migrate_req", + .data = &init_net.ipv4.sysctl_tcp_migrate_req, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + { .procname = "tcp_reordering", .data = &init_net.ipv4.sysctl_tcp_reordering, .maxlen = sizeof(int), @@ -1050,7 +1079,16 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = &three, + }, + { + .procname = "fib_multipath_hash_fields", + .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_fib_multipath_hash_fields, + .extra1 = SYSCTL_ONE, + .extra2 = &fib_multipath_hash_fields_all_mask, }, #endif { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1c1f9e3de72..0e3f0e0e5b51 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1738,8 +1738,8 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); -static void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss) +void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) { if (skb->tstamp) tss->ts[0] = ktime_to_timespec64(skb->tstamp); @@ -2024,8 +2024,6 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, } #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) -static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping_internal *tss); static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) @@ -2197,8 +2195,8 @@ out: #endif /* Similar to __sock_recv_timestamp, but does not require an skb */ -static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping_internal *tss) +void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); bool has_timestamping = false; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ad9d17923fc5..a80de92ea3b6 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -184,11 +184,11 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { - int data, err = 0; long timeo; + int data; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = sk_msg_wait_data(sk, psock, timeo); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; @@ -196,14 +196,9 @@ msg_bytes_ready: sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); } - if (err) { - ret = err; - goto out; - } copied = -EAGAIN; } ret = copied; -out: release_sock(sk); sk_psock_put(sk, psock); return ret; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index af2814c9342a..47c32604d38f 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -526,7 +526,7 @@ bool tcp_fastopen_active_should_disable(struct sock *sk) if (!tfo_da_times) return false; - /* Limit timout to max: 2^6 * initial timeout */ + /* Limit timeout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); timeout = multiplier * tfo_bh_timeout * HZ; if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4cf4dd532d1c..7d5e59f688de 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2816,8 +2816,17 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, *rexmit = REXMIT_LOST; } +static bool tcp_force_fast_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return after(tcp_highest_sack_seq(tp), + tp->snd_una + tp->reordering * tp->mss_cache); +} + /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, + bool *do_lost) { struct tcp_sock *tp = tcp_sk(sk); @@ -2842,7 +2851,9 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) tcp_undo_cwnd_reduction(sk, true); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); tcp_try_keep_open(sk); - return true; + } else { + /* Partial ACK arrived. Force fast retransmit. */ + *do_lost = tcp_force_fast_retransmit(sk); } return false; } @@ -2866,14 +2877,6 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) } } -static bool tcp_force_fast_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - return after(tcp_highest_sack_seq(tp), - tp->snd_una + tp->reordering * tp->mss_cache); -} - /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2943,17 +2946,21 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) tcp_add_reno_sack(sk, num_dupack, ece_ack); - } else { - if (tcp_try_undo_partial(sk, prior_snd_una)) - return; - /* Partial ACK arrived. Force fast retransmit. */ - do_lost = tcp_force_fast_retransmit(sk); - } - if (tcp_try_undo_dsack(sk)) { - tcp_try_keep_open(sk); + } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) return; - } + + if (tcp_try_undo_dsack(sk)) + tcp_try_keep_open(sk); + tcp_identify_packet_loss(sk, ack_flag); + if (icsk->icsk_ca_state != TCP_CA_Recovery) { + if (!tcp_time_to_recover(sk, flag)) + return; + /* Undo reverts the recovery state. If loss is evident, + * starts a new recovery (e.g. reordering then loss); + */ + tcp_enter_recovery(sk, ece_ack); + } break; case TCP_CA_Loss: tcp_process_loss(sk, flag, num_dupack, rexmit); @@ -5885,6 +5892,7 @@ step5: return; csum_error: + trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 312184cead57..6cb8e269f1ab 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1731,6 +1731,7 @@ discard: return 0; csum_err: + trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1801,6 +1802,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); + trace_tcp_bad_csum(skb); __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); return true; @@ -2000,13 +2002,21 @@ process: goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; + nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); + if (!nsk) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sk = nsk; + /* reuseport_migrate_sock() has already held one sk_refcnt + * before returning. + */ + } else { + /* We own a reference on the listener, increase it again + * as we might lose it too soon. + */ + sock_hold(sk); } - /* We own a reference on the listener, increase it again - * as we might lose it too soon. - */ - sock_hold(sk); refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { @@ -2098,6 +2108,7 @@ no_tcp_socket: if (tcp_checksum_complete(skb)) { csum_error: + trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7513ba45553d..f258a4c0da71 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -775,8 +775,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, goto listen_overflow; if (own_req && rsk_drop_req(req)) { - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - inet_csk_reqsk_queue_drop_and_put(sk, req); + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4ef08079ccfa..56b9d648f054 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -441,7 +441,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) * This function gets called when the kernel timer for a TCP packet * of this socket expires. * - * It handles retransmission, timer adjustment and other necesarry measures. + * It handles retransmission, timer adjustment and other necessary measures. * * Returns: Nothing (void) */ @@ -766,7 +766,7 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) if (!sock_owned_by_user(sk)) { if (tp->compressed_ack) { /* Since we have to send one ack finally, - * substract one from tp->compressed_ack to keep + * subtract one from tp->compressed_ack to keep * LINUX_MIB_TCPACKCOMPRESSED accurate. */ tp->compressed_ack--; diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index e44aaf41a138..5048c47c79b2 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -218,7 +218,6 @@ static const struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, .no_policy = 1, - .netns_ok = 1, }; #if IS_ENABLED(CONFIG_IPV6) @@ -226,7 +225,6 @@ static const struct net_protocol tunnel64_protocol = { .handler = tunnel64_rcv, .err_handler = tunnel64_err, .no_policy = 1, - .netns_ok = 1, }; #endif @@ -235,7 +233,6 @@ static const struct net_protocol tunnelmpls4_protocol = { .handler = tunnelmpls4_rcv, .err_handler = tunnelmpls4_err, .no_policy = 1, - .netns_ok = 1, }; #endif diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 954c4591a6fd..b07e4b6dda25 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -43,21 +43,17 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { - int data, err = 0; long timeo; + int data; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = sk_msg_wait_data(sk, psock, timeo); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); goto out; } - if (err) { - ret = err; - goto out; - } copied = -EAGAIN; } ret = copied; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index bd8773b49e72..cd1cd68adeec 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -31,7 +31,6 @@ static const struct net_protocol udplite_protocol = { .handler = udplite_rcv, .err_handler = udplite_err, .no_policy = 1, - .netns_ok = 1, }; struct proto udplite_prot = { diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index ea595c8549c7..2fe5860c21d6 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -181,21 +181,18 @@ static const struct net_protocol esp4_protocol = { .handler = xfrm4_esp_rcv, .err_handler = xfrm4_esp_err, .no_policy = 1, - .netns_ok = 1, }; static const struct net_protocol ah4_protocol = { .handler = xfrm4_ah_rcv, .err_handler = xfrm4_ah_err, .no_policy = 1, - .netns_ok = 1, }; static const struct net_protocol ipcomp4_protocol = { .handler = xfrm4_ipcomp_rcv, .err_handler = xfrm4_ipcomp_err, .no_policy = 1, - .netns_ok = 1, }; static const struct xfrm_input_afinfo xfrm4_input_afinfo = { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 701eb82acd1c..3bf685fe64b9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6903,10 +6903,10 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { - .procname = "addr_gen_mode", - .data = &ipv6_devconf.addr_gen_mode, - .maxlen = sizeof(int), - .mode = 0644, + .procname = "addr_gen_mode", + .data = &ipv6_devconf.addr_gen_mode, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = addrconf_sysctl_addr_gen_mode, }, { diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 8f9a83314de7..40f3e4f9f33a 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -467,7 +467,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { static int __net_init fib6_rules_net_init(struct net *net) { struct fib_rules_ops *ops; - int err = -ENOMEM; + int err; ops = fib_rules_register(&fib6_rules_ops_template, net); if (IS_ERR(ops)) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 679699e953f1..2d650dc24349 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -32,6 +32,7 @@ #include <net/lwtunnel.h> #include <net/fib_notifier.h> +#include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> @@ -2355,6 +2356,10 @@ static int __net_init fib6_net_init(struct net *net) if (err) return err; + /* Default to 3-tuple */ + net->ipv6.sysctl.multipath_hash_fields = + FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; + spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); @@ -2362,7 +2367,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) - goto out_timer; + goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); @@ -2407,7 +2412,7 @@ out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); -out_timer: +out_notifier: fib6_notifier_exit(net); return -ENOMEM; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index d36ef9d25e73..54ec163fbafa 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1729,22 +1729,25 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) { + u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, + 2, 0, 0, IPV6_TLV_PADN, 0 }; struct net_device *dev = idev->dev; - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.igmp_sk; - struct sk_buff *skb; - struct mld2_report *pmr; - struct in6_addr addr_buf; - const struct in6_addr *saddr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - unsigned int size = mtu + hlen + tlen; + struct net *net = dev_net(dev); + const struct in6_addr *saddr; + struct in6_addr addr_buf; + struct mld2_report *pmr; + struct sk_buff *skb; + unsigned int size; + struct sock *sk; int err; - u8 ra[8] = { IPPROTO_ICMPV6, 0, - IPV6_TLV_ROUTERALERT, 2, 0, 0, - IPV6_TLV_PADN, 0 }; - /* we assume size > sizeof(ra) here */ + sk = net->ipv6.igmp_sk; + /* we assume size > sizeof(ra) here + * Also try to not allocate high-order pages for big MTU + */ + size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; skb = sock_alloc_send_skb(sk, size, 1, &err); if (!skb) return NULL; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e810a23baf99..de2cf3943b91 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -51,7 +51,7 @@ ip6_packet_match(const struct sk_buff *skb, const char *outdev, const struct ip6t_ip6 *ip6info, unsigned int *protoff, - int *fragoff, bool *hotdrop) + u16 *fragoff, bool *hotdrop) { unsigned long ret; const struct ipv6hdr *ipv6 = ipv6_hdr(skb); diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index 7969d1f3018d..ed69c768797e 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -28,7 +28,7 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nf_send_reset6(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; default: diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index af36acc1a644..2880dc7d9a49 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -15,29 +15,11 @@ static u32 __ipv6_select_ident(struct net *net, const struct in6_addr *dst, const struct in6_addr *src) { - const struct { - struct in6_addr dst; - struct in6_addr src; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .dst = *dst, - .src = *src, - }; - u32 hash, id; - - /* Note the following code is not safe, but this is okay. */ - if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) - get_random_bytes(&net->ipv4.ip_id_key, - sizeof(net->ipv4.ip_id_key)); - - hash = siphash(&combined, sizeof(combined), &net->ipv4.ip_id_key); - - /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve, - * set the hight order instead thus minimizing possible future - * collisions. - */ - id = ip_idents_reserve(hash, 1); - if (unlikely(!id)) - id = 1 << 31; + u32 id; + + do { + id = prandom_u32(); + } while (!id); return id; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d417e514bd52..7b756a7dc036 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2326,12 +2326,131 @@ out: } } +static u32 rt6_multipath_custom_hash_outer(const struct net *net, + const struct sk_buff *skb, + bool *p_has_inner) +{ + u32 hash_fields = ip6_multipath_hash_fields(net); + struct flow_keys keys, hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); + + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) + hash_keys.tags.flow_label = keys.tags.flow_label; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); + return flow_hash_from_keys(&hash_keys); +} + +static u32 rt6_multipath_custom_hash_inner(const struct net *net, + const struct sk_buff *skb, + bool has_inner) +{ + u32 hash_fields = ip6_multipath_hash_fields(net); + struct flow_keys keys, hash_keys; + + /* We assume the packet carries an encapsulation, but if none was + * encountered during dissection of the outer flow, then there is no + * point in calling the flow dissector again. + */ + if (!has_inner) + return 0; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, 0); + + if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) + return 0; + + if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) + hash_keys.tags.flow_label = keys.tags.flow_label; + } + + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + return flow_hash_from_keys(&hash_keys); +} + +static u32 rt6_multipath_custom_hash_skb(const struct net *net, + const struct sk_buff *skb) +{ + u32 mhash, mhash_inner; + bool has_inner = true; + + mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner); + mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner); + + return jhash_2words(mhash, mhash_inner, 0); +} + +static u32 rt6_multipath_custom_hash_fl6(const struct net *net, + const struct flowi6 *fl6) +{ + u32 hash_fields = ip6_multipath_hash_fields(net); + struct flow_keys hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v6addrs.src = fl6->saddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v6addrs.dst = fl6->daddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = fl6->flowi6_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) + hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = fl6->fl6_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = fl6->fl6_dport; + + return flow_hash_from_keys(&hash_keys); +} + /* if skb is set it will be used and fl6 can be NULL */ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; - u32 mhash; + u32 mhash = 0; switch (ip6_multipath_hash_policy(net)) { case 0: @@ -2345,6 +2464,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } + mhash = flow_hash_from_keys(&hash_keys); break; case 1: if (skb) { @@ -2376,6 +2496,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } + mhash = flow_hash_from_keys(&hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -2412,9 +2533,15 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } + mhash = flow_hash_from_keys(&hash_keys); + break; + case 3: + if (skb) + mhash = rt6_multipath_custom_hash_skb(net, skb); + else + mhash = rt6_multipath_custom_hash_fl6(net, fl6); break; } - mhash = flow_hash_from_keys(&hash_keys); return mhash >> 1; } diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 4ff38cb08f4b..60bf3b877957 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -87,10 +87,10 @@ struct seg6_end_dt_info { int vrf_ifindex; int vrf_table; - /* tunneled packet proto and family (IPv4 or IPv6) */ - __be16 proto; + /* tunneled packet family (IPv4 or IPv6). + * Protocol and header length are inferred from family. + */ u16 family; - int hdrlen; }; struct pcpu_seg6_local_counters { @@ -521,19 +521,6 @@ static int __seg6_end_dt_vrf_build(struct seg6_local_lwt *slwt, const void *cfg, info->net = net; info->vrf_ifindex = vrf_ifindex; - switch (family) { - case AF_INET: - info->proto = htons(ETH_P_IP); - info->hdrlen = sizeof(struct iphdr); - break; - case AF_INET6: - info->proto = htons(ETH_P_IPV6); - info->hdrlen = sizeof(struct ipv6hdr); - break; - default: - return -EINVAL; - } - info->family = family; info->mode = DT_VRF_MODE; @@ -622,22 +609,44 @@ error: } static struct sk_buff *end_dt_vrf_core(struct sk_buff *skb, - struct seg6_local_lwt *slwt) + struct seg6_local_lwt *slwt, u16 family) { struct seg6_end_dt_info *info = &slwt->dt_info; struct net_device *vrf; + __be16 protocol; + int hdrlen; vrf = end_dt_get_vrf_rcu(skb, info); if (unlikely(!vrf)) goto drop; - skb->protocol = info->proto; + switch (family) { + case AF_INET: + protocol = htons(ETH_P_IP); + hdrlen = sizeof(struct iphdr); + break; + case AF_INET6: + protocol = htons(ETH_P_IPV6); + hdrlen = sizeof(struct ipv6hdr); + break; + case AF_UNSPEC: + fallthrough; + default: + goto drop; + } + + if (unlikely(info->family != AF_UNSPEC && info->family != family)) { + pr_warn_once("seg6local: SRv6 End.DT* family mismatch"); + goto drop; + } + + skb->protocol = protocol; skb_dst_drop(skb); - skb_set_transport_header(skb, info->hdrlen); + skb_set_transport_header(skb, hdrlen); - return end_dt_vrf_rcv(skb, info->family, vrf); + return end_dt_vrf_rcv(skb, family, vrf); drop: kfree_skb(skb); @@ -656,7 +665,7 @@ static int input_action_end_dt4(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - skb = end_dt_vrf_core(skb, slwt); + skb = end_dt_vrf_core(skb, slwt, AF_INET); if (!skb) /* packet has been processed and consumed by the VRF */ return 0; @@ -739,7 +748,7 @@ static int input_action_end_dt6(struct sk_buff *skb, goto legacy_mode; /* DT6_VRF_MODE */ - skb = end_dt_vrf_core(skb, slwt); + skb = end_dt_vrf_core(skb, slwt, AF_INET6); if (!skb) /* packet has been processed and consumed by the VRF */ return 0; @@ -767,6 +776,36 @@ drop: return -EINVAL; } +#ifdef CONFIG_NET_L3_MASTER_DEV +static int seg6_end_dt46_build(struct seg6_local_lwt *slwt, const void *cfg, + struct netlink_ext_ack *extack) +{ + return __seg6_end_dt_vrf_build(slwt, cfg, AF_UNSPEC, extack); +} + +static int input_action_end_dt46(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + unsigned int off = 0; + int nexthdr; + + nexthdr = ipv6_find_hdr(skb, &off, -1, NULL, NULL); + if (unlikely(nexthdr < 0)) + goto drop; + + switch (nexthdr) { + case IPPROTO_IPIP: + return input_action_end_dt4(skb, slwt); + case IPPROTO_IPV6: + return input_action_end_dt6(skb, slwt); + } + +drop: + kfree_skb(skb); + return -EINVAL; +} +#endif + /* push an SRH on top of the current one */ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt) { @@ -969,6 +1008,17 @@ static struct seg6_action_desc seg6_action_table[] = { .input = input_action_end_dt6, }, { + .action = SEG6_LOCAL_ACTION_END_DT46, + .attrs = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE), + .optattrs = SEG6_F_LOCAL_COUNTERS, +#ifdef CONFIG_NET_L3_MASTER_DEV + .input = input_action_end_dt46, + .slwt_ops = { + .build_state = seg6_end_dt46_build, + }, +#endif + }, + { .action = SEG6_LOCAL_ACTION_END_B6, .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH), .optattrs = SEG6_F_LOCAL_COUNTERS, diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index f7c8110ece5f..e0a39b0bb4c1 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -973,7 +973,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, if (df) { mtu = dst_mtu(&rt->dst) - t_hlen; - if (mtu < 68) { + if (mtu < IPV4_MIN_MTU) { dev->stats.collisions++; ip_rt_put(rt); goto tx_error; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 27102c3d6e1d..d7cf26f730d7 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -17,13 +17,17 @@ #include <net/addrconf.h> #include <net/inet_frag.h> #include <net/netevent.h> +#include <net/ip_fib.h> #ifdef CONFIG_NETLABEL #include <net/calipso.h> #endif static int two = 2; +static int three = 3; static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; +static u32 rt6_multipath_hash_fields_all_mask = + FIB_MULTIPATH_HASH_FIELD_ALL_MASK; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -40,6 +44,22 @@ static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, return ret; } +static int +proc_rt6_multipath_hash_fields(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv6.sysctl.multipath_hash_fields); + ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); + + return ret; +} + static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", @@ -149,7 +169,16 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_rt6_multipath_hash_policy, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = &three, + }, + { + .procname = "fib_multipath_hash_fields", + .data = &init_net.ipv6.sysctl.multipath_hash_fields, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_rt6_multipath_hash_fields, + .extra1 = SYSCTL_ONE, + .extra2 = &rt6_multipath_hash_fields_all_mask, }, { .procname = "seg6_flowlabel", diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f47c0b6e3de..4d71464094b3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1538,6 +1538,7 @@ discard: kfree_skb(skb); return 0; csum_err: + trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1663,10 +1664,18 @@ process: goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; + nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); + if (!nsk) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sk = nsk; + /* reuseport_migrate_sock() has already held one sk_refcnt + * before returning. + */ + } else { + sock_hold(sk); } - sock_hold(sk); refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { @@ -1754,6 +1763,7 @@ no_tcp_socket: if (tcp_checksum_complete(skb)) { csum_error: + trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 0fdb389c3390..44453b35c7b7 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -44,6 +44,7 @@ static struct proto iucv_proto = { }; static struct iucv_interface *pr_iucv; +static struct iucv_handler af_iucv_handler; /* special AF_IUCV IPRM messages */ static const u8 iprm_shutdown[8] = @@ -91,28 +92,11 @@ static void iucv_sock_close(struct sock *sk); static void afiucv_hs_callback_txnotify(struct sock *sk, enum iucv_tx_notify); -/* Call Back functions */ -static void iucv_callback_rx(struct iucv_path *, struct iucv_message *); -static void iucv_callback_txdone(struct iucv_path *, struct iucv_message *); -static void iucv_callback_connack(struct iucv_path *, u8 *); -static int iucv_callback_connreq(struct iucv_path *, u8 *, u8 *); -static void iucv_callback_connrej(struct iucv_path *, u8 *); -static void iucv_callback_shutdown(struct iucv_path *, u8 *); - static struct iucv_sock_list iucv_sk_list = { .lock = __RW_LOCK_UNLOCKED(iucv_sk_list.lock), .autobind_name = ATOMIC_INIT(0) }; -static struct iucv_handler af_iucv_handler = { - .path_pending = iucv_callback_connreq, - .path_complete = iucv_callback_connack, - .path_severed = iucv_callback_connrej, - .message_pending = iucv_callback_rx, - .message_complete = iucv_callback_txdone, - .path_quiesced = iucv_callback_shutdown, -}; - static inline void high_nmcpy(unsigned char *dst, char *src) { memcpy(dst, src, 8); @@ -1817,6 +1801,15 @@ static void iucv_callback_shutdown(struct iucv_path *path, u8 ipuser[16]) bh_unlock_sock(sk); } +static struct iucv_handler af_iucv_handler = { + .path_pending = iucv_callback_connreq, + .path_complete = iucv_callback_connack, + .path_severed = iucv_callback_connrej, + .message_pending = iucv_callback_rx, + .message_complete = iucv_callback_txdone, + .path_quiesced = iucv_callback_shutdown, +}; + /***************** HiperSockets transport callbacks ********************/ static void afiucv_swap_src_dest(struct sk_buff *skb) { diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 97ae1255fcb6..b3edafa5fba4 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -488,7 +488,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - /* We dont need to clone dst here, it is guaranteed to not disappear. + /* We don't need to clone dst here, it is guaranteed to not disappear. * __dev_xmit_skb() might force a refcount if needed. */ skb_dst_set_noref(skb, &rt->dst); @@ -635,7 +635,6 @@ static struct inet_protosw l2tp_ip_protosw = { static struct net_protocol l2tp_ip_protocol __read_mostly = { .handler = l2tp_ip_recv, - .netns_ok = 1, }; static int __init l2tp_ip_init(void) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index aea85f91f059..bf35710127dd 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -226,7 +226,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int /* If the first two bytes are 0xFF03, consider that it is the PPP's * Address and Control fields and skip them. The L2TP module has always * worked this way, although, in theory, the use of these fields should - * be negociated and handled at the PPP layer. These fields are + * be negotiated and handled at the PPP layer. These fields are * constant: 0xFF is the All-Stations Address and 0x03 the Unnumbered * Information command with Poll/Final bit set to zero (RFC 1662). */ diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 1078e14f1acf..0971ca48ba15 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -80,11 +80,9 @@ static void __lapb_insert_cb(struct lapb_cb *lapb) static struct lapb_cb *__lapb_devtostruct(struct net_device *dev) { - struct list_head *entry; struct lapb_cb *lapb, *use = NULL; - list_for_each(entry, &lapb_list) { - lapb = list_entry(entry, struct lapb_cb, node); + list_for_each_entry(lapb, &lapb_list, node) { if (lapb->dev == dev) { use = lapb; break; diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 96ba616f59bf..6c2639bb9c19 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -4,7 +4,9 @@ * Copyright (c) 2019, Tessares SA. */ +#ifdef CONFIG_SYSCTL #include <linux/sysctl.h> +#endif #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -15,10 +17,13 @@ static int mptcp_pernet_id; struct mptcp_pernet { +#ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_table_hdr; +#endif - int mptcp_enabled; + u8 mptcp_enabled; unsigned int add_addr_timeout; + u8 checksum_enabled; }; static struct mptcp_pernet *mptcp_get_pernet(struct net *net) @@ -36,15 +41,30 @@ unsigned int mptcp_get_add_addr_timeout(struct net *net) return mptcp_get_pernet(net)->add_addr_timeout; } +int mptcp_is_checksum_enabled(struct net *net) +{ + return mptcp_get_pernet(net)->checksum_enabled; +} + +static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) +{ + pernet->mptcp_enabled = 1; + pernet->add_addr_timeout = TCP_RTO_MAX; + pernet->checksum_enabled = 0; +} + +#ifdef CONFIG_SYSCTL static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, /* users with CAP_NET_ADMIN or root (not and) can change this * value, same as other sysctl or the 'net' tree. */ - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE }, { .procname = "add_addr_timeout", @@ -52,15 +72,17 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "checksum_enabled", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, {} }; -static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) -{ - pernet->mptcp_enabled = 1; - pernet->add_addr_timeout = TCP_RTO_MAX; -} - static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) { struct ctl_table_header *hdr; @@ -75,6 +97,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[0].data = &pernet->mptcp_enabled; table[1].data = &pernet->add_addr_timeout; + table[2].data = &pernet->checksum_enabled; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) @@ -100,6 +123,17 @@ static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) kfree(table); } +#else + +static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) +{ + return 0; +} + +static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {} + +#endif /* CONFIG_SYSCTL */ + static int __net_init mptcp_net_init(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index eb2dc6dbe212..e7e60bc1fb96 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -25,6 +25,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), + SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index f0da4f060fe1..92e56c0cfbdd 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -18,6 +18,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ + MPTCP_MIB_DATACSUMERR, /* The data checksum fail */ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */ MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */ diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index f16d9b5ee978..8f88ddeab6a2 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -144,6 +144,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, info->mptcpi_write_seq = READ_ONCE(msk->write_seq); info->mptcpi_snd_una = READ_ONCE(msk->snd_una); info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); unlock_sock_fast(sk, slow); } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 9b263f27ce9b..25189595ed1d 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -44,7 +44,20 @@ static void mptcp_parse_option(const struct sk_buff *skb, else expected_opsize = TCPOLEN_MPTCP_MPC_SYN; } - if (opsize != expected_opsize) + + /* Cfr RFC 8684 Section 3.3.0: + * If a checksum is present but its use had + * not been negotiated in the MP_CAPABLE handshake, the receiver MUST + * close the subflow with a RST, as it is not behaving as negotiated. + * If a checksum is not present when its use has been negotiated, the + * receiver MUST close the subflow with a RST, as it is considered + * broken + * We parse even option with mismatching csum presence, so that + * later in subflow_data_ready we can trigger the reset. + */ + if (opsize != expected_opsize && + (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA || + opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM)) break; /* try to be gentle vs future versions on the initial syn */ @@ -66,16 +79,9 @@ static void mptcp_parse_option(const struct sk_buff *skb, * host requires the use of checksums, checksums MUST be used. * In other words, the only way for checksums not to be used * is if both hosts in their SYNs set A=0." - * - * Section 3.3.0: - * "If a checksum is not present when its use has been - * negotiated, the receiver MUST close the subflow with a RST as - * it is considered broken." - * - * We don't implement DSS checksum - fall back to TCP. */ if (flags & MPTCP_CAP_CHECKSUM_REQD) - break; + mp_opt->csum_reqd = 1; mp_opt->mp_capable = 1; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { @@ -86,7 +92,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; } - if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { + if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) { /* Section 3.1.: * "the data parameters in a MP_CAPABLE are semantically * equivalent to those in a DSS option and can be used @@ -98,9 +104,14 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } - pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", + if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { + mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + mp_opt->csum_reqd = 1; + ptr += 2; + } + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u", version, flags, opsize, mp_opt->sndr_key, - mp_opt->rcvr_key, mp_opt->data_len); + mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; case MPTCPOPT_MP_JOIN: @@ -171,10 +182,8 @@ static void mptcp_parse_option(const struct sk_buff *skb, expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; } - /* RFC 6824, Section 3.3: - * If a checksum is present, but its use had - * not been negotiated in the MP_CAPABLE handshake, - * the checksum field MUST be ignored. + /* Always parse any csum presence combination, we will enforce + * RFC 8684 Section 3.3.0 checks later in subflow_data_ready */ if (opsize != expected_opsize && opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) @@ -209,9 +218,15 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; - pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", + if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { + mp_opt->csum_reqd = 1; + mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + ptr += 2; + } + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", mp_opt->data_seq, mp_opt->subflow_seq, - mp_opt->data_len); + mp_opt->data_len, mp_opt->csum_reqd, mp_opt->csum); } break; @@ -323,9 +338,12 @@ static void mptcp_parse_option(const struct sk_buff *skb, } } -void mptcp_get_options(const struct sk_buff *skb, +void mptcp_get_options(const struct sock *sk, + const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); const struct tcphdr *th = tcp_hdr(skb); const unsigned char *ptr; int length; @@ -341,6 +359,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->dss = 0; mp_opt->mp_prio = 0; mp_opt->reset = 0; + mp_opt->csum_reqd = READ_ONCE(msk->csum_enabled); length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -382,6 +401,7 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { opts->suboptions = OPTION_MPTCP_MPC_SYN; + opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { @@ -437,8 +457,10 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_ext *mpext; unsigned int data_len; + u8 len; /* When skb is not available, we better over-estimate the emitted * options len. A full DSS option (28 bytes) is longer than @@ -467,16 +489,26 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; + opts->csum_reqd = READ_ONCE(msk->csum_enabled); /* Section 3.1. * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK * packets that start the first subflow of an MPTCP connection, * as well as the first packet that carries data */ - if (data_len > 0) - *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); - else + if (data_len > 0) { + len = TCPOLEN_MPTCP_MPC_ACK_DATA; + if (opts->csum_reqd) { + /* we need to propagate more info to csum the pseudo hdr */ + opts->ext_copy.data_seq = mpext->data_seq; + opts->ext_copy.subflow_seq = mpext->subflow_seq; + opts->ext_copy.csum = mpext->csum; + len += TCPOLEN_MPTCP_DSS_CHECKSUM; + } + *size = ALIGN(len, 4); + } else { *size = TCPOLEN_MPTCP_MPC_ACK; + } pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", subflow, subflow->local_key, subflow->remote_key, @@ -537,18 +569,21 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool ret = false; u64 ack_seq; + opts->csum_reqd = READ_ONCE(msk->csum_enabled); mpext = skb ? mptcp_get_ext(skb) : NULL; if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { - unsigned int map_size; + unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; - map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; + if (mpext) { + if (opts->csum_reqd) + map_size += TCPOLEN_MPTCP_DSS_CHECKSUM; - remaining -= map_size; - dss_size = map_size; - if (mpext) opts->ext_copy = *mpext; + } + remaining -= map_size; + dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); ret = true; @@ -791,6 +826,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, if (subflow_req->mp_capable) { opts->suboptions = OPTION_MPTCP_MPC_SYNACK; opts->sndr_key = subflow_req->local_key; + opts->csum_reqd = subflow_req->csum_reqd; *size = TCPOLEN_MPTCP_MPC_SYNACK; pr_debug("subflow_req=%p, local_key=%llu", subflow_req, subflow_req->local_key); @@ -1009,7 +1045,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) return; } - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; @@ -1101,6 +1137,10 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } mpext->data_len = mp_opt.data_len; mpext->use_map = 1; + mpext->csum_reqd = mp_opt.csum_reqd; + + if (mpext->csum_reqd) + mpext->csum = mp_opt.csum; } } @@ -1120,25 +1160,50 @@ static void mptcp_set_rwin(const struct tcp_sock *tp) WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } +static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +{ + struct csum_pseudo_header header; + __wsum csum; + + /* cfr RFC 8684 3.3.1.: + * the data sequence number used in the pseudo-header is + * always the 64-bit value, irrespective of what length is used in the + * DSS option itself. + */ + header.data_seq = cpu_to_be64(mpext->data_seq); + header.subflow_seq = htonl(mpext->subflow_seq); + header.data_len = htons(mpext->data_len); + header.csum = 0; + + csum = csum_partial(&header, sizeof(header), ~csum_unfold(mpext->csum)); + return (__force u16)csum_fold(csum); +} + void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { - u8 len; + u8 len, flag = MPTCP_CAP_HMAC_SHA256; - if (OPTION_MPTCP_MPC_SYN & opts->suboptions) + if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYN; - else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) + } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; - else if (opts->ext_copy.data_len) + } else if (opts->ext_copy.data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; - else + if (opts->csum_reqd) + len += TCPOLEN_MPTCP_DSS_CHECKSUM; + } else { len = TCPOLEN_MPTCP_MPC_ACK; + } + + if (opts->csum_reqd) + flag |= MPTCP_CAP_CHECKSUM_REQD; *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, MPTCP_SUPPORTED_VERSION, - MPTCP_CAP_HMAC_SHA256); + flag); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) @@ -1154,8 +1219,13 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, if (!opts->ext_copy.data_len) goto mp_capable_done; - put_unaligned_be32(opts->ext_copy.data_len << 16 | - TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + if (opts->csum_reqd) { + put_unaligned_be32(opts->ext_copy.data_len << 16 | + mptcp_make_csum(&opts->ext_copy), ptr); + } else { + put_unaligned_be32(opts->ext_copy.data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } ptr += 1; } @@ -1307,6 +1377,9 @@ mp_capable_done: flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; if (mpext->data_fin) flags |= MPTCP_DSS_DATA_FIN; + + if (opts->csum_reqd) + len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); @@ -1326,8 +1399,13 @@ mp_capable_done: ptr += 2; put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; - put_unaligned_be32(mpext->data_len << 16 | - TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + if (opts->csum_reqd) { + put_unaligned_be32(mpext->data_len << 16 | + mptcp_make_csum(mpext), ptr); + } else { + put_unaligned_be32(mpext->data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } } } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2469e06a3a9d..09722598994d 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -971,8 +971,14 @@ skip_family: if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); - if (tb[MPTCP_PM_ADDR_ATTR_PORT]) + if (tb[MPTCP_PM_ADDR_ATTR_PORT]) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "flags must have signal when using port"); + return -EINVAL; + } entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); + } return 0; } @@ -1913,10 +1919,13 @@ static int __net_init pm_nl_init_net(struct net *net) struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); INIT_LIST_HEAD_RCU(&pernet->local_addr_list); - __reset_counters(pernet); pernet->next_id = 1; - bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); spin_lock_init(&pernet->lock); + + /* No need to initialize other pernet fields, the struct is zeroed at + * allocation time. + */ + return 0; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 632350018fb6..b5f2f504b85b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -39,10 +39,15 @@ struct mptcp_skb_cb { u64 map_seq; u64 end_seq; u32 offset; + u8 has_rxtstamp:1; }; #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) +enum { + MPTCP_CMSG_TS = BIT(0), +}; + static struct percpu_counter mptcp_sockets_allocated; static void __mptcp_destroy_sock(struct sock *sk); @@ -272,6 +277,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; struct sk_buff *tail; + bool has_rxtstamp; __skb_unlink(skb, &ssk->sk_receive_queue); @@ -289,6 +295,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, sk->sk_forward_alloc += amount; } + has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq * value @@ -296,6 +304,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset; + MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ @@ -1298,6 +1307,18 @@ static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation); } +/* note: this always recompute the csum on the whole skb, even + * if we just appended a single frag. More status info needed + */ +static void mptcp_update_data_checksum(struct sk_buff *skb, int added) +{ + struct mptcp_ext *mpext = mptcp_get_ext(skb); + __wsum csum = ~csum_unfold(mpext->csum); + int offset = skb->len - added; + + mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) @@ -1392,10 +1413,14 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (zero_window_probe) { mptcp_subflow_ctx(ssk)->rel_write_seq += ret; mpext->frozen = 1; - ret = 0; + if (READ_ONCE(msk->csum_enabled)) + mptcp_update_data_checksum(tail, ret); tcp_push_pending_frames(ssk); + return 0; } out: + if (READ_ONCE(msk->csum_enabled)) + mptcp_update_data_checksum(tail, ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; } @@ -1770,7 +1795,9 @@ static void mptcp_wait_data(struct sock *sk, long *timeo) static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, - size_t len, int flags) + size_t len, int flags, + struct scm_timestamping_internal *tss, + int *cmsg_flags) { struct sk_buff *skb, *tmp; int copied = 0; @@ -1790,6 +1817,11 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, } } + if (MPTCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + *cmsg_flags |= MPTCP_CMSG_TS; + } + copied += count; if (count < data_len) { @@ -1980,7 +2012,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); - int copied = 0; + struct scm_timestamping_internal tss; + int copied = 0, cmsg_flags = 0; int target; long timeo; @@ -2002,7 +2035,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, while (copied < len) { int bytes_read; - bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags); + bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; @@ -2083,6 +2116,11 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, set_bit(MPTCP_DATA_READY, &msk->flags); } out_err: + if (cmsg_flags && copied >= 0) { + if (cmsg_flags & MPTCP_CMSG_TS) + tcp_recv_timestamp(msg, sk, &tss); + } + pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", msk, test_bit(MPTCP_DATA_READY, &msk->flags), skb_queue_empty_lockless(&sk->sk_receive_queue), copied); @@ -2339,8 +2377,8 @@ static void __mptcp_retrans(struct sock *sk) /* limit retransmission to the bytes already sent on some subflows */ info.sent = 0; - info.limit = dfrag->already_sent; - while (info.sent < dfrag->already_sent) { + info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; + while (info.sent < info.limit) { if (!mptcp_alloc_tx_skb(sk, ssk)) break; @@ -2352,9 +2390,11 @@ static void __mptcp_retrans(struct sock *sk) copied += ret; info.sent += ret; } - if (copied) + if (copied) { + dfrag->already_sent = max(dfrag->already_sent, info.sent); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); + } mptcp_set_timeout(sk, ssk); release_sock(ssk); @@ -2433,6 +2473,7 @@ static int __mptcp_init_sock(struct sock *sk) msk->ack_hint = NULL; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; + WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_init(msk); @@ -2773,6 +2814,8 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->token = subflow_req->token; msk->subflow = NULL; WRITE_ONCE(msk->fully_established, false); + if (mp_opt->csum_reqd) + WRITE_ONCE(msk->csum_enabled, true); msk->write_seq = subflow_req->idsn + 1; msk->snd_nxt = msk->write_seq; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 385796f0ef19..160d716ebc2b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -68,6 +68,8 @@ #define TCPOLEN_MPTCP_FASTCLOSE 12 #define TCPOLEN_MPTCP_RST 4 +#define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA) + /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) #define MPTCPOPT_HMAC_LEN 20 @@ -124,6 +126,7 @@ struct mptcp_options_received { u64 data_seq; u32 subflow_seq; u16 data_len; + __sum16 csum; u16 mp_capable : 1, mp_join : 1, fastclose : 1, @@ -133,6 +136,7 @@ struct mptcp_options_received { rm_addr : 1, mp_prio : 1, echo : 1, + csum_reqd : 1, backup : 1; u32 token; u32 nonce; @@ -234,6 +238,7 @@ struct mptcp_sock { bool snd_data_fin_enable; bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ + bool csum_enabled; spinlock_t join_list_lock; struct sock *ack_hint; struct work_struct work; @@ -335,11 +340,19 @@ static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } +struct csum_pseudo_header { + __be64 data_seq; + __be32 subflow_seq; + __be16 data_len; + __sum16 csum; +}; + struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, mp_join : 1, - backup : 1; + backup : 1, + csum_reqd : 1; u8 local_id; u8 remote_id; u64 local_key; @@ -386,6 +399,8 @@ struct mptcp_subflow_context { u32 map_subflow_seq; u32 ssn_offset; u32 map_data_len; + __wsum map_data_csum; + u32 map_csum_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ request_join : 1, /* send MP_JOIN */ request_bkup : 1, @@ -395,6 +410,8 @@ struct mptcp_subflow_context { pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, + map_csum_reqd : 1, + map_data_fin : 1, mpc_map : 1, backup : 1, send_mp_prio : 1, @@ -524,6 +541,7 @@ static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *su int mptcp_is_enabled(struct net *net); unsigned int mptcp_get_add_addr_timeout(struct net *net); +int mptcp_is_checksum_enabled(struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); @@ -575,7 +593,8 @@ int __init mptcp_proto_v6_init(void); struct sock *mptcp_sk_clone(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct request_sock *req); -void mptcp_get_options(const struct sk_buff *skb, +void mptcp_get_options(const struct sock *sk, + const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); @@ -626,6 +645,8 @@ static inline void mptcp_write_space(struct sock *sk) void mptcp_destroy_common(struct mptcp_sock *msk); +#define MPTCP_TOKEN_MAX_RETRIES 4 + void __init mptcp_token_init(void); static inline void mptcp_token_init_request(struct request_sock *req) { diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index a79798189599..092d1f635d27 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -140,6 +140,43 @@ static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val) mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val); } +static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optname, int val) +{ + sockptr_t optval = KERNEL_SOCKPTR(&val); + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int ret; + + ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, + optval, sizeof(val)); + if (ret) + return ret; + + lock_sock(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); + + switch (optname) { + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + sock_set_timestamp(sk, optname, !!val); + break; + case SO_TIMESTAMPING_NEW: + case SO_TIMESTAMPING_OLD: + sock_set_timestamping(sk, optname, val); + break; + } + + unlock_sock_fast(ssk, slow); + } + + release_sock(sk); + return 0; +} + static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -164,6 +201,13 @@ static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, case SO_INCOMING_CPU: mptcp_so_incoming_cpu(msk, val); return 0; + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: + return mptcp_setsockopt_sol_socket_tstamp(msk, optname, val); } return -ENOPROTOOPT; @@ -251,9 +295,23 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_MARK: case SO_INCOMING_CPU: case SO_DEBUG: + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: return mptcp_setsockopt_sol_socket_int(msk, optname, optval, optlen); case SO_LINGER: return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen); + case SO_RCVLOWAT: + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + case SO_BUSY_POLL: + case SO_PREFER_BUSY_POLL: + case SO_BUSY_POLL_BUDGET: + /* No need to copy: only relevant for msk */ + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); case SO_NO_CHECK: case SO_DONTROUTE: case SO_BROADCAST: @@ -267,7 +325,24 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return 0; } - return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); + /* SO_OOBINLINE is not supported, let's avoid the related mess + * SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, + * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, + * we must be careful with subflows + * + * SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks + * explicitly the sk_protocol field + * + * SO_PEEK_OFF is unsupported, as it is for plain TCP + * SO_MAX_PACING_RATE is unsupported, we must be careful with subflows + * SO_CNX_ADVICE is currently unsupported, could possibly be relevant, + * but likely needs careful design + * + * SO_ZEROCOPY is currently unsupported, TODO in sndmsg + * SO_TXTIME is currently unsupported + */ + + return -EOPNOTSUPP; } static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, @@ -299,72 +374,6 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, static bool mptcp_supported_sockopt(int level, int optname) { - if (level == SOL_SOCKET) { - switch (optname) { - case SO_DEBUG: - case SO_REUSEPORT: - case SO_REUSEADDR: - - /* the following ones need a better implementation, - * but are quite common we want to preserve them - */ - case SO_BINDTODEVICE: - case SO_SNDBUF: - case SO_SNDBUFFORCE: - case SO_RCVBUF: - case SO_RCVBUFFORCE: - case SO_KEEPALIVE: - case SO_PRIORITY: - case SO_LINGER: - case SO_TIMESTAMP_OLD: - case SO_TIMESTAMP_NEW: - case SO_TIMESTAMPNS_OLD: - case SO_TIMESTAMPNS_NEW: - case SO_TIMESTAMPING_OLD: - case SO_TIMESTAMPING_NEW: - case SO_RCVLOWAT: - case SO_RCVTIMEO_OLD: - case SO_RCVTIMEO_NEW: - case SO_SNDTIMEO_OLD: - case SO_SNDTIMEO_NEW: - case SO_MARK: - case SO_INCOMING_CPU: - case SO_BINDTOIFINDEX: - case SO_BUSY_POLL: - case SO_PREFER_BUSY_POLL: - case SO_BUSY_POLL_BUDGET: - - /* next ones are no-op for plain TCP */ - case SO_NO_CHECK: - case SO_DONTROUTE: - case SO_BROADCAST: - case SO_BSDCOMPAT: - case SO_PASSCRED: - case SO_PASSSEC: - case SO_RXQ_OVFL: - case SO_WIFI_STATUS: - case SO_NOFCS: - case SO_SELECT_ERR_QUEUE: - return true; - } - - /* SO_OOBINLINE is not supported, let's avoid the related mess */ - /* SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, - * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, - * we must be careful with subflows - */ - /* SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks - * explicitly the sk_protocol field - */ - /* SO_PEEK_OFF is unsupported, as it is for plain TCP */ - /* SO_MAX_PACING_RATE is unsupported, we must be careful with subflows */ - /* SO_CNX_ADVICE is currently unsupported, could possibly be relevant, - * but likely needs careful design - */ - /* SO_ZEROCOPY is currently unsupported, TODO in sndmsg */ - /* SO_TXTIME is currently unsupported */ - return false; - } if (level == SOL_IP) { switch (optname) { /* should work fine */ @@ -574,12 +583,12 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname, pr_debug("msk=%p", msk); - if (!mptcp_supported_sockopt(level, optname)) - return -ENOPROTOOPT; - if (level == SOL_SOCKET) return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); + if (!mptcp_supported_sockopt(level, optname)) + return -ENOPROTOOPT; + /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the * MPTCP-level socket to configure the subflows until the subflow diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index be1de4084196..8976ff586b87 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -108,6 +108,7 @@ static void subflow_init_req(struct request_sock *req, const struct sock *sk_lis subflow_req->mp_capable = 0; subflow_req->mp_join = 0; + subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); subflow_req->msk = NULL; mptcp_token_init_request(req); } @@ -150,7 +151,7 @@ static int subflow_check_req(struct request_sock *req, return -EINVAL; #endif - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk_listener, skb, &mp_opt); if (mp_opt.mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); @@ -162,7 +163,7 @@ static int subflow_check_req(struct request_sock *req, } if (mp_opt.mp_capable && listener->request_mptcp) { - int err, retries = 4; + int err, retries = MPTCP_TOKEN_MAX_RETRIES; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; again: @@ -247,7 +248,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, int err; subflow_init_req(req, sk_listener); - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk_listener, skb, &mp_opt); if (mp_opt.mp_capable && mp_opt.mp_join) return -EINVAL; @@ -394,7 +395,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (subflow->request_mptcp) { if (!mp_opt.mp_capable) { MPTCP_INC_STATS(sock_net(sk), @@ -404,6 +405,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) goto fallback; } + if (mp_opt.csum_reqd) + WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true); subflow->mp_capable = 1; subflow->can_ack = 1; subflow->remote_key = mp_opt.sndr_key; @@ -430,15 +433,15 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) goto do_reset; } + if (!mptcp_finish_join(sk)) + goto do_reset; + subflow_generate_hmac(subflow->local_key, subflow->remote_key, subflow->local_nonce, subflow->remote_nonce, hmac); memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); - if (!mptcp_finish_join(sk)) - goto do_reset; - subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); @@ -638,7 +641,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * reordered MPC will cause fallback, but we don't have other * options. */ - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (!mp_opt.mp_capable) { fallback = true; goto create_child; @@ -648,7 +651,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, if (!new_msk) fallback = true; } else if (subflow_req->mp_join) { - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); @@ -824,10 +827,92 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) return true; } +static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, + bool csum_reqd) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct csum_pseudo_header header; + u32 offset, seq, delta; + __wsum csum; + int len; + + if (!csum_reqd) + return MAPPING_OK; + + /* mapping already validated on previous traversal */ + if (subflow->map_csum_len == subflow->map_data_len) + return MAPPING_OK; + + /* traverse the receive queue, ensuring it contains a full + * DSS mapping and accumulating the related csum. + * Preserve the accoumlate csum across multiple calls, to compute + * the csum only once + */ + delta = subflow->map_data_len - subflow->map_csum_len; + for (;;) { + seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; + offset = seq - TCP_SKB_CB(skb)->seq; + + /* if the current skb has not been accounted yet, csum its contents + * up to the amount covered by the current DSS + */ + if (offset < skb->len) { + __wsum csum; + + len = min(skb->len - offset, delta); + csum = skb_checksum(skb, offset, len, 0); + subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, + subflow->map_csum_len); + + delta -= len; + subflow->map_csum_len += len; + } + if (delta == 0) + break; + + if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { + /* if this subflow is closed, the partial mapping + * will be never completed; flush the pending skbs, so + * that subflow_sched_work_if_closed() can kick in + */ + if (unlikely(ssk->sk_state == TCP_CLOSE)) + while ((skb = skb_peek(&ssk->sk_receive_queue))) + sk_eat_skb(ssk, skb); + + /* not enough data to validate the csum */ + return MAPPING_EMPTY; + } + + /* the DSS mapping for next skbs will be validated later, + * when a get_mapping_status call will process such skb + */ + skb = skb->next; + } + + /* note that 'map_data_len' accounts only for the carried data, does + * not include the eventual seq increment due to the data fin, + * while the pseudo header requires the original DSS data len, + * including that + */ + header.data_seq = cpu_to_be64(subflow->map_seq); + header.subflow_seq = htonl(subflow->map_subflow_seq); + header.data_len = htons(subflow->map_data_len + subflow->map_data_fin); + header.csum = 0; + + csum = csum_partial(&header, sizeof(header), subflow->map_data_csum); + if (unlikely(csum_fold(csum))) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); + return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; + } + + return MAPPING_OK; +} + static enum mapping_status get_mapping_status(struct sock *ssk, struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + bool csum_reqd = READ_ONCE(msk->csum_enabled); struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; @@ -920,9 +1005,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk, /* Allow replacing only with an identical map */ if (subflow->map_seq == map_seq && subflow->map_subflow_seq == mpext->subflow_seq && - subflow->map_data_len == data_len) { + subflow->map_data_len == data_len && + subflow->map_csum_reqd == mpext->csum_reqd) { skb_ext_del(skb, SKB_EXT_MPTCP); - return MAPPING_OK; + goto validate_csum; } /* If this skb data are fully covered by the current mapping, @@ -934,17 +1020,27 @@ static enum mapping_status get_mapping_status(struct sock *ssk, } /* will validate the next map after consuming the current one */ - return MAPPING_OK; + goto validate_csum; } subflow->map_seq = map_seq; subflow->map_subflow_seq = mpext->subflow_seq; subflow->map_data_len = data_len; subflow->map_valid = 1; + subflow->map_data_fin = mpext->data_fin; subflow->mpc_map = mpext->mpc_map; - pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", + subflow->map_csum_reqd = mpext->csum_reqd; + subflow->map_csum_len = 0; + subflow->map_data_csum = csum_unfold(mpext->csum); + + /* Cfr RFC 8684 Section 3.3.0 */ + if (unlikely(subflow->map_csum_reqd != csum_reqd)) + return MAPPING_INVALID; + + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", subflow->map_seq, subflow->map_subflow_seq, - subflow->map_data_len); + subflow->map_data_len, subflow->map_csum_reqd, + subflow->map_data_csum); validate_seq: /* we revalidate valid mapping on new skb, because we must ensure @@ -954,7 +1050,9 @@ validate_seq: return MAPPING_INVALID; skb_ext_del(skb, SKB_EXT_MPTCP); - return MAPPING_OK; + +validate_csum: + return validate_data_csum(ssk, skb, csum_reqd); } static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 8f0270a780ce..a98e554b034f 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -33,7 +33,6 @@ #include <net/mptcp.h> #include "protocol.h" -#define TOKEN_MAX_RETRIES 4 #define TOKEN_MAX_CHAIN_LEN 4 struct token_bucket { @@ -153,12 +152,9 @@ int mptcp_token_new_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - int retries = TOKEN_MAX_RETRIES; + int retries = MPTCP_TOKEN_MAX_RETRIES; struct token_bucket *bucket; - pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", - sk, subflow->local_key, subflow->token, subflow->idsn); - again: mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, &subflow->idsn); @@ -172,6 +168,9 @@ again: goto again; } + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", + sk, subflow->local_key, subflow->token, subflow->idsn); + WRITE_ONCE(msk->token, subflow->token); __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); bucket->chain_len++; diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 49031f804276..cbbb0de4750a 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -238,7 +238,7 @@ struct ncsi_package { struct ncsi_dev_priv *ndp; /* NCSI device */ spinlock_t lock; /* Protect the package */ unsigned int channel_num; /* Number of channels */ - struct list_head channels; /* List of chanels */ + struct list_head channels; /* List of channels */ struct list_head node; /* Form list of packages */ bool multi_channel; /* Enable multiple channels */ @@ -339,7 +339,7 @@ struct ncsi_cmd_arg { unsigned char type; /* Command in the NCSI packet */ unsigned char id; /* Request ID (sequence number) */ unsigned char package; /* Destination package ID */ - unsigned char channel; /* Detination channel ID or 0x1f */ + unsigned char channel; /* Destination channel ID or 0x1f */ unsigned short payload; /* Command packet payload length */ unsigned int req_flags; /* NCSI request properties */ union { diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index ffff8da707b8..ca04b6df1341 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -627,7 +627,7 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, return 0; } -/* Find an outstanding VLAN tag and constuct a "Set VLAN Filter - Enable" +/* Find an outstanding VLAN tag and construct a "Set VLAN Filter - Enable" * packet. */ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 56a2531a3402..54395266339d 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -19,6 +19,16 @@ config NETFILTER_FAMILY_BRIDGE config NETFILTER_FAMILY_ARP bool +config NETFILTER_NETLINK_HOOK + tristate "Netfilter base hook dump support" + depends on NETFILTER_ADVANCED + depends on NF_TABLES + select NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + to list the base netfilter hooks via NFNETLINK. + This is helpful for debugging. + config NETFILTER_NETLINK_ACCT tristate "Netfilter NFACCT over NFNETLINK interface" depends on NETFILTER_ADVANCED @@ -816,7 +826,7 @@ config NETFILTER_XT_TARGET_CLASSIFY the priority of a packet. Some qdiscs can use this value for classification, among these are: - atm, cbq, dsmark, pfifo_fast, htb, prio + atm, cbq, dsmark, pfifo_fast, htb, prio To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index e80e010354b1..87112dad1fd4 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o obj-$(CONFIG_NETFILTER_NETLINK_OSF) += nfnetlink_osf.o +obj-$(CONFIG_NETFILTER_NETLINK_HOOK) += nfnetlink_hook.o # connection tracking obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index de2d20c37cda..16ae92054baa 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1685,8 +1685,8 @@ static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { }; static int -call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, - struct nlattr *tb[], enum ipset_adt adt, +call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, + struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 flags, bool use_lineno) { int ret; @@ -1738,8 +1738,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, *errline = lineno; - netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); + nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } @@ -1783,7 +1782,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, adt, flags, + ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; @@ -1794,7 +1793,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, adt, + ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; @@ -1859,7 +1858,6 @@ static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info, const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; - int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME])) @@ -1885,12 +1883,7 @@ static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -1945,12 +1938,7 @@ static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info, nlmsg_end(skb2, nlh2); pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -1971,7 +1959,6 @@ static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info, { struct sk_buff *skb2; struct nlmsghdr *nlh2; - int ret = 0; if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; @@ -1990,12 +1977,7 @@ static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2014,7 +1996,6 @@ static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info, struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; - int ret = 0; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_SETNAME])) @@ -2038,12 +2019,7 @@ static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2065,7 +2041,6 @@ static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info, struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; - int ret = 0; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_INDEX])) @@ -2091,12 +2066,7 @@ static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index d61886874940..271da8447b29 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -318,7 +318,7 @@ config IP_VS_MH_TAB_INDEX comment 'IPVS application helper' config IP_VS_FTP - tristate "FTP protocol helper" + tristate "FTP protocol helper" depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \ NF_CONNTRACK_FTP select IP_VS_NFCT diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e0befcf8113a..96ba19fc8155 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -55,8 +55,6 @@ #include "nf_internals.h" -extern unsigned int nf_conntrack_net_id; - __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; EXPORT_SYMBOL_GPL(nf_conntrack_locks); @@ -87,8 +85,6 @@ static __read_mostly bool nf_conntrack_locks_all; static struct conntrack_gc_work conntrack_gc_work; -extern unsigned int nf_conntrack_net_id; - void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) { /* 1) Acquire the lock */ @@ -1404,7 +1400,7 @@ static void gc_worker(struct work_struct *work) continue; net = nf_ct_net(tmp); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); if (atomic_read(&cnet->count) < nf_conntrack_max95) continue; @@ -1484,7 +1480,7 @@ __nf_conntrack_alloc(struct net *net, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); unsigned int ct_count; struct nf_conn *ct; @@ -1556,7 +1552,7 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); kmem_cache_free(nf_conntrack_cachep, ct); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); smp_mb__before_atomic(); atomic_dec(&cnet->count); @@ -1614,7 +1610,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, GFP_ATOMIC); local_bh_disable(); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); if (cnet->expect_count) { spin_lock(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple); @@ -2317,7 +2313,7 @@ __nf_ct_unconfirmed_destroy(struct net *net) void nf_ct_unconfirmed_destroy(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); might_sleep(); @@ -2333,7 +2329,7 @@ void nf_ct_iterate_cleanup_net(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct iter_data d; might_sleep(); @@ -2367,7 +2363,7 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) down_read(&net_rwsem); for_each_net(net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (atomic_read(&cnet->count) == 0) continue; @@ -2449,7 +2445,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); nf_ct_iterate_cleanup(kill_all, net, 0, 0); if (atomic_read(&cnet->count) != 0) @@ -2733,7 +2729,7 @@ void nf_conntrack_init_end(void) int nf_conntrack_init_net(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); int ret = -ENOMEM; int cpu; diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 759d87aef95f..296e4a171bd1 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -27,8 +27,6 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> -extern unsigned int nf_conntrack_net_id; - static DEFINE_MUTEX(nf_ct_ecache_mutex); #define ECACHE_RETRY_WAIT (HZ/10) @@ -348,7 +346,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (state == NFCT_ECACHE_DESTROY_FAIL && !delayed_work_pending(&cnet->ecache_dwork)) { @@ -371,7 +369,7 @@ static const struct nf_ct_ext_type event_extend = { void nf_conntrack_ecache_pernet_init(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); net->ct.sysctl_events = nf_ct_events; cnet->ct_net = &net->ct; @@ -380,7 +378,7 @@ void nf_conntrack_ecache_pernet_init(struct net *net) void nf_conntrack_ecache_pernet_fini(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); cancel_delayed_work_sync(&cnet->ecache_dwork); } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index efdd391b3f72..1e851bc2e61a 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -43,8 +43,6 @@ unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; static unsigned int nf_ct_expect_hashrnd __read_mostly; -extern unsigned int nf_conntrack_net_id; - /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report) @@ -58,7 +56,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, hlist_del_rcu(&exp->hnode); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); cnet->expect_count--; hlist_del_rcu(&exp->lnode); @@ -123,7 +121,7 @@ __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i; unsigned int h; @@ -164,7 +162,7 @@ nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; @@ -397,7 +395,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) master_help->expecting[exp->class]++; hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); cnet->expect_count++; NF_CT_STAT_INC(net, expect_create); @@ -468,7 +466,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, } } - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); if (cnet->expect_count >= nf_ct_expect_max) { net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index aafaff00baf1..2eb31ffb3d14 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -194,7 +194,7 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, if (tcpdatalen == 4) { /* Separate TPKT header */ /* Netmeeting sends TPKT header and data separately */ pr_debug("nf_ct_h323: separate TPKT header indicates " - "there will be TPKT data of %hu bytes\n", + "there will be TPKT data of %d bytes\n", tpktlen - 4); info->tpkt_len[dir] = tpktlen - 4; return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index ac396cc8bfae..ae4488a13c70 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -43,8 +43,6 @@ MODULE_PARM_DESC(nf_conntrack_helper, static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; -extern unsigned int nf_conntrack_net_id; - /* Stupid hash, but collision free for the default registrations of the * helpers currently in the kernel. */ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) @@ -214,7 +212,7 @@ EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); static struct nf_conntrack_helper * nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (!cnet->sysctl_auto_assign_helper) { if (cnet->auto_assign_helper_warned) @@ -560,7 +558,7 @@ static const struct nf_ct_ext_type helper_extend = { void nf_conntrack_helper_pernet_init(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); cnet->sysctl_auto_assign_helper = nf_ct_auto_assign_helper; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 8690fc07030f..4e1a9dba7077 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1528,7 +1528,7 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u8 family = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; @@ -1541,12 +1541,12 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb, if (cda[CTA_TUPLE_ORIG]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, - nfmsg->nfgen_family, &zone); + family, &zone); else if (cda[CTA_TUPLE_REPLY]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, - nfmsg->nfgen_family, &zone); + family, &zone); else { - u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC; + u_int8_t u3 = info->nfmsg->version ? family : AF_UNSPEC; return ctnetlink_flush_conntrack(info->net, cda, NETLINK_CB(skb).portid, @@ -1586,8 +1586,7 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; @@ -1628,9 +1627,8 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, ct = nf_ct_tuplehash_to_ctrack(h); - err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { + if (!skb2) { nf_ct_put(ct); return -ENOMEM; } @@ -1640,21 +1638,12 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct, true, 0); nf_ct_put(ct); - if (err <= 0) - goto free; - - err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + } -free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static int ctnetlink_done_list(struct netlink_callback *cb) @@ -2373,10 +2362,9 @@ static int ctnetlink_new_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_zone zone; struct nf_conn *ct; int err; @@ -2590,21 +2578,12 @@ static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), sock_net(skb->sk)); - if (err <= 0) - goto free; - - err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + } -free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { @@ -3278,8 +3257,7 @@ static int ctnetlink_get_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nf_conntrack_zone zone; @@ -3329,11 +3307,10 @@ static int ctnetlink_get_expect(struct sk_buff *skb, } } - err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { + if (!skb2) { nf_ct_expect_put(exp); - goto out; + return -ENOMEM; } rcu_read_lock(); @@ -3342,21 +3319,12 @@ static int ctnetlink_get_expect(struct sk_buff *skb, exp); rcu_read_unlock(); nf_ct_expect_put(exp); - if (err <= 0) - goto free; - - err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + } -free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data) @@ -3378,8 +3346,7 @@ static int ctnetlink_del_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; @@ -3630,8 +3597,7 @@ static int ctnetlink_new_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nf_conntrack_zone zone; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index dc9ca12b0489..be14e0bea4c8 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -42,8 +42,6 @@ #include <net/ipv6.h> #include <net/inet_frag.h> -extern unsigned int nf_conntrack_net_id; - static DEFINE_MUTEX(nf_ct_proto_mutex); #ifdef CONFIG_SYSCTL @@ -446,7 +444,7 @@ static struct nf_ct_bridge_info *nf_ct_bridge_info; static int nf_ct_netns_do_get(struct net *net, u8 nfproto) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); bool fixup_needed = false, retry = true; int err = 0; retry: @@ -531,7 +529,7 @@ retry: static void nf_ct_netns_do_put(struct net *net, u8 nfproto) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); mutex_lock(&nf_ct_proto_mutex); switch (nfproto) { diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 34e22416a721..de840fc41a2e 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1441,6 +1441,11 @@ void nf_conntrack_tcp_init_net(struct net *net) * will be started. */ tn->tcp_max_retrans = 3; + +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + tn->offload_timeout = 30 * HZ; + tn->offload_pickup = 120 * HZ; +#endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index af402f458ee0..68911fcaa0f1 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -270,6 +270,11 @@ void nf_conntrack_udp_init_net(struct net *net) for (i = 0; i < UDP_CT_MAX; i++) un->timeouts[i] = udp_timeouts[i]; + +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + un->offload_timeout = 30 * HZ; + un->offload_pickup = 30 * HZ; +#endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index aaa55246d0ca..f57a951c9b5e 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -512,9 +512,7 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) u32 nf_conntrack_count(const struct net *net) { - const struct nf_conntrack_net *cnet; - - cnet = net_generic(net, nf_conntrack_net_id); + const struct nf_conntrack_net *cnet = nf_ct_pernet(net); return atomic_read(&cnet->count); } @@ -575,11 +573,19 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD, + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP, +#endif NF_SYSCTL_CT_PROTO_TCP_LOOSE, NF_SYSCTL_CT_PROTO_TCP_LIBERAL, NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD, + NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP, +#endif NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP, NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6, #ifdef CONFIG_NF_CT_PROTO_SCTP @@ -762,6 +768,20 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD] = { + .procname = "nf_flowtable_tcp_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP] = { + .procname = "nf_flowtable_tcp_pickup", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { .procname = "nf_conntrack_tcp_loose", .maxlen = sizeof(u8), @@ -796,6 +816,20 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, +#if IS_ENABLED(CONFIG_NFT_FLOW_OFFLOAD) + [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = { + .procname = "nf_flowtable_udp_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP] = { + .procname = "nf_flowtable_udp_pickup", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = { .procname = "nf_conntrack_icmp_timeout", .maxlen = sizeof(unsigned int), @@ -971,6 +1005,12 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, XASSIGN(LIBERAL, &tn->tcp_be_liberal); XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans); #undef XASSIGN + +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout; + table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP].data = &tn->offload_pickup; +#endif + } static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, @@ -1032,7 +1072,7 @@ static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, static int nf_conntrack_standalone_init_sysctl(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_udp_net *un = nf_udp_pernet(net); struct ctl_table *table; @@ -1059,6 +1099,10 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED]; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED]; +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout; + table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP].data = &un->offload_pickup; +#endif nf_conntrack_standalone_init_tcp_sysctl(net, table); nf_conntrack_standalone_init_sctp_sysctl(net, table); @@ -1085,7 +1129,7 @@ out_unregister_netfilter: static void nf_conntrack_standalone_fini_sysctl(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct ctl_table *table; table = cnet->sysctl_header->ctl_table_arg; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 1d02650dd715..1e50908b1b7e 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -178,12 +178,10 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) tcp->seen[1].td_maxwin = 0; } -#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) -#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) - static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; + struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); unsigned int timeout; @@ -191,12 +189,17 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) if (!l4proto) return; - if (l4num == IPPROTO_TCP) - timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT; - else if (l4num == IPPROTO_UDP) - timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT; - else + if (l4num == IPPROTO_TCP) { + struct nf_tcp_net *tn = nf_tcp_pernet(net); + + timeout = tn->offload_pickup; + } else if (l4num == IPPROTO_UDP) { + struct nf_udp_net *tn = nf_udp_pernet(net); + + timeout = tn->offload_pickup; + } else { return; + } if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) ct->timeout = nfct_time_stamp + timeout; @@ -268,11 +271,35 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = { .automatic_shrinking = true, }; +unsigned long flow_offload_get_timeout(struct flow_offload *flow) +{ + const struct nf_conntrack_l4proto *l4proto; + unsigned long timeout = NF_FLOW_TIMEOUT; + struct net *net = nf_ct_net(flow->ct); + int l4num = nf_ct_protonum(flow->ct); + + l4proto = nf_ct_l4proto_find(l4num); + if (!l4proto) + return timeout; + + if (l4num == IPPROTO_TCP) { + struct nf_tcp_net *tn = nf_tcp_pernet(net); + + timeout = tn->offload_timeout; + } else if (l4num == IPPROTO_UDP) { + struct nf_udp_net *tn = nf_udp_pernet(net); + + timeout = tn->offload_timeout; + } + + return timeout; +} + int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; - flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, @@ -304,7 +331,7 @@ EXPORT_SYMBOL_GPL(flow_offload_add); void flow_offload_refresh(struct nf_flowtable *flow_table, struct flow_offload *flow) { - flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); if (likely(!nf_flowtable_hw_offload(flow_table))) return; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 528b2f172684..f92006cec94c 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -937,7 +937,7 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) lastused = max_t(u64, stats[0].lastused, stats[1].lastused); offload->flow->timeout = max_t(u64, offload->flow->timeout, - lastused + NF_FLOW_TIMEOUT); + lastused + flow_offload_get_timeout(offload->flow)); if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) { if (stats[0].pkts) @@ -1041,7 +1041,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, __s32 delta; delta = nf_flow_timeout_delta(flow->timeout); - if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10)) + if ((delta >= (9 * flow_offload_get_timeout(flow)) / 10)) return; offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bf4d6ec9fc55..d6214242fe7f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -862,10 +862,9 @@ static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_table *table; struct net *net = info->net; struct sk_buff *skb2; @@ -1068,10 +1067,9 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; @@ -1263,10 +1261,9 @@ out: static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; @@ -1636,10 +1633,9 @@ done: static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_chain *chain; struct net *net = info->net; struct nft_table *table; @@ -2015,11 +2011,12 @@ static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family, const struct nft_chain_hook *hook, struct nft_chain *chain) { - ops->pf = family; - ops->hooknum = hook->num; - ops->priority = hook->priority; - ops->priv = chain; - ops->hook = hook->type->hooks[ops->hooknum]; + ops->pf = family; + ops->hooknum = hook->num; + ops->priority = hook->priority; + ops->priv = chain; + ops->hook = hook->type->hooks[ops->hooknum]; + ops->hook_ops_type = NF_HOOK_OP_NF_TABLES; } static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, @@ -2371,10 +2368,9 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct nft_chain *chain = NULL; struct net *net = info->net; const struct nlattr *attr; @@ -2469,10 +2465,9 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; @@ -3096,10 +3091,9 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb) static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_chain *chain; const struct nft_rule *rule; struct net *net = info->net; @@ -3237,13 +3231,12 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; unsigned int size, i, n, ulen = 0, usize = 0; u8 genmask = nft_genmask_next(info->net); struct nft_rule *rule, *old_rule = NULL; struct nft_expr_info *expr_info = NULL; - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; struct nft_flow_rule *flow; struct nft_userdata *udata; @@ -3477,15 +3470,15 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; - int family = nfmsg->nfgen_family, err = 0; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct nft_chain *chain = NULL; struct net *net = info->net; struct nft_table *table; struct nft_rule *rule; struct nft_ctx ctx; + int err = 0; table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, NETLINK_CB(skb).portid); @@ -3665,30 +3658,6 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED }, }; -static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, - const struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack, - u8 genmask, u32 nlpid) -{ - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - int family = nfmsg->nfgen_family; - struct nft_table *table = NULL; - - if (nla[NFTA_SET_TABLE] != NULL) { - table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, - genmask, nlpid); - if (IS_ERR(table)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); - return PTR_ERR(table); - } - } - - nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); - return 0; -} - static struct nft_set *nft_set_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask) { @@ -4068,20 +4037,26 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb) static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; + struct nft_table *table = NULL; struct net *net = info->net; const struct nft_set *set; struct sk_buff *skb2; struct nft_ctx ctx; int err; - /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, 0); - if (err < 0) - return err; + if (nla[NFTA_SET_TABLE]) { + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask, 0); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); + return PTR_ERR(table); + } + } + + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -4096,12 +4071,12 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, } /* Only accept unspec with dump */ - if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; if (!nla[NFTA_SET_TABLE]) return -EINVAL; - set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -4189,11 +4164,10 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); u32 ktype, dtype, flags, policy, gc_int, objtype; struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_set_ops *ops; struct nft_expr *expr = NULL; struct net *net = info->net; @@ -4494,31 +4468,31 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; + struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; - int err; - if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; - if (nla[NFTA_SET_TABLE] == NULL) - return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, NETLINK_CB(skb).portid); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); + return PTR_ERR(table); + } if (nla[NFTA_SET_HANDLE]) { attr = nla[NFTA_SET_HANDLE]; - set = nft_set_lookup_byhandle(ctx.table, attr, genmask); + set = nft_set_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_SET_NAME]; - set = nft_set_lookup(ctx.table, attr, genmask); + set = nft_set_lookup(table, attr, genmask); } if (IS_ERR(set)) { @@ -4532,6 +4506,8 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, return -EBUSY; } + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + return nft_delset(&ctx, set); } @@ -4733,28 +4709,6 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, - const struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack, - u8 genmask, u32 nlpid) -{ - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - int family = nfmsg->nfgen_family; - struct nft_table *table; - - table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, - genmask, nlpid); - if (IS_ERR(table)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); - return PTR_ERR(table); - } - - nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); - return 0; -} - static int nft_set_elem_expr_dump(struct sk_buff *skb, const struct nft_set *set, const struct nft_set_ext *ext) @@ -5212,21 +5166,27 @@ static int nf_tables_getsetelem(struct sk_buff *skb, { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; + struct nft_table *table; struct nft_set *set; struct nlattr *attr; struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, NETLINK_CB(skb).portid); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); + return PTR_ERR(table); + } - set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_set_start, @@ -5995,8 +5955,10 @@ static int nf_tables_newsetelem(struct sk_buff *skb, struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; + struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; int rem, err; @@ -6004,12 +5966,14 @@ static int nf_tables_newsetelem(struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, NETLINK_CB(skb).portid); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); + return PTR_ERR(table); + } - set = nft_set_lookup_global(net, ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET], nla[NFTA_SET_ELEM_LIST_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -6017,6 +5981,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb, if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) return -EBUSY; + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags); if (err < 0) @@ -6024,7 +5990,7 @@ static int nf_tables_newsetelem(struct sk_buff *skb, } if (nft_net->validate_state == NFT_VALIDATE_DO) - return nft_table_validate(net, ctx.table); + return nft_table_validate(net, table); return 0; } @@ -6262,23 +6228,29 @@ static int nf_tables_delsetelem(struct sk_buff *skb, { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; + struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, NETLINK_CB(skb).portid); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); + return PTR_ERR(table); + } - set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) return -EBUSY; + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return nft_set_flush(&ctx, set, genmask); @@ -6546,11 +6518,10 @@ err_free_trans: static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nft_object_type *type; - int family = nfmsg->nfgen_family; struct net *net = info->net; struct nft_table *table; struct nft_object *obj; @@ -6802,10 +6773,9 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_table *table; struct net *net = info->net; struct nft_object *obj; @@ -6892,10 +6862,9 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; @@ -7323,12 +7292,11 @@ static int nf_tables_newflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; struct nft_flowtable_hook flowtable_hook; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nf_flowtable_type *type; - int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; struct nft_hook *hook, *next; struct net *net = info->net; @@ -7512,10 +7480,9 @@ static int nf_tables_delflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; struct net *net = info->net; const struct nlattr *attr; @@ -7707,9 +7674,8 @@ static int nf_tables_getflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; const struct nft_table *table; struct net *net = info->net; diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index dbc2e945c98e..7780342e2f2d 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -81,7 +81,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, else { if (!pkt->tprot_set) return false; - ptr = skb_network_header(skb) + pkt->xt.thoff; + ptr = skb_network_header(skb) + nft_thoff(pkt); } ptr += priv->offset; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 0cf3278007ba..e4fe2f0780eb 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -113,17 +113,17 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, int off = skb_network_offset(skb); unsigned int len, nh_end; - nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len; + nh_end = pkt->tprot_set ? nft_thoff(pkt) : skb->len; len = min_t(unsigned int, nh_end - skb_network_offset(skb), NFT_TRACETYPE_NETWORK_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len)) return -1; if (pkt->tprot_set) { - len = min_t(unsigned int, skb->len - pkt->xt.thoff, + len = min_t(unsigned int, skb->len - nft_thoff(pkt), NFT_TRACETYPE_TRANSPORT_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, - pkt->xt.thoff, len)) + nft_thoff(pkt), len)) return -1; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e8dbd8379027..7e2c8dd01408 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -68,6 +68,7 @@ static const char *const nfnl_lockdep_names[NFNL_SUBSYS_COUNT] = { [NFNL_SUBSYS_CTHELPER] = "nfnl_subsys_cthelper", [NFNL_SUBSYS_NFTABLES] = "nfnl_subsys_nftables", [NFNL_SUBSYS_NFT_COMPAT] = "nfnl_subsys_nftcompat", + [NFNL_SUBSYS_HOOK] = "nfnl_subsys_hook", }; static const int nfnl_group2type[NFNLGRP_MAX+1] = { @@ -256,6 +257,7 @@ replay: .net = net, .sk = nfnlnet->nfnl, .nlh = nlh, + .nfmsg = nlmsg_data(nlh), .extack = extack, }; @@ -491,6 +493,7 @@ replay_abort: .net = net, .sk = nfnlnet->nfnl, .nlh = nlh, + .nfmsg = nlmsg_data(nlh), .extack = &extack, }; diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 3c8cf8748cfb..505f46a32173 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -314,14 +314,11 @@ static int nfnl_acct_get(struct sk_buff *skb, const struct nfnl_info *info, kfree_skb(skb2); break; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + break; } + return ret; } diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 752b10cae524..5c622f55c9d6 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -667,14 +667,10 @@ static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info, break; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + break; } + return ret; } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 38848ad68899..c57673d499be 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -287,14 +287,11 @@ static int cttimeout_get_timeout(struct sk_buff *skb, kfree_skb(skb2); break; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + break; } + return ret; } @@ -427,9 +424,9 @@ static int cttimeout_default_get(struct sk_buff *skb, const struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts = NULL; struct sk_buff *skb2; - int ret, err; __u16 l3num; __u8 l4num; + int ret; if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) return -EINVAL; @@ -438,9 +435,8 @@ static int cttimeout_default_get(struct sk_buff *skb, l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num); - err = -EOPNOTSUPP; if (l4proto->l4proto != l4num) - goto err; + return -EOPNOTSUPP; switch (l4proto->l4proto) { case IPPROTO_ICMP: @@ -480,13 +476,11 @@ static int cttimeout_default_get(struct sk_buff *skb, } if (!timeouts) - goto err; + return -EOPNOTSUPP; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { - err = -ENOMEM; - goto err; - } + if (!skb2) + return -ENOMEM; ret = cttimeout_default_fill_info(info->net, skb2, NETLINK_CB(skb).portid, @@ -496,18 +490,10 @@ static int cttimeout_default_get(struct sk_buff *skb, l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); - err = -ENOMEM; - goto err; + return -ENOMEM; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; -err: - return err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net, diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c new file mode 100644 index 000000000000..58fda6ac663b --- /dev/null +++ b/net/netfilter/nfnetlink_hook.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021 Red Hat GmbH + * + * Author: Florian Westphal <fw@strlen.de> + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/slab.h> + +#include <linux/netfilter.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_hook.h> + +#include <net/netfilter/nf_tables.h> +#include <net/sock.h> + +static const struct nla_policy nfnl_hook_nla_policy[NFNLA_HOOK_MAX + 1] = { + [NFNLA_HOOK_HOOKNUM] = { .type = NLA_U32 }, + [NFNLA_HOOK_PRIORITY] = { .type = NLA_U32 }, + [NFNLA_HOOK_DEV] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, + [NFNLA_HOOK_FUNCTION_NAME] = { .type = NLA_NUL_STRING, + .len = KSYM_NAME_LEN, }, + [NFNLA_HOOK_MODULE_NAME] = { .type = NLA_NUL_STRING, + .len = MODULE_NAME_LEN, }, + [NFNLA_HOOK_CHAIN_INFO] = { .type = NLA_NESTED, }, +}; + +static int nf_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *c) +{ + int err; + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + rcu_read_unlock(); + err = netlink_dump_start(nlsk, skb, nlh, c); + rcu_read_lock(); + module_put(THIS_MODULE); + + return err; +} + +struct nfnl_dump_hook_data { + char devname[IFNAMSIZ]; + unsigned long headv; + u8 hook; +}; + +static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + unsigned int seq, + const struct nf_hook_ops *ops) +{ + struct net *net = sock_net(nlskb->sk); + struct nlattr *nest, *nest2; + struct nft_chain *chain; + int ret = 0; + + if (ops->hook_ops_type != NF_HOOK_OP_NF_TABLES) + return 0; + + chain = ops->priv; + if (WARN_ON_ONCE(!chain)) + return 0; + + if (!nft_is_active(net, chain)) + return 0; + + nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO); + if (!nest) + return -EMSGSIZE; + + ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE, + htonl(NFNL_HOOK_TYPE_NFTABLES)); + if (ret) + goto cancel_nest; + + nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC); + if (!nest2) + goto cancel_nest; + + ret = nla_put_string(nlskb, NFTA_CHAIN_TABLE, chain->table->name); + if (ret) + goto cancel_nest; + + ret = nla_put_string(nlskb, NFTA_CHAIN_NAME, chain->name); + if (ret) + goto cancel_nest; + + nla_nest_end(nlskb, nest2); + nla_nest_end(nlskb, nest); + return ret; + +cancel_nest: + nla_nest_cancel(nlskb, nest); + return -EMSGSIZE; +} + +static int nfnl_hook_dump_one(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + const struct nf_hook_ops *ops, + unsigned int seq) +{ + u16 event = nfnl_msg_type(NFNL_SUBSYS_HOOK, NFNL_MSG_HOOK_GET); + unsigned int portid = NETLINK_CB(nlskb).portid; + struct nlmsghdr *nlh; + int ret = -EMSGSIZE; +#ifdef CONFIG_KALLSYMS + char sym[KSYM_SYMBOL_LEN]; + char *module_name; +#endif + nlh = nfnl_msg_put(nlskb, portid, seq, event, + NLM_F_MULTI, ops->pf, NFNETLINK_V0, 0); + if (!nlh) + goto nla_put_failure; + +#ifdef CONFIG_KALLSYMS + ret = snprintf(sym, sizeof(sym), "%ps", ops->hook); + if (ret < 0 || ret > (int)sizeof(sym)) + goto nla_put_failure; + + module_name = strstr(sym, " ["); + if (module_name) { + char *end; + + module_name += 2; + end = strchr(module_name, ']'); + if (end) { + *end = 0; + + ret = nla_put_string(nlskb, NFNLA_HOOK_MODULE_NAME, module_name); + if (ret) + goto nla_put_failure; + } + } + + ret = nla_put_string(nlskb, NFNLA_HOOK_FUNCTION_NAME, sym); + if (ret) + goto nla_put_failure; +#endif + + ret = nla_put_be32(nlskb, NFNLA_HOOK_HOOKNUM, htonl(ops->hooknum)); + if (ret) + goto nla_put_failure; + + ret = nla_put_be32(nlskb, NFNLA_HOOK_PRIORITY, htonl(ops->priority)); + if (ret) + goto nla_put_failure; + + ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops); + if (ret) + goto nla_put_failure; + + nlmsg_end(nlskb, nlh); + return 0; +nla_put_failure: + nlmsg_trim(nlskb, nlh); + return ret; +} + +static const struct nf_hook_entries * +nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *dev) +{ + const struct nf_hook_entries *hook_head = NULL; + struct net_device *netdev; + + switch (pf) { + case NFPROTO_IPV4: + if (hook >= ARRAY_SIZE(net->nf.hooks_ipv4)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); + break; + case NFPROTO_IPV6: + if (hook >= ARRAY_SIZE(net->nf.hooks_ipv6)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); + break; + case NFPROTO_ARP: +#ifdef CONFIG_NETFILTER_FAMILY_ARP + if (hook >= ARRAY_SIZE(net->nf.hooks_arp)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_arp[hook]); +#endif + break; + case NFPROTO_BRIDGE: +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + if (hook >= ARRAY_SIZE(net->nf.hooks_bridge)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); +#endif + break; +#if IS_ENABLED(CONFIG_DECNET) + case NFPROTO_DECNET: + if (hook >= ARRAY_SIZE(net->nf.hooks_decnet)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_decnet[hook]); + break; +#endif +#ifdef CONFIG_NETFILTER_INGRESS + case NFPROTO_NETDEV: + if (hook != NF_NETDEV_INGRESS) + return ERR_PTR(-EOPNOTSUPP); + + if (!dev) + return ERR_PTR(-ENODEV); + + netdev = dev_get_by_name_rcu(net, dev); + if (!netdev) + return ERR_PTR(-ENODEV); + + return rcu_dereference(netdev->nf_hooks_ingress); +#endif + default: + return ERR_PTR(-EPROTONOSUPPORT); + } + + return hook_head; +} + +static int nfnl_hook_dump(struct sk_buff *nlskb, + struct netlink_callback *cb) +{ + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nfnl_dump_hook_data *ctx = cb->data; + int err, family = nfmsg->nfgen_family; + struct net *net = sock_net(nlskb->sk); + struct nf_hook_ops * const *ops; + const struct nf_hook_entries *e; + unsigned int i = cb->args[0]; + + rcu_read_lock(); + + e = nfnl_hook_entries_head(family, ctx->hook, net, ctx->devname); + if (!e) + goto done; + + if (IS_ERR(e)) { + cb->seq++; + goto done; + } + + if ((unsigned long)e != ctx->headv || i >= e->num_hook_entries) + cb->seq++; + + ops = nf_hook_entries_get_hook_ops(e); + + for (; i < e->num_hook_entries; i++) { + err = nfnl_hook_dump_one(nlskb, ctx, ops[i], cb->seq); + if (err) + break; + } + +done: + nl_dump_check_consistent(cb, nlmsg_hdr(nlskb)); + rcu_read_unlock(); + cb->args[0] = i; + return nlskb->len; +} + +static int nfnl_hook_dump_start(struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nlattr * const *nla = cb->data; + struct nfnl_dump_hook_data *ctx = NULL; + struct net *net = sock_net(cb->skb->sk); + u8 family = nfmsg->nfgen_family; + char name[IFNAMSIZ] = ""; + const void *head; + u32 hooknum; + + hooknum = ntohl(nla_get_be32(nla[NFNLA_HOOK_HOOKNUM])); + if (hooknum > 255) + return -EINVAL; + + if (family == NFPROTO_NETDEV) { + if (!nla[NFNLA_HOOK_DEV]) + return -EINVAL; + + nla_strscpy(name, nla[NFNLA_HOOK_DEV], sizeof(name)); + } + + rcu_read_lock(); + /* Not dereferenced; for consistency check only */ + head = nfnl_hook_entries_head(family, hooknum, net, name); + rcu_read_unlock(); + + if (head && IS_ERR(head)) + return PTR_ERR(head); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + strscpy(ctx->devname, name, sizeof(ctx->devname)); + ctx->headv = (unsigned long)head; + ctx->hook = hooknum; + + cb->seq = 1; + cb->data = ctx; + + return 0; +} + +static int nfnl_hook_dump_stop(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + +static int nfnl_hook_get(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + if (!nla[NFNLA_HOOK_HOOKNUM]) + return -EINVAL; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nfnl_hook_dump_start, + .done = nfnl_hook_dump_stop, + .dump = nfnl_hook_dump, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nf_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + return -EOPNOTSUPP; +} + +static const struct nfnl_callback nfnl_hook_cb[NFNL_MSG_HOOK_MAX] = { + [NFNL_MSG_HOOK_GET] = { + .call = nfnl_hook_get, + .type = NFNL_CB_RCU, + .attr_count = NFNLA_HOOK_MAX, + .policy = nfnl_hook_nla_policy + }, +}; + +static const struct nfnetlink_subsystem nfhook_subsys = { + .name = "nfhook", + .subsys_id = NFNL_SUBSYS_HOOK, + .cb_count = NFNL_MSG_HOOK_MAX, + .cb = nfnl_hook_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_HOOK); + +static int __init nfnetlink_hook_init(void) +{ + return nfnetlink_subsys_register(&nfhook_subsys); +} + +static void __exit nfnetlink_hook_exit(void) +{ + nfnetlink_subsys_unregister(&nfhook_subsys); +} + +module_init(nfnetlink_hook_init); +module_exit(nfnetlink_hook_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("nfnetlink_hook: list registered netfilter hooks"); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 587086b18c36..691ef4cffdd9 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -871,15 +871,14 @@ static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfula[]) { struct nfnl_log_net *log = nfnl_log_pernet(info->net); - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int16_t group_num = ntohs(nfmsg->res_id); + u_int16_t group_num = ntohs(info->nfmsg->res_id); struct nfulnl_msg_config_cmd *cmd = NULL; struct nfulnl_instance *inst; u16 flags = 0; int ret = 0; if (nfula[NFULA_CFG_CMD]) { - u_int8_t pf = nfmsg->nfgen_family; + u_int8_t pf = info->nfmsg->nfgen_family; cmd = nla_data(nfula[NFULA_CFG_CMD]); /* Commands without queue context */ diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f37a575ebd7f..f774de0fc24f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1051,8 +1051,7 @@ static int nfqnl_recv_verdict_batch(struct sk_buff *skb, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u16 queue_num = ntohs(nfmsg->res_id); + u16 queue_num = ntohs(info->nfmsg->res_id); struct nf_queue_entry *entry, *tmp; struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; @@ -1160,8 +1159,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); + u_int16_t queue_num = ntohs(info->nfmsg->res_id); struct nfqnl_msg_verdict_hdr *vhdr; enum ip_conntrack_info ctinfo; struct nfqnl_instance *queue; @@ -1243,8 +1241,7 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); + u_int16_t queue_num = ntohs(info->nfmsg->res_id); struct nfqnl_msg_config_cmd *cmd = NULL; struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 363bdd7044ec..5b02408a920b 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -18,7 +18,7 @@ static unsigned int nft_do_chain_ipv4(void *priv, struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); return nft_do_chain(&pkt, priv); } @@ -62,7 +62,7 @@ static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb, struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_unspec(&pkt, skb); + nft_set_pktinfo_unspec(&pkt); return nft_do_chain(&pkt, priv); } @@ -102,7 +102,7 @@ static unsigned int nft_do_chain_ipv6(void *priv, struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); return nft_do_chain(&pkt, priv); } @@ -149,10 +149,10 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, switch (state->pf) { case NFPROTO_IPV4: - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); break; case NFPROTO_IPV6: - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); break; default: break; @@ -174,7 +174,7 @@ static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb, ingress_state.hook = NF_INET_INGRESS; nft_set_pktinfo(&pkt, skb, &ingress_state); - if (nft_set_pktinfo_ipv4_ingress(&pkt, skb) < 0) + if (nft_set_pktinfo_ipv4_ingress(&pkt) < 0) return NF_DROP; break; case htons(ETH_P_IPV6): @@ -182,7 +182,7 @@ static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb, ingress_state.hook = NF_INET_INGRESS; nft_set_pktinfo(&pkt, skb, &ingress_state); - if (nft_set_pktinfo_ipv6_ingress(&pkt, skb) < 0) + if (nft_set_pktinfo_ipv6_ingress(&pkt) < 0) return NF_DROP; break; default: @@ -238,13 +238,13 @@ nft_do_chain_bridge(void *priv, switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb); + nft_set_pktinfo_ipv4_validate(&pkt); break; case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb); + nft_set_pktinfo_ipv6_validate(&pkt); break; default: - nft_set_pktinfo_unspec(&pkt, skb); + nft_set_pktinfo_unspec(&pkt); break; } @@ -293,13 +293,13 @@ static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb, switch (skb->protocol) { case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb); + nft_set_pktinfo_ipv4_validate(&pkt); break; case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb); + nft_set_pktinfo_ipv6_validate(&pkt); break; default: - nft_set_pktinfo_unspec(&pkt, skb); + nft_set_pktinfo_unspec(&pkt); break; } diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c index eac4a901233f..98e4946100c5 100644 --- a/net/netfilter/nft_chain_nat.c +++ b/net/netfilter/nft_chain_nat.c @@ -17,12 +17,12 @@ static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, switch (state->pf) { #ifdef CONFIG_NF_TABLES_IPV4 case NFPROTO_IPV4: - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); break; #endif #ifdef CONFIG_NF_TABLES_IPV6 case NFPROTO_IPV6: - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); break; #endif default: diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c index edd02cda57fc..925db0dce48d 100644 --- a/net/netfilter/nft_chain_route.c +++ b/net/netfilter/nft_chain_route.c @@ -26,7 +26,7 @@ static unsigned int nf_route_table_hook4(void *priv, u8 tos; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); mark = skb->mark; iph = ip_hdr(skb); @@ -74,7 +74,7 @@ static unsigned int nf_route_table_hook6(void *priv, int err; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); /* save source/dest address, mark, hoplimit, flowlabel, priority */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 5415ab14400d..639c337c885b 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -57,8 +57,13 @@ union nft_entry { }; static inline void -nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info) +nft_compat_set_par(struct xt_action_param *par, + const struct nft_pktinfo *pkt, + const void *xt, const void *xt_info) { + par->state = pkt->state; + par->thoff = nft_thoff(pkt); + par->fragoff = pkt->fragoff; par->target = xt; par->targinfo = xt_info; par->hotdrop = false; @@ -71,13 +76,14 @@ static void nft_target_eval_xt(const struct nft_expr *expr, void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct sk_buff *skb = pkt->skb; + struct xt_action_param xt; int ret; - nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + nft_compat_set_par(&xt, pkt, target, info); - ret = target->target(skb, &pkt->xt); + ret = target->target(skb, &xt); - if (pkt->xt.hotdrop) + if (xt.hotdrop) ret = NF_DROP; switch (ret) { @@ -97,13 +103,14 @@ static void nft_target_eval_bridge(const struct nft_expr *expr, void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct sk_buff *skb = pkt->skb; + struct xt_action_param xt; int ret; - nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + nft_compat_set_par(&xt, pkt, target, info); - ret = target->target(skb, &pkt->xt); + ret = target->target(skb, &xt); - if (pkt->xt.hotdrop) + if (xt.hotdrop) ret = NF_DROP; switch (ret) { @@ -350,13 +357,14 @@ static void __nft_match_eval(const struct nft_expr *expr, { struct xt_match *match = expr->ops->data; struct sk_buff *skb = pkt->skb; + struct xt_action_param xt; bool ret; - nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info); + nft_compat_set_par(&xt, pkt, match, info); - ret = match->match(skb, (struct xt_action_param *)&pkt->xt); + ret = match->match(skb, &xt); - if (pkt->xt.hotdrop) { + if (xt.hotdrop) { regs->verdict.code = NF_DROP; return; } @@ -617,7 +625,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { - struct nfgenmsg *nfmsg; + u8 family = info->nfmsg->nfgen_family; const char *name, *fmt; struct sk_buff *skb2; int ret = 0, target; @@ -632,9 +640,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); - nfmsg = nlmsg_data(info->nlh); - - switch(nfmsg->nfgen_family) { + switch(family) { case AF_INET: fmt = "ipt_%s"; break; @@ -648,8 +654,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, fmt = "arpt_%s"; break; default: - pr_err("nft_compat: unsupported protocol %d\n", - nfmsg->nfgen_family); + pr_err("nft_compat: unsupported protocol %d\n", family); return -EINVAL; } @@ -657,9 +662,8 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, return -EINVAL; rcu_read_unlock(); - try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name, - rev, target, &ret), - fmt, name); + try_then_request_module(xt_find_revision(family, name, rev, target, &ret), + fmt, name); if (ret < 0) goto out_put; @@ -674,8 +678,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_COMPAT_GET, - nfmsg->nfgen_family, - name, ret, target) <= 0) { + family, name, ret, target) <= 0) { kfree_skb(skb2); goto out_put; } diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index f64f0017e9a5..7f705b5c09de 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -10,8 +10,10 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/sctp.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> +#include <net/sctp/sctp.h> #include <net/tcp.h> struct nft_exthdr { @@ -165,7 +167,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt, if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP) return NULL; - tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buffer); + tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer); if (!tcph) return NULL; @@ -173,7 +175,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt, if (*tcphdr_len < sizeof(*tcph) || *tcphdr_len > len) return NULL; - return skb_header_pointer(pkt->skb, pkt->xt.thoff, *tcphdr_len, buffer); + return skb_header_pointer(pkt->skb, nft_thoff(pkt), *tcphdr_len, buffer); } static void nft_exthdr_tcp_eval(const struct nft_expr *expr, @@ -249,7 +251,7 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, return; if (skb_ensure_writable(pkt->skb, - pkt->xt.thoff + i + priv->len)) + nft_thoff(pkt) + i + priv->len)) return; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, @@ -300,6 +302,45 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, } } +static void nft_exthdr_sctp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + unsigned int offset = nft_thoff(pkt) + sizeof(struct sctphdr); + struct nft_exthdr *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; + + do { + sch = skb_header_pointer(pkt->skb, offset, sizeof(_sch), &_sch); + if (!sch || !sch->length) + break; + + if (sch->type == priv->type) { + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + nft_reg_store8(dest, true); + return; + } + if (priv->offset + priv->len > ntohs(sch->length) || + offset + ntohs(sch->length) > pkt->skb->len) + break; + + dest[priv->len / NFT_REG32_SIZE] = 0; + if (skb_copy_bits(pkt->skb, offset + priv->offset, + dest, priv->len) < 0) + break; + return; + } + offset += SCTP_PAD4(ntohs(sch->length)); + } while (offset < pkt->skb->len); + + if (priv->flags & NFT_EXTHDR_F_PRESENT) + nft_reg_store8(dest, false); + else + regs->verdict.code = NFT_BREAK; +} + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, @@ -499,6 +540,14 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { .dump = nft_exthdr_dump_set, }; +static const struct nft_expr_ops nft_exthdr_sctp_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_sctp_eval, + .init = nft_exthdr_init, + .dump = nft_exthdr_dump, +}; + static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -529,6 +578,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, return &nft_exthdr_ipv4_ops; } break; + case NFT_EXTHDR_OP_SCTP: + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_sctp_ops; + break; } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 4843dd2b410c..0af34ad41479 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -291,7 +291,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: - tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, + tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); if (unlikely(!tcph || tcph->fin || tcph->rst)) goto out; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index a479f8a1270c..90becbf5bff3 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -23,6 +23,37 @@ struct nft_lookup { struct nft_set_binding binding; }; +#ifdef CONFIG_RETPOLINE +bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + if (set->ops == &nft_set_hash_fast_type.ops) + return nft_hash_lookup_fast(net, set, key, ext); + if (set->ops == &nft_set_hash_type.ops) + return nft_hash_lookup(net, set, key, ext); + + if (set->ops == &nft_set_rhash_type.ops) + return nft_rhash_lookup(net, set, key, ext); + + if (set->ops == &nft_set_bitmap_type.ops) + return nft_bitmap_lookup(net, set, key, ext); + + if (set->ops == &nft_set_pipapo_type.ops) + return nft_pipapo_lookup(net, set, key, ext); +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) + if (set->ops == &nft_set_pipapo_avx2_type.ops) + return nft_pipapo_avx2_lookup(net, set, key, ext); +#endif + + if (set->ops == &nft_set_rbtree_type.ops) + return nft_rbtree_lookup(net, set, key, ext); + + WARN_ON_ONCE(1); + return set->ops->lookup(net, set, key, ext); +} +EXPORT_SYMBOL_GPL(nft_set_do_lookup); +#endif + void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -33,8 +64,8 @@ void nft_lookup_eval(const struct nft_expr *expr, const struct net *net = nft_net(pkt); bool found; - found = set->ops->lookup(net, set, ®s->data[priv->sreg], &ext) ^ - priv->invert; + found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext) ^ + priv->invert; if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 7e47edee88ee..94b2327e71dc 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -9,7 +9,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> #define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) @@ -110,7 +110,7 @@ static void nft_objref_map_eval(const struct nft_expr *expr, struct nft_object *obj; bool found; - found = set->ops->lookup(net, set, ®s->data[priv->sreg], &ext); + found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext); if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 501c5b24cc39..a44b14f6c0dc 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -110,7 +110,7 @@ void nft_payload_eval(const struct nft_expr *expr, case NFT_PAYLOAD_TRANSPORT_HEADER: if (!pkt->tprot_set) goto err; - offset = pkt->xt.thoff; + offset = nft_thoff(pkt); break; default: BUG(); @@ -507,7 +507,7 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, *l4csum_offset = offsetof(struct tcphdr, check); break; case IPPROTO_UDP: - if (!nft_payload_udp_checksum(skb, pkt->xt.thoff)) + if (!nft_payload_udp_checksum(skb, nft_thoff(pkt))) return -1; fallthrough; case IPPROTO_UDPLITE: @@ -520,7 +520,7 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, return -1; } - *l4csum_offset += pkt->xt.thoff; + *l4csum_offset += nft_thoff(pkt); return 0; } @@ -612,7 +612,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, case NFT_PAYLOAD_TRANSPORT_HEADER: if (!pkt->tprot_set) goto err; - offset = pkt->xt.thoff; + offset = nft_thoff(pkt); break; default: BUG(); @@ -643,7 +643,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP && pkt->tprot == IPPROTO_SCTP && skb->ip_summed != CHECKSUM_PARTIAL) { - if (nft_payload_csum_sctp(skb, pkt->xt.thoff)) + if (nft_payload_csum_sctp(skb, nft_thoff(pkt))) goto err; } diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 95090186ee90..554caf967baa 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -28,7 +28,7 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->xt.state->sk, + nf_send_reset(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: @@ -45,7 +45,7 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, + nf_send_reset6(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 2a81ea421819..e7ae5914971e 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -73,8 +73,9 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask) return (bitmap[idx] & (0x3 << off)) & (genmask << off); } -static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { const struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 7b3d0a78c569..df40314de21f 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -74,8 +74,9 @@ static const struct rhashtable_params nft_rhash_params = { .automatic_shrinking = true, }; -static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_rhash *priv = nft_set_priv(set); const struct nft_rhash_elem *he; @@ -446,8 +447,9 @@ struct nft_hash_elem { struct nft_set_ext ext; }; -static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -484,9 +486,10 @@ static void *nft_hash_get(const struct net *net, const struct nft_set *set, return ERR_PTR(-ENOENT); } -static bool nft_hash_lookup_fast(const struct net *net, - const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_hash_lookup_fast(const struct net *net, + const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index d84afb8fa79a..25a75591583e 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -178,8 +178,6 @@ struct nft_pipapo_elem { int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, union nft_pipapo_map_bucket *mt, bool match_only); -bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); /** * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index eabdb8d552ee..e517663e0cd1 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -142,7 +142,6 @@ static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len) * @map: Bitmap to be scanned for set bits * @dst: Destination bitmap * @mt: Mapping table containing bit set specifiers - * @len: Length of bitmap in longs * @last: Return index of first set bit, if this is the last field * * This is an alternative implementation of pipapo_refill() suitable for usage @@ -1109,7 +1108,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @key: nftables API element representation containing key data * @ext: nftables API extension pointer, filled with matching reference * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. @@ -1136,8 +1135,13 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, m = rcu_dereference(priv->match); - /* This also protects access to all data related to scratch maps */ - kernel_fpu_begin(); + /* This also protects access to all data related to scratch maps. + * + * Note that we don't need a valid MXCSR state for any of the + * operations we use here, so pass 0 as mask and spare a LDMXCSR + * instruction. + */ + kernel_fpu_begin_mask(0); scratch = *raw_cpu_ptr(m->scratch_aligned); if (unlikely(!scratch)) { diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h index 394bcb704db7..dbb6aaca8a7a 100644 --- a/net/netfilter/nft_set_pipapo_avx2.h +++ b/net/netfilter/nft_set_pipapo_avx2.h @@ -5,8 +5,6 @@ #include <asm/fpu/xstate.h> #define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE) -bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); #endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */ diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 9e36eb4a7429..d600a566da32 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -107,8 +107,9 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set return false; } -static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 4fda8b3f1762..a0109fa1e92d 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -109,7 +109,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, { struct synproxy_options opts = {}; struct sk_buff *skb = pkt->skb; - int thoff = pkt->xt.thoff; + int thoff = nft_thoff(pkt); const struct tcphdr *tcp; struct tcphdr _tcph; @@ -123,7 +123,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, return; } - tcp = skb_header_pointer(skb, pkt->xt.thoff, + tcp = skb_header_pointer(skb, thoff, sizeof(struct tcphdr), &_tcph); if (!tcp) { diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index accef672088c..18e79c0fd3cf 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -82,9 +82,9 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, const struct nft_tproxy *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; const struct ipv6hdr *iph = ipv6_hdr(skb); - struct in6_addr taddr; - int thoff = pkt->xt.thoff; + int thoff = nft_thoff(pkt); struct udphdr _hdr, *hp; + struct in6_addr taddr; __be16 tport = 0; struct sock *sk; int l4proto; diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index 9cdc16b0d0d8..b6a015aee0ce 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -117,7 +117,7 @@ static int audit_tg_check(const struct xt_tgchk_param *par) const struct xt_audit_info *info = par->targinfo; if (info->type > XT_AUDIT_TYPE_MAX) { - pr_info_ratelimited("Audit type out of range (valid range: 0..%hhu)\n", + pr_info_ratelimited("Audit type out of range (valid range: 0..%u)\n", XT_AUDIT_TYPE_MAX); return -ERANGE; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index d4deee39158b..12404d221026 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -172,7 +172,6 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err2; } - ret = 0; if ((info->ct_events || info->exp_events) && !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, GFP_KERNEL)) { diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 24d4afb9988d..8b4fd27857f2 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -8,16 +8,14 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_limit.h> struct xt_limit_priv { - spinlock_t lock; unsigned long prev; - uint32_t credit; + u32 credit; }; MODULE_LICENSE("GPL"); @@ -66,22 +64,31 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateinfo *r = par->matchinfo; struct xt_limit_priv *priv = r->master; - unsigned long now = jiffies; - - spin_lock_bh(&priv->lock); - priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; - if (priv->credit > r->credit_cap) - priv->credit = r->credit_cap; - - if (priv->credit >= r->cost) { - /* We're not limited. */ - priv->credit -= r->cost; - spin_unlock_bh(&priv->lock); - return true; - } - - spin_unlock_bh(&priv->lock); - return false; + unsigned long now; + u32 old_credit, new_credit, credit_increase = 0; + bool ret; + + /* fastpath if there is nothing to update */ + if ((READ_ONCE(priv->credit) < r->cost) && (READ_ONCE(priv->prev) == jiffies)) + return false; + + do { + now = jiffies; + credit_increase += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; + old_credit = READ_ONCE(priv->credit); + new_credit = old_credit; + new_credit += credit_increase; + if (new_credit > r->credit_cap) + new_credit = r->credit_cap; + if (new_credit >= r->cost) { + ret = true; + new_credit -= r->cost; + } else { + ret = false; + } + } while (cmpxchg(&priv->credit, old_credit, new_credit) != old_credit); + + return ret; } /* Precision saver. */ @@ -122,7 +129,6 @@ static int limit_mt_check(const struct xt_mtchk_param *par) r->credit_cap = priv->credit; /* Credits full. */ r->cost = user2credits(r->avg); } - spin_lock_init(&priv->lock); return 0; } diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index f28c8947c730..91a19c3ea1a3 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -105,7 +105,7 @@ static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info) !info->attrs[NLBL_CALIPSO_A_MTYPE]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) { case CALIPSO_MAP_PASS: ret_val = netlbl_calipso_add_pass(info, &audit_info); @@ -287,7 +287,7 @@ static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NLBL_CALIPSO_A_DOI]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); cb_arg.doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); cb_arg.audit_info = &audit_info; ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 4f50a64315cf..baf235721c43 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -410,7 +410,7 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info) !info->attrs[NLBL_CIPSOV4_A_MTYPE]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CIPSOV4_A_MTYPE])) { case CIPSO_V4_MAP_TRANS: ret_val = netlbl_cipsov4_add_std(info, &audit_info); @@ -709,7 +709,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NLBL_CIPSOV4_A_DOI]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); cb_arg.doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); cb_arg.audit_info = &audit_info; ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index dc8c39f51f7d..8158a25972b4 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -929,7 +929,7 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain, * @cb_arg: argument for the callback function * * Description: - * Interate over the domain mapping hash table, skipping the first @skip_bkt + * Iterate over the domain mapping hash table, skipping the first @skip_bkt * buckets and @skip_chain entries. For each entry in the table call * @callback, if @callback returns a negative value stop 'walking' through the * table and return. Updates the values in @skip_bkt and @skip_chain on diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 5e1239cef000..beb0e573266d 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -719,7 +719,7 @@ int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset) * it in @bitmap. The @offset must be aligned to an unsigned long and will be * updated on return if different from what was requested; if the catmap is * empty at the requested offset and beyond, the @offset is set to (u32)-1. - * Returns zero on sucess, negative values on failure. + * Returns zero on success, negative values on failure. * */ int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap, diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index ca52f5085989..032b7d7b32c7 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -76,6 +76,7 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { static int netlbl_mgmt_add_common(struct genl_info *info, struct netlbl_audit *audit_info) { + void *pmap = NULL; int ret_val = -EINVAL; struct netlbl_domaddr_map *addrmap = NULL; struct cipso_v4_doi *cipsov4 = NULL; @@ -175,6 +176,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = -ENOMEM; goto add_free_addrmap; } + pmap = map; map->list.addr = addr->s_addr & mask->s_addr; map->list.mask = mask->s_addr; map->list.valid = 1; @@ -183,10 +185,8 @@ static int netlbl_mgmt_add_common(struct genl_info *info, map->def.cipso = cipsov4; ret_val = netlbl_af4list_add(&map->list, &addrmap->list4); - if (ret_val != 0) { - kfree(map); - goto add_free_addrmap; - } + if (ret_val != 0) + goto add_free_map; entry->family = AF_INET; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; @@ -223,6 +223,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = -ENOMEM; goto add_free_addrmap; } + pmap = map; map->list.addr = *addr; map->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; map->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; @@ -235,10 +236,8 @@ static int netlbl_mgmt_add_common(struct genl_info *info, map->def.calipso = calipso; ret_val = netlbl_af6list_add(&map->list, &addrmap->list6); - if (ret_val != 0) { - kfree(map); - goto add_free_addrmap; - } + if (ret_val != 0) + goto add_free_map; entry->family = AF_INET6; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; @@ -248,10 +247,12 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = netlbl_domhsh_add(entry, audit_info); if (ret_val != 0) - goto add_free_addrmap; + goto add_free_map; return 0; +add_free_map: + kfree(pmap); add_free_addrmap: kfree(addrmap); add_doi_put_def: @@ -434,7 +435,7 @@ static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info) (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); return netlbl_mgmt_add_common(info, &audit_info); } @@ -457,7 +458,7 @@ static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NLBL_MGMT_A_DOMAIN]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]); return netlbl_domhsh_remove(domain, AF_UNSPEC, &audit_info); @@ -557,7 +558,7 @@ static int netlbl_mgmt_adddef(struct sk_buff *skb, struct genl_info *info) (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); return netlbl_mgmt_add_common(info, &audit_info); } @@ -576,7 +577,7 @@ static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info) { struct netlbl_audit audit_info; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); return netlbl_domhsh_remove_default(AF_UNSPEC, &audit_info); } diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 3e6ac9b790b1..2483df0bbd7c 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -814,7 +814,7 @@ static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) { value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]); if (value == 1 || value == 0) { - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); netlbl_unlabel_acceptflg_set(value, &audit_info); return 0; } @@ -897,7 +897,7 @@ static int netlbl_unlabel_staticadd(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) @@ -947,7 +947,7 @@ static int netlbl_unlabel_staticadddef(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) @@ -994,7 +994,7 @@ static int netlbl_unlabel_staticremove(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) @@ -1034,7 +1034,7 @@ static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h index b9ba8112b3c5..6190cbf94bf0 100644 --- a/net/netlabel/netlabel_user.h +++ b/net/netlabel/netlabel_user.h @@ -28,11 +28,9 @@ /** * netlbl_netlink_auditinfo - Fetch the audit information from a NETLINK msg - * @skb: the packet * @audit_info: NetLabel audit information */ -static inline void netlbl_netlink_auditinfo(struct sk_buff *skb, - struct netlbl_audit *audit_info) +static inline void netlbl_netlink_auditinfo(struct netlbl_audit *audit_info) { security_task_getsecid_subj(current, &audit_info->secid); audit_info->loginuid = audit_get_loginuid(current); diff --git a/net/nfc/hci/command.c b/net/nfc/hci/command.c index e02b9befce0b..3a89bd9b89fc 100644 --- a/net/nfc/hci/command.c +++ b/net/nfc/hci/command.c @@ -34,7 +34,7 @@ static int nfc_hci_execute_cmd_async(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, * HCI command execution completion callback. * err will be a standard linux error (may be converted from HCI response) * skb contains the response data and must be disposed, or may be NULL if - * an error occured + * an error occurred */ static void nfc_hci_execute_cb(void *context, struct sk_buff *skb, int err) { diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 43811b5219b5..3481941be70b 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -705,7 +705,7 @@ static void hci_transceive_cb(void *context, struct sk_buff *skb, int err) /* * TODO: Check RF Error indicator to make sure data is valid. * It seems that HCI cmd can complete without error, but data - * can be invalid if an RF error occured? Ignore for now. + * can be invalid if an RF error occurred? Ignore for now. */ if (err == 0) skb_trim(skb, skb->len - 1); /* RF Err ind */ diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index c0c8fea3a186..1e3a90049da9 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -406,7 +406,7 @@ static void llc_shdlc_rcv_u_frame(struct llc_shdlc *shdlc, case SHDLC_NEGOTIATING: case SHDLC_CONNECTING: /* - * We sent RSET, but chip wants to negociate or we + * We sent RSET, but chip wants to negotiate or we * got RSET before we managed to send out our. */ if (skb->len > 0) diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index 96865142104f..d6732e5e8958 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -161,8 +161,6 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, *(u8 *)skb_push(skb, 1) = data_type; do { - len = conn_info->max_pkt_payload_len; - /* If last packet add NCI_HFP_NO_CHAINING */ if (i + conn_info->max_pkt_payload_len - (skb->len + 1) >= data_len) { diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index cadb6a29b285..1b5eae57bc90 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -967,8 +967,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, /* Associate skb with specified zone. */ if (tmpl) { - if (skb_nfct(skb)) - nf_conntrack_put(skb_nfct(skb)); + nf_conntrack_put(skb_nfct(skb)); nf_conntrack_get(&tmpl->ct_general); nf_ct_set(skb, tmpl, IP_CT_NEW); } @@ -1329,11 +1328,9 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) { - if (skb_nfct(skb)) { - nf_conntrack_put(skb_nfct(skb)); - nf_ct_set(skb, NULL, IP_CT_UNTRACKED); - ovs_ct_fill_key(skb, key, false); - } + nf_conntrack_put(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + ovs_ct_fill_key(skb, key, false); return 0; } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 330ba68828e7..77b0cdab3810 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3934,12 +3934,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, return -EFAULT; lock_sock(sk); - if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { - ret = -EBUSY; - } else { + if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec) po->tp_tx_has_off = !!val; - ret = 0; - } + release_sock(sk); return 0; } diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 8d00dfe8139e..1990d496fcfc 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -775,8 +775,10 @@ int qrtr_ns_init(void) } qrtr_ns.workqueue = alloc_workqueue("qrtr_ns_handler", WQ_UNBOUND, 1); - if (!qrtr_ns.workqueue) + if (!qrtr_ns.workqueue) { + ret = -ENOMEM; goto err_sock; + } qrtr_ns.sock->sk->sk_data_ready = qrtr_ns_data_ready; diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c index ff97e8eda858..006b2e441418 100644 --- a/net/rds/ib_ring.c +++ b/net/rds/ib_ring.c @@ -141,7 +141,7 @@ int rds_ib_ring_low(struct rds_ib_work_ring *ring) } /* - * returns the oldest alloced ring entry. This will be the next one + * returns the oldest allocated ring entry. This will be the next one * freed. This can't be called if there are none allocated. */ u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring) diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 42c5ff1eda95..f4ee13da90c7 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -177,7 +177,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, goto out; } tc->t_tinc = tinc; - rdsdebug("alloced tinc %p\n", tinc); + rdsdebug("allocated tinc %p\n", tinc); rds_inc_path_init(&tinc->ti_inc, cp, &cp->cp_conn->c_faddr); tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 3ce6d628cd75..19e929c7c38b 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -77,7 +77,7 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, } /* - * Process event packets targetted at a local endpoint. + * Process event packets targeted at a local endpoint. */ void rxrpc_process_local_events(struct rxrpc_local *local) { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f6d5755d669e..d17a66aab8ee 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -381,7 +381,8 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, } mutex_unlock(&idrinfo->lock); - if (nla_put_u32(skb, TCA_FCNT, n_i)) + ret = nla_put_u32(skb, TCA_FCNT, n_i); + if (ret) goto nla_put_failure; nla_nest_end(skb, nest); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 1cac3c6fbb49..71f2015c70ca 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -70,7 +70,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, /* replace the vid */ tci = (tci & ~VLAN_VID_MASK) | p->tcfv_push_vid; /* replace prio bits, if tcfv_push_prio specified */ - if (p->tcfv_push_prio) { + if (p->tcfv_push_prio_exists) { tci &= ~VLAN_PRIO_MASK; tci |= p->tcfv_push_prio << VLAN_PRIO_SHIFT; } @@ -121,6 +121,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; struct tcf_chain *goto_ch = NULL; + bool push_prio_exists = false; struct tcf_vlan_params *p; struct tc_vlan *parm; struct tcf_vlan *v; @@ -189,7 +190,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, push_proto = htons(ETH_P_8021Q); } - if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY]) + push_prio_exists = !!tb[TCA_VLAN_PUSH_VLAN_PRIORITY]; + if (push_prio_exists) push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]); break; case TCA_VLAN_ACT_POP_ETH: @@ -241,6 +243,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, p->tcfv_action = action; p->tcfv_push_vid = push_vid; p->tcfv_push_prio = push_prio; + p->tcfv_push_prio_exists = push_prio_exists || action == TCA_VLAN_ACT_PUSH; p->tcfv_push_proto = push_proto; if (action == TCA_VLAN_ACT_PUSH_ETH) { @@ -304,8 +307,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, p->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, p->tcfv_push_proto) || - (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, - p->tcfv_push_prio)))) + (p->tcfv_push_prio_exists && + nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, p->tcfv_push_prio)))) goto nla_put_failure; if (p->tcfv_action == TCA_VLAN_ACT_PUSH_ETH) { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 279f9e2a2319..d73b5c5514a9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1531,7 +1531,7 @@ static inline int __tcf_classify(struct sk_buff *skb, u32 *last_executed_chain) { #ifdef CONFIG_NET_CLS_ACT - const int max_reclassify_loop = 4; + const int max_reclassify_loop = 16; const struct tcf_proto *first_tp; int limit = 0; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index d7869a984881..2e704c7a105a 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1531,14 +1531,13 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->basic.n_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.n_proto)); + mask->basic.n_proto = cpu_to_be16(0); } else { key->basic.n_proto = ethertype; - mask->basic.n_proto = cpu_to_be16(~0); } } } else { key->basic.n_proto = ethertype; - mask->basic.n_proto = cpu_to_be16(~0); } } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 2e288f88ff02..27a4b6dbcf57 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -7,7 +7,7 @@ /* Comparing to general packet classification problem, - RSVP needs only sevaral relatively simple rules: + RSVP needs only several relatively simple rules: * (dst, protocol) are always specified, so that we are able to hash them. diff --git a/net/sched/ematch.c b/net/sched/ematch.c index f885bea5b452..4ce681361851 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -141,7 +141,7 @@ errout: EXPORT_SYMBOL(tcf_em_register); /** - * tcf_em_unregister - unregster and extended match + * tcf_em_unregister - unregister and extended match * * @ops: ematch operations lookup table * diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index fc8b56bcabf3..e9c0afc8becc 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -540,6 +540,24 @@ void netif_carrier_off(struct net_device *dev) } EXPORT_SYMBOL(netif_carrier_off); +/** + * netif_carrier_event - report carrier state event + * @dev: network device + * + * Device has detected a carrier event but the carrier state wasn't changed. + * Use in drivers when querying carrier state asynchronously, to avoid missing + * events (link flaps) if link recovers before it's queried. + */ +void netif_carrier_event(struct net_device *dev) +{ + if (dev->reg_state == NETREG_UNINITIALIZED) + return; + atomic_inc(&dev->carrier_up_count); + atomic_inc(&dev->carrier_down_count); + linkwatch_fire_event(dev); +} +EXPORT_SYMBOL_GPL(netif_carrier_event); + /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index f4132dc25ac0..621dc6afde8f 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -6,7 +6,7 @@ * * 991129: - Bug fix with grio mode * - a better sing. AvgQ mode with Grio(WRED) - * - A finer grained VQ dequeue based on sugestion + * - A finer grained VQ dequeue based on suggestion * from Ren Liu * - More error checks * diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 8827987ba903..5f7ac27a5264 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -52,7 +52,7 @@ */ static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */ -#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */ +#define HTB_VER 0x30011 /* major must be matched with number supplied by TC as version */ #if HTB_VER >> 16 != TC_HTB_PROTOVER #error "Mismatched sch_htb.c and pkt_sch.h" @@ -273,6 +273,9 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, /** * htb_add_to_id_tree - adds class to the round robin list + * @root: the root of the tree + * @cl: the class to add + * @prio: the give prio in class * * Routine adds class to the list (actually tree) sorted by classid. * Make sure that class is not already on such list for given prio. @@ -298,6 +301,9 @@ static void htb_add_to_id_tree(struct rb_root *root, /** * htb_add_to_wait_tree - adds class to the event queue with delay + * @q: the priority event queue + * @cl: the class to add + * @delay: delay in microseconds * * The class is added to priority event queue to indicate that class will * change its mode in cl->pq_key microseconds. Make sure that class is not @@ -331,6 +337,7 @@ static void htb_add_to_wait_tree(struct htb_sched *q, /** * htb_next_rb_node - finds next node in binary tree + * @n: the current node in binary tree * * When we are past last key we return NULL. * Average complexity is 2 steps per call. @@ -342,6 +349,9 @@ static inline void htb_next_rb_node(struct rb_node **n) /** * htb_add_class_to_row - add class to its row + * @q: the priority event queue + * @cl: the class to add + * @mask: the given priorities in class in bitmap * * The class is added to row at priorities marked in mask. * It does nothing if mask == 0. @@ -371,6 +381,9 @@ static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root) /** * htb_remove_class_from_row - removes class from its row + * @q: the priority event queue + * @cl: the class to add + * @mask: the given priorities in class in bitmap * * The class is removed from row at priorities marked in mask. * It does nothing if mask == 0. @@ -398,6 +411,8 @@ static inline void htb_remove_class_from_row(struct htb_sched *q, /** * htb_activate_prios - creates active classe's feed chain + * @q: the priority event queue + * @cl: the class to activate * * The class is connected to ancestors and/or appropriate rows * for priorities it is participating on. cl->cmode must be new @@ -433,6 +448,8 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) /** * htb_deactivate_prios - remove class from feed chain + * @q: the priority event queue + * @cl: the class to deactivate * * cl->cmode must represent old mode (before deactivation). It does * nothing if cl->prio_activity == 0. Class is removed from all feed @@ -493,6 +510,8 @@ static inline s64 htb_hiwater(const struct htb_class *cl) /** * htb_class_mode - computes and returns current class mode + * @cl: the target class + * @diff: diff time in microseconds * * It computes cl's mode at time cl->t_c+diff and returns it. If mode * is not HTB_CAN_SEND then cl->pq_key is updated to time difference @@ -521,9 +540,12 @@ htb_class_mode(struct htb_class *cl, s64 *diff) /** * htb_change_class_mode - changes classe's mode + * @q: the priority event queue + * @cl: the target class + * @diff: diff time in microseconds * * This should be the only way how to change classe's mode under normal - * cirsumstances. Routine will update feed lists linkage, change mode + * circumstances. Routine will update feed lists linkage, change mode * and add class to the wait event queue if appropriate. New mode should * be different from old one and cl->pq_key has to be valid if changing * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). @@ -553,6 +575,8 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) /** * htb_activate - inserts leaf cl into appropriate active feeds + * @q: the priority event queue + * @cl: the target class * * Routine learns (new) priority of leaf and activates feed chain * for the prio. It can be called on already active leaf safely. @@ -570,6 +594,8 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) /** * htb_deactivate - remove leaf cl from active feeds + * @q: the priority event queue + * @cl: the target class * * Make sure that leaf is active. In the other words it can't be called * with non-active leaf. It also removes class from the drop list. @@ -649,6 +675,10 @@ static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff) /** * htb_charge_class - charges amount "bytes" to leaf and ancestors + * @q: the priority event queue + * @cl: the class to start iterate + * @level: the minimum level to account + * @skb: the socket buffer * * Routine assumes that packet "bytes" long was dequeued from leaf cl * borrowing from "level". It accounts bytes to ceil leaky bucket for @@ -698,6 +728,9 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, /** * htb_do_events - make mode changes to classes at the level + * @q: the priority event queue + * @level: which wait_pq in 'q->hlevel' + * @start: start jiffies * * Scans event queue for pending events and applies them. Returns time of * next pending event (0 for no event in pq, q->now for too many events). @@ -766,6 +799,8 @@ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n, /** * htb_lookup_leaf - returns next leaf class in DRR order + * @hprio: the current one + * @prio: which prio in class * * Find leaf where current feed pointers points to. */ diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 5c91df52b8c2..66fe2b82af9a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -114,9 +114,6 @@ static void taprio_free_sched_cb(struct rcu_head *head) struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); struct sched_entry *entry, *n; - if (!sched) - return; - list_for_each_entry_safe(entry, n, &sched->entries, list) { list_del(&entry->list); kfree(entry); @@ -438,6 +435,11 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child; int queue; + if (unlikely(FULL_OFFLOAD_IS_ENABLED(q->flags))) { + WARN_ONCE(1, "Trying to enqueue skb into the root of a taprio qdisc configured with full offload\n"); + return qdisc_drop(skb, sch, to_free); + } + queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; @@ -529,23 +531,7 @@ static struct sk_buff *taprio_peek_soft(struct Qdisc *sch) static struct sk_buff *taprio_peek_offload(struct Qdisc *sch) { - struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); - struct sk_buff *skb; - int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct Qdisc *child = q->qdiscs[i]; - - if (unlikely(!child)) - continue; - - skb = child->ops->peek(child); - if (!skb) - continue; - - return skb; - } + WARN_ONCE(1, "Trying to peek into the root of a taprio qdisc configured with full offload\n"); return NULL; } @@ -654,27 +640,7 @@ done: static struct sk_buff *taprio_dequeue_offload(struct Qdisc *sch) { - struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); - struct sk_buff *skb; - int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct Qdisc *child = q->qdiscs[i]; - - if (unlikely(!child)) - continue; - - skb = child->ops->dequeue(child); - if (unlikely(!skb)) - continue; - - qdisc_bstats_update(sch, skb); - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - - return skb; - } + WARN_ONCE(1, "Trying to dequeue from the root of a taprio qdisc configured with full offload\n"); return NULL; } @@ -1759,6 +1725,37 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, return taprio_change(sch, opt, extack); } +static void taprio_attach(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx; + + /* Attach underlying qdisc */ + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + struct Qdisc *qdisc = q->qdiscs[ntx]; + struct Qdisc *old; + + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; + old = dev_graft_qdisc(qdisc->dev_queue, qdisc); + if (ntx < dev->real_num_tx_queues) + qdisc_hash_add(qdisc, false); + } else { + old = dev_graft_qdisc(qdisc->dev_queue, sch); + qdisc_refcount_inc(sch); + } + if (old) + qdisc_put(old); + } + + /* access to the child qdiscs is not needed in offload mode */ + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + kfree(q->qdiscs); + q->qdiscs = NULL; + } +} + static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, unsigned long cl) { @@ -1785,8 +1782,12 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl, if (dev->flags & IFF_UP) dev_deactivate(dev); - *old = q->qdiscs[cl - 1]; - q->qdiscs[cl - 1] = new; + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + *old = dev_graft_qdisc(dev_queue, new); + } else { + *old = q->qdiscs[cl - 1]; + q->qdiscs[cl - 1] = new; + } if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; @@ -2020,6 +2021,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .change = taprio_change, .destroy = taprio_destroy, .reset = taprio_reset, + .attach = taprio_attach, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6f2bbfeec3a4..baa4e770e4ba 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1171,7 +1171,6 @@ static const struct net_protocol sctp_protocol = { .handler = sctp4_rcv, .err_handler = sctp_v4_err, .no_policy = 1, - .netns_ok = 1, .icmp_strict_tag_validation = 1, }; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index fd1e319eda00..4f30388a0dd0 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -361,7 +361,7 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, /* If the INIT is coming toward a closing socket, we'll send back * and ABORT. Essentially, this catches the race of INIT being - * backloged to the socket at the same time as the user isses close(). + * backloged to the socket at the same time as the user issues close(). * Since the socket and all its associations are going away, we * can treat this OOTB */ @@ -608,8 +608,8 @@ enum sctp_disposition sctp_sf_do_5_1C_ack(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_COOKIE_ECHOED)); - /* SCTP-AUTH: genereate the assocition shared keys so that - * we can potentially signe the COOKIE-ECHO. + /* SCTP-AUTH: generate the association shared keys so that + * we can potentially sign the COOKIE-ECHO. */ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_SHKEY, SCTP_NULL()); @@ -787,7 +787,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, goto nomem_init; /* SCTP-AUTH: Now that we've populate required fields in - * sctp_process_init, set up the assocaition shared keys as + * sctp_process_init, set up the association shared keys as * necessary so that we can potentially authenticate the ACK */ error = sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC); @@ -838,7 +838,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, /* Add all the state machine commands now since we've created * everything. This way we don't introduce memory corruptions - * during side-effect processing and correclty count established + * during side-effect processing and correctly count established * associations. */ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); @@ -923,7 +923,7 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net, commands); /* Reset init error count upon receipt of COOKIE-ACK, - * to avoid problems with the managemement of this + * to avoid problems with the management of this * counter in stale cookie situations when a transition back * from the COOKIE-ECHOED state to the COOKIE-WAIT * state is performed. @@ -2950,7 +2950,7 @@ enum sctp_disposition sctp_sf_do_9_2_reshutack( commands); /* Since we are not going to really process this INIT, there - * is no point in verifying chunk boundries. Just generate + * is no point in verifying chunk boundaries. Just generate * the SHUTDOWN ACK. */ reply = sctp_make_shutdown_ack(asoc, chunk); @@ -3560,7 +3560,7 @@ enum sctp_disposition sctp_sf_do_9_2_final(struct net *net, goto nomem_chunk; /* Do all the commands now (after allocation), so that we - * have consistent state if memory allocation failes + * have consistent state if memory allocation fails */ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); @@ -3747,7 +3747,7 @@ static enum sctp_disposition sctp_sf_shut_8_4_5( return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); /* We need to discard the rest of the packet to prevent - * potential bomming attacks from additional bundled chunks. + * potential boomming attacks from additional bundled chunks. * This is documented in SCTP Threats ID. */ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -4257,7 +4257,7 @@ gen_shutdown: } /* - * SCTP-AUTH Section 6.3 Receiving authenticated chukns + * SCTP-AUTH Section 6.3 Receiving authenticated chunks * * The receiver MUST use the HMAC algorithm indicated in the HMAC * Identifier field. If this algorithm was not specified by the @@ -4812,7 +4812,7 @@ static enum sctp_disposition sctp_sf_violation_ctsn( /* Handle protocol violation of an invalid chunk bundling. For example, * when we have an association and we receive bundled INIT-ACK, or - * SHUDOWN-COMPLETE, our peer is clearly violationg the "MUST NOT bundle" + * SHUTDOWN-COMPLETE, our peer is clearly violating the "MUST NOT bundle" * statement from the specs. Additionally, there might be an attacker * on the path and we may not want to continue this communication. */ @@ -5208,7 +5208,7 @@ enum sctp_disposition sctp_sf_cookie_wait_prm_shutdown( * Inputs * (endpoint, asoc) * - * The RFC does not explcitly address this issue, but is the route through the + * The RFC does not explicitly address this issue, but is the route through the * state table when someone issues a shutdown while in COOKIE_ECHOED state. * * Outputs @@ -5932,7 +5932,7 @@ enum sctp_disposition sctp_sf_t1_cookie_timer_expire( /* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN * with the updated last sequential TSN received from its peer. * - * An endpoint should limit the number of retransmissions of the + * An endpoint should limit the number of retransmission of the * SHUTDOWN chunk to the protocol parameter 'Association.Max.Retrans'. * If this threshold is exceeded the endpoint should destroy the TCB and * MUST report the peer endpoint unreachable to the upper layer (and @@ -6010,7 +6010,7 @@ nomem: } /* - * ADDIP Section 4.1 ASCONF CHunk Procedures + * ADDIP Section 4.1 ASCONF Chunk Procedures * If the T4 RTO timer expires the endpoint should do B1 to B5 */ enum sctp_disposition sctp_sf_t4_timer_expire( @@ -6441,7 +6441,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, chunk->ecn_ce_done = 1; if (af->is_ce(sctp_gso_headskb(chunk->skb))) { - /* Do real work as sideffect. */ + /* Do real work as side effect. */ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, SCTP_U32(tsn)); } diff --git a/net/smc/Makefile b/net/smc/Makefile index 77e54fe42b1c..99a0186cba5b 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o -smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5eff7cccceff..e41fdac606d4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -49,6 +49,7 @@ #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" +#include "smc_stats.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -508,9 +509,44 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->r0.qp_mtu; } -static void smc_switch_to_fallback(struct smc_sock *smc) +static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, + struct smc_stats_fback *fback_arr) +{ + int cnt; + + for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) { + if (fback_arr[cnt].fback_code == smc->fallback_rsn) { + fback_arr[cnt].count++; + break; + } + if (!fback_arr[cnt].fback_code) { + fback_arr[cnt].fback_code = smc->fallback_rsn; + fback_arr[cnt].count++; + break; + } + } +} + +static void smc_stat_fallback(struct smc_sock *smc) +{ + struct net *net = sock_net(&smc->sk); + + mutex_lock(&net->smc.mutex_fback_rsn); + if (smc->listen_smc) { + smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv); + net->smc.fback_rsn->srv_fback_cnt++; + } else { + smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt); + net->smc.fback_rsn->clnt_fback_cnt++; + } + mutex_unlock(&net->smc.mutex_fback_rsn); +} + +static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { smc->use_fallback = true; + smc->fallback_rsn = reason_code; + smc_stat_fallback(smc); if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; smc->clcsock->file->private_data = smc->clcsock; @@ -522,8 +558,7 @@ static void smc_switch_to_fallback(struct smc_sock *smc) /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = reason_code; + smc_switch_to_fallback(smc, reason_code); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) @@ -535,9 +570,11 @@ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, u8 version) { + struct net *net = sock_net(&smc->sk); int rc; if (reason_code < 0) { /* error, fallback is not possible */ + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return reason_code; @@ -545,6 +582,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, if (reason_code != SMC_CLC_DECL_PEERDECL) { rc = smc_clc_send_decline(smc, reason_code, version); if (rc < 0) { + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; @@ -992,6 +1030,7 @@ static int __smc_connect(struct smc_sock *smc) if (rc) goto vlan_cleanup; + SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); smc_connect_ism_vlan_cleanup(smc, ini); kfree(buf); kfree(ini); @@ -1307,7 +1346,9 @@ static void smc_listen_out_connected(struct smc_sock *new_smc) static void smc_listen_out_err(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; + struct net *net = sock_net(newsmcsk); + this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt); if (newsmcsk->sk_state == SMC_INIT) sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; @@ -1325,8 +1366,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, smc_listen_out_err(new_smc); return; } - smc_switch_to_fallback(new_smc); - new_smc->fallback_rsn = reason_code; + smc_switch_to_fallback(new_smc, reason_code); if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { smc_listen_out_err(new_smc); @@ -1699,8 +1739,7 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - smc_switch_to_fallback(new_smc); - new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; + smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); smc_listen_out_connected(new_smc); return; } @@ -1778,6 +1817,7 @@ static void smc_listen_work(struct work_struct *work) } smc_conn_save_peer_info(new_smc, cclc); smc_listen_out_connected(new_smc); + SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); goto out_free; out_unlock: @@ -1984,18 +2024,19 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; + smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; goto out; } } - if (smc->use_fallback) + if (smc->use_fallback) { rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); - else + } else { rc = smc_tx_sendmsg(smc, msg, len); + SMC_STAT_TX_PAYLOAD(smc, len, rc); + } out: release_sock(sk); return rc; @@ -2030,6 +2071,7 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } else { msg->msg_namelen = 0; rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); + SMC_STAT_RX_PAYLOAD(smc, rc, rc); } out: @@ -2194,8 +2236,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; + smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; } @@ -2204,18 +2245,22 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (val) + if (val) { + SMC_STAT_INC(smc, ndly_cnt); mod_delayed_work(smc->conn.lgr->tx_wq, &smc->conn.tx_work, 0); + } } break; case TCP_CORK: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (!val) + if (!val) { + SMC_STAT_INC(smc, cork_cnt); mod_delayed_work(smc->conn.lgr->tx_wq, &smc->conn.tx_work, 0); + } } break; case TCP_DEFER_ACCEPT: @@ -2338,11 +2383,13 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page, goto out; } release_sock(sk); - if (smc->use_fallback) + if (smc->use_fallback) { rc = kernel_sendpage(smc->clcsock, page, offset, size, flags); - else + } else { + SMC_STAT_INC(smc, sendpage_cnt); rc = sock_no_sendpage(sock, page, offset, size, flags); + } out: return rc; @@ -2391,6 +2438,7 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, flags = MSG_DONTWAIT; else flags = 0; + SMC_STAT_INC(smc, splice_cnt); rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); } out: @@ -2479,6 +2527,16 @@ static void __net_exit smc_net_exit(struct net *net) smc_pnet_net_exit(net); } +static __net_init int smc_net_stat_init(struct net *net) +{ + return smc_stats_init(net); +} + +static void __net_exit smc_net_stat_exit(struct net *net) +{ + smc_stats_exit(net); +} + static struct pernet_operations smc_net_ops = { .init = smc_net_init, .exit = smc_net_exit, @@ -2486,6 +2544,11 @@ static struct pernet_operations smc_net_ops = { .size = sizeof(struct smc_net), }; +static struct pernet_operations smc_net_stat_ops = { + .init = smc_net_stat_init, + .exit = smc_net_stat_exit, +}; + static int __init smc_init(void) { int rc; @@ -2494,6 +2557,10 @@ static int __init smc_init(void) if (rc) return rc; + rc = register_pernet_subsys(&smc_net_stat_ops); + if (rc) + return rc; + smc_ism_init(); smc_clc_init(); @@ -2595,6 +2662,7 @@ static void __exit smc_exit(void) proto_unregister(&smc_proto); smc_pnet_exit(); smc_nl_exit(); + unregister_pernet_subsys(&smc_net_stat_ops); unregister_pernet_subsys(&smc_net_ops); rcu_barrier(); } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 0df85a12651e..cd0d7c908b2a 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -33,6 +33,7 @@ #include "smc_close.h" #include "smc_ism.h" #include "smc_netlink.h" +#include "smc_stats.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) @@ -1235,20 +1236,6 @@ static void smc_lgr_free(struct smc_link_group *lgr) kfree(lgr); } -static void smcd_unregister_all_dmbs(struct smc_link_group *lgr) -{ - int i; - - for (i = 0; i < SMC_RMBE_SIZES; i++) { - struct smc_buf_desc *buf_desc; - - list_for_each_entry(buf_desc, &lgr->rmbs[i], list) { - buf_desc->len += sizeof(struct smcd_cdc_msg); - smc_ism_unregister_dmb(lgr->smcd, buf_desc); - } - } -} - static void smc_sk_wake_ups(struct smc_sock *smc) { smc->sk.sk_write_space(&smc->sk); @@ -1285,7 +1272,6 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) { if (lgr->is_smcd) { smc_ism_signal_shutdown(lgr); - smcd_unregister_all_dmbs(lgr); } else { u32 rsn = lgr->llc_termination_rsn; @@ -2044,6 +2030,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_short; + bool is_dgraded = false; struct mutex *lock; /* lock buffer list */ int sk_buf_size; @@ -2071,6 +2058,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); if (buf_desc) { + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); + SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); memset(buf_desc->cpu_addr, 0, bufsize); break; /* found reusable slot */ } @@ -2082,9 +2071,16 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (PTR_ERR(buf_desc) == -ENOMEM) break; - if (IS_ERR(buf_desc)) + if (IS_ERR(buf_desc)) { + if (!is_dgraded) { + is_dgraded = true; + SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rmb); + } continue; + } + SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); buf_desc->used = 1; mutex_lock(lock); list_add(&buf_desc->list, buf_list); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 967712ba52a0..9cb2df289963 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -470,7 +470,6 @@ void smcd_unregister_dev(struct smcd_dev *smcd) mutex_unlock(&smcd_dev_list.mutex); smcd->going_away = 1; smc_smcd_terminate_all(smcd); - flush_workqueue(smcd->event_wq); destroy_workqueue(smcd->event_wq); device_del(&smcd->dev); diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index 140419a19dbf..6fb6f96c1d17 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -19,6 +19,7 @@ #include "smc_core.h" #include "smc_ism.h" #include "smc_ib.h" +#include "smc_stats.h" #include "smc_netlink.h" #define SMC_CMD_MAX_ATTR 1 @@ -55,6 +56,16 @@ static const struct genl_ops smc_gen_nl_ops[] = { /* can be retrieved by unprivileged users */ .dumpit = smcr_nl_get_device, }, + { + .cmd = SMC_NETLINK_GET_STATS, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_stats, + }, + { + .cmd = SMC_NETLINK_GET_FBACK_STATS, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_fback_stats, + }, }; static const struct nla_policy smc_gen_nl_policy[2] = { diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h index 3477265cba6c..5ce2c0a89ccd 100644 --- a/net/smc/smc_netlink.h +++ b/net/smc/smc_netlink.h @@ -18,7 +18,7 @@ extern struct genl_family smc_gen_nl_family; struct smc_nl_dmp_ctx { - int pos[2]; + int pos[3]; }; static inline struct smc_nl_dmp_ctx *smc_nl_dmp_ctx(struct netlink_callback *c) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index fcfac59f8b72..170b733bc736 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -21,6 +21,7 @@ #include "smc_cdc.h" #include "smc_tx.h" /* smc_tx_consumer_update() */ #include "smc_rx.h" +#include "smc_stats.h" /* callback implementation to wakeup consumers blocked with smc_rx_wait(). * indirectly called by smc_cdc_msg_recv_action(). @@ -227,6 +228,7 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, conn->urg_state == SMC_URG_READ) return -EINVAL; + SMC_STAT_INC(smc, urg_data_cnt); if (conn->urg_state == SMC_URG_VALID) { if (!(flags & MSG_PEEK)) smc->conn.urg_state = SMC_URG_READ; @@ -303,6 +305,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + readable = atomic_read(&conn->bytes_to_rcv); + if (readable >= conn->rmb_desc->len) + SMC_STAT_RMB_RX_FULL(smc, !conn->lnk); + + if (len < readable) + SMC_STAT_RMB_RX_SIZE_SMALL(smc, !conn->lnk); /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr; diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c new file mode 100644 index 000000000000..614013e3b574 --- /dev/null +++ b/net/smc/smc_stats.c @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * SMC statistics netlink routines + * + * Copyright IBM Corp. 2021 + * + * Author(s): Guvenc Gulce + */ +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/ctype.h> +#include <linux/smc.h> +#include <net/genetlink.h> +#include <net/sock.h> +#include "smc_netlink.h" +#include "smc_stats.h" + +int smc_stats_init(struct net *net) +{ + net->smc.fback_rsn = kzalloc(sizeof(*net->smc.fback_rsn), GFP_KERNEL); + if (!net->smc.fback_rsn) + goto err_fback; + net->smc.smc_stats = alloc_percpu(struct smc_stats); + if (!net->smc.smc_stats) + goto err_stats; + mutex_init(&net->smc.mutex_fback_rsn); + return 0; + +err_stats: + kfree(net->smc.fback_rsn); +err_fback: + return -ENOMEM; +} + +void smc_stats_exit(struct net *net) +{ + kfree(net->smc.fback_rsn); + if (net->smc.smc_stats) + free_percpu(net->smc.smc_stats); +} + +static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb, + struct smc_stats *stats, int tech, + int type) +{ + struct smc_stats_rmbcnt *stats_rmb_cnt; + struct nlattr *attrs; + + if (type == SMC_NLA_STATS_T_TX_RMB_STATS) + stats_rmb_cnt = &stats->smc[tech].rmb_tx; + else + stats_rmb_cnt = &stats->smc[tech].rmb_rx; + + attrs = nla_nest_start(skb, type); + if (!attrs) + goto errout; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_REUSE_CNT, + stats_rmb_cnt->reuse_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_PEER_CNT, + stats_rmb_cnt->buf_size_small_peer_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_CNT, + stats_rmb_cnt->buf_size_small_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_PEER_CNT, + stats_rmb_cnt->buf_full_peer_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_CNT, + stats_rmb_cnt->buf_full_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_ALLOC_CNT, + stats_rmb_cnt->alloc_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_DGRADE_CNT, + stats_rmb_cnt->dgrade_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_stats_bufsize_data(struct sk_buff *skb, + struct smc_stats *stats, int tech, + int type) +{ + struct smc_stats_memsize *stats_pload; + struct nlattr *attrs; + + if (type == SMC_NLA_STATS_T_TXPLOAD_SIZE) + stats_pload = &stats->smc[tech].tx_pd; + else if (type == SMC_NLA_STATS_T_RXPLOAD_SIZE) + stats_pload = &stats->smc[tech].rx_pd; + else if (type == SMC_NLA_STATS_T_TX_RMB_SIZE) + stats_pload = &stats->smc[tech].tx_rmbsize; + else if (type == SMC_NLA_STATS_T_RX_RMB_SIZE) + stats_pload = &stats->smc[tech].rx_rmbsize; + else + goto errout; + + attrs = nla_nest_start(skb, type); + if (!attrs) + goto errout; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_8K, + stats_pload->buf[SMC_BUF_8K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_16K, + stats_pload->buf[SMC_BUF_16K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_32K, + stats_pload->buf[SMC_BUF_32K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_64K, + stats_pload->buf[SMC_BUF_64K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_128K, + stats_pload->buf[SMC_BUF_128K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_256K, + stats_pload->buf[SMC_BUF_256K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_512K, + stats_pload->buf[SMC_BUF_512K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_1024K, + stats_pload->buf[SMC_BUF_1024K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_1024K, + stats_pload->buf[SMC_BUF_G_1024K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, + struct smc_stats *stats, int tech) +{ + struct smc_stats_tech *smc_tech; + struct nlattr *attrs; + + smc_tech = &stats->smc[tech]; + if (tech == SMC_TYPE_D) + attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCD_TECH); + else + attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCR_TECH); + + if (!attrs) + goto errout; + if (smc_nl_fill_stats_rmb_data(skb, stats, tech, + SMC_NLA_STATS_T_TX_RMB_STATS)) + goto errattr; + if (smc_nl_fill_stats_rmb_data(skb, stats, tech, + SMC_NLA_STATS_T_RX_RMB_STATS)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_TXPLOAD_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_RXPLOAD_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_TX_RMB_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_RX_RMB_SIZE)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V1_SUCC, + smc_tech->clnt_v1_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V2_SUCC, + smc_tech->clnt_v2_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V1_SUCC, + smc_tech->srv_v1_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V2_SUCC, + smc_tech->srv_v2_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_BYTES, + smc_tech->rx_bytes, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_BYTES, + smc_tech->tx_bytes, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT, + smc_tech->rx_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_CNT, + smc_tech->tx_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT, + smc_tech->sendpage_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT, + smc_tech->cork_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_NDLY_CNT, + smc_tech->ndly_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SPLICE_CNT, + smc_tech->splice_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_URG_DATA_CNT, + smc_tech->urg_data_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +int smc_nl_get_stats(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); + struct smc_stats *stats; + struct nlattr *attrs; + int cpu, i, size; + void *nlh; + u64 *src; + u64 *sum; + + if (cb_ctx->pos[0]) + goto errmsg; + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_STATS); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_STATS); + if (!attrs) + goto errnest; + stats = kzalloc(sizeof(*stats), GFP_KERNEL); + if (!stats) + goto erralloc; + size = sizeof(*stats) / sizeof(u64); + for_each_possible_cpu(cpu) { + src = (u64 *)per_cpu_ptr(net->smc.smc_stats, cpu); + sum = (u64 *)stats; + for (i = 0; i < size; i++) + *(sum++) += *(src++); + } + if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_D)) + goto errattr; + if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_R)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_CLNT_HS_ERR_CNT, + stats->clnt_hshake_err_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_SRV_HS_ERR_CNT, + stats->srv_hshake_err_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + kfree(stats); + return skb->len; + +errattr: + kfree(stats); +erralloc: + nla_nest_cancel(skb, attrs); +errnest: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} + +static int smc_nl_get_fback_details(struct sk_buff *skb, + struct netlink_callback *cb, int pos, + bool is_srv) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); + int cnt_reported = cb_ctx->pos[2]; + struct smc_stats_fback *trgt_arr; + struct nlattr *attrs; + int rc = 0; + void *nlh; + + if (is_srv) + trgt_arr = &net->smc.fback_rsn->srv[0]; + else + trgt_arr = &net->smc.fback_rsn->clnt[0]; + if (!trgt_arr[pos].fback_code) + return -ENODATA; + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_FBACK_STATS); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_FBACK_STATS); + if (!attrs) + goto errout; + if (nla_put_u8(skb, SMC_NLA_FBACK_STATS_TYPE, is_srv)) + goto errattr; + if (!cnt_reported) { + if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_SRV_CNT, + net->smc.fback_rsn->srv_fback_cnt, + SMC_NLA_FBACK_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_CLNT_CNT, + net->smc.fback_rsn->clnt_fback_cnt, + SMC_NLA_FBACK_STATS_PAD)) + goto errattr; + cnt_reported = 1; + } + + if (nla_put_u32(skb, SMC_NLA_FBACK_STATS_RSN_CODE, + trgt_arr[pos].fback_code)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_FBACK_STATS_RSN_CNT, + trgt_arr[pos].count)) + goto errattr; + + cb_ctx->pos[2] = cnt_reported; + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return rc; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); + int rc_srv = 0, rc_clnt = 0, k; + int skip_serv = cb_ctx->pos[1]; + int snum = cb_ctx->pos[0]; + bool is_srv = true; + + mutex_lock(&net->smc.mutex_fback_rsn); + for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) { + if (k < snum) + continue; + if (!skip_serv) { + rc_srv = smc_nl_get_fback_details(skb, cb, k, is_srv); + if (rc_srv && rc_srv != ENODATA) + break; + } else { + skip_serv = 0; + } + rc_clnt = smc_nl_get_fback_details(skb, cb, k, !is_srv); + if (rc_clnt && rc_clnt != ENODATA) { + skip_serv = 1; + break; + } + if (rc_clnt == ENODATA && rc_srv == ENODATA) + break; + } + mutex_unlock(&net->smc.mutex_fback_rsn); + cb_ctx->pos[1] = skip_serv; + cb_ctx->pos[0] = k; + return skb->len; +} diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h new file mode 100644 index 000000000000..84b7ecd8c05c --- /dev/null +++ b/net/smc/smc_stats.h @@ -0,0 +1,266 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Macros for SMC statistics + * + * Copyright IBM Corp. 2021 + * + * Author(s): Guvenc Gulce + */ + +#ifndef NET_SMC_SMC_STATS_H_ +#define NET_SMC_SMC_STATS_H_ +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/ctype.h> +#include <linux/smc.h> + +#include "smc_clc.h" + +#define SMC_MAX_FBACK_RSN_CNT 30 + +enum { + SMC_BUF_8K, + SMC_BUF_16K, + SMC_BUF_32K, + SMC_BUF_64K, + SMC_BUF_128K, + SMC_BUF_256K, + SMC_BUF_512K, + SMC_BUF_1024K, + SMC_BUF_G_1024K, + SMC_BUF_MAX, +}; + +struct smc_stats_fback { + int fback_code; + u16 count; +}; + +struct smc_stats_rsn { + struct smc_stats_fback srv[SMC_MAX_FBACK_RSN_CNT]; + struct smc_stats_fback clnt[SMC_MAX_FBACK_RSN_CNT]; + u64 srv_fback_cnt; + u64 clnt_fback_cnt; +}; + +struct smc_stats_rmbcnt { + u64 buf_size_small_peer_cnt; + u64 buf_size_small_cnt; + u64 buf_full_peer_cnt; + u64 buf_full_cnt; + u64 reuse_cnt; + u64 alloc_cnt; + u64 dgrade_cnt; +}; + +struct smc_stats_memsize { + u64 buf[SMC_BUF_MAX]; +}; + +struct smc_stats_tech { + struct smc_stats_memsize tx_rmbsize; + struct smc_stats_memsize rx_rmbsize; + struct smc_stats_memsize tx_pd; + struct smc_stats_memsize rx_pd; + struct smc_stats_rmbcnt rmb_tx; + struct smc_stats_rmbcnt rmb_rx; + u64 clnt_v1_succ_cnt; + u64 clnt_v2_succ_cnt; + u64 srv_v1_succ_cnt; + u64 srv_v2_succ_cnt; + u64 sendpage_cnt; + u64 urg_data_cnt; + u64 splice_cnt; + u64 cork_cnt; + u64 ndly_cnt; + u64 rx_bytes; + u64 tx_bytes; + u64 rx_cnt; + u64 tx_cnt; +}; + +struct smc_stats { + struct smc_stats_tech smc[2]; + u64 clnt_hshake_err_cnt; + u64 srv_hshake_err_cnt; +}; + +#define SMC_STAT_PAYLOAD_SUB(_smc_stats, _tech, key, _len, _rc) \ +do { \ + typeof(_smc_stats) stats = (_smc_stats); \ + typeof(_tech) t = (_tech); \ + typeof(_len) l = (_len); \ + int _pos = fls64((l) >> 13); \ + typeof(_rc) r = (_rc); \ + int m = SMC_BUF_MAX - 1; \ + this_cpu_inc((*stats).smc[t].key ## _cnt); \ + if (r <= 0) \ + break; \ + _pos = (_pos < m) ? ((l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ + this_cpu_inc((*stats).smc[t].key ## _pd.buf[_pos]); \ + this_cpu_add((*stats).smc[t].key ## _bytes, r); \ +} \ +while (0) + +#define SMC_STAT_TX_PAYLOAD(_smc, length, rcode) \ +do { \ + typeof(_smc) __smc = _smc; \ + struct net *_net = sock_net(&__smc->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(length) _len = (length); \ + typeof(rcode) _rc = (rcode); \ + bool is_smcd = !__smc->conn.lnk; \ + if (is_smcd) \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, tx, _len, _rc); \ + else \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, tx, _len, _rc); \ +} \ +while (0) + +#define SMC_STAT_RX_PAYLOAD(_smc, length, rcode) \ +do { \ + typeof(_smc) __smc = _smc; \ + struct net *_net = sock_net(&__smc->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(length) _len = (length); \ + typeof(rcode) _rc = (rcode); \ + bool is_smcd = !__smc->conn.lnk; \ + if (is_smcd) \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, rx, _len, _rc); \ + else \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, rx, _len, _rc); \ +} \ +while (0) + +#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _len) \ +do { \ + typeof(_len) _l = (_len); \ + typeof(_tech) t = (_tech); \ + int _pos = fls((_l) >> 13); \ + int m = SMC_BUF_MAX - 1; \ + _pos = (_pos < m) ? ((_l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ + this_cpu_inc((*(_smc_stats)).smc[t].k ## _rmbsize.buf[_pos]); \ +} \ +while (0) + +#define SMC_STAT_RMB_SUB(_smc_stats, type, t, key) \ + this_cpu_inc((*(_smc_stats)).smc[t].rmb ## _ ## key.type ## _cnt) + +#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _len) \ +do { \ + struct net *_net = sock_net(&(_smc)->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(_is_smcd) is_d = (_is_smcd); \ + typeof(_is_rx) is_r = (_is_rx); \ + typeof(_len) l = (_len); \ + if ((is_d) && (is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, l); \ + if ((is_d) && !(is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, l); \ + if (!(is_d) && (is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, l); \ + if (!(is_d) && !(is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, l); \ +} \ +while (0) + +#define SMC_STAT_RMB(_smc, type, _is_smcd, _is_rx) \ +do { \ + struct net *net = sock_net(&(_smc)->sk); \ + struct smc_stats __percpu *_smc_stats = net->smc.smc_stats; \ + typeof(_is_smcd) is_d = (_is_smcd); \ + typeof(_is_rx) is_r = (_is_rx); \ + if ((is_d) && (is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, rx); \ + if ((is_d) && !(is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, tx); \ + if (!(is_d) && (is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, rx); \ + if (!(is_d) && !(is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, tx); \ +} \ +while (0) + +#define SMC_STAT_BUF_REUSE(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, reuse, is_smcd, is_rx) + +#define SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, alloc, is_smcd, is_rx) + +#define SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, dgrade, is_smcd, is_rx) + +#define SMC_STAT_RMB_TX_PEER_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full_peer, is_smcd, false) + +#define SMC_STAT_RMB_TX_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full, is_smcd, false) + +#define SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small_peer, is_smcd, false) + +#define SMC_STAT_RMB_TX_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small, is_smcd, false) + +#define SMC_STAT_RMB_RX_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small, is_smcd, true) + +#define SMC_STAT_RMB_RX_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full, is_smcd, true) + +#define SMC_STAT_INC(_smc, type) \ +do { \ + typeof(_smc) __smc = _smc; \ + bool is_smcd = !(__smc)->conn.lnk; \ + struct net *net = sock_net(&(__smc)->sk); \ + struct smc_stats __percpu *smc_stats = net->smc.smc_stats; \ + if ((is_smcd)) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].type); \ + else \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].type); \ +} \ +while (0) + +#define SMC_STAT_CLNT_SUCC_INC(net, _aclc) \ +do { \ + typeof(_aclc) acl = (_aclc); \ + bool is_v2 = (acl->hdr.version == SMC_V2); \ + bool is_smcd = (acl->hdr.typev1 == SMC_TYPE_D); \ + struct smc_stats __percpu *smc_stats = (net)->smc.smc_stats; \ + if (is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v2_succ_cnt); \ + else if (is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v2_succ_cnt); \ + else if (!is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v1_succ_cnt); \ + else if (!is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v1_succ_cnt); \ +} \ +while (0) + +#define SMC_STAT_SERV_SUCC_INC(net, _ini) \ +do { \ + typeof(_ini) i = (_ini); \ + bool is_v2 = (i->smcd_version & SMC_V2); \ + bool is_smcd = (i->is_smcd); \ + typeof(net->smc.smc_stats) smc_stats = (net)->smc.smc_stats; \ + if (is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v2_succ_cnt); \ + else if (is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v2_succ_cnt); \ + else if (!is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v1_succ_cnt); \ + else if (!is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v1_succ_cnt); \ +} \ +while (0) + +int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb); +int smc_stats_init(struct net *net); +void smc_stats_exit(struct net *net); + +#endif /* NET_SMC_SMC_STATS_H_ */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 4532c16bf85e..075c4f4b41cf 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -27,6 +27,7 @@ #include "smc_close.h" #include "smc_ism.h" #include "smc_tx.h" +#include "smc_stats.h" #define SMC_TX_WORK_DELAY 0 #define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ @@ -45,6 +46,8 @@ static void smc_tx_write_space(struct sock *sk) /* similar to sk_stream_write_space */ if (atomic_read(&smc->conn.sndbuf_space) && sock) { + if (test_bit(SOCK_NOSPACE, &sock->flags)) + SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk); clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); @@ -151,6 +154,15 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) goto out_err; } + if (len > conn->sndbuf_desc->len) + SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk); + + if (len > conn->peer_rmbe_size) + SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk); + + if (msg->msg_flags & MSG_OOB) + SMC_STAT_INC(smc, urg_data_cnt); + while (msg_data_left(msg)) { if (sk->sk_state == SMC_INIT) return -ENOTCONN; @@ -419,8 +431,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, /* destination: RMBE */ /* cf. snd_wnd */ rmbespace = atomic_read(&conn->peer_rmbe_space); - if (rmbespace <= 0) + if (rmbespace <= 0) { + struct smc_sock *smc = container_of(conn, struct smc_sock, + conn); + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); return 0; + } smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); diff --git a/net/tipc/link.c b/net/tipc/link.c index c44b4bfaaee6..5b6181277cc5 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -912,7 +912,7 @@ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr) skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, dnode, l->addr, dport, 0, 0); if (!skb) - return -ENOBUFS; + return -ENOMEM; msg_set_dest_droppable(buf_msg(skb), true); TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr); skb_queue_tail(&l->wakeupq, skb); @@ -1030,7 +1030,7 @@ void tipc_link_reset(struct tipc_link *l) * * Consumes the buffer chain. * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted - * Return: 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS + * Return: 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS or -ENOMEM */ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *xmitq) @@ -1088,7 +1088,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (!_skb) { kfree_skb(skb); __skb_queue_purge(list); - return -ENOBUFS; + return -ENOMEM; } __skb_queue_tail(transmq, skb); tipc_link_set_skb_retransmit_time(skb, l); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index fecab516bf41..01396dd1c899 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -673,12 +673,12 @@ exit: * Returns a list of local sockets */ void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, - bool exact, struct list_head *dports) + struct list_head *dports) { struct service_range *sr; struct tipc_service *sc; struct publication *p; - u32 scope = ua->scope; + u8 scope = ua->scope; rcu_read_lock(); sc = tipc_service_find(net, ua); @@ -688,7 +688,7 @@ void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { - if (p->scope == scope || (!exact && p->scope < scope)) + if (scope == p->scope || scope == TIPC_ANY_SCOPE) tipc_dest_push(dports, 0, p->sk.ref); } } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index c7c9a3ddd420..259f95e3d99c 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -51,6 +51,8 @@ struct tipc_uaddr; #define TIPC_PUBL_SCOPE_NUM (TIPC_NODE_SCOPE + 1) #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ +#define TIPC_ANY_SCOPE 10 /* Both node and cluster scope will match */ + /** * struct publication - info about a published service address or range * @sr: service range represented by this publication @@ -113,7 +115,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); bool tipc_nametbl_lookup_anycast(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk); void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, - bool exact, struct list_head *dports); + struct list_head *dports); void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua, struct tipc_nlist *nodes); bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua, diff --git a/net/tipc/node.c b/net/tipc/node.c index 81af92954c6c..9947b7dfe1d2 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1214,7 +1214,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, /* Peer has changed i/f address without rebooting. * If so, the link will reset soon, and the next * discovery will be accepted. So we can ignore it. - * It may also be an cloned or malicious peer having + * It may also be a cloned or malicious peer having * chosen the same node address and signature as an * existing one. * Ignore requests until the link goes down, if ever. diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 53af72824c9c..34a97ea36cc8 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -73,9 +73,6 @@ struct sockaddr_pair { /** * struct tipc_sock - TIPC socket structure * @sk: socket - interacts with 'port' and with user via the socket API - * @conn_type: TIPC type used when connection was established - * @conn_instance: TIPC instance used when connection was established - * @published: non-zero if port has one or more associated names * @max_pkt: maximum packet size "hint" used when building messages sent by port * @maxnagle: maximum size of msg which can be subject to nagle * @portid: unique port identity in TIPC socket hash table @@ -106,11 +103,11 @@ struct sockaddr_pair { * @expect_ack: whether this TIPC socket is expecting an ack * @nodelay: setsockopt() TIPC_NODELAY setting * @group_is_open: TIPC socket group is fully open (FIXME) + * @published: true if port has one or more associated names + * @conn_addrtype: address type used when establishing connection */ struct tipc_sock { struct sock sk; - u32 conn_type; - u32 conn_instance; u32 max_pkt; u32 maxnagle; u32 portid; @@ -141,6 +138,7 @@ struct tipc_sock { bool nodelay; bool group_is_open; bool published; + u8 conn_addrtype; }; static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); @@ -664,7 +662,7 @@ static int tipc_release(struct socket *sock) * @skaddr: socket address describing name(s) and desired operation * @alen: size of socket address data structure * - * Name and name sequence binding is indicated using a positive scope value; + * Name and name sequence binding are indicated using a positive scope value; * a negative scope value unbinds the specified name. Specifying no name * (i.e. a socket address length of 0) unbinds all names from the socket. * @@ -1202,12 +1200,12 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct tipc_msg *hdr; struct tipc_uaddr ua; int user, mtyp, hlen; - bool exact; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); ua.addrtype = TIPC_SERVICE_RANGE; + /* tipc_skb_peek() increments the head skb's reference counter */ skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { hdr = buf_msg(skb); @@ -1216,6 +1214,12 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, hlen = skb_headroom(skb) + msg_hdr_sz(hdr); onode = msg_orignode(hdr); ua.sr.type = msg_nametype(hdr); + ua.sr.lower = msg_namelower(hdr); + ua.sr.upper = msg_nameupper(hdr); + if (onode == self) + ua.scope = TIPC_ANY_SCOPE; + else + ua.scope = TIPC_CLUSTER_SCOPE; if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) { spin_lock_bh(&inputq->lock); @@ -1233,20 +1237,10 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, ua.sr.lower = 0; ua.sr.upper = ~0; ua.scope = msg_lookup_scope(hdr); - exact = true; - } else { - /* TIPC_NODE_SCOPE means "any scope" in this context */ - if (onode == self) - ua.scope = TIPC_NODE_SCOPE; - else - ua.scope = TIPC_CLUSTER_SCOPE; - exact = false; - ua.sr.lower = msg_namelower(hdr); - ua.sr.upper = msg_nameupper(hdr); } /* Create destination port list: */ - tipc_nametbl_lookup_mcast_sockets(net, &ua, exact, &dports); + tipc_nametbl_lookup_mcast_sockets(net, &ua, &dports); /* Clone message per destination */ while (tipc_dest_pop(&dports, NULL, &portid)) { @@ -1258,13 +1252,11 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, } pr_warn("Failed to clone mcast rcv buffer\n"); } - /* Append to inputq if not already done by other thread */ + /* Append clones to inputq only if skb is still head of arrvq */ spin_lock_bh(&inputq->lock); if (skb_peek(arrvq) == skb) { skb_queue_splice_tail_init(&tmpq, inputq); - /* Decrease the skb's refcnt as increasing in the - * function tipc_skb_peek - */ + /* Decrement the skb's refcnt */ kfree_skb(__skb_dequeue(arrvq)); } spin_unlock_bh(&inputq->lock); @@ -1463,10 +1455,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return -EISCONN; if (tsk->published) return -EOPNOTSUPP; - if (atype == TIPC_SERVICE_ADDR) { - tsk->conn_type = ua->sa.type; - tsk->conn_instance = ua->sa.instance; - } + if (atype == TIPC_SERVICE_ADDR) + tsk->conn_addrtype = atype; msg_set_syn(hdr, 1); } @@ -1737,67 +1727,58 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb, struct tipc_sock *tsk) { - struct tipc_msg *msg; - u32 anc_data[3]; - u32 err; - u32 dest_type; - int has_name; - int res; + struct tipc_msg *hdr; + u32 data[3] = {0,}; + bool has_addr; + int dlen, rc; if (likely(m->msg_controllen == 0)) return 0; - msg = buf_msg(skb); - /* Optionally capture errored message object(s) */ - err = msg ? msg_errcode(msg) : 0; - if (unlikely(err)) { - anc_data[0] = err; - anc_data[1] = msg_data_sz(msg); - res = put_cmsg(m, SOL_TIPC, TIPC_ERRINFO, 8, anc_data); - if (res) - return res; - if (anc_data[1]) { - if (skb_linearize(skb)) - return -ENOMEM; - msg = buf_msg(skb); - res = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, anc_data[1], - msg_data(msg)); - if (res) - return res; - } + hdr = buf_msg(skb); + dlen = msg_data_sz(hdr); + + /* Capture errored message object, if any */ + if (msg_errcode(hdr)) { + if (skb_linearize(skb)) + return -ENOMEM; + hdr = buf_msg(skb); + data[0] = msg_errcode(hdr); + data[1] = dlen; + rc = put_cmsg(m, SOL_TIPC, TIPC_ERRINFO, 8, data); + if (rc || !dlen) + return rc; + rc = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, dlen, msg_data(hdr)); + if (rc) + return rc; } - /* Optionally capture message destination object */ - dest_type = msg ? msg_type(msg) : TIPC_DIRECT_MSG; - switch (dest_type) { + /* Capture TIPC_SERVICE_ADDR/RANGE destination address, if any */ + switch (msg_type(hdr)) { case TIPC_NAMED_MSG: - has_name = 1; - anc_data[0] = msg_nametype(msg); - anc_data[1] = msg_namelower(msg); - anc_data[2] = msg_namelower(msg); + has_addr = true; + data[0] = msg_nametype(hdr); + data[1] = msg_namelower(hdr); + data[2] = data[1]; break; case TIPC_MCAST_MSG: - has_name = 1; - anc_data[0] = msg_nametype(msg); - anc_data[1] = msg_namelower(msg); - anc_data[2] = msg_nameupper(msg); + has_addr = true; + data[0] = msg_nametype(hdr); + data[1] = msg_namelower(hdr); + data[2] = msg_nameupper(hdr); break; case TIPC_CONN_MSG: - has_name = (tsk->conn_type != 0); - anc_data[0] = tsk->conn_type; - anc_data[1] = tsk->conn_instance; - anc_data[2] = tsk->conn_instance; + has_addr = !!tsk->conn_addrtype; + data[0] = msg_nametype(&tsk->phdr); + data[1] = msg_nameinst(&tsk->phdr); + data[2] = data[1]; break; default: - has_name = 0; + has_addr = false; } - if (has_name) { - res = put_cmsg(m, SOL_TIPC, TIPC_DESTNAME, 12, anc_data); - if (res) - return res; - } - - return 0; + if (!has_addr) + return 0; + return put_cmsg(m, SOL_TIPC, TIPC_DESTNAME, 12, data); } static struct sk_buff *tipc_sk_build_ack(struct tipc_sock *tsk) @@ -2750,8 +2731,9 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, tsk_set_importance(new_sk, msg_importance(msg)); if (msg_named(msg)) { - new_tsock->conn_type = msg_nametype(msg); - new_tsock->conn_instance = msg_nameinst(msg); + new_tsock->conn_addrtype = TIPC_SERVICE_ADDR; + msg_set_nametype(&new_tsock->phdr, msg_nametype(msg)); + msg_set_nameinst(&new_tsock->phdr, msg_nameinst(msg)); } /* @@ -3455,13 +3437,14 @@ void tipc_socket_stop(void) /* Caller should hold socket lock for the passed tipc socket. */ static int __tipc_nl_add_sk_con(struct sk_buff *skb, struct tipc_sock *tsk) { - u32 peer_node; - u32 peer_port; + u32 peer_node, peer_port; + u32 conn_type, conn_instance; struct nlattr *nest; peer_node = tsk_peer_node(tsk); peer_port = tsk_peer_port(tsk); - + conn_type = msg_nametype(&tsk->phdr); + conn_instance = msg_nameinst(&tsk->phdr); nest = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_CON); if (!nest) return -EMSGSIZE; @@ -3471,12 +3454,12 @@ static int __tipc_nl_add_sk_con(struct sk_buff *skb, struct tipc_sock *tsk) if (nla_put_u32(skb, TIPC_NLA_CON_SOCK, peer_port)) goto msg_full; - if (tsk->conn_type != 0) { + if (tsk->conn_addrtype != 0) { if (nla_put_flag(skb, TIPC_NLA_CON_FLAG)) goto msg_full; - if (nla_put_u32(skb, TIPC_NLA_CON_TYPE, tsk->conn_type)) + if (nla_put_u32(skb, TIPC_NLA_CON_TYPE, conn_type)) goto msg_full; - if (nla_put_u32(skb, TIPC_NLA_CON_INST, tsk->conn_instance)) + if (nla_put_u32(skb, TIPC_NLA_CON_INST, conn_instance)) goto msg_full; } nla_nest_end(skb, nest); @@ -3866,9 +3849,9 @@ bool tipc_sk_filtering(struct sock *sk) } if (!tipc_sk_type_connectionless(sk)) { - type = tsk->conn_type; - lower = tsk->conn_instance; - upper = tsk->conn_instance; + type = msg_nametype(&tsk->phdr); + lower = msg_nameinst(&tsk->phdr); + upper = lower; } if ((_type && _type != type) || (_lower && _lower != lower) || @@ -3933,6 +3916,7 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) { int i = 0; size_t sz = (dqueues) ? SK_LMAX : SK_LMIN; + u32 conn_type, conn_instance; struct tipc_sock *tsk; struct publication *p; bool tsk_connected; @@ -3953,8 +3937,10 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) if (tsk_connected) { i += scnprintf(buf + i, sz - i, " %x", tsk_peer_node(tsk)); i += scnprintf(buf + i, sz - i, " %u", tsk_peer_port(tsk)); - i += scnprintf(buf + i, sz - i, " %u", tsk->conn_type); - i += scnprintf(buf + i, sz - i, " %u", tsk->conn_instance); + conn_type = msg_nametype(&tsk->phdr); + conn_instance = msg_nameinst(&tsk->phdr); + i += scnprintf(buf + i, sz - i, " %u", conn_type); + i += scnprintf(buf + i, sz - i, " %u", conn_instance); } i += scnprintf(buf + i, sz - i, " | %u", tsk->published); if (tsk->published) { diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 8e00d739f03a..05d49ad81290 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -66,7 +66,7 @@ static void tipc_sub_send_event(struct tipc_subscription *sub, /** * tipc_sub_check_overlap - test for subscription overlap with the given values * @subscribed: the service range subscribed for - * @found: the service range we are checning for match + * @found: the service range we are checking for match * * Returns true if there is overlap, otherwise false. */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index bd9f1567aa39..b932469ee69c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -128,7 +128,7 @@ static void destroy_record(struct tls_record_info *record) int i; for (i = 0; i < record->num_frags; i++) - __skb_frag_unref(&record->frags[i]); + __skb_frag_unref(&record->frags[i], false); kfree(record); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 694de024d0ee..f0fbb079cbaa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2019,8 +2019,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, if (copied < 0) goto splice_read_end; - if (likely(!(flags & MSG_PEEK))) - tls_sw_advance_skb(sk, skb, copied); + tls_sw_advance_skb(sk, skb, copied); splice_read_end: release_sock(sk); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5d1192ceb139..c9dfec7b71e7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1393,7 +1393,7 @@ restart: unix_state_unlock(sk); - /* take ten and and send info to listening sock */ + /* take ten and send info to listening sock */ spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 92a72f0e0d94..21ccf450e249 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -415,8 +415,8 @@ static void vsock_deassign_transport(struct vsock_sock *vsk) /* Assign a transport to a socket and call the .init transport callback. * - * Note: for stream socket this must be called when vsk->remote_addr is set - * (e.g. during the connect() or when a connection request on a listener + * Note: for connection oriented socket this must be called when vsk->remote_addr + * is set (e.g. during the connect() or when a connection request on a listener * socket is received). * The vsk->remote_addr is used to decide which transport to use: * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if @@ -452,6 +452,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) new_transport = transport_dgram; break; case SOCK_STREAM: + case SOCK_SEQPACKET: if (vsock_use_local_transport(remote_cid)) new_transport = transport_local; else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g || @@ -469,10 +470,10 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) return 0; /* transport->release() must be called with sock lock acquired. - * This path can only be taken during vsock_stream_connect(), - * where we have already held the sock lock. - * In the other cases, this function is called on a new socket - * which is not assigned to any transport. + * This path can only be taken during vsock_connect(), where we + * have already held the sock lock. In the other cases, this + * function is called on a new socket which is not assigned to + * any transport. */ vsk->transport->release(vsk); vsock_deassign_transport(vsk); @@ -484,6 +485,14 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) if (!new_transport || !try_module_get(new_transport->module)) return -ENODEV; + if (sk->sk_type == SOCK_SEQPACKET) { + if (!new_transport->seqpacket_allow || + !new_transport->seqpacket_allow(remote_cid)) { + module_put(new_transport->module); + return -ESOCKTNOSUPPORT; + } + } + ret = new_transport->init(vsk, psk); if (ret) { module_put(new_transport->module); @@ -604,8 +613,8 @@ out: /**** SOCKET OPERATIONS ****/ -static int __vsock_bind_stream(struct vsock_sock *vsk, - struct sockaddr_vm *addr) +static int __vsock_bind_connectible(struct vsock_sock *vsk, + struct sockaddr_vm *addr) { static u32 port; struct sockaddr_vm new_addr; @@ -649,9 +658,10 @@ static int __vsock_bind_stream(struct vsock_sock *vsk, vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); - /* Remove stream sockets from the unbound list and add them to the hash - * table for easy lookup by its address. The unbound list is simply an - * extra entry at the end of the hash table, a trick used by AF_UNIX. + /* Remove connection oriented sockets from the unbound list and add them + * to the hash table for easy lookup by its address. The unbound list + * is simply an extra entry at the end of the hash table, a trick used + * by AF_UNIX. */ __vsock_remove_bound(vsk); __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); @@ -684,8 +694,9 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) switch (sk->sk_socket->type) { case SOCK_STREAM: + case SOCK_SEQPACKET: spin_lock_bh(&vsock_table_lock); - retval = __vsock_bind_stream(vsk, addr); + retval = __vsock_bind_connectible(vsk, addr); spin_unlock_bh(&vsock_table_lock); break; @@ -768,6 +779,11 @@ static struct sock *__vsock_create(struct net *net, return sk; } +static bool sock_type_connectible(u16 type) +{ + return (type == SOCK_STREAM) || (type == SOCK_SEQPACKET); +} + static void __vsock_release(struct sock *sk, int level) { if (sk) { @@ -786,7 +802,7 @@ static void __vsock_release(struct sock *sk, int level) if (vsk->transport) vsk->transport->release(vsk); - else if (sk->sk_type == SOCK_STREAM) + else if (sock_type_connectible(sk->sk_type)) vsock_remove_sock(vsk); sock_orphan(sk); @@ -844,6 +860,16 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(vsock_stream_has_data); +static s64 vsock_connectible_has_data(struct vsock_sock *vsk) +{ + struct sock *sk = sk_vsock(vsk); + + if (sk->sk_type == SOCK_SEQPACKET) + return vsk->transport->seqpacket_has_data(vsk); + else + return vsock_stream_has_data(vsk); +} + s64 vsock_stream_has_space(struct vsock_sock *vsk) { return vsk->transport->stream_has_space(vsk); @@ -937,10 +963,10 @@ static int vsock_shutdown(struct socket *sock, int mode) if ((mode & ~SHUTDOWN_MASK) || !mode) return -EINVAL; - /* If this is a STREAM socket and it is not connected then bail out - * immediately. If it is a DGRAM socket then we must first kick the - * socket so that it wakes up from any sleeping calls, for example - * recv(), and then afterwards return the error. + /* If this is a connection oriented socket and it is not connected then + * bail out immediately. If it is a DGRAM socket then we must first + * kick the socket so that it wakes up from any sleeping calls, for + * example recv(), and then afterwards return the error. */ sk = sock->sk; @@ -948,7 +974,7 @@ static int vsock_shutdown(struct socket *sock, int mode) lock_sock(sk); if (sock->state == SS_UNCONNECTED) { err = -ENOTCONN; - if (sk->sk_type == SOCK_STREAM) + if (sock_type_connectible(sk->sk_type)) goto out; } else { sock->state = SS_DISCONNECTING; @@ -961,7 +987,7 @@ static int vsock_shutdown(struct socket *sock, int mode) sk->sk_shutdown |= mode; sk->sk_state_change(sk); - if (sk->sk_type == SOCK_STREAM) { + if (sock_type_connectible(sk->sk_type)) { sock_reset_flag(sk, SOCK_DONE); vsock_send_shutdown(sk, mode); } @@ -1016,7 +1042,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; - } else if (sock->type == SOCK_STREAM) { + } else if (sock_type_connectible(sk->sk_type)) { const struct vsock_transport *transport; lock_sock(sk); @@ -1263,8 +1289,8 @@ static void vsock_connect_timeout(struct work_struct *work) sock_put(sk); } -static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, - int addr_len, int flags) +static int vsock_connect(struct socket *sock, struct sockaddr *addr, + int addr_len, int flags) { int err; struct sock *sk; @@ -1414,7 +1440,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, lock_sock(listener); - if (sock->type != SOCK_STREAM) { + if (!sock_type_connectible(sock->type)) { err = -EOPNOTSUPP; goto out; } @@ -1491,7 +1517,7 @@ static int vsock_listen(struct socket *sock, int backlog) lock_sock(sk); - if (sock->type != SOCK_STREAM) { + if (!sock_type_connectible(sk->sk_type)) { err = -EOPNOTSUPP; goto out; } @@ -1535,11 +1561,11 @@ static void vsock_update_buffer_size(struct vsock_sock *vsk, vsk->buffer_size = val; } -static int vsock_stream_setsockopt(struct socket *sock, - int level, - int optname, - sockptr_t optval, - unsigned int optlen) +static int vsock_connectible_setsockopt(struct socket *sock, + int level, + int optname, + sockptr_t optval, + unsigned int optlen) { int err; struct sock *sk; @@ -1617,10 +1643,10 @@ exit: return err; } -static int vsock_stream_getsockopt(struct socket *sock, - int level, int optname, - char __user *optval, - int __user *optlen) +static int vsock_connectible_getsockopt(struct socket *sock, + int level, int optname, + char __user *optval, + int __user *optlen) { int err; int len; @@ -1688,8 +1714,8 @@ static int vsock_stream_getsockopt(struct socket *sock, return 0; } -static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, - size_t len) +static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk; struct vsock_sock *vsk; @@ -1712,7 +1738,9 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, transport = vsk->transport; - /* Callers should not provide a destination with stream sockets. */ + /* Callers should not provide a destination with connection oriented + * sockets. + */ if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out; @@ -1803,9 +1831,13 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, * responsibility to check how many bytes we were able to send. */ - written = transport->stream_enqueue( - vsk, msg, - len - total_written); + if (sk->sk_type == SOCK_SEQPACKET) { + written = transport->seqpacket_enqueue(vsk, + msg, len - total_written); + } else { + written = transport->stream_enqueue(vsk, + msg, len - total_written); + } if (written < 0) { err = -ENOMEM; goto out_err; @@ -1821,72 +1853,98 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, } out_err: - if (total_written > 0) - err = total_written; + if (total_written > 0) { + /* Return number of written bytes only if: + * 1) SOCK_STREAM socket. + * 2) SOCK_SEQPACKET socket when whole buffer is sent. + */ + if (sk->sk_type == SOCK_STREAM || total_written == len) + err = total_written; + } out: release_sock(sk); return err; } - -static int -vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +static int vsock_connectible_wait_data(struct sock *sk, + struct wait_queue_entry *wait, + long timeout, + struct vsock_transport_recv_notify_data *recv_data, + size_t target) { - struct sock *sk; - struct vsock_sock *vsk; const struct vsock_transport *transport; + struct vsock_sock *vsk; + s64 data; int err; - size_t target; - ssize_t copied; - long timeout; - struct vsock_transport_recv_notify_data recv_data; - - DEFINE_WAIT(wait); - sk = sock->sk; vsk = vsock_sk(sk); err = 0; + transport = vsk->transport; - lock_sock(sk); + while ((data = vsock_connectible_has_data(vsk)) == 0) { + prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); - transport = vsk->transport; + if (sk->sk_err != 0 || + (sk->sk_shutdown & RCV_SHUTDOWN) || + (vsk->peer_shutdown & SEND_SHUTDOWN)) { + break; + } - if (!transport || sk->sk_state != TCP_ESTABLISHED) { - /* Recvmsg is supposed to return 0 if a peer performs an - * orderly shutdown. Differentiate between that case and when a - * peer has not connected or a local shutdown occurred with the - * SOCK_DONE flag. - */ - if (sock_flag(sk, SOCK_DONE)) - err = 0; - else - err = -ENOTCONN; + /* Don't wait for non-blocking sockets. */ + if (timeout == 0) { + err = -EAGAIN; + break; + } - goto out; - } + if (recv_data) { + err = transport->notify_recv_pre_block(vsk, target, recv_data); + if (err < 0) + break; + } - if (flags & MSG_OOB) { - err = -EOPNOTSUPP; - goto out; - } + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); - /* We don't check peer_shutdown flag here since peer may actually shut - * down, but there can be data in the queue that a local socket can - * receive. - */ - if (sk->sk_shutdown & RCV_SHUTDOWN) { - err = 0; - goto out; + if (signal_pending(current)) { + err = sock_intr_errno(timeout); + break; + } else if (timeout == 0) { + err = -EAGAIN; + break; + } } - /* It is valid on Linux to pass in a zero-length receive buffer. This - * is not an error. We may as well bail out now. + finish_wait(sk_sleep(sk), wait); + + if (err) + return err; + + /* Internal transport error when checking for available + * data. XXX This should be changed to a connection + * reset in a later change. */ - if (!len) { - err = 0; - goto out; - } + if (data < 0) + return -ENOMEM; + + return data; +} + +static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags) +{ + struct vsock_transport_recv_notify_data recv_data; + const struct vsock_transport *transport; + struct vsock_sock *vsk; + ssize_t copied; + size_t target; + long timeout; + int err; + + DEFINE_WAIT(wait); + + vsk = vsock_sk(sk); + transport = vsk->transport; /* We must not copy less than target bytes into the user's buffer * before returning successfully, so we wait for the consume queue to @@ -1908,94 +1966,158 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, while (1) { - s64 ready; + ssize_t read; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - ready = vsock_stream_has_data(vsk); + err = vsock_connectible_wait_data(sk, &wait, timeout, + &recv_data, target); + if (err <= 0) + break; - if (ready == 0) { - if (sk->sk_err != 0 || - (sk->sk_shutdown & RCV_SHUTDOWN) || - (vsk->peer_shutdown & SEND_SHUTDOWN)) { - finish_wait(sk_sleep(sk), &wait); - break; - } - /* Don't wait for non-blocking sockets. */ - if (timeout == 0) { - err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); - break; - } + err = transport->notify_recv_pre_dequeue(vsk, target, + &recv_data); + if (err < 0) + break; - err = transport->notify_recv_pre_block( - vsk, target, &recv_data); - if (err < 0) { - finish_wait(sk_sleep(sk), &wait); - break; - } - release_sock(sk); - timeout = schedule_timeout(timeout); - lock_sock(sk); + read = transport->stream_dequeue(vsk, msg, len - copied, flags); + if (read < 0) { + err = -ENOMEM; + break; + } - if (signal_pending(current)) { - err = sock_intr_errno(timeout); - finish_wait(sk_sleep(sk), &wait); - break; - } else if (timeout == 0) { - err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); - break; - } - } else { - ssize_t read; + copied += read; - finish_wait(sk_sleep(sk), &wait); + err = transport->notify_recv_post_dequeue(vsk, target, read, + !(flags & MSG_PEEK), &recv_data); + if (err < 0) + goto out; - if (ready < 0) { - /* Invalid queue pair content. XXX This should - * be changed to a connection reset in a later - * change. - */ + if (read >= target || flags & MSG_PEEK) + break; - err = -ENOMEM; - goto out; - } + target -= read; + } - err = transport->notify_recv_pre_dequeue( - vsk, target, &recv_data); - if (err < 0) - break; + if (sk->sk_err) + err = -sk->sk_err; + else if (sk->sk_shutdown & RCV_SHUTDOWN) + err = 0; - read = transport->stream_dequeue( - vsk, msg, - len - copied, flags); - if (read < 0) { - err = -ENOMEM; - break; - } + if (copied > 0) + err = copied; + +out: + return err; +} - copied += read; +static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags) +{ + const struct vsock_transport *transport; + struct vsock_sock *vsk; + ssize_t record_len; + long timeout; + int err = 0; + DEFINE_WAIT(wait); - err = transport->notify_recv_post_dequeue( - vsk, target, read, - !(flags & MSG_PEEK), &recv_data); - if (err < 0) - goto out; + vsk = vsock_sk(sk); + transport = vsk->transport; - if (read >= target || flags & MSG_PEEK) - break; + timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - target -= read; - } + err = vsock_connectible_wait_data(sk, &wait, timeout, NULL, 0); + if (err <= 0) + goto out; + + record_len = transport->seqpacket_dequeue(vsk, msg, flags); + + if (record_len < 0) { + err = -ENOMEM; + goto out; } - if (sk->sk_err) + if (sk->sk_err) { err = -sk->sk_err; - else if (sk->sk_shutdown & RCV_SHUTDOWN) + } else if (sk->sk_shutdown & RCV_SHUTDOWN) { err = 0; + } else { + /* User sets MSG_TRUNC, so return real length of + * packet. + */ + if (flags & MSG_TRUNC) + err = record_len; + else + err = len - msg_data_left(msg); - if (copied > 0) - err = copied; + /* Always set MSG_TRUNC if real length of packet is + * bigger than user's buffer. + */ + if (record_len > len) + msg->msg_flags |= MSG_TRUNC; + } + +out: + return err; +} + +static int +vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct sock *sk; + struct vsock_sock *vsk; + const struct vsock_transport *transport; + int err; + + DEFINE_WAIT(wait); + + sk = sock->sk; + vsk = vsock_sk(sk); + err = 0; + + lock_sock(sk); + + transport = vsk->transport; + + if (!transport || sk->sk_state != TCP_ESTABLISHED) { + /* Recvmsg is supposed to return 0 if a peer performs an + * orderly shutdown. Differentiate between that case and when a + * peer has not connected or a local shutdown occurred with the + * SOCK_DONE flag. + */ + if (sock_flag(sk, SOCK_DONE)) + err = 0; + else + err = -ENOTCONN; + + goto out; + } + + if (flags & MSG_OOB) { + err = -EOPNOTSUPP; + goto out; + } + + /* We don't check peer_shutdown flag here since peer may actually shut + * down, but there can be data in the queue that a local socket can + * receive. + */ + if (sk->sk_shutdown & RCV_SHUTDOWN) { + err = 0; + goto out; + } + + /* It is valid on Linux to pass in a zero-length receive buffer. This + * is not an error. We may as well bail out now. + */ + if (!len) { + err = 0; + goto out; + } + + if (sk->sk_type == SOCK_STREAM) + err = __vsock_stream_recvmsg(sk, msg, len, flags); + else + err = __vsock_seqpacket_recvmsg(sk, msg, len, flags); out: release_sock(sk); @@ -2007,7 +2129,7 @@ static const struct proto_ops vsock_stream_ops = { .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, - .connect = vsock_stream_connect, + .connect = vsock_connect, .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, @@ -2015,10 +2137,31 @@ static const struct proto_ops vsock_stream_ops = { .ioctl = sock_no_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, - .setsockopt = vsock_stream_setsockopt, - .getsockopt = vsock_stream_getsockopt, - .sendmsg = vsock_stream_sendmsg, - .recvmsg = vsock_stream_recvmsg, + .setsockopt = vsock_connectible_setsockopt, + .getsockopt = vsock_connectible_getsockopt, + .sendmsg = vsock_connectible_sendmsg, + .recvmsg = vsock_connectible_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static const struct proto_ops vsock_seqpacket_ops = { + .family = PF_VSOCK, + .owner = THIS_MODULE, + .release = vsock_release, + .bind = vsock_bind, + .connect = vsock_connect, + .socketpair = sock_no_socketpair, + .accept = vsock_accept, + .getname = vsock_getname, + .poll = vsock_poll, + .ioctl = sock_no_ioctl, + .listen = vsock_listen, + .shutdown = vsock_shutdown, + .setsockopt = vsock_connectible_setsockopt, + .getsockopt = vsock_connectible_getsockopt, + .sendmsg = vsock_connectible_sendmsg, + .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; @@ -2043,6 +2186,9 @@ static int vsock_create(struct net *net, struct socket *sock, case SOCK_STREAM: sock->ops = &vsock_stream_ops; break; + case SOCK_SEQPACKET: + sock->ops = &vsock_seqpacket_ops; + break; default: return -ESOCKTNOSUPPORT; } diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 2700a63ab095..e73ce652bf3c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -62,6 +62,7 @@ struct virtio_vsock { struct virtio_vsock_event event_list[8]; u32 guest_cid; + bool seqpacket_allow; }; static u32 virtio_transport_get_local_cid(void) @@ -443,6 +444,8 @@ static void virtio_vsock_rx_done(struct virtqueue *vq) queue_work(virtio_vsock_workqueue, &vsock->rx_work); } +static bool virtio_transport_seqpacket_allow(u32 remote_cid); + static struct virtio_transport virtio_transport = { .transport = { .module = THIS_MODULE, @@ -469,6 +472,11 @@ static struct virtio_transport virtio_transport = { .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, + .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, + .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, + .seqpacket_allow = virtio_transport_seqpacket_allow, + .seqpacket_has_data = virtio_transport_seqpacket_has_data, + .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, @@ -485,6 +493,19 @@ static struct virtio_transport virtio_transport = { .send_pkt = virtio_transport_send_pkt, }; +static bool virtio_transport_seqpacket_allow(u32 remote_cid) +{ + struct virtio_vsock *vsock; + bool seqpacket_allow; + + rcu_read_lock(); + vsock = rcu_dereference(the_virtio_vsock); + seqpacket_allow = vsock->seqpacket_allow; + rcu_read_unlock(); + + return seqpacket_allow; +} + static void virtio_transport_rx_work(struct work_struct *work) { struct virtio_vsock *vsock = @@ -608,10 +629,14 @@ static int virtio_vsock_probe(struct virtio_device *vdev) vsock->event_run = true; mutex_unlock(&vsock->event_lock); + if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_SEQPACKET)) + vsock->seqpacket_allow = true; + vdev->priv = vsock; rcu_assign_pointer(the_virtio_vsock, vsock); mutex_unlock(&the_virtio_vsock_mutex); + return 0; out: @@ -695,6 +720,7 @@ static struct virtio_device_id id_table[] = { }; static unsigned int features[] = { + VIRTIO_VSOCK_F_SEQPACKET }; static struct virtio_driver virtio_vsock_driver = { diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 902cb6dd710b..f014ccfdd9c2 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -74,6 +74,10 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, err = memcpy_from_msg(pkt->buf, info->msg, len); if (err) goto out; + + if (msg_data_left(info->msg) == 0 && + info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) + pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); } trace_virtio_transport_alloc_pkt(src_cid, src_port, @@ -165,6 +169,14 @@ void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt) } EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); +static u16 virtio_transport_get_type(struct sock *sk) +{ + if (sk->sk_type == SOCK_STREAM) + return VIRTIO_VSOCK_TYPE_STREAM; + else + return VIRTIO_VSOCK_TYPE_SEQPACKET; +} + /* This function can only be used on connecting/connected sockets, * since a socket assigned to a transport is required. * @@ -179,6 +191,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, struct virtio_vsock_pkt *pkt; u32 pkt_len = info->pkt_len; + info->type = virtio_transport_get_type(sk_vsock(vsk)); + t_ops = virtio_transport_get_ops(vsk); if (unlikely(!t_ops)) return -EFAULT; @@ -269,13 +283,10 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit) } EXPORT_SYMBOL_GPL(virtio_transport_put_credit); -static int virtio_transport_send_credit_update(struct vsock_sock *vsk, - int type, - struct virtio_vsock_hdr *hdr) +static int virtio_transport_send_credit_update(struct vsock_sock *vsk) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, - .type = type, .vsk = vsk, }; @@ -383,11 +394,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, * messages, we set the limit to a high value. TODO: experiment * with different values. */ - if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) { - virtio_transport_send_credit_update(vsk, - VIRTIO_VSOCK_TYPE_STREAM, - NULL); - } + if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) + virtio_transport_send_credit_update(vsk); return total; @@ -397,6 +405,75 @@ out: return err; } +static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + int flags) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt; + int dequeued_len = 0; + size_t user_buf_len = msg_data_left(msg); + bool msg_ready = false; + + spin_lock_bh(&vvs->rx_lock); + + if (vvs->msg_count == 0) { + spin_unlock_bh(&vvs->rx_lock); + return 0; + } + + while (!msg_ready) { + pkt = list_first_entry(&vvs->rx_queue, struct virtio_vsock_pkt, list); + + if (dequeued_len >= 0) { + size_t pkt_len; + size_t bytes_to_copy; + + pkt_len = (size_t)le32_to_cpu(pkt->hdr.len); + bytes_to_copy = min(user_buf_len, pkt_len); + + if (bytes_to_copy) { + int err; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + + err = memcpy_to_msg(msg, pkt->buf, bytes_to_copy); + if (err) { + /* Copy of message failed. Rest of + * fragments will be freed without copy. + */ + dequeued_len = err; + } else { + user_buf_len -= bytes_to_copy; + } + + spin_lock_bh(&vvs->rx_lock); + } + + if (dequeued_len >= 0) + dequeued_len += pkt_len; + } + + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) { + msg_ready = true; + vvs->msg_count--; + } + + virtio_transport_dec_rx_pkt(vvs, pkt); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + spin_unlock_bh(&vvs->rx_lock); + + virtio_transport_send_credit_update(vsk); + + return dequeued_len; +} + ssize_t virtio_transport_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, @@ -409,6 +486,38 @@ virtio_transport_stream_dequeue(struct vsock_sock *vsk, } EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); +ssize_t +virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + int flags) +{ + if (flags & MSG_PEEK) + return -EOPNOTSUPP; + + return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags); +} +EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue); + +int +virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + spin_lock_bh(&vvs->tx_lock); + + if (len > vvs->peer_buf_alloc) { + spin_unlock_bh(&vvs->tx_lock); + return -EMSGSIZE; + } + + spin_unlock_bh(&vvs->tx_lock); + + return virtio_transport_stream_enqueue(vsk, msg, len); +} +EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_enqueue); + int virtio_transport_dgram_dequeue(struct vsock_sock *vsk, struct msghdr *msg, @@ -431,6 +540,19 @@ s64 virtio_transport_stream_has_data(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data); +u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + u32 msg_count; + + spin_lock_bh(&vvs->rx_lock); + msg_count = vvs->msg_count; + spin_unlock_bh(&vvs->rx_lock); + + return msg_count; +} +EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_has_data); + static s64 virtio_transport_has_space(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; @@ -496,8 +618,7 @@ void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val) vvs->buf_alloc = *val; - virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, - NULL); + virtio_transport_send_credit_update(vsk); } EXPORT_SYMBOL_GPL(virtio_transport_notify_buffer_size); @@ -624,7 +745,6 @@ int virtio_transport_connect(struct vsock_sock *vsk) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_REQUEST, - .type = VIRTIO_VSOCK_TYPE_STREAM, .vsk = vsk, }; @@ -636,7 +756,6 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_SHUTDOWN, - .type = VIRTIO_VSOCK_TYPE_STREAM, .flags = (mode & RCV_SHUTDOWN ? VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | (mode & SEND_SHUTDOWN ? @@ -665,7 +784,6 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk, { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RW, - .type = VIRTIO_VSOCK_TYPE_STREAM, .msg = msg, .pkt_len = len, .vsk = vsk, @@ -688,7 +806,6 @@ static int virtio_transport_reset(struct vsock_sock *vsk, { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, - .type = VIRTIO_VSOCK_TYPE_STREAM, .reply = !!pkt, .vsk = vsk, }; @@ -848,7 +965,7 @@ void virtio_transport_release(struct vsock_sock *vsk) struct sock *sk = &vsk->sk; bool remove_sock = true; - if (sk->sk_type == SOCK_STREAM) + if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) remove_sock = virtio_transport_close(vsk); if (remove_sock) { @@ -912,6 +1029,9 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, goto out; } + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) + vvs->msg_count++; + /* Try to copy small packets into the buffer of last packet queued, * to avoid wasting memory queueing the entire buffer with a small * payload. @@ -923,13 +1043,18 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, struct virtio_vsock_pkt, list); /* If there is space in the last packet queued, we copy the - * new packet in its buffer. + * new packet in its buffer. We avoid this if the last packet + * queued has VIRTIO_VSOCK_SEQ_EOR set, because this is + * delimiter of SEQPACKET record, so 'pkt' is the first packet + * of a new record. */ - if (pkt->len <= last_pkt->buf_len - last_pkt->len) { + if ((pkt->len <= last_pkt->buf_len - last_pkt->len) && + !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)) { memcpy(last_pkt->buf + last_pkt->len, pkt->buf, pkt->len); last_pkt->len += pkt->len; free_pkt = true; + last_pkt->hdr.flags |= pkt->hdr.flags; goto out; } } @@ -1000,7 +1125,6 @@ virtio_transport_send_response(struct vsock_sock *vsk, { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RESPONSE, - .type = VIRTIO_VSOCK_TYPE_STREAM, .remote_cid = le64_to_cpu(pkt->hdr.src_cid), .remote_port = le32_to_cpu(pkt->hdr.src_port), .reply = true, @@ -1096,6 +1220,12 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, return 0; } +static bool virtio_transport_valid_type(u16 type) +{ + return (type == VIRTIO_VSOCK_TYPE_STREAM) || + (type == VIRTIO_VSOCK_TYPE_SEQPACKET); +} + /* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex * lock. */ @@ -1121,7 +1251,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, le32_to_cpu(pkt->hdr.buf_alloc), le32_to_cpu(pkt->hdr.fwd_cnt)); - if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) { + if (!virtio_transport_valid_type(le16_to_cpu(pkt->hdr.type))) { (void)virtio_transport_reset_no_sock(t, pkt); goto free_pkt; } @@ -1138,6 +1268,12 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, } } + if (virtio_transport_get_type(sk) != le16_to_cpu(pkt->hdr.type)) { + (void)virtio_transport_reset_no_sock(t, pkt); + sock_put(sk); + goto free_pkt; + } + vsk = vsock_sk(sk); lock_sock(sk); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index c99bc4ce78e2..e617ed93f06b 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1248,7 +1248,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, vsock_remove_pending(listener, pending); vsock_enqueue_accept(listener, pending); - /* Callers of accept() will be be waiting on the listening socket, not + /* Callers of accept() will be waiting on the listening socket, not * the pending socket. */ listener->sk_data_ready(listener); diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index a45f7ffca8c5..169a8cf65b39 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -63,6 +63,8 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) return 0; } +static bool vsock_loopback_seqpacket_allow(u32 remote_cid); + static struct virtio_transport loopback_transport = { .transport = { .module = THIS_MODULE, @@ -89,6 +91,11 @@ static struct virtio_transport loopback_transport = { .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, + .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, + .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, + .seqpacket_allow = vsock_loopback_seqpacket_allow, + .seqpacket_has_data = virtio_transport_seqpacket_has_data, + .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, @@ -105,6 +112,11 @@ static struct virtio_transport loopback_transport = { .send_pkt = vsock_loopback_send_pkt, }; +static bool vsock_loopback_seqpacket_allow(u32 remote_cid) +{ + return true; +} + static void vsock_loopback_work(struct work_struct *work) { struct vsock_loopback *vsock = diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 1816899499ce..3583354a7d7f 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -366,7 +366,7 @@ static void x25_destroy_timer(struct timer_list *t) /* * This is called from user mode and the timers. Thus it protects itself - * against interrupt users but doesn't worry about being called during + * against interrupting users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. * Not static as it's used by the timer diff --git a/net/x25/x25_forward.c b/net/x25/x25_forward.c index d48ad6d29197..21b30b56e889 100644 --- a/net/x25/x25_forward.c +++ b/net/x25/x25_forward.c @@ -19,7 +19,6 @@ int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from, { struct x25_route *rt; struct x25_neigh *neigh_new = NULL; - struct list_head *entry; struct x25_forward *x25_frwd, *new_frwd; struct sk_buff *skbn; short same_lci = 0; @@ -46,8 +45,7 @@ int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from, * established LCI? It shouldn't happen, just in case.. */ read_lock_bh(&x25_forward_list_lock); - list_for_each(entry, &x25_forward_list) { - x25_frwd = list_entry(entry, struct x25_forward, node); + list_for_each_entry(x25_frwd, &x25_forward_list, node) { if (x25_frwd->lci == lci) { pr_warn("call request for lci which is already registered!, transmitting but not registering new pair\n"); same_lci = 1; @@ -92,15 +90,13 @@ out_no_route: int x25_forward_data(int lci, struct x25_neigh *from, struct sk_buff *skb) { struct x25_forward *frwd; - struct list_head *entry; struct net_device *peer = NULL; struct x25_neigh *nb; struct sk_buff *skbn; int rc = 0; read_lock_bh(&x25_forward_list_lock); - list_for_each(entry, &x25_forward_list) { - frwd = list_entry(entry, struct x25_forward, node); + list_for_each_entry(frwd, &x25_forward_list, node) { if (frwd->lci == lci) { /* The call is established, either side can send */ if (from->dev == frwd->dev1) { diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index 57a81100c5da..5460b9146dd8 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -332,12 +332,9 @@ void x25_link_device_down(struct net_device *dev) struct x25_neigh *x25_get_neigh(struct net_device *dev) { struct x25_neigh *nb, *use = NULL; - struct list_head *entry; read_lock_bh(&x25_neigh_list_lock); - list_for_each(entry, &x25_neigh_list) { - nb = list_entry(entry, struct x25_neigh, node); - + list_for_each_entry(nb, &x25_neigh_list, node) { if (nb->dev == dev) { use = nb; break; diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c index 9fbe4bb38d94..647f325ed867 100644 --- a/net/x25/x25_route.c +++ b/net/x25/x25_route.c @@ -27,14 +27,11 @@ static int x25_add_route(struct x25_address *address, unsigned int sigdigits, struct net_device *dev) { struct x25_route *rt; - struct list_head *entry; int rc = -EINVAL; write_lock_bh(&x25_route_list_lock); - list_for_each(entry, &x25_route_list) { - rt = list_entry(entry, struct x25_route, node); - + list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, address, sigdigits) && rt->sigdigits == sigdigits) goto out; @@ -78,14 +75,11 @@ static int x25_del_route(struct x25_address *address, unsigned int sigdigits, struct net_device *dev) { struct x25_route *rt; - struct list_head *entry; int rc = -EINVAL; write_lock_bh(&x25_route_list_lock); - list_for_each(entry, &x25_route_list) { - rt = list_entry(entry, struct x25_route, node); - + list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, address, sigdigits) && rt->sigdigits == sigdigits && rt->dev == dev) { __x25_remove_route(rt); @@ -141,13 +135,10 @@ struct net_device *x25_dev_get(char *devname) struct x25_route *x25_get_route(struct x25_address *addr) { struct x25_route *rt, *use = NULL; - struct list_head *entry; read_lock_bh(&x25_route_list_lock); - list_for_each(entry, &x25_route_list) { - rt = list_entry(entry, struct x25_route, node); - + list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, addr, rt->sigdigits)) { if (!use) use = rt; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 56a28a686988..f01ef6bda390 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -27,7 +27,7 @@ static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); - kfree(umem->pgs); + kvfree(umem->pgs); umem->pgs = NULL; } @@ -99,8 +99,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) long npgs; int err; - umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), - GFP_KERNEL | __GFP_NOWARN); + umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN); if (!umem->pgs) return -ENOMEM; @@ -123,7 +122,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) out_pin: xdp_umem_unpin_pages(umem); out_pgs: - kfree(umem->pgs); + kvfree(umem->pgs); umem->pgs = NULL; return err; } diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 67b4ce504852..9df75ea4a567 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -226,7 +226,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem); + return __bpf_xdp_redirect_map(map, ifindex, flags, 0, + __xsk_map_lookup_elem); } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, |