diff options
Diffstat (limited to 'net')
231 files changed, 8533 insertions, 4971 deletions
diff --git a/net/atm/lec.c b/net/atm/lec.c index b570ef919c28..dbabb65d8b67 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1070,7 +1070,7 @@ module_exit(lane_module_cleanup); /* * LANE2: 3.1.3, LE_RESOLVE.request * Non force allocates memory and fills in *tlvs, fills in *sizeoftlvs. - * If sizeoftlvs == NULL the default TLVs associated with with this + * If sizeoftlvs == NULL the default TLVs associated with this * lec will be used. * If dst_mac == NULL, targetless LE_ARP will be sent */ diff --git a/net/atm/signaling.c b/net/atm/signaling.c index fbd0c5e7b299..5de06ab8ed75 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -52,7 +52,7 @@ static void modify_qos(struct atm_vcc *vcc, struct atmsvc_msg *msg) msg->type = as_okay; } /* - * Should probably just turn around the old skb. But the, the buffer + * Should probably just turn around the old skb. But then, the buffer * space accounting needs to follow the change too. Maybe later. */ while (!(skb = alloc_skb(sizeof(struct atmsvc_msg), GFP_KERNEL))) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index a4faf5f904d9..206d0b424712 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -27,6 +27,7 @@ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/pkt_sched.h> +#include <linux/prandom.h> #include <linux/printk.h> #include <linux/random.h> #include <linux/rculist.h> diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index d35aca0e969a..79a7dfc32e76 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -20,6 +20,7 @@ #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/nl80211.h> +#include <linux/prandom.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 717fe657561d..8c1148fc73d7 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -20,6 +20,7 @@ #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/netdevice.h> +#include <linux/prandom.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index c350ab63cd54..ba0027d1f2df 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1863,7 +1863,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, ret = queue_work(batadv_event_workqueue, &backbone_gw->report_work); - /* backbone_gw is unreferenced in the report work function function + /* backbone_gw is unreferenced in the report work function * if queue_work() call was successful */ if (!ret) diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 9fdbe3068153..9a47ef8b95c4 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -306,7 +306,7 @@ free: * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb * to NULL; 3) Error: Return false and free skb. * - * Return: true when the packet is merged or buffered, false when skb is not not + * Return: true when the packet is merged or buffered, false when skb is not * used. */ bool batadv_frag_skb_buffer(struct sk_buff **skb, diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index fa06b51c0144..dad99641df2a 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -599,7 +599,7 @@ out: /* report to the other components the maximum amount of bytes that * batman-adv can send over the wire (without considering the payload * overhead). For example, this value is used by TT to compute the - * maximum local table table size + * maximum local table size */ atomic_set(&bat_priv->packet_size_max, min_mtu); @@ -977,23 +977,6 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface) } /** - * batadv_hardif_remove_interfaces() - Remove all hard interfaces - */ -void batadv_hardif_remove_interfaces(void) -{ - struct batadv_hard_iface *hard_iface, *hard_iface_tmp; - - rtnl_lock(); - list_for_each_entry_safe(hard_iface, hard_iface_tmp, - &batadv_hardif_list, list) { - list_del_rcu(&hard_iface->list); - batadv_hardif_generation++; - batadv_hardif_remove_interface(hard_iface); - } - rtnl_unlock(); -} - -/** * batadv_hard_if_event_softif() - Handle events for soft interfaces * @event: NETDEV_* event to handle * @net_dev: net_device which generated an event diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index bad2e50135e8..b1855d9d0b06 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -100,7 +100,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, struct net *net, const char *iface_name); void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, enum batadv_hard_if_cleanup autodel); -void batadv_hardif_remove_interfaces(void); int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); void batadv_hardif_release(struct kref *ref); diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 519c08c2cfba..70fee9b42e25 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -137,7 +137,6 @@ static void __exit batadv_exit(void) batadv_netlink_unregister(); rtnl_link_unregister(&batadv_link_ops); unregister_netdevice_notifier(&batadv_hard_if_notifier); - batadv_hardif_remove_interfaces(); flush_workqueue(batadv_event_workqueue); destroy_workqueue(batadv_event_workqueue); diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 0393bb9ed3d0..a47dc332d796 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2020.3" +#define BATADV_SOURCE_VERSION "2020.4" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index ca24a2e522b7..0746fe2c2c04 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -208,7 +208,7 @@ static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; /* TODO: ask the bridge if a multicast router is present (the bridge - * is capable of performing proper RFC4286 multicast multicast router + * is capable of performing proper RFC4286 multicast router * discovery) instead of searching for a ff02::2 listener here */ ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 48d707850f3e..61ddd6d709a0 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -26,8 +26,8 @@ #include <linux/lockdep.h> #include <linux/net.h> #include <linux/netdevice.h> +#include <linux/prandom.h> #include <linux/printk.h> -#include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> @@ -250,7 +250,7 @@ static void batadv_nc_path_put(struct batadv_nc_path *nc_path) /** * batadv_nc_packet_free() - frees nc packet * @nc_packet: the nc packet to free - * @dropped: whether the packet is freed because is is dropped + * @dropped: whether the packet is freed because is dropped */ static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet, bool dropped) diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index d267b94800d6..87017332b567 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -461,7 +461,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, /** * batadv_forw_packet_free() - free a forwarding packet * @forw_packet: The packet to free - * @dropped: whether the packet is freed because is is dropped + * @dropped: whether the packet is freed because is dropped * * This frees a forwarding packet and releases any resources it might * have claimed. diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index cdde943c1b83..82e7ca886605 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -648,7 +648,7 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, /** * batadv_interface_add_vid() - ndo_add_vid API implementation * @dev: the netdev of the mesh interface - * @proto: protocol of the the vlan id + * @proto: protocol of the vlan id * @vid: identifier of the new vlan * * Set up all the internal structures for handling the new vlan on top of the @@ -706,7 +706,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, /** * batadv_interface_kill_vid() - ndo_kill_vid API implementation * @dev: the netdev of the mesh interface - * @proto: protocol of the the vlan id + * @proto: protocol of the vlan id * @vid: identifier of the deleted vlan * * Destroy all the internal structures used to handle the vlan identified by vid diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index ed519efa3c36..965336a3b89d 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1492,7 +1492,7 @@ struct batadv_tp_vars { /** @unacked_lock: protect unacked_list */ spinlock_t unacked_lock; - /** @last_recv_time: time time (jiffies) a msg was received */ + /** @last_recv_time: time (jiffies) a msg was received */ unsigned long last_recv_time; /** @refcount: number of context where the object is used */ @@ -1996,7 +1996,7 @@ struct batadv_tt_change_node { */ struct batadv_tt_req_node { /** - * @addr: mac address address of the originator this request was sent to + * @addr: mac address of the originator this request was sent to */ u8 addr[ETH_ALEN]; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 9832f8445d43..d0c1024bf600 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1388,7 +1388,7 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) return 0; } -/* Encrypt the the link */ +/* Encrypt the link */ static void hci_conn_encrypt(struct hci_conn *conn) { BT_DBG("hcon %p", conn); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 68bfe57b6625..b0209e35284a 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -808,7 +808,7 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) * Delete Stored Link Key command. They are clearly indicating its * absence in the bit mask of supported commands. * - * Check the supported commands and only if the the command is marked + * Check the supported commands and only if the command is marked * as supported send it. If not supported assume that the controller * does not have actual support for stored link keys which makes this * command redundant anyway. diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig index 73d0b12789f1..8ad0233ce497 100644 --- a/net/bpfilter/Kconfig +++ b/net/bpfilter/Kconfig @@ -2,6 +2,7 @@ menuconfig BPFILTER bool "BPF based packet filtering framework (BPFILTER)" depends on NET && BPF && INET + select USERMODE_DRIVER help This builds experimental bpfilter framework that is aiming to provide netfilter compatible functionality via BPF diff --git a/net/bridge/br.c b/net/bridge/br.c index b6fe30e3768f..401eeb9142eb 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -183,6 +183,11 @@ static int br_switchdev_event(struct notifier_block *unused, br_fdb_offloaded_set(br, p, fdb_info->addr, fdb_info->vid, fdb_info->offloaded); break; + case SWITCHDEV_FDB_FLUSH_TO_BRIDGE: + fdb_info = ptr; + /* Don't delete static entries */ + br_fdb_delete_by_port(br, p, fdb_info->vid, 0); + break; } out: diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 5e71fc8b826f..2db800fc27ca 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -103,7 +103,7 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) /* * Legacy ioctl's through SIOCDEVPRIVATE - * This interface is deprecated because it was too difficult to + * This interface is deprecated because it was too difficult * to do the translation for 32/64bit ioctl compatibility. */ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index da5ed4cf9233..00f1651a6aba 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -77,10 +77,67 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) #endif } +static int __mdb_fill_srcs(struct sk_buff *skb, + struct net_bridge_port_group *p) +{ + struct net_bridge_group_src *ent; + struct nlattr *nest, *nest_ent; + + if (hlist_empty(&p->src_list)) + return 0; + + nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST); + if (!nest) + return -EMSGSIZE; + + hlist_for_each_entry_rcu(ent, &p->src_list, node, + lockdep_is_held(&p->port->br->multicast_lock)) { + nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY); + if (!nest_ent) + goto out_cancel_err; + switch (ent->addr.proto) { + case htons(ETH_P_IP): + if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, + ent->addr.u.ip4)) { + nla_nest_cancel(skb, nest_ent); + goto out_cancel_err; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, + &ent->addr.u.ip6)) { + nla_nest_cancel(skb, nest_ent); + goto out_cancel_err; + } + break; +#endif + default: + nla_nest_cancel(skb, nest_ent); + continue; + } + if (nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER, + br_timer_value(&ent->timer))) { + nla_nest_cancel(skb, nest_ent); + goto out_cancel_err; + } + nla_nest_end(skb, nest_ent); + } + + nla_nest_end(skb, nest); + + return 0; + +out_cancel_err: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int __mdb_fill_info(struct sk_buff *skb, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *p) { + bool dump_srcs_mode = false; struct timer_list *mtimer; struct nlattr *nest_ent; struct br_mdb_entry e; @@ -119,6 +176,23 @@ static int __mdb_fill_info(struct sk_buff *skb, nla_nest_cancel(skb, nest_ent); return -EMSGSIZE; } + switch (mp->addr.proto) { + case htons(ETH_P_IP): + dump_srcs_mode = !!(p && mp->br->multicast_igmp_version == 3); + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + dump_srcs_mode = !!(p && mp->br->multicast_mld_version == 2); + break; +#endif + } + if (dump_srcs_mode && + (__mdb_fill_srcs(skb, p) || + nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) { + nla_nest_cancel(skb, nest_ent); + return -EMSGSIZE; + } + nla_nest_end(skb, nest_ent); return 0; @@ -127,7 +201,7 @@ static int __mdb_fill_info(struct sk_buff *skb, static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { - int idx = 0, s_idx = cb->args[1], err = 0; + int idx = 0, s_idx = cb->args[1], err = 0, pidx = 0, s_pidx = cb->args[2]; struct net_bridge *br = netdev_priv(dev); struct net_bridge_mdb_entry *mp; struct nlattr *nest, *nest2; @@ -152,7 +226,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, break; } - if (mp->host_joined) { + if (!s_pidx && mp->host_joined) { err = __mdb_fill_info(skb, mp, NULL); if (err) { nla_nest_cancel(skb, nest2); @@ -164,13 +238,19 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, pp = &p->next) { if (!p->port) continue; + if (pidx < s_pidx) + goto skip_pg; err = __mdb_fill_info(skb, mp, p); if (err) { - nla_nest_cancel(skb, nest2); + nla_nest_end(skb, nest2); goto out; } +skip_pg: + pidx++; } + pidx = 0; + s_pidx = 0; nla_nest_end(skb, nest2); skip: idx++; @@ -178,6 +258,7 @@ skip: out: cb->args[1] = idx; + cb->args[2] = pidx; nla_nest_end(skb, nest); return err; } @@ -263,14 +344,15 @@ out: static int nlmsg_populate_mdb_fill(struct sk_buff *skb, struct net_device *dev, - struct br_mdb_entry *entry, u32 pid, - u32 seq, int type, unsigned int flags) + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type) { struct nlmsghdr *nlh; struct br_port_msg *bpm; struct nlattr *nest, *nest2; - nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0); + nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; @@ -285,7 +367,7 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb, if (nest2 == NULL) goto end; - if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(*entry), entry)) + if (__mdb_fill_info(skb, mp, pg)) goto end; nla_nest_end(skb, nest2); @@ -300,10 +382,49 @@ cancel: return -EMSGSIZE; } -static inline size_t rtnl_mdb_nlmsg_size(void) +static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) { - return NLMSG_ALIGN(sizeof(struct br_port_msg)) - + nla_total_size(sizeof(struct br_mdb_entry)); + size_t nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) + + nla_total_size(sizeof(struct br_mdb_entry)) + + nla_total_size(sizeof(u32)); + struct net_bridge_group_src *ent; + size_t addr_size = 0; + + if (!pg) + goto out; + + switch (pg->addr.proto) { + case htons(ETH_P_IP): + if (pg->port->br->multicast_igmp_version == 2) + goto out; + addr_size = sizeof(__be32); + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + if (pg->port->br->multicast_mld_version == 1) + goto out; + addr_size = sizeof(struct in6_addr); + break; +#endif + } + + /* MDBA_MDB_EATTR_GROUP_MODE */ + nlmsg_size += nla_total_size(sizeof(u8)); + + /* MDBA_MDB_EATTR_SRC_LIST nested attr */ + if (!hlist_empty(&pg->src_list)) + nlmsg_size += nla_total_size(0); + + hlist_for_each_entry(ent, &pg->src_list, node) { + /* MDBA_MDB_SRCLIST_ENTRY nested attr + + * MDBA_MDB_SRCATTR_ADDRESS + MDBA_MDB_SRCATTR_TIMER + */ + nlmsg_size += nla_total_size(0) + + nla_total_size(addr_size) + + nla_total_size(sizeof(u32)); + } +out: + return nlmsg_size; } struct br_mdb_complete_info { @@ -341,21 +462,22 @@ err: static void br_mdb_switchdev_host_port(struct net_device *dev, struct net_device *lower_dev, - struct br_mdb_entry *entry, int type) + struct net_bridge_mdb_entry *mp, + int type) { struct switchdev_obj_port_mdb mdb = { .obj = { .id = SWITCHDEV_OBJ_ID_HOST_MDB, .flags = SWITCHDEV_F_DEFER, }, - .vid = entry->vid, + .vid = mp->addr.vid, }; - if (entry->addr.proto == htons(ETH_P_IP)) - ip_eth_mc_map(entry->addr.u.ip4, mdb.addr); + if (mp->addr.proto == htons(ETH_P_IP)) + ip_eth_mc_map(mp->addr.u.ip4, mdb.addr); #if IS_ENABLED(CONFIG_IPV6) else - ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr); + ipv6_eth_mc_map(&mp->addr.u.ip6, mdb.addr); #endif mdb.obj.orig_dev = dev; @@ -370,17 +492,19 @@ static void br_mdb_switchdev_host_port(struct net_device *dev, } static void br_mdb_switchdev_host(struct net_device *dev, - struct br_mdb_entry *entry, int type) + struct net_bridge_mdb_entry *mp, int type) { struct net_device *lower_dev; struct list_head *iter; netdev_for_each_lower_dev(dev, lower_dev, iter) - br_mdb_switchdev_host_port(dev, lower_dev, entry, type); + br_mdb_switchdev_host_port(dev, lower_dev, mp, type); } -static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, - struct br_mdb_entry *entry, int type) +void br_mdb_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type) { struct br_mdb_complete_info *complete_info; struct switchdev_obj_port_mdb mdb = { @@ -388,44 +512,45 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, .id = SWITCHDEV_OBJ_ID_PORT_MDB, .flags = SWITCHDEV_F_DEFER, }, - .vid = entry->vid, + .vid = mp->addr.vid, }; - struct net_device *port_dev; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; - port_dev = __dev_get_by_index(net, entry->ifindex); - if (entry->addr.proto == htons(ETH_P_IP)) - ip_eth_mc_map(entry->addr.u.ip4, mdb.addr); + if (pg) { + if (mp->addr.proto == htons(ETH_P_IP)) + ip_eth_mc_map(mp->addr.u.ip4, mdb.addr); #if IS_ENABLED(CONFIG_IPV6) - else - ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr); + else + ipv6_eth_mc_map(&mp->addr.u.ip6, mdb.addr); #endif - - mdb.obj.orig_dev = port_dev; - if (p && port_dev && type == RTM_NEWMDB) { - complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); - if (complete_info) { - complete_info->port = p; - __mdb_entry_to_br_ip(entry, &complete_info->ip); + mdb.obj.orig_dev = pg->port->dev; + switch (type) { + case RTM_NEWMDB: + complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); + if (!complete_info) + break; + complete_info->port = pg->port; + complete_info->ip = mp->addr; mdb.obj.complete_priv = complete_info; mdb.obj.complete = br_mdb_complete; - if (switchdev_port_obj_add(port_dev, &mdb.obj, NULL)) + if (switchdev_port_obj_add(pg->port->dev, &mdb.obj, NULL)) kfree(complete_info); + break; + case RTM_DELMDB: + switchdev_port_obj_del(pg->port->dev, &mdb.obj); + break; } - } else if (p && port_dev && type == RTM_DELMDB) { - switchdev_port_obj_del(port_dev, &mdb.obj); + } else { + br_mdb_switchdev_host(dev, mp, type); } - if (!p) - br_mdb_switchdev_host(dev, entry, type); - - skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC); + skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC); if (!skb) goto errout; - err = nlmsg_populate_mdb_fill(skb, dev, entry, 0, 0, type, NTF_SELF); + err = nlmsg_populate_mdb_fill(skb, dev, mp, pg, type); if (err < 0) { kfree_skb(skb); goto errout; @@ -437,26 +562,6 @@ errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } -void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type, u8 flags) -{ - struct br_mdb_entry entry; - - memset(&entry, 0, sizeof(entry)); - if (port) - entry.ifindex = port->dev->ifindex; - else - entry.ifindex = dev->ifindex; - entry.addr.proto = group->proto; - entry.addr.u.ip4 = group->u.ip4; -#if IS_ENABLED(CONFIG_IPV6) - entry.addr.u.ip6 = group->u.ip6; -#endif - entry.vid = group->vid; - __mdb_entry_fill_flags(&entry, flags); - __br_mdb_notify(dev, port, &entry, type); -} - static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, int ifindex, u32 pid, @@ -600,7 +705,7 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, } static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group, unsigned char state) + struct br_ip *group, struct br_mdb_entry *entry) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; @@ -619,12 +724,13 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, /* host join */ if (!port) { /* don't allow any flags for host-joined groups */ - if (state) + if (entry->state) return -EINVAL; if (mp->host_joined) return -EEXIST; br_multicast_host_join(mp, false); + br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB); return 0; } @@ -638,12 +744,14 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, break; } - p = br_multicast_new_port_group(port, group, *pp, state, NULL); + p = br_multicast_new_port_group(port, group, *pp, entry->state, NULL, + MCAST_EXCLUDE); if (unlikely(!p)) return -ENOMEM; rcu_assign_pointer(*pp, p); - if (state == MDB_TEMPORARY) + if (entry->state == MDB_TEMPORARY) mod_timer(&p->timer, now + br->multicast_membership_interval); + br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); return 0; } @@ -672,7 +780,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, __mdb_entry_to_br_ip(entry, &ip); spin_lock_bh(&br->multicast_lock); - ret = br_mdb_add_group(br, p, &ip, entry->state); + ret = br_mdb_add_group(br, p, &ip, entry); spin_unlock_bh(&br->multicast_lock); return ret; } @@ -717,12 +825,9 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, err = __br_mdb_add(net, br, entry); if (err) break; - __br_mdb_notify(dev, p, entry, RTM_NEWMDB); } } else { err = __br_mdb_add(net, br, entry); - if (!err) - __br_mdb_notify(dev, p, entry, RTM_NEWMDB); } return err; @@ -750,6 +855,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) { br_multicast_host_leave(mp, false); err = 0; + br_mdb_notify(br->dev, mp, NULL, RTM_DELMDB); if (!mp->ports && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); goto unlock; @@ -764,16 +870,8 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (p->port->state == BR_STATE_DISABLED) goto unlock; - __mdb_entry_fill_flags(entry, p->flags); - rcu_assign_pointer(*pp, p->next); - hlist_del_init(&p->mglist); - del_timer(&p->timer); - kfree_rcu(p, rcu); + br_multicast_del_pg(mp, p, pp); err = 0; - - if (!mp->ports && !mp->host_joined && - netif_running(br->dev)) - mod_timer(&mp->timer, jiffies); break; } @@ -820,13 +918,9 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; err = __br_mdb_del(br, entry); - if (!err) - __br_mdb_notify(dev, p, entry, RTM_DELMDB); } } else { err = __br_mdb_del(br, entry); - if (!err) - __br_mdb_notify(dev, p, entry, RTM_DELMDB); } return err; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 4c4a93abde68..e77f1e27caf7 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -50,6 +50,7 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, __be32 group, __u16 vid, const unsigned char *src); +static void br_multicast_port_group_rexmit(struct timer_list *t); static void __del_port_router(struct net_bridge_port *p); #if IS_ENABLED(CONFIG_IPV6) @@ -139,6 +140,29 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, return br_mdb_ip_get_rcu(br, &ip); } +static void br_multicast_destroy_mdb_entry(struct net_bridge_mcast_gc *gc) +{ + struct net_bridge_mdb_entry *mp; + + mp = container_of(gc, struct net_bridge_mdb_entry, mcast_gc); + WARN_ON(!hlist_unhashed(&mp->mdb_node)); + WARN_ON(mp->ports); + + del_timer_sync(&mp->timer); + kfree_rcu(mp, rcu); +} + +static void br_multicast_del_mdb_entry(struct net_bridge_mdb_entry *mp) +{ + struct net_bridge *br = mp->br; + + rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode, + br_mdb_rht_params); + hlist_del_init_rcu(&mp->mdb_node); + hlist_add_head(&mp->mcast_gc.gc_node, &br->mcast_gc_list); + queue_work(system_long_wq, &br->mcast_gc_work); +} + static void br_multicast_group_expired(struct timer_list *t) { struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer); @@ -152,23 +176,71 @@ static void br_multicast_group_expired(struct timer_list *t) if (mp->ports) goto out; + br_multicast_del_mdb_entry(mp); +out: + spin_unlock(&br->multicast_lock); +} - rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode, - br_mdb_rht_params); - hlist_del_rcu(&mp->mdb_node); +static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc) +{ + struct net_bridge_group_src *src; - kfree_rcu(mp, rcu); + src = container_of(gc, struct net_bridge_group_src, mcast_gc); + WARN_ON(!hlist_unhashed(&src->node)); -out: - spin_unlock(&br->multicast_lock); + del_timer_sync(&src->timer); + kfree_rcu(src, rcu); } -static void br_multicast_del_pg(struct net_bridge *br, - struct net_bridge_port_group *pg) +static void br_multicast_del_group_src(struct net_bridge_group_src *src) { + struct net_bridge *br = src->pg->port->br; + + hlist_del_init_rcu(&src->node); + src->pg->src_ents--; + hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list); + queue_work(system_long_wq, &br->mcast_gc_work); +} + +static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc) +{ + struct net_bridge_port_group *pg; + + pg = container_of(gc, struct net_bridge_port_group, mcast_gc); + WARN_ON(!hlist_unhashed(&pg->mglist)); + WARN_ON(!hlist_empty(&pg->src_list)); + + del_timer_sync(&pg->rexmit_timer); + del_timer_sync(&pg->timer); + kfree_rcu(pg, rcu); +} + +void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + struct net_bridge_port_group __rcu **pp) +{ + struct net_bridge *br = pg->port->br; + struct net_bridge_group_src *ent; + struct hlist_node *tmp; + + rcu_assign_pointer(*pp, pg->next); + hlist_del_init(&pg->mglist); + hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) + br_multicast_del_group_src(ent); + br_mdb_notify(br->dev, mp, pg, RTM_DELMDB); + hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list); + queue_work(system_long_wq, &br->mcast_gc_work); + + if (!mp->ports && !mp->host_joined && netif_running(br->dev)) + mod_timer(&mp->timer, jiffies); +} + +static void br_multicast_find_del_pg(struct net_bridge *br, + struct net_bridge_port_group *pg) +{ + struct net_bridge_port_group __rcu **pp; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; - struct net_bridge_port_group __rcu **pp; mp = br_mdb_ip_get(br, &pg->addr); if (WARN_ON(!mp)) @@ -180,17 +252,7 @@ static void br_multicast_del_pg(struct net_bridge *br, if (p != pg) continue; - rcu_assign_pointer(*pp, p->next); - hlist_del_init(&p->mglist); - del_timer(&p->timer); - br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB, - p->flags); - kfree_rcu(p, rcu); - - if (!mp->ports && !mp->host_joined && - netif_running(br->dev)) - mod_timer(&mp->timer, jiffies); - + br_multicast_del_pg(mp, pg, pp); return; } @@ -200,35 +262,95 @@ static void br_multicast_del_pg(struct net_bridge *br, static void br_multicast_port_group_expired(struct timer_list *t) { struct net_bridge_port_group *pg = from_timer(pg, t, timer); + struct net_bridge_group_src *src_ent; struct net_bridge *br = pg->port->br; + struct hlist_node *tmp; + bool changed; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || timer_pending(&pg->timer) || hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT) goto out; - br_multicast_del_pg(br, pg); + changed = !!(pg->filter_mode == MCAST_EXCLUDE); + pg->filter_mode = MCAST_INCLUDE; + hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) { + if (!timer_pending(&src_ent->timer)) { + br_multicast_del_group_src(src_ent); + changed = true; + } + } + + if (hlist_empty(&pg->src_list)) { + br_multicast_find_del_pg(br, pg); + } else if (changed) { + struct net_bridge_mdb_entry *mp = br_mdb_ip_get(br, &pg->addr); + if (WARN_ON(!mp)) + goto out; + br_mdb_notify(br->dev, mp, pg, RTM_NEWMDB); + } out: spin_unlock(&br->multicast_lock); } -static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, - __be32 group, - u8 *igmp_type) +static void br_multicast_gc(struct hlist_head *head) { + struct net_bridge_mcast_gc *gcent; + struct hlist_node *tmp; + + hlist_for_each_entry_safe(gcent, tmp, head, gc_node) { + hlist_del_init(&gcent->gc_node); + gcent->destroy(gcent); + } +} + +static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, + struct net_bridge_port_group *pg, + __be32 ip_dst, __be32 group, + bool with_srcs, bool over_lmqt, + u8 sflag, u8 *igmp_type, + bool *need_rexmit) +{ + struct net_bridge_port *p = pg ? pg->port : NULL; + struct net_bridge_group_src *ent; + size_t pkt_size, igmp_hdr_size; + unsigned long now = jiffies; struct igmpv3_query *ihv3; - size_t igmp_hdr_size; + void *csum_start = NULL; + __sum16 *csum = NULL; struct sk_buff *skb; struct igmphdr *ih; struct ethhdr *eth; + unsigned long lmqt; struct iphdr *iph; + u16 lmqt_srcs = 0; igmp_hdr_size = sizeof(*ih); - if (br->multicast_igmp_version == 3) + if (br->multicast_igmp_version == 3) { igmp_hdr_size = sizeof(*ihv3); - skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) + - igmp_hdr_size + 4); + if (pg && with_srcs) { + lmqt = now + (br->multicast_last_member_interval * + br->multicast_last_member_count); + hlist_for_each_entry(ent, &pg->src_list, node) { + if (over_lmqt == time_after(ent->timer.expires, + lmqt) && + ent->src_query_rexmit_cnt > 0) + lmqt_srcs++; + } + + if (!lmqt_srcs) + return NULL; + igmp_hdr_size += lmqt_srcs * sizeof(__be32); + } + } + + pkt_size = sizeof(*eth) + sizeof(*iph) + 4 + igmp_hdr_size; + if ((p && pkt_size > p->dev->mtu) || + pkt_size > br->dev->mtu) + return NULL; + + skb = netdev_alloc_skb_ip_align(br->dev, pkt_size); if (!skb) goto out; @@ -238,29 +360,24 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, eth = eth_hdr(skb); ether_addr_copy(eth->h_source, br->dev->dev_addr); - eth->h_dest[0] = 1; - eth->h_dest[1] = 0; - eth->h_dest[2] = 0x5e; - eth->h_dest[3] = 0; - eth->h_dest[4] = 0; - eth->h_dest[5] = 1; + ip_eth_mc_map(ip_dst, eth->h_dest); eth->h_proto = htons(ETH_P_IP); skb_put(skb, sizeof(*eth)); skb_set_network_header(skb, skb->len); iph = ip_hdr(skb); + iph->tot_len = htons(pkt_size - sizeof(*eth)); iph->version = 4; iph->ihl = 6; iph->tos = 0xc0; - iph->tot_len = htons(sizeof(*iph) + igmp_hdr_size + 4); iph->id = 0; iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->protocol = IPPROTO_IGMP; iph->saddr = br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR) ? inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0; - iph->daddr = htonl(INADDR_ALLHOSTS_GROUP); + iph->daddr = ip_dst; ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; @@ -280,7 +397,8 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, (HZ / IGMP_TIMER_SCALE); ih->group = group; ih->csum = 0; - ih->csum = ip_compute_csum((void *)ih, sizeof(*ih)); + csum = &ih->csum; + csum_start = (void *)ih; break; case 3: ihv3 = igmpv3_query_hdr(skb); @@ -290,15 +408,40 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, (HZ / IGMP_TIMER_SCALE); ihv3->group = group; ihv3->qqic = br->multicast_query_interval / HZ; - ihv3->nsrcs = 0; + ihv3->nsrcs = htons(lmqt_srcs); ihv3->resv = 0; - ihv3->suppress = 0; + ihv3->suppress = sflag; ihv3->qrv = 2; ihv3->csum = 0; - ihv3->csum = ip_compute_csum((void *)ihv3, sizeof(*ihv3)); + csum = &ihv3->csum; + csum_start = (void *)ihv3; + if (!pg || !with_srcs) + break; + + lmqt_srcs = 0; + hlist_for_each_entry(ent, &pg->src_list, node) { + if (over_lmqt == time_after(ent->timer.expires, + lmqt) && + ent->src_query_rexmit_cnt > 0) { + ihv3->srcs[lmqt_srcs++] = ent->addr.u.ip4; + ent->src_query_rexmit_cnt--; + if (need_rexmit && ent->src_query_rexmit_cnt) + *need_rexmit = true; + } + } + if (WARN_ON(lmqt_srcs != ntohs(ihv3->nsrcs))) { + kfree_skb(skb); + return NULL; + } break; } + if (WARN_ON(!csum || !csum_start)) { + kfree_skb(skb); + return NULL; + } + + *csum = ip_compute_csum(csum_start, igmp_hdr_size); skb_put(skb, igmp_hdr_size); __skb_pull(skb, sizeof(*eth)); @@ -308,23 +451,54 @@ out: #if IS_ENABLED(CONFIG_IPV6) static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, - const struct in6_addr *grp, - u8 *igmp_type) -{ + struct net_bridge_port_group *pg, + const struct in6_addr *ip6_dst, + const struct in6_addr *group, + bool with_srcs, bool over_llqt, + u8 sflag, u8 *igmp_type, + bool *need_rexmit) +{ + struct net_bridge_port *p = pg ? pg->port : NULL; + struct net_bridge_group_src *ent; + size_t pkt_size, mld_hdr_size; + unsigned long now = jiffies; struct mld2_query *mld2q; + void *csum_start = NULL; unsigned long interval; + __sum16 *csum = NULL; struct ipv6hdr *ip6h; struct mld_msg *mldq; - size_t mld_hdr_size; struct sk_buff *skb; + unsigned long llqt; struct ethhdr *eth; + u16 llqt_srcs = 0; u8 *hopopt; mld_hdr_size = sizeof(*mldq); - if (br->multicast_mld_version == 2) + if (br->multicast_mld_version == 2) { mld_hdr_size = sizeof(*mld2q); - skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) + - 8 + mld_hdr_size); + if (pg && with_srcs) { + llqt = now + (br->multicast_last_member_interval * + br->multicast_last_member_count); + hlist_for_each_entry(ent, &pg->src_list, node) { + if (over_llqt == time_after(ent->timer.expires, + llqt) && + ent->src_query_rexmit_cnt > 0) + llqt_srcs++; + } + + if (!llqt_srcs) + return NULL; + mld_hdr_size += llqt_srcs * sizeof(struct in6_addr); + } + } + + pkt_size = sizeof(*eth) + sizeof(*ip6h) + 8 + mld_hdr_size; + if ((p && pkt_size > p->dev->mtu) || + pkt_size > br->dev->mtu) + return NULL; + + skb = netdev_alloc_skb_ip_align(br->dev, pkt_size); if (!skb) goto out; @@ -346,7 +520,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, ip6h->payload_len = htons(8 + mld_hdr_size); ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; - ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); + ip6h->daddr = *ip6_dst; if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, &ip6h->saddr)) { kfree_skb(skb); @@ -371,7 +545,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, /* ICMPv6 */ skb_set_transport_header(skb, skb->len); - interval = ipv6_addr_any(grp) ? + interval = ipv6_addr_any(group) ? br->multicast_query_response_interval : br->multicast_last_member_interval; *igmp_type = ICMPV6_MGM_QUERY; @@ -383,12 +557,9 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, mldq->mld_cksum = 0; mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); mldq->mld_reserved = 0; - mldq->mld_mca = *grp; - mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, - sizeof(*mldq), IPPROTO_ICMPV6, - csum_partial(mldq, - sizeof(*mldq), - 0)); + mldq->mld_mca = *group; + csum = &mldq->mld_cksum; + csum_start = (void *)mldq; break; case 2: mld2q = (struct mld2_query *)icmp6_hdr(skb); @@ -398,21 +569,43 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, mld2q->mld2q_cksum = 0; mld2q->mld2q_resv1 = 0; mld2q->mld2q_resv2 = 0; - mld2q->mld2q_suppress = 0; + mld2q->mld2q_suppress = sflag; mld2q->mld2q_qrv = 2; - mld2q->mld2q_nsrcs = 0; + mld2q->mld2q_nsrcs = htons(llqt_srcs); mld2q->mld2q_qqic = br->multicast_query_interval / HZ; - mld2q->mld2q_mca = *grp; - mld2q->mld2q_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, - sizeof(*mld2q), - IPPROTO_ICMPV6, - csum_partial(mld2q, - sizeof(*mld2q), - 0)); + mld2q->mld2q_mca = *group; + csum = &mld2q->mld2q_cksum; + csum_start = (void *)mld2q; + if (!pg || !with_srcs) + break; + + llqt_srcs = 0; + hlist_for_each_entry(ent, &pg->src_list, node) { + if (over_llqt == time_after(ent->timer.expires, + llqt) && + ent->src_query_rexmit_cnt > 0) { + mld2q->mld2q_srcs[llqt_srcs++] = ent->addr.u.ip6; + ent->src_query_rexmit_cnt--; + if (need_rexmit && ent->src_query_rexmit_cnt) + *need_rexmit = true; + } + } + if (WARN_ON(llqt_srcs != ntohs(mld2q->mld2q_nsrcs))) { + kfree_skb(skb); + return NULL; + } break; } - skb_put(skb, mld_hdr_size); + if (WARN_ON(!csum || !csum_start)) { + kfree_skb(skb); + return NULL; + } + + *csum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, mld_hdr_size, + IPPROTO_ICMPV6, + csum_partial(csum_start, mld_hdr_size, 0)); + skb_put(skb, mld_hdr_size); __skb_pull(skb, sizeof(*eth)); out: @@ -421,16 +614,39 @@ out: #endif static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, - struct br_ip *addr, - u8 *igmp_type) + struct net_bridge_port_group *pg, + struct br_ip *ip_dst, + struct br_ip *group, + bool with_srcs, bool over_lmqt, + u8 sflag, u8 *igmp_type, + bool *need_rexmit) { - switch (addr->proto) { + __be32 ip4_dst; + + switch (group->proto) { case htons(ETH_P_IP): - return br_ip4_multicast_alloc_query(br, addr->u.ip4, igmp_type); + ip4_dst = ip_dst ? ip_dst->u.ip4 : htonl(INADDR_ALLHOSTS_GROUP); + return br_ip4_multicast_alloc_query(br, pg, + ip4_dst, group->u.ip4, + with_srcs, over_lmqt, + sflag, igmp_type, + need_rexmit); #if IS_ENABLED(CONFIG_IPV6) - case htons(ETH_P_IPV6): - return br_ip6_multicast_alloc_query(br, &addr->u.ip6, - igmp_type); + case htons(ETH_P_IPV6): { + struct in6_addr ip6_dst; + + if (ip_dst) + ip6_dst = ip_dst->u.ip6; + else + ipv6_addr_set(&ip6_dst, htonl(0xff020000), 0, 0, + htonl(1)); + + return br_ip6_multicast_alloc_query(br, pg, + &ip6_dst, &group->u.ip6, + with_srcs, over_lmqt, + sflag, igmp_type, + need_rexmit); + } #endif } return NULL; @@ -457,6 +673,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, mp->br = br; mp->addr = *group; + mp->mcast_gc.destroy = br_multicast_destroy_mdb_entry; timer_setup(&mp->timer, br_multicast_group_expired, 0); err = rhashtable_lookup_insert_fast(&br->mdb_hash_tbl, &mp->rhnode, br_mdb_rht_params); @@ -470,12 +687,97 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, return mp; } +static void br_multicast_group_src_expired(struct timer_list *t) +{ + struct net_bridge_group_src *src = from_timer(src, t, timer); + struct net_bridge_port_group *pg; + struct net_bridge *br = src->br; + + spin_lock(&br->multicast_lock); + if (hlist_unhashed(&src->node) || !netif_running(br->dev) || + timer_pending(&src->timer)) + goto out; + + pg = src->pg; + if (pg->filter_mode == MCAST_INCLUDE) { + br_multicast_del_group_src(src); + if (!hlist_empty(&pg->src_list)) + goto out; + br_multicast_find_del_pg(br, pg); + } +out: + spin_unlock(&br->multicast_lock); +} + +static struct net_bridge_group_src * +br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip) +{ + struct net_bridge_group_src *ent; + + switch (ip->proto) { + case htons(ETH_P_IP): + hlist_for_each_entry(ent, &pg->src_list, node) + if (ip->u.ip4 == ent->addr.u.ip4) + return ent; + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + hlist_for_each_entry(ent, &pg->src_list, node) + if (!ipv6_addr_cmp(&ent->addr.u.ip6, &ip->u.ip6)) + return ent; + break; +#endif + } + + return NULL; +} + +static struct net_bridge_group_src * +br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip) +{ + struct net_bridge_group_src *grp_src; + + if (unlikely(pg->src_ents >= PG_SRC_ENT_LIMIT)) + return NULL; + + switch (src_ip->proto) { + case htons(ETH_P_IP): + if (ipv4_is_zeronet(src_ip->u.ip4) || + ipv4_is_multicast(src_ip->u.ip4)) + return NULL; + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + if (ipv6_addr_any(&src_ip->u.ip6) || + ipv6_addr_is_multicast(&src_ip->u.ip6)) + return NULL; + break; +#endif + } + + grp_src = kzalloc(sizeof(*grp_src), GFP_ATOMIC); + if (unlikely(!grp_src)) + return NULL; + + grp_src->pg = pg; + grp_src->br = pg->port->br; + grp_src->addr = *src_ip; + grp_src->mcast_gc.destroy = br_multicast_destroy_group_src; + timer_setup(&grp_src->timer, br_multicast_group_src_expired, 0); + + hlist_add_head_rcu(&grp_src->node, &pg->src_list); + pg->src_ents++; + + return grp_src; +} + struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, - const unsigned char *src) + const unsigned char *src, + u8 filter_mode) { struct net_bridge_port_group *p; @@ -486,9 +788,13 @@ struct net_bridge_port_group *br_multicast_new_port_group( p->addr = *group; p->port = port; p->flags = flags; + p->filter_mode = filter_mode; + p->mcast_gc.destroy = br_multicast_destroy_port_group; + INIT_HLIST_HEAD(&p->src_list); rcu_assign_pointer(p->next, next); - hlist_add_head(&p->mglist, &port->mglist); timer_setup(&p->timer, br_multicast_port_group_expired, 0); + timer_setup(&p->rexmit_timer, br_multicast_port_group_rexmit, 0); + hlist_add_head(&p->mglist, &port->mglist); if (src) memcpy(p->eth_addr, src, ETH_ALEN); @@ -516,8 +822,7 @@ void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify) if (!mp->host_joined) { mp->host_joined = true; if (notify) - br_mdb_notify(mp->br->dev, NULL, &mp->addr, - RTM_NEWMDB, 0); + br_mdb_notify(mp->br->dev, mp, NULL, RTM_NEWMDB); } mod_timer(&mp->timer, jiffies + mp->br->multicast_membership_interval); } @@ -529,13 +834,15 @@ void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify) mp->host_joined = false; if (notify) - br_mdb_notify(mp->br->dev, NULL, &mp->addr, RTM_DELMDB, 0); + br_mdb_notify(mp->br->dev, mp, NULL, RTM_DELMDB); } static int br_multicast_add_group(struct net_bridge *br, struct net_bridge_port *port, struct br_ip *group, - const unsigned char *src) + const unsigned char *src, + u8 filter_mode, + bool igmpv2_mldv1) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; @@ -567,14 +874,16 @@ static int br_multicast_add_group(struct net_bridge *br, break; } - p = br_multicast_new_port_group(port, group, *pp, 0, src); + p = br_multicast_new_port_group(port, group, *pp, 0, src, filter_mode); if (unlikely(!p)) goto err; rcu_assign_pointer(*pp, p); - br_mdb_notify(br->dev, port, group, RTM_NEWMDB, 0); + br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); found: - mod_timer(&p->timer, now + br->multicast_membership_interval); + if (igmpv2_mldv1) + mod_timer(&p->timer, now + br->multicast_membership_interval); + out: err = 0; @@ -587,9 +896,11 @@ static int br_ip4_multicast_add_group(struct net_bridge *br, struct net_bridge_port *port, __be32 group, __u16 vid, - const unsigned char *src) + const unsigned char *src, + bool igmpv2) { struct br_ip br_group; + u8 filter_mode; if (ipv4_is_local_multicast(group)) return 0; @@ -598,8 +909,10 @@ static int br_ip4_multicast_add_group(struct net_bridge *br, br_group.u.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; + filter_mode = igmpv2 ? MCAST_EXCLUDE : MCAST_INCLUDE; - return br_multicast_add_group(br, port, &br_group, src); + return br_multicast_add_group(br, port, &br_group, src, filter_mode, + igmpv2); } #if IS_ENABLED(CONFIG_IPV6) @@ -607,9 +920,11 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, struct net_bridge_port *port, const struct in6_addr *group, __u16 vid, - const unsigned char *src) + const unsigned char *src, + bool mldv1) { struct br_ip br_group; + u8 filter_mode; if (ipv6_addr_is_ll_all_nodes(group)) return 0; @@ -618,8 +933,10 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, br_group.u.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; + filter_mode = mldv1 ? MCAST_EXCLUDE : MCAST_INCLUDE; - return br_multicast_add_group(br, port, &br_group, src); + return br_multicast_add_group(br, port, &br_group, src, filter_mode, + mldv1); } #endif @@ -711,12 +1028,21 @@ static void br_multicast_select_own_querier(struct net_bridge *br, static void __br_multicast_send_query(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *ip) -{ + struct net_bridge_port_group *pg, + struct br_ip *ip_dst, + struct br_ip *group, + bool with_srcs, + u8 sflag, + bool *need_rexmit) +{ + bool over_lmqt = !!sflag; struct sk_buff *skb; u8 igmp_type; - skb = br_multicast_alloc_query(br, ip, &igmp_type); +again_under_lmqt: + skb = br_multicast_alloc_query(br, pg, ip_dst, group, with_srcs, + over_lmqt, sflag, &igmp_type, + need_rexmit); if (!skb) return; @@ -727,8 +1053,13 @@ static void __br_multicast_send_query(struct net_bridge *br, NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, dev_net(port->dev), NULL, skb, NULL, skb->dev, br_dev_queue_push_xmit); + + if (over_lmqt && with_srcs && sflag) { + over_lmqt = false; + goto again_under_lmqt; + } } else { - br_multicast_select_own_querier(br, ip, skb); + br_multicast_select_own_querier(br, group, skb); br_multicast_count(br, port, skb, igmp_type, BR_MCAST_DIR_RX); netif_rx(skb); @@ -764,7 +1095,8 @@ static void br_multicast_send_query(struct net_bridge *br, if (!other_query || timer_pending(&other_query->timer)) return; - __br_multicast_send_query(br, port, &br_group); + __br_multicast_send_query(br, port, NULL, NULL, &br_group, false, 0, + NULL); time = jiffies; time += own_query->startup_sent < br->multicast_startup_query_count ? @@ -809,6 +1141,44 @@ static void br_ip6_multicast_port_query_expired(struct timer_list *t) } #endif +static void br_multicast_port_group_rexmit(struct timer_list *t) +{ + struct net_bridge_port_group *pg = from_timer(pg, t, rexmit_timer); + struct bridge_mcast_other_query *other_query = NULL; + struct net_bridge *br = pg->port->br; + bool need_rexmit = false; + + spin_lock(&br->multicast_lock); + if (!netif_running(br->dev) || hlist_unhashed(&pg->mglist) || + !br_opt_get(br, BROPT_MULTICAST_ENABLED) || + !br_opt_get(br, BROPT_MULTICAST_QUERIER)) + goto out; + + if (pg->addr.proto == htons(ETH_P_IP)) + other_query = &br->ip4_other_query; +#if IS_ENABLED(CONFIG_IPV6) + else + other_query = &br->ip6_other_query; +#endif + + if (!other_query || timer_pending(&other_query->timer)) + goto out; + + if (pg->grp_query_rexmit_cnt) { + pg->grp_query_rexmit_cnt--; + __br_multicast_send_query(br, pg->port, pg, &pg->addr, + &pg->addr, false, 1, NULL); + } + __br_multicast_send_query(br, pg->port, pg, &pg->addr, + &pg->addr, true, 0, &need_rexmit); + + if (pg->grp_query_rexmit_cnt || need_rexmit) + mod_timer(&pg->rexmit_timer, jiffies + + br->multicast_last_member_interval); +out: + spin_unlock(&br->multicast_lock); +} + static void br_mc_disabled_update(struct net_device *dev, bool value) { struct switchdev_attr attr = { @@ -847,13 +1217,16 @@ void br_multicast_del_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; struct net_bridge_port_group *pg; + HLIST_HEAD(deleted_head); struct hlist_node *n; /* Take care of the remaining groups, only perm ones should be left */ spin_lock_bh(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) - br_multicast_del_pg(br, pg); + br_multicast_find_del_pg(br, pg); + hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); + br_multicast_gc(&deleted_head); del_timer_sync(&port->multicast_router_timer); free_percpu(port->mcast_stats); } @@ -901,7 +1274,7 @@ void br_multicast_disable_port(struct net_bridge_port *port) spin_lock(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) if (!(pg->flags & MDB_PG_FLAGS_PERMANENT)) - br_multicast_del_pg(br, pg); + br_multicast_find_del_pg(br, pg); __del_port_router(port); @@ -913,20 +1286,561 @@ void br_multicast_disable_port(struct net_bridge_port *port) spin_unlock(&br->multicast_lock); } +static int __grp_src_delete_marked(struct net_bridge_port_group *pg) +{ + struct net_bridge_group_src *ent; + struct hlist_node *tmp; + int deleted = 0; + + hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) + if (ent->flags & BR_SGRP_F_DELETE) { + br_multicast_del_group_src(ent); + deleted++; + } + + return deleted; +} + +static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg) +{ + struct bridge_mcast_other_query *other_query = NULL; + struct net_bridge *br = pg->port->br; + u32 lmqc = br->multicast_last_member_count; + unsigned long lmqt, lmi, now = jiffies; + struct net_bridge_group_src *ent; + + if (!netif_running(br->dev) || + !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + return; + + if (pg->addr.proto == htons(ETH_P_IP)) + other_query = &br->ip4_other_query; +#if IS_ENABLED(CONFIG_IPV6) + else + other_query = &br->ip6_other_query; +#endif + + lmqt = now + br_multicast_lmqt(br); + hlist_for_each_entry(ent, &pg->src_list, node) { + if (ent->flags & BR_SGRP_F_SEND) { + ent->flags &= ~BR_SGRP_F_SEND; + if (ent->timer.expires > lmqt) { + if (br_opt_get(br, BROPT_MULTICAST_QUERIER) && + other_query && + !timer_pending(&other_query->timer)) + ent->src_query_rexmit_cnt = lmqc; + mod_timer(&ent->timer, lmqt); + } + } + } + + if (!br_opt_get(br, BROPT_MULTICAST_QUERIER) || + !other_query || timer_pending(&other_query->timer)) + return; + + __br_multicast_send_query(br, pg->port, pg, &pg->addr, + &pg->addr, true, 1, NULL); + + lmi = now + br->multicast_last_member_interval; + if (!timer_pending(&pg->rexmit_timer) || + time_after(pg->rexmit_timer.expires, lmi)) + mod_timer(&pg->rexmit_timer, lmi); +} + +static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) +{ + struct bridge_mcast_other_query *other_query = NULL; + struct net_bridge *br = pg->port->br; + unsigned long now = jiffies, lmi; + + if (!netif_running(br->dev) || + !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + return; + + if (pg->addr.proto == htons(ETH_P_IP)) + other_query = &br->ip4_other_query; +#if IS_ENABLED(CONFIG_IPV6) + else + other_query = &br->ip6_other_query; +#endif + + if (br_opt_get(br, BROPT_MULTICAST_QUERIER) && + other_query && !timer_pending(&other_query->timer)) { + lmi = now + br->multicast_last_member_interval; + pg->grp_query_rexmit_cnt = br->multicast_last_member_count - 1; + __br_multicast_send_query(br, pg->port, pg, &pg->addr, + &pg->addr, false, 0, NULL); + if (!timer_pending(&pg->rexmit_timer) || + time_after(pg->rexmit_timer.expires, lmi)) + mod_timer(&pg->rexmit_timer, lmi); + } + + if (pg->filter_mode == MCAST_EXCLUDE && + (!timer_pending(&pg->timer) || + time_after(pg->timer.expires, now + br_multicast_lmqt(br)))) + mod_timer(&pg->timer, now + br_multicast_lmqt(br)); +} + +/* State Msg type New state Actions + * INCLUDE (A) IS_IN (B) INCLUDE (A+B) (B)=GMI + * INCLUDE (A) ALLOW (B) INCLUDE (A+B) (B)=GMI + * EXCLUDE (X,Y) ALLOW (A) EXCLUDE (X+A,Y-A) (A)=GMI + */ +static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + struct net_bridge *br = pg->port->br; + struct net_bridge_group_src *ent; + unsigned long now = jiffies; + bool changed = false; + struct br_ip src_ip; + u32 src_idx; + + memset(&src_ip, 0, sizeof(src_ip)); + src_ip.proto = pg->addr.proto; + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&src_ip.u, srcs, src_size); + ent = br_multicast_find_group_src(pg, &src_ip); + if (!ent) { + ent = br_multicast_new_group_src(pg, &src_ip); + if (ent) + changed = true; + } + + if (ent) + mod_timer(&ent->timer, now + br_multicast_gmi(br)); + srcs += src_size; + } + + return changed; +} + +/* State Msg type New state Actions + * INCLUDE (A) IS_EX (B) EXCLUDE (A*B,B-A) (B-A)=0 + * Delete (A-B) + * Group Timer=GMI + */ +static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + struct net_bridge_group_src *ent; + struct br_ip src_ip; + u32 src_idx; + + hlist_for_each_entry(ent, &pg->src_list, node) + ent->flags |= BR_SGRP_F_DELETE; + + memset(&src_ip, 0, sizeof(src_ip)); + src_ip.proto = pg->addr.proto; + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&src_ip.u, srcs, src_size); + ent = br_multicast_find_group_src(pg, &src_ip); + if (ent) + ent->flags &= ~BR_SGRP_F_DELETE; + else + br_multicast_new_group_src(pg, &src_ip); + srcs += src_size; + } + + __grp_src_delete_marked(pg); +} + +/* State Msg type New state Actions + * EXCLUDE (X,Y) IS_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=GMI + * Delete (X-A) + * Delete (Y-A) + * Group Timer=GMI + */ +static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + struct net_bridge *br = pg->port->br; + struct net_bridge_group_src *ent; + unsigned long now = jiffies; + bool changed = false; + struct br_ip src_ip; + u32 src_idx; + + hlist_for_each_entry(ent, &pg->src_list, node) + ent->flags |= BR_SGRP_F_DELETE; + + memset(&src_ip, 0, sizeof(src_ip)); + src_ip.proto = pg->addr.proto; + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&src_ip.u, srcs, src_size); + ent = br_multicast_find_group_src(pg, &src_ip); + if (ent) { + ent->flags &= ~BR_SGRP_F_DELETE; + } else { + ent = br_multicast_new_group_src(pg, &src_ip); + if (ent) { + mod_timer(&ent->timer, + now + br_multicast_gmi(br)); + changed = true; + } + } + srcs += src_size; + } + + if (__grp_src_delete_marked(pg)) + changed = true; + + return changed; +} + +static bool br_multicast_isexc(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + struct net_bridge *br = pg->port->br; + bool changed = false; + + switch (pg->filter_mode) { + case MCAST_INCLUDE: + __grp_src_isexc_incl(pg, srcs, nsrcs, src_size); + changed = true; + break; + case MCAST_EXCLUDE: + changed = __grp_src_isexc_excl(pg, srcs, nsrcs, src_size); + break; + } + + pg->filter_mode = MCAST_EXCLUDE; + mod_timer(&pg->timer, jiffies + br_multicast_gmi(br)); + + return changed; +} + +/* State Msg type New state Actions + * INCLUDE (A) TO_IN (B) INCLUDE (A+B) (B)=GMI + * Send Q(G,A-B) + */ +static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + struct net_bridge *br = pg->port->br; + u32 src_idx, to_send = pg->src_ents; + struct net_bridge_group_src *ent; + unsigned long now = jiffies; + bool changed = false; + struct br_ip src_ip; + + hlist_for_each_entry(ent, &pg->src_list, node) + ent->flags |= BR_SGRP_F_SEND; + + memset(&src_ip, 0, sizeof(src_ip)); + src_ip.proto = pg->addr.proto; + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&src_ip.u, srcs, src_size); + ent = br_multicast_find_group_src(pg, &src_ip); + if (ent) { + ent->flags &= ~BR_SGRP_F_SEND; + to_send--; + } else { + ent = br_multicast_new_group_src(pg, &src_ip); + if (ent) + changed = true; + } + if (ent) + mod_timer(&ent->timer, now + br_multicast_gmi(br)); + srcs += src_size; + } + + if (to_send) + __grp_src_query_marked_and_rexmit(pg); + + return changed; +} + +/* State Msg type New state Actions + * EXCLUDE (X,Y) TO_IN (A) EXCLUDE (X+A,Y-A) (A)=GMI + * Send Q(G,X-A) + * Send Q(G) + */ +static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + struct net_bridge *br = pg->port->br; + u32 src_idx, to_send = pg->src_ents; + struct net_bridge_group_src *ent; + unsigned long now = jiffies; + bool changed = false; + struct br_ip src_ip; + + hlist_for_each_entry(ent, &pg->src_list, node) + if (timer_pending(&ent->timer)) + ent->flags |= BR_SGRP_F_SEND; + + memset(&src_ip, 0, sizeof(src_ip)); + src_ip.proto = pg->addr.proto; + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&src_ip.u, srcs, src_size); + ent = br_multicast_find_group_src(pg, &src_ip); + if (ent) { + if (timer_pending(&ent->timer)) { + ent->flags &= ~BR_SGRP_F_SEND; + to_send--; + } + } else { + ent = br_multicast_new_group_src(pg, &src_ip); + if (ent) + changed = true; + } + if (ent) + mod_timer(&ent->timer, now + br_multicast_gmi(br)); + srcs += src_size; + } + + if (to_send) + __grp_src_query_marked_and_rexmit(pg); + + __grp_send_query_and_rexmit(pg); + + return changed; +} + +static bool br_multicast_toin(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + bool changed = false; + + switch (pg->filter_mode) { + case MCAST_INCLUDE: + changed = __grp_src_toin_incl(pg, srcs, nsrcs, src_size); + break; + case MCAST_EXCLUDE: + changed = __grp_src_toin_excl(pg, srcs, nsrcs, src_size); + break; + } + + return changed; +} + +/* State Msg type New state Actions + * INCLUDE (A) TO_EX (B) EXCLUDE (A*B,B-A) (B-A)=0 + * Delete (A-B) + * Send Q(G,A*B) + * Group Timer=GMI + */ +static void __grp_src_toex_incl(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + struct net_bridge_group_src *ent; + u32 src_idx, to_send = 0; + struct br_ip src_ip; + + hlist_for_each_entry(ent, &pg->src_list, node) + ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE; + + memset(&src_ip, 0, sizeof(src_ip)); + src_ip.proto = pg->addr.proto; + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&src_ip.u, srcs, src_size); + ent = br_multicast_find_group_src(pg, &src_ip); + if (ent) { + ent->flags = (ent->flags & ~BR_SGRP_F_DELETE) | + BR_SGRP_F_SEND; + to_send++; + } else { + br_multicast_new_group_src(pg, &src_ip); + } + srcs += src_size; + } + + __grp_src_delete_marked(pg); + if (to_send) + __grp_src_query_marked_and_rexmit(pg); +} + +/* State Msg type New state Actions + * EXCLUDE (X,Y) TO_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=Group Timer + * Delete (X-A) + * Delete (Y-A) + * Send Q(G,A-Y) + * Group Timer=GMI + */ +static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + struct net_bridge_group_src *ent; + u32 src_idx, to_send = 0; + bool changed = false; + struct br_ip src_ip; + + hlist_for_each_entry(ent, &pg->src_list, node) + ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE; + + memset(&src_ip, 0, sizeof(src_ip)); + src_ip.proto = pg->addr.proto; + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&src_ip.u, srcs, src_size); + ent = br_multicast_find_group_src(pg, &src_ip); + if (ent) { + ent->flags &= ~BR_SGRP_F_DELETE; + } else { + ent = br_multicast_new_group_src(pg, &src_ip); + if (ent) { + mod_timer(&ent->timer, pg->timer.expires); + changed = true; + } + } + if (ent && timer_pending(&ent->timer)) { + ent->flags |= BR_SGRP_F_SEND; + to_send++; + } + srcs += src_size; + } + + if (__grp_src_delete_marked(pg)) + changed = true; + if (to_send) + __grp_src_query_marked_and_rexmit(pg); + + return changed; +} + +static bool br_multicast_toex(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + struct net_bridge *br = pg->port->br; + bool changed = false; + + switch (pg->filter_mode) { + case MCAST_INCLUDE: + __grp_src_toex_incl(pg, srcs, nsrcs, src_size); + changed = true; + break; + case MCAST_EXCLUDE: + changed = __grp_src_toex_excl(pg, srcs, nsrcs, src_size); + break; + } + + pg->filter_mode = MCAST_EXCLUDE; + mod_timer(&pg->timer, jiffies + br_multicast_gmi(br)); + + return changed; +} + +/* State Msg type New state Actions + * INCLUDE (A) BLOCK (B) INCLUDE (A) Send Q(G,A*B) + */ +static void __grp_src_block_incl(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + struct net_bridge_group_src *ent; + u32 src_idx, to_send = 0; + struct br_ip src_ip; + + hlist_for_each_entry(ent, &pg->src_list, node) + ent->flags &= ~BR_SGRP_F_SEND; + + memset(&src_ip, 0, sizeof(src_ip)); + src_ip.proto = pg->addr.proto; + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&src_ip.u, srcs, src_size); + ent = br_multicast_find_group_src(pg, &src_ip); + if (ent) { + ent->flags |= BR_SGRP_F_SEND; + to_send++; + } + srcs += src_size; + } + + if (to_send) + __grp_src_query_marked_and_rexmit(pg); + + if (pg->filter_mode == MCAST_INCLUDE && hlist_empty(&pg->src_list)) + br_multicast_find_del_pg(pg->port->br, pg); +} + +/* State Msg type New state Actions + * EXCLUDE (X,Y) BLOCK (A) EXCLUDE (X+(A-Y),Y) (A-X-Y)=Group Timer + * Send Q(G,A-Y) + */ +static bool __grp_src_block_excl(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + struct net_bridge_group_src *ent; + u32 src_idx, to_send = 0; + bool changed = false; + struct br_ip src_ip; + + hlist_for_each_entry(ent, &pg->src_list, node) + ent->flags &= ~BR_SGRP_F_SEND; + + memset(&src_ip, 0, sizeof(src_ip)); + src_ip.proto = pg->addr.proto; + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&src_ip.u, srcs, src_size); + ent = br_multicast_find_group_src(pg, &src_ip); + if (!ent) { + ent = br_multicast_new_group_src(pg, &src_ip); + if (ent) { + mod_timer(&ent->timer, pg->timer.expires); + changed = true; + } + } + if (ent && timer_pending(&ent->timer)) { + ent->flags |= BR_SGRP_F_SEND; + to_send++; + } + srcs += src_size; + } + + if (to_send) + __grp_src_query_marked_and_rexmit(pg); + + return changed; +} + +static bool br_multicast_block(struct net_bridge_port_group *pg, + void *srcs, u32 nsrcs, size_t src_size) +{ + bool changed = false; + + switch (pg->filter_mode) { + case MCAST_INCLUDE: + __grp_src_block_incl(pg, srcs, nsrcs, src_size); + break; + case MCAST_EXCLUDE: + changed = __grp_src_block_excl(pg, srcs, nsrcs, src_size); + break; + } + + return changed; +} + +static struct net_bridge_port_group * +br_multicast_find_port(struct net_bridge_mdb_entry *mp, + struct net_bridge_port *p, + const unsigned char *src) +{ + struct net_bridge *br __maybe_unused = mp->br; + struct net_bridge_port_group *pg; + + for (pg = mlock_dereference(mp->ports, br); + pg; + pg = mlock_dereference(pg->next, br)) + if (br_port_group_equal(pg, p, src)) + return pg; + + return NULL; +} + static int br_ip4_multicast_igmp3_report(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, u16 vid) { + bool igmpv2 = br->multicast_igmp_version == 2; + struct net_bridge_mdb_entry *mdst; + struct net_bridge_port_group *pg; const unsigned char *src; struct igmpv3_report *ih; struct igmpv3_grec *grec; - int i; - int len; - int num; - int type; - int err = 0; + int i, len, num, type; + bool changed = false; __be32 group; + int err = 0; u16 nsrcs; ih = igmpv3_report_hdr(skb); @@ -947,7 +1861,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, if (!ip_mc_may_pull(skb, len)) return -EINVAL; - /* We treat this as an IGMPv2 report for now. */ switch (type) { case IGMPV3_MODE_IS_INCLUDE: case IGMPV3_MODE_IS_EXCLUDE: @@ -962,16 +1875,62 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, } src = eth_hdr(skb)->h_source; - if ((type == IGMPV3_CHANGE_TO_INCLUDE || - type == IGMPV3_MODE_IS_INCLUDE) && - nsrcs == 0) { - br_ip4_multicast_leave_group(br, port, group, vid, src); + if (nsrcs == 0 && + (type == IGMPV3_CHANGE_TO_INCLUDE || + type == IGMPV3_MODE_IS_INCLUDE)) { + if (!port || igmpv2) { + br_ip4_multicast_leave_group(br, port, group, vid, src); + continue; + } } else { err = br_ip4_multicast_add_group(br, port, group, vid, - src); + src, igmpv2); if (err) break; } + + if (!port || igmpv2) + continue; + + spin_lock_bh(&br->multicast_lock); + mdst = br_mdb_ip4_get(br, group, vid); + if (!mdst) + goto unlock_continue; + pg = br_multicast_find_port(mdst, port, src); + if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) + goto unlock_continue; + /* reload grec */ + grec = (void *)(skb->data + len - sizeof(*grec) - (nsrcs * 4)); + switch (type) { + case IGMPV3_ALLOW_NEW_SOURCES: + changed = br_multicast_isinc_allow(pg, grec->grec_src, + nsrcs, sizeof(__be32)); + break; + case IGMPV3_MODE_IS_INCLUDE: + changed = br_multicast_isinc_allow(pg, grec->grec_src, nsrcs, + sizeof(__be32)); + break; + case IGMPV3_MODE_IS_EXCLUDE: + changed = br_multicast_isexc(pg, grec->grec_src, nsrcs, + sizeof(__be32)); + break; + case IGMPV3_CHANGE_TO_INCLUDE: + changed = br_multicast_toin(pg, grec->grec_src, nsrcs, + sizeof(__be32)); + break; + case IGMPV3_CHANGE_TO_EXCLUDE: + changed = br_multicast_toex(pg, grec->grec_src, nsrcs, + sizeof(__be32)); + break; + case IGMPV3_BLOCK_OLD_SOURCES: + changed = br_multicast_block(pg, grec->grec_src, nsrcs, + sizeof(__be32)); + break; + } + if (changed) + br_mdb_notify(br->dev, mdst, pg, RTM_NEWMDB); +unlock_continue: + spin_unlock_bh(&br->multicast_lock); } return err; @@ -983,14 +1942,16 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, struct sk_buff *skb, u16 vid) { + bool mldv1 = br->multicast_mld_version == 1; + struct net_bridge_mdb_entry *mdst; + struct net_bridge_port_group *pg; unsigned int nsrcs_offset; const unsigned char *src; struct icmp6hdr *icmp6h; struct mld2_grec *grec; unsigned int grec_len; - int i; - int len; - int num; + bool changed = false; + int i, len, num; int err = 0; if (!ipv6_mc_may_pull(skb, sizeof(*icmp6h))) @@ -1024,7 +1985,6 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, grec = (struct mld2_grec *)(skb->data + len); len += grec_len; - /* We treat these as MLDv1 reports for now. */ switch (grec->grec_type) { case MLD2_MODE_IS_INCLUDE: case MLD2_MODE_IS_EXCLUDE: @@ -1042,15 +2002,61 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || grec->grec_type == MLD2_MODE_IS_INCLUDE) && nsrcs == 0) { - br_ip6_multicast_leave_group(br, port, &grec->grec_mca, - vid, src); + if (!port || mldv1) { + br_ip6_multicast_leave_group(br, port, + &grec->grec_mca, + vid, src); + continue; + } } else { err = br_ip6_multicast_add_group(br, port, &grec->grec_mca, vid, - src); + src, mldv1); if (err) break; } + + if (!port || mldv1) + continue; + + spin_lock_bh(&br->multicast_lock); + mdst = br_mdb_ip6_get(br, &grec->grec_mca, vid); + if (!mdst) + goto unlock_continue; + pg = br_multicast_find_port(mdst, port, src); + if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) + goto unlock_continue; + switch (grec->grec_type) { + case MLD2_ALLOW_NEW_SOURCES: + changed = br_multicast_isinc_allow(pg, grec->grec_src, + nsrcs, + sizeof(struct in6_addr)); + break; + case MLD2_MODE_IS_INCLUDE: + changed = br_multicast_isinc_allow(pg, grec->grec_src, nsrcs, + sizeof(struct in6_addr)); + break; + case MLD2_MODE_IS_EXCLUDE: + changed = br_multicast_isexc(pg, grec->grec_src, nsrcs, + sizeof(struct in6_addr)); + break; + case MLD2_CHANGE_TO_INCLUDE: + changed = br_multicast_toin(pg, grec->grec_src, nsrcs, + sizeof(struct in6_addr)); + break; + case MLD2_CHANGE_TO_EXCLUDE: + changed = br_multicast_toex(pg, grec->grec_src, nsrcs, + sizeof(struct in6_addr)); + break; + case MLD2_BLOCK_OLD_SOURCES: + changed = br_multicast_block(pg, grec->grec_src, nsrcs, + sizeof(struct in6_addr)); + break; + } + if (changed) + br_mdb_notify(br->dev, mdst, pg, RTM_NEWMDB); +unlock_continue: + spin_unlock_bh(&br->multicast_lock); } return err; @@ -1245,7 +2251,8 @@ static void br_ip4_multicast_query(struct net_bridge *br, } } else if (transport_len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); - if (ih3->nsrcs) + if (ih3->nsrcs || + (br->multicast_igmp_version == 3 && group && ih3->suppress)) goto out; max_delay = ih3->code ? @@ -1280,7 +2287,9 @@ static void br_ip4_multicast_query(struct net_bridge *br, pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : - try_to_del_timer_sync(&p->timer) >= 0) + try_to_del_timer_sync(&p->timer) >= 0 && + (br->multicast_igmp_version == 2 || + p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } @@ -1330,6 +2339,10 @@ static int br_ip6_multicast_query(struct net_bridge *br, mld2q = (struct mld2_query *)icmp6_hdr(skb); if (!mld2q->mld2q_nsrcs) group = &mld2q->mld2q_mca; + if (br->multicast_mld_version == 2 && + !ipv6_addr_any(&mld2q->mld2q_mca) && + mld2q->mld2q_suppress) + goto out; max_delay = max(msecs_to_jiffies(mldv2_mrc(mld2q)), 1UL); } @@ -1363,7 +2376,9 @@ static int br_ip6_multicast_query(struct net_bridge *br, pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : - try_to_del_timer_sync(&p->timer) >= 0) + try_to_del_timer_sync(&p->timer) >= 0 && + (br->multicast_mld_version == 1 || + p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } @@ -1407,16 +2422,8 @@ br_multicast_leave_group(struct net_bridge *br, if (p->flags & MDB_PG_FLAGS_PERMANENT) break; - rcu_assign_pointer(*pp, p->next); - hlist_del_init(&p->mglist); - del_timer(&p->timer); - kfree_rcu(p, rcu); - br_mdb_notify(br->dev, port, group, RTM_DELMDB, - p->flags | MDB_PG_FLAGS_FAST_LEAVE); - - if (!mp->ports && !mp->host_joined && - netif_running(br->dev)) - mod_timer(&mp->timer, jiffies); + p->flags |= MDB_PG_FLAGS_FAST_LEAVE; + br_multicast_del_pg(mp, p, pp); } goto out; } @@ -1425,7 +2432,8 @@ br_multicast_leave_group(struct net_bridge *br, goto out; if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) { - __br_multicast_send_query(br, port, &mp->addr); + __br_multicast_send_query(br, port, NULL, NULL, &mp->addr, + false, 0, NULL); time = jiffies + br->multicast_last_member_count * br->multicast_last_member_interval; @@ -1627,7 +2635,8 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - err = br_ip4_multicast_add_group(br, port, ih->group, vid, src); + err = br_ip4_multicast_add_group(br, port, ih->group, vid, src, + true); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: err = br_ip4_multicast_igmp3_report(br, port, skb, vid); @@ -1706,7 +2715,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, src = eth_hdr(skb)->h_source; BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid, - src); + src, true); break; case ICMPV6_MLD2_REPORT: err = br_ip6_multicast_mld2_report(br, port, skb, vid); @@ -1781,6 +2790,19 @@ static void br_ip6_multicast_query_expired(struct timer_list *t) } #endif +static void br_multicast_gc_work(struct work_struct *work) +{ + struct net_bridge *br = container_of(work, struct net_bridge, + mcast_gc_work); + HLIST_HEAD(deleted_head); + + spin_lock_bh(&br->multicast_lock); + hlist_move_list(&br->mcast_gc_list, &deleted_head); + spin_unlock_bh(&br->multicast_lock); + + br_multicast_gc(&deleted_head); +} + void br_multicast_init(struct net_bridge *br) { br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; @@ -1821,6 +2843,8 @@ void br_multicast_init(struct net_bridge *br) br_ip6_multicast_query_expired, 0); #endif INIT_HLIST_HEAD(&br->mdb_list); + INIT_HLIST_HEAD(&br->mcast_gc_list); + INIT_WORK(&br->mcast_gc_work, br_multicast_gc_work); } static void br_ip4_multicast_join_snoopers(struct net_bridge *br) @@ -1924,18 +2948,18 @@ void br_multicast_stop(struct net_bridge *br) void br_multicast_dev_del(struct net_bridge *br) { struct net_bridge_mdb_entry *mp; + HLIST_HEAD(deleted_head); struct hlist_node *tmp; spin_lock_bh(&br->multicast_lock); - hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node) { - del_timer(&mp->timer); - rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode, - br_mdb_rht_params); - hlist_del_rcu(&mp->mdb_node); - kfree_rcu(mp, rcu); - } + hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node) + br_multicast_del_mdb_entry(mp); + hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); + br_multicast_gc(&deleted_head); + cancel_work_sync(&br->mcast_gc_work); + rcu_barrier(); } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 147d52596e17..8a71c60fa357 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1095,8 +1095,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, - [IFLA_BR_MULTI_BOOLOPT] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_boolopt_multi) }, + [IFLA_BR_MULTI_BOOLOPT] = + NLA_POLICY_EXACT_LEN(sizeof(struct br_boolopt_multi)), }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index baa1500f384f..a23d2bae56e1 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -214,26 +214,61 @@ struct net_bridge_fdb_entry { #define MDB_PG_FLAGS_OFFLOAD BIT(1) #define MDB_PG_FLAGS_FAST_LEAVE BIT(2) +#define PG_SRC_ENT_LIMIT 32 + +#define BR_SGRP_F_DELETE BIT(0) +#define BR_SGRP_F_SEND BIT(1) + +struct net_bridge_mcast_gc { + struct hlist_node gc_node; + void (*destroy)(struct net_bridge_mcast_gc *gc); +}; + +struct net_bridge_group_src { + struct hlist_node node; + + struct br_ip addr; + struct net_bridge_port_group *pg; + u8 flags; + u8 src_query_rexmit_cnt; + struct timer_list timer; + + struct net_bridge *br; + struct net_bridge_mcast_gc mcast_gc; + struct rcu_head rcu; +}; + struct net_bridge_port_group { struct net_bridge_port *port; struct net_bridge_port_group __rcu *next; - struct hlist_node mglist; - struct rcu_head rcu; - struct timer_list timer; struct br_ip addr; unsigned char eth_addr[ETH_ALEN] __aligned(2); unsigned char flags; + unsigned char filter_mode; + unsigned char grp_query_rexmit_cnt; + + struct hlist_head src_list; + unsigned int src_ents; + struct timer_list timer; + struct timer_list rexmit_timer; + struct hlist_node mglist; + + struct net_bridge_mcast_gc mcast_gc; + struct rcu_head rcu; }; struct net_bridge_mdb_entry { struct rhash_head rhnode; struct net_bridge *br; struct net_bridge_port_group __rcu *ports; - struct rcu_head rcu; - struct timer_list timer; struct br_ip addr; bool host_joined; + + struct timer_list timer; struct hlist_node mdb_node; + + struct net_bridge_mcast_gc mcast_gc; + struct rcu_head rcu; }; struct net_bridge_port { @@ -406,6 +441,7 @@ struct net_bridge { struct rhashtable mdb_hash_tbl; + struct hlist_head mcast_gc_list; struct hlist_head mdb_list; struct hlist_head router_list; @@ -419,6 +455,7 @@ struct net_bridge { struct bridge_mcast_own_query ip6_own_query; struct bridge_mcast_querier ip6_querier; #endif /* IS_ENABLED(CONFIG_IPV6) */ + struct work_struct mcast_gc_work; #endif struct timer_list hello_timer; @@ -766,13 +803,17 @@ br_multicast_new_group(struct net_bridge *br, struct br_ip *group); struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, - unsigned char flags, const unsigned char *src); + unsigned char flags, const unsigned char *src, + u8 filter_mode); int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); -void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type, u8 flags); +void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, int type); void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, int type); +void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + struct net_bridge_port_group __rcu **pp); void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir); int br_multicast_init_stats(struct net_bridge *br); @@ -836,6 +877,19 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return BR_INPUT_SKB_CB(skb)->igmp; } + +static inline unsigned long br_multicast_lmqt(const struct net_bridge *br) +{ + return br->multicast_last_member_interval * + br->multicast_last_member_count; +} + +static inline unsigned long br_multicast_gmi(const struct net_bridge *br) +{ + /* use the RFC default of 2 for QRV */ + return 2 * br->multicast_query_interval + + br->multicast_query_response_interval; +} #else static inline int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 61c94cefa843..002bbc93209d 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -140,7 +140,7 @@ static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, return err == -EOPNOTSUPP ? 0 : err; } -/* Returns a master vlan, if it didn't exist it gets created. In all cases a +/* Returns a master vlan, if it didn't exist it gets created. In all cases * a reference is taken to the master vlan before returning. */ static struct net_bridge_vlan * @@ -1891,8 +1891,8 @@ out_err: } static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = { - [BRIDGE_VLANDB_ENTRY_INFO] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct bridge_vlan_info) }, + [BRIDGE_VLANDB_ENTRY_INFO] = + NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)), [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED }, diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 0d6d20c9105e..8f68afda5f81 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -15,7 +15,6 @@ #include <linux/netfilter_bridge/ebt_stp.h> #define BPDU_TYPE_CONFIG 0 -#define BPDU_TYPE_TCN 0x80 struct stp_header { u8 dsap; diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c index d0a4d0ac7045..9cef9496a707 100644 --- a/net/caif/cfsrvl.c +++ b/net/caif/cfsrvl.c @@ -21,7 +21,6 @@ #define SRVL_FLOW_OFF 0x81 #define SRVL_FLOW_ON 0x80 #define SRVL_SET_PIN 0x82 -#define SRVL_CTRL_PKT_SIZE 1 #define container_obj(layr) container_of(layr, struct cfsrvl, layer) diff --git a/net/can/af_can.c b/net/can/af_can.c index 5c06404bdf3e..ea29a6d97ef5 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -358,7 +358,7 @@ static unsigned int effhash(canid_t can_id) * * Return: * Pointer to optimal filterlist for the given can_id/mask pair. - * Constistency checked mask. + * Consistency checked mask. * Reduced can_id to have a preprocessed filter compare value. */ static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask, @@ -411,7 +411,7 @@ static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask, /** * can_rx_register - subscribe CAN frames from a specific interface * @net: the applicable net namespace - * @dev: pointer to netdevice (NULL => subcribe from 'all' CAN devices list) + * @dev: pointer to netdevice (NULL => subscribe from 'all' CAN devices list) * @can_id: CAN identifier (see description) * @mask: CAN mask (see description) * @func: callback function on filter match diff --git a/net/can/bcm.c b/net/can/bcm.c index d14ea12affb1..4253915800e6 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content * diff --git a/net/can/gw.c b/net/can/gw.c index 65d60c93af29..49b4e3d91ad6 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* gw.c - CAN frame Gateway/Router/Bridge with netlink interface * * Copyright (c) 2019 Volkswagen Group Electronic Research diff --git a/net/can/proc.c b/net/can/proc.c index e6881bfc3ed1..a4eb06c9eb70 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* * proc.c - procfs support for Protocol family CAN core module * diff --git a/net/can/raw.c b/net/can/raw.c index 94a9405658dc..24db4b4afdc7 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* raw.c - Raw sockets for protocol family CAN * * Copyright (c) 2002-2007 Volkswagen Group Electronic Research @@ -154,16 +154,16 @@ static void raw_rcv(struct sk_buff *oskb, void *data) if (!skb) return; - /* Put the datagram to the queue so that raw_recvmsg() can - * get it from there. We need to pass the interface index to - * raw_recvmsg(). We pass a whole struct sockaddr_can in skb->cb - * containing the interface index. + /* Put the datagram to the queue so that raw_recvmsg() can get + * it from there. We need to pass the interface index to + * raw_recvmsg(). We pass a whole struct sockaddr_can in + * skb->cb containing the interface index. */ sock_skb_cb_check_size(sizeof(struct sockaddr_can)); addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); - addr->can_family = AF_CAN; + addr->can_family = AF_CAN; addr->can_ifindex = skb->dev->ifindex; /* add CAN specific message flags for raw_recvmsg() */ @@ -290,8 +290,8 @@ static int raw_notifier(struct notifier_block *nb, kfree(ro->filter); ro->ifindex = 0; - ro->bound = 0; - ro->count = 0; + ro->bound = 0; + ro->count = 0; release_sock(sk); sk->sk_err = ENODEV; @@ -374,8 +374,8 @@ static int raw_release(struct socket *sock) kfree(ro->filter); ro->ifindex = 0; - ro->bound = 0; - ro->count = 0; + ro->bound = 0; + ro->count = 0; free_percpu(ro->uniq); sock_orphan(sk); @@ -773,7 +773,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) skb_setup_tx_timestamp(skb, sk->sk_tsflags); skb->dev = dev; - skb->sk = sk; + skb->sk = sk; skb->priority = sk->sk_priority; err = can_send(skb, ro->loopback); @@ -801,8 +801,8 @@ static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int err = 0; int noblock; - noblock = flags & MSG_DONTWAIT; - flags &= ~MSG_DONTWAIT; + noblock = flags & MSG_DONTWAIT; + flags &= ~MSG_DONTWAIT; skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index b988f48153a4..a0d1a3265b71 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -7,97 +7,14 @@ #include <linux/spinlock.h> #include <linux/bpf.h> #include <linux/btf_ids.h> +#include <linux/bpf_local_storage.h> #include <net/bpf_sk_storage.h> #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> +#include <linux/btf_ids.h> -#define SK_STORAGE_CREATE_FLAG_MASK \ - (BPF_F_NO_PREALLOC | BPF_F_CLONE) - -struct bucket { - struct hlist_head list; - raw_spinlock_t lock; -}; - -/* Thp map is not the primary owner of a bpf_sk_storage_elem. - * Instead, the sk->sk_bpf_storage is. - * - * The map (bpf_sk_storage_map) is for two purposes - * 1. Define the size of the "sk local storage". It is - * the map's value_size. - * - * 2. Maintain a list to keep track of all elems such - * that they can be cleaned up during the map destruction. - * - * When a bpf local storage is being looked up for a - * particular sk, the "bpf_map" pointer is actually used - * as the "key" to search in the list of elem in - * sk->sk_bpf_storage. - * - * Hence, consider sk->sk_bpf_storage is the mini-map - * with the "bpf_map" pointer as the searching key. - */ -struct bpf_sk_storage_map { - struct bpf_map map; - /* Lookup elem does not require accessing the map. - * - * Updating/Deleting requires a bucket lock to - * link/unlink the elem from the map. Having - * multiple buckets to improve contention. - */ - struct bucket *buckets; - u32 bucket_log; - u16 elem_size; - u16 cache_idx; -}; - -struct bpf_sk_storage_data { - /* smap is used as the searching key when looking up - * from sk->sk_bpf_storage. - * - * Put it in the same cacheline as the data to minimize - * the number of cachelines access during the cache hit case. - */ - struct bpf_sk_storage_map __rcu *smap; - u8 data[] __aligned(8); -}; - -/* Linked to bpf_sk_storage and bpf_sk_storage_map */ -struct bpf_sk_storage_elem { - struct hlist_node map_node; /* Linked to bpf_sk_storage_map */ - struct hlist_node snode; /* Linked to bpf_sk_storage */ - struct bpf_sk_storage __rcu *sk_storage; - struct rcu_head rcu; - /* 8 bytes hole */ - /* The data is stored in aother cacheline to minimize - * the number of cachelines access during a cache hit. - */ - struct bpf_sk_storage_data sdata ____cacheline_aligned; -}; - -#define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata) -#define SDATA(_SELEM) (&(_SELEM)->sdata) -#define BPF_SK_STORAGE_CACHE_SIZE 16 - -static DEFINE_SPINLOCK(cache_idx_lock); -static u64 cache_idx_usage_counts[BPF_SK_STORAGE_CACHE_SIZE]; - -struct bpf_sk_storage { - struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE]; - struct hlist_head list; /* List of bpf_sk_storage_elem */ - struct sock *sk; /* The sk that owns the the above "list" of - * bpf_sk_storage_elem. - */ - struct rcu_head rcu; - raw_spinlock_t lock; /* Protect adding/removing from the "list" */ -}; - -static struct bucket *select_bucket(struct bpf_sk_storage_map *smap, - struct bpf_sk_storage_elem *selem) -{ - return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; -} +DEFINE_BPF_STORAGE_CACHE(sk_cache); static int omem_charge(struct sock *sk, unsigned int size) { @@ -111,445 +28,38 @@ static int omem_charge(struct sock *sk, unsigned int size) return -ENOMEM; } -static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem) -{ - return !hlist_unhashed(&selem->snode); -} - -static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem) -{ - return !hlist_unhashed(&selem->map_node); -} - -static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap, - struct sock *sk, void *value, - bool charge_omem) -{ - struct bpf_sk_storage_elem *selem; - - if (charge_omem && omem_charge(sk, smap->elem_size)) - return NULL; - - selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (selem) { - if (value) - memcpy(SDATA(selem)->data, value, smap->map.value_size); - return selem; - } - - if (charge_omem) - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); - - return NULL; -} - -/* sk_storage->lock must be held and selem->sk_storage == sk_storage. - * The caller must ensure selem->smap is still valid to be - * dereferenced for its smap->elem_size and smap->cache_idx. - */ -static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage, - struct bpf_sk_storage_elem *selem, - bool uncharge_omem) -{ - struct bpf_sk_storage_map *smap; - bool free_sk_storage; - struct sock *sk; - - smap = rcu_dereference(SDATA(selem)->smap); - sk = sk_storage->sk; - - /* All uncharging on sk->sk_omem_alloc must be done first. - * sk may be freed once the last selem is unlinked from sk_storage. - */ - if (uncharge_omem) - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); - - free_sk_storage = hlist_is_singular_node(&selem->snode, - &sk_storage->list); - if (free_sk_storage) { - atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc); - sk_storage->sk = NULL; - /* After this RCU_INIT, sk may be freed and cannot be used */ - RCU_INIT_POINTER(sk->sk_bpf_storage, NULL); - - /* sk_storage is not freed now. sk_storage->lock is - * still held and raw_spin_unlock_bh(&sk_storage->lock) - * will be done by the caller. - * - * Although the unlock will be done under - * rcu_read_lock(), it is more intutivie to - * read if kfree_rcu(sk_storage, rcu) is done - * after the raw_spin_unlock_bh(&sk_storage->lock). - * - * Hence, a "bool free_sk_storage" is returned - * to the caller which then calls the kfree_rcu() - * after unlock. - */ - } - hlist_del_init_rcu(&selem->snode); - if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) == - SDATA(selem)) - RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL); - - kfree_rcu(selem, rcu); - - return free_sk_storage; -} - -static void selem_unlink_sk(struct bpf_sk_storage_elem *selem) -{ - struct bpf_sk_storage *sk_storage; - bool free_sk_storage = false; - - if (unlikely(!selem_linked_to_sk(selem))) - /* selem has already been unlinked from sk */ - return; - - sk_storage = rcu_dereference(selem->sk_storage); - raw_spin_lock_bh(&sk_storage->lock); - if (likely(selem_linked_to_sk(selem))) - free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); - raw_spin_unlock_bh(&sk_storage->lock); - - if (free_sk_storage) - kfree_rcu(sk_storage, rcu); -} - -static void __selem_link_sk(struct bpf_sk_storage *sk_storage, - struct bpf_sk_storage_elem *selem) -{ - RCU_INIT_POINTER(selem->sk_storage, sk_storage); - hlist_add_head(&selem->snode, &sk_storage->list); -} - -static void selem_unlink_map(struct bpf_sk_storage_elem *selem) -{ - struct bpf_sk_storage_map *smap; - struct bucket *b; - - if (unlikely(!selem_linked_to_map(selem))) - /* selem has already be unlinked from smap */ - return; - - smap = rcu_dereference(SDATA(selem)->smap); - b = select_bucket(smap, selem); - raw_spin_lock_bh(&b->lock); - if (likely(selem_linked_to_map(selem))) - hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_bh(&b->lock); -} - -static void selem_link_map(struct bpf_sk_storage_map *smap, - struct bpf_sk_storage_elem *selem) -{ - struct bucket *b = select_bucket(smap, selem); - - raw_spin_lock_bh(&b->lock); - RCU_INIT_POINTER(SDATA(selem)->smap, smap); - hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_bh(&b->lock); -} - -static void selem_unlink(struct bpf_sk_storage_elem *selem) -{ - /* Always unlink from map before unlinking from sk_storage - * because selem will be freed after successfully unlinked from - * the sk_storage. - */ - selem_unlink_map(selem); - selem_unlink_sk(selem); -} - -static struct bpf_sk_storage_data * -__sk_storage_lookup(struct bpf_sk_storage *sk_storage, - struct bpf_sk_storage_map *smap, - bool cacheit_lockit) -{ - struct bpf_sk_storage_data *sdata; - struct bpf_sk_storage_elem *selem; - - /* Fast path (cache hit) */ - sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]); - if (sdata && rcu_access_pointer(sdata->smap) == smap) - return sdata; - - /* Slow path (cache miss) */ - hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) - if (rcu_access_pointer(SDATA(selem)->smap) == smap) - break; - - if (!selem) - return NULL; - - sdata = SDATA(selem); - if (cacheit_lockit) { - /* spinlock is needed to avoid racing with the - * parallel delete. Otherwise, publishing an already - * deleted sdata to the cache will become a use-after-free - * problem in the next __sk_storage_lookup(). - */ - raw_spin_lock_bh(&sk_storage->lock); - if (selem_linked_to_sk(selem)) - rcu_assign_pointer(sk_storage->cache[smap->cache_idx], - sdata); - raw_spin_unlock_bh(&sk_storage->lock); - } - - return sdata; -} - -static struct bpf_sk_storage_data * +static struct bpf_local_storage_data * sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) { - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_map *smap; + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_map *smap; sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage) return NULL; - smap = (struct bpf_sk_storage_map *)map; - return __sk_storage_lookup(sk_storage, smap, cacheit_lockit); -} - -static int check_flags(const struct bpf_sk_storage_data *old_sdata, - u64 map_flags) -{ - if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) - /* elem already exists */ - return -EEXIST; - - if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) - /* elem doesn't exist, cannot update it */ - return -ENOENT; - - return 0; -} - -static int sk_storage_alloc(struct sock *sk, - struct bpf_sk_storage_map *smap, - struct bpf_sk_storage_elem *first_selem) -{ - struct bpf_sk_storage *prev_sk_storage, *sk_storage; - int err; - - err = omem_charge(sk, sizeof(*sk_storage)); - if (err) - return err; - - sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN); - if (!sk_storage) { - err = -ENOMEM; - goto uncharge; - } - INIT_HLIST_HEAD(&sk_storage->list); - raw_spin_lock_init(&sk_storage->lock); - sk_storage->sk = sk; - - __selem_link_sk(sk_storage, first_selem); - selem_link_map(smap, first_selem); - /* Publish sk_storage to sk. sk->sk_lock cannot be acquired. - * Hence, atomic ops is used to set sk->sk_bpf_storage - * from NULL to the newly allocated sk_storage ptr. - * - * From now on, the sk->sk_bpf_storage pointer is protected - * by the sk_storage->lock. Hence, when freeing - * the sk->sk_bpf_storage, the sk_storage->lock must - * be held before setting sk->sk_bpf_storage to NULL. - */ - prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage, - NULL, sk_storage); - if (unlikely(prev_sk_storage)) { - selem_unlink_map(first_selem); - err = -EAGAIN; - goto uncharge; - - /* Note that even first_selem was linked to smap's - * bucket->list, first_selem can be freed immediately - * (instead of kfree_rcu) because - * bpf_sk_storage_map_free() does a - * synchronize_rcu() before walking the bucket->list. - * Hence, no one is accessing selem from the - * bucket->list under rcu_read_lock(). - */ - } - - return 0; - -uncharge: - kfree(sk_storage); - atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc); - return err; -} - -/* sk cannot be going away because it is linking new elem - * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). - * Otherwise, it will become a leak (and other memory issues - * during map destruction). - */ -static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk, - struct bpf_map *map, - void *value, - u64 map_flags) -{ - struct bpf_sk_storage_data *old_sdata = NULL; - struct bpf_sk_storage_elem *selem; - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_map *smap; - int err; - - /* BPF_EXIST and BPF_NOEXIST cannot be both set */ - if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || - /* BPF_F_LOCK can only be used in a value with spin_lock */ - unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) - return ERR_PTR(-EINVAL); - - smap = (struct bpf_sk_storage_map *)map; - sk_storage = rcu_dereference(sk->sk_bpf_storage); - if (!sk_storage || hlist_empty(&sk_storage->list)) { - /* Very first elem for this sk */ - err = check_flags(NULL, map_flags); - if (err) - return ERR_PTR(err); - - selem = selem_alloc(smap, sk, value, true); - if (!selem) - return ERR_PTR(-ENOMEM); - - err = sk_storage_alloc(sk, smap, selem); - if (err) { - kfree(selem); - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); - return ERR_PTR(err); - } - - return SDATA(selem); - } - - if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { - /* Hoping to find an old_sdata to do inline update - * such that it can avoid taking the sk_storage->lock - * and changing the lists. - */ - old_sdata = __sk_storage_lookup(sk_storage, smap, false); - err = check_flags(old_sdata, map_flags); - if (err) - return ERR_PTR(err); - if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) { - copy_map_value_locked(map, old_sdata->data, - value, false); - return old_sdata; - } - } - - raw_spin_lock_bh(&sk_storage->lock); - - /* Recheck sk_storage->list under sk_storage->lock */ - if (unlikely(hlist_empty(&sk_storage->list))) { - /* A parallel del is happening and sk_storage is going - * away. It has just been checked before, so very - * unlikely. Return instead of retry to keep things - * simple. - */ - err = -EAGAIN; - goto unlock_err; - } - - old_sdata = __sk_storage_lookup(sk_storage, smap, false); - err = check_flags(old_sdata, map_flags); - if (err) - goto unlock_err; - - if (old_sdata && (map_flags & BPF_F_LOCK)) { - copy_map_value_locked(map, old_sdata->data, value, false); - selem = SELEM(old_sdata); - goto unlock; - } - - /* sk_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during __selem_unlink_sk(). - */ - selem = selem_alloc(smap, sk, value, !old_sdata); - if (!selem) { - err = -ENOMEM; - goto unlock_err; - } - - /* First, link the new selem to the map */ - selem_link_map(smap, selem); - - /* Second, link (and publish) the new selem to sk_storage */ - __selem_link_sk(sk_storage, selem); - - /* Third, remove old selem, SELEM(old_sdata) */ - if (old_sdata) { - selem_unlink_map(SELEM(old_sdata)); - __selem_unlink_sk(sk_storage, SELEM(old_sdata), false); - } - -unlock: - raw_spin_unlock_bh(&sk_storage->lock); - return SDATA(selem); - -unlock_err: - raw_spin_unlock_bh(&sk_storage->lock); - return ERR_PTR(err); + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit); } static int sk_storage_delete(struct sock *sk, struct bpf_map *map) { - struct bpf_sk_storage_data *sdata; + struct bpf_local_storage_data *sdata; sdata = sk_storage_lookup(sk, map, false); if (!sdata) return -ENOENT; - selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata)); return 0; } -static u16 cache_idx_get(void) -{ - u64 min_usage = U64_MAX; - u16 i, res = 0; - - spin_lock(&cache_idx_lock); - - for (i = 0; i < BPF_SK_STORAGE_CACHE_SIZE; i++) { - if (cache_idx_usage_counts[i] < min_usage) { - min_usage = cache_idx_usage_counts[i]; - res = i; - - /* Found a free cache_idx */ - if (!min_usage) - break; - } - } - cache_idx_usage_counts[res]++; - - spin_unlock(&cache_idx_lock); - - return res; -} - -static void cache_idx_free(u16 idx) -{ - spin_lock(&cache_idx_lock); - cache_idx_usage_counts[idx]--; - spin_unlock(&cache_idx_lock); -} - /* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { - struct bpf_sk_storage_elem *selem; - struct bpf_sk_storage *sk_storage; + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *sk_storage; bool free_sk_storage = false; struct hlist_node *n; @@ -565,7 +75,7 @@ void bpf_sk_storage_free(struct sock *sk) * Thus, no elem can be added-to or deleted-from the * sk_storage->list by the bpf_prog or by the bpf-map's syscall. * - * It is racing with bpf_sk_storage_map_free() alone + * It is racing with bpf_local_storage_map_free() alone * when unlinking elem from the sk_storage->list and * the map's bucket->list. */ @@ -574,8 +84,9 @@ void bpf_sk_storage_free(struct sock *sk) /* Always unlink from map before unlinking from * sk_storage. */ - selem_unlink_map(selem); - free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); + bpf_selem_unlink_map(selem); + free_sk_storage = bpf_selem_unlink_storage_nolock(sk_storage, + selem, true); } raw_spin_unlock_bh(&sk_storage->lock); rcu_read_unlock(); @@ -584,132 +95,24 @@ void bpf_sk_storage_free(struct sock *sk) kfree_rcu(sk_storage, rcu); } -static void bpf_sk_storage_map_free(struct bpf_map *map) -{ - struct bpf_sk_storage_elem *selem; - struct bpf_sk_storage_map *smap; - struct bucket *b; - unsigned int i; - - smap = (struct bpf_sk_storage_map *)map; - - cache_idx_free(smap->cache_idx); - - /* Note that this map might be concurrently cloned from - * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone - * RCU read section to finish before proceeding. New RCU - * read sections should be prevented via bpf_map_inc_not_zero. - */ - synchronize_rcu(); - - /* bpf prog and the userspace can no longer access this map - * now. No new selem (of this map) can be added - * to the sk->sk_bpf_storage or to the map bucket's list. - * - * The elem of this map can be cleaned up here - * or - * by bpf_sk_storage_free() during __sk_destruct(). - */ - for (i = 0; i < (1U << smap->bucket_log); i++) { - b = &smap->buckets[i]; - - rcu_read_lock(); - /* No one is adding to b->list now */ - while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)), - struct bpf_sk_storage_elem, - map_node))) { - selem_unlink(selem); - cond_resched_rcu(); - } - rcu_read_unlock(); - } - - /* bpf_sk_storage_free() may still need to access the map. - * e.g. bpf_sk_storage_free() has unlinked selem from the map - * which then made the above while((selem = ...)) loop - * exited immediately. - * - * However, the bpf_sk_storage_free() still needs to access - * the smap->elem_size to do the uncharging in - * __selem_unlink_sk(). - * - * Hence, wait another rcu grace period for the - * bpf_sk_storage_free() to finish. - */ - synchronize_rcu(); - - kvfree(smap->buckets); - kfree(map); -} - -/* U16_MAX is much more than enough for sk local storage - * considering a tcp_sock is ~2k. - */ -#define MAX_VALUE_SIZE \ - min_t(u32, \ - (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \ - (U16_MAX - sizeof(struct bpf_sk_storage_elem))) - -static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) +static void sk_storage_map_free(struct bpf_map *map) { - if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || - !(attr->map_flags & BPF_F_NO_PREALLOC) || - attr->max_entries || - attr->key_size != sizeof(int) || !attr->value_size || - /* Enforce BTF for userspace sk dumping */ - !attr->btf_key_type_id || !attr->btf_value_type_id) - return -EINVAL; - - if (!bpf_capable()) - return -EPERM; + struct bpf_local_storage_map *smap; - if (attr->value_size > MAX_VALUE_SIZE) - return -E2BIG; - - return 0; + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); } -static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) +static struct bpf_map *sk_storage_map_alloc(union bpf_attr *attr) { - struct bpf_sk_storage_map *smap; - unsigned int i; - u32 nbuckets; - u64 cost; - int ret; - - smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); - if (!smap) - return ERR_PTR(-ENOMEM); - bpf_map_init_from_attr(&smap->map, attr); - - nbuckets = roundup_pow_of_two(num_possible_cpus()); - /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ - nbuckets = max_t(u32, 2, nbuckets); - smap->bucket_log = ilog2(nbuckets); - cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); - - ret = bpf_map_charge_init(&smap->map.memory, cost); - if (ret < 0) { - kfree(smap); - return ERR_PTR(ret); - } - - smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, - GFP_USER | __GFP_NOWARN); - if (!smap->buckets) { - bpf_map_charge_finish(&smap->map.memory); - kfree(smap); - return ERR_PTR(-ENOMEM); - } + struct bpf_local_storage_map *smap; - for (i = 0; i < nbuckets; i++) { - INIT_HLIST_HEAD(&smap->buckets[i].list); - raw_spin_lock_init(&smap->buckets[i].lock); - } - - smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; - smap->cache_idx = cache_idx_get(); + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache); return &smap->map; } @@ -719,26 +122,9 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static int bpf_sk_storage_map_check_btf(const struct bpf_map *map, - const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type) -{ - u32 int_data; - - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) - return -EINVAL; - - return 0; -} - static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) { - struct bpf_sk_storage_data *sdata; + struct bpf_local_storage_data *sdata; struct socket *sock; int fd, err; @@ -756,14 +142,16 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { - struct bpf_sk_storage_data *sdata; + struct bpf_local_storage_data *sdata; struct socket *sock; int fd, err; fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - sdata = sk_storage_update(sock->sk, map, value, map_flags); + sdata = bpf_local_storage_update( + sock->sk, (struct bpf_local_storage_map *)map, value, + map_flags); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -787,14 +175,14 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) return err; } -static struct bpf_sk_storage_elem * +static struct bpf_local_storage_elem * bpf_sk_storage_clone_elem(struct sock *newsk, - struct bpf_sk_storage_map *smap, - struct bpf_sk_storage_elem *selem) + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) { - struct bpf_sk_storage_elem *copy_selem; + struct bpf_local_storage_elem *copy_selem; - copy_selem = selem_alloc(smap, newsk, NULL, true); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, true); if (!copy_selem) return NULL; @@ -810,9 +198,9 @@ bpf_sk_storage_clone_elem(struct sock *newsk, int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) { - struct bpf_sk_storage *new_sk_storage = NULL; - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_elem *selem; + struct bpf_local_storage *new_sk_storage = NULL; + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_elem *selem; int ret = 0; RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); @@ -824,8 +212,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) goto out; hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { - struct bpf_sk_storage_elem *copy_selem; - struct bpf_sk_storage_map *smap; + struct bpf_local_storage_elem *copy_selem; + struct bpf_local_storage_map *smap; struct bpf_map *map; smap = rcu_dereference(SDATA(selem)->smap); @@ -833,7 +221,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) continue; /* Note that for lockless listeners adding new element - * here can race with cleanup in bpf_sk_storage_map_free. + * here can race with cleanup in bpf_local_storage_map_free. * Try to grab map refcnt to make sure that it's still * alive and prevent concurrent removal. */ @@ -849,10 +237,10 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } if (new_sk_storage) { - selem_link_map(smap, copy_selem); - __selem_link_sk(new_sk_storage, copy_selem); + bpf_selem_link_map(smap, copy_selem); + bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { - ret = sk_storage_alloc(newsk, smap, copy_selem); + ret = bpf_local_storage_alloc(newsk, smap, copy_selem); if (ret) { kfree(copy_selem); atomic_sub(smap->elem_size, @@ -861,7 +249,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) goto out; } - new_sk_storage = rcu_dereference(copy_selem->sk_storage); + new_sk_storage = + rcu_dereference(copy_selem->local_storage); } bpf_map_put(map); } @@ -879,7 +268,7 @@ out: BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags) { - struct bpf_sk_storage_data *sdata; + struct bpf_local_storage_data *sdata; if (flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; @@ -895,7 +284,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, * destruction). */ refcount_inc_not_zero(&sk->sk_refcnt)) { - sdata = sk_storage_update(sk, map, value, BPF_NOEXIST); + sdata = bpf_local_storage_update( + sk, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ @@ -920,18 +311,44 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) return -ENOENT; } +static int sk_storage_charge(struct bpf_local_storage_map *smap, + void *owner, u32 size) +{ + return omem_charge(owner, size); +} + +static void sk_storage_uncharge(struct bpf_local_storage_map *smap, + void *owner, u32 size) +{ + struct sock *sk = owner; + + atomic_sub(size, &sk->sk_omem_alloc); +} + +static struct bpf_local_storage __rcu ** +sk_storage_ptr(void *owner) +{ + struct sock *sk = owner; + + return &sk->sk_bpf_storage; +} + static int sk_storage_map_btf_id; const struct bpf_map_ops sk_storage_map_ops = { - .map_alloc_check = bpf_sk_storage_map_alloc_check, - .map_alloc = bpf_sk_storage_map_alloc, - .map_free = bpf_sk_storage_map_free, + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = sk_storage_map_alloc, + .map_free = sk_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, - .map_check_btf = bpf_sk_storage_map_check_btf, - .map_btf_name = "bpf_sk_storage_map", + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_name = "bpf_local_storage_map", .map_btf_id = &sk_storage_map_btf_id, + .map_local_storage_charge = sk_storage_charge, + .map_local_storage_uncharge = sk_storage_uncharge, + .map_owner_storage_ptr = sk_storage_ptr, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { @@ -962,6 +379,30 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg2_type = ARG_PTR_TO_SOCKET, }; +BTF_ID_LIST(sk_storage_btf_ids) +BTF_ID_UNUSED +BTF_ID(struct, sock) + +const struct bpf_func_proto sk_storage_get_btf_proto = { + .func = bpf_sk_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .btf_id = sk_storage_btf_ids, +}; + +const struct bpf_func_proto sk_storage_delete_btf_proto = { + .func = bpf_sk_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .btf_id = sk_storage_btf_ids, +}; + struct bpf_sk_storage_diag { u32 nr_maps; struct bpf_map *maps[]; @@ -1022,7 +463,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) u32 nr_maps = 0; int rem, err; - /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as + /* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as * the map_alloc_check() side also does. */ if (!bpf_capable()) @@ -1072,13 +513,13 @@ err_free: } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); -static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb) +static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) { struct nlattr *nla_stg, *nla_value; - struct bpf_sk_storage_map *smap; + struct bpf_local_storage_map *smap; /* It cannot exceed max nlattr's payload */ - BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE); + BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE); nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE); if (!nla_stg) @@ -1114,9 +555,9 @@ static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, { /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ unsigned int diag_size = nla_total_size(0); - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_elem *selem; - struct bpf_sk_storage_map *smap; + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; struct nlattr *nla_stgs; unsigned int saved_len; int err = 0; @@ -1169,8 +610,8 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, { /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ unsigned int diag_size = nla_total_size(0); - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_data *sdata; + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_data *sdata; struct nlattr *nla_stgs; unsigned int saved_len; int err = 0; @@ -1197,8 +638,8 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, saved_len = skb->len; for (i = 0; i < diag->nr_maps; i++) { - sdata = __sk_storage_lookup(sk_storage, - (struct bpf_sk_storage_map *)diag->maps[i], + sdata = bpf_local_storage_lookup(sk_storage, + (struct bpf_local_storage_map *)diag->maps[i], false); if (!sdata) @@ -1235,19 +676,19 @@ struct bpf_iter_seq_sk_storage_map_info { unsigned skip_elems; }; -static struct bpf_sk_storage_elem * +static struct bpf_local_storage_elem * bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, - struct bpf_sk_storage_elem *prev_selem) + struct bpf_local_storage_elem *prev_selem) { - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_elem *selem; + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_elem *selem; u32 skip_elems = info->skip_elems; - struct bpf_sk_storage_map *smap; + struct bpf_local_storage_map *smap; u32 bucket_id = info->bucket_id; u32 i, count, n_buckets; - struct bucket *b; + struct bpf_local_storage_map_bucket *b; - smap = (struct bpf_sk_storage_map *)info->map; + smap = (struct bpf_local_storage_map *)info->map; n_buckets = 1U << smap->bucket_log; if (bucket_id >= n_buckets) return NULL; @@ -1257,7 +698,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, count = 0; while (selem) { selem = hlist_entry_safe(selem->map_node.next, - struct bpf_sk_storage_elem, map_node); + struct bpf_local_storage_elem, map_node); if (!selem) { /* not found, unlock and go to the next bucket */ b = &smap->buckets[bucket_id++]; @@ -1265,7 +706,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, skip_elems = 0; break; } - sk_storage = rcu_dereference_raw(selem->sk_storage); + sk_storage = rcu_dereference_raw(selem->local_storage); if (sk_storage) { info->skip_elems = skip_elems + count; return selem; @@ -1278,7 +719,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, raw_spin_lock_bh(&b->lock); count = 0; hlist_for_each_entry(selem, &b->list, map_node) { - sk_storage = rcu_dereference_raw(selem->sk_storage); + sk_storage = rcu_dereference_raw(selem->local_storage); if (sk_storage && count >= skip_elems) { info->bucket_id = i; info->skip_elems = count; @@ -1297,7 +738,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos) { - struct bpf_sk_storage_elem *selem; + struct bpf_local_storage_elem *selem; selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL); if (!selem) @@ -1330,11 +771,11 @@ DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta, void *value) static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, - struct bpf_sk_storage_elem *selem) + struct bpf_local_storage_elem *selem) { struct bpf_iter_seq_sk_storage_map_info *info = seq->private; struct bpf_iter__bpf_sk_storage_map ctx = {}; - struct bpf_sk_storage *sk_storage; + struct bpf_local_storage *sk_storage; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret = 0; @@ -1345,8 +786,8 @@ static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, ctx.meta = &meta; ctx.map = info->map; if (selem) { - sk_storage = rcu_dereference_raw(selem->sk_storage); - ctx.sk = sk_storage->sk; + sk_storage = rcu_dereference_raw(selem->local_storage); + ctx.sk = sk_storage->owner; ctx.value = SDATA(selem)->data; } ret = bpf_iter_run_prog(prog, &ctx); @@ -1363,13 +804,13 @@ static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v) static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_seq_sk_storage_map_info *info = seq->private; - struct bpf_sk_storage_map *smap; - struct bucket *b; + struct bpf_local_storage_map *smap; + struct bpf_local_storage_map_bucket *b; if (!v) { (void)__bpf_sk_storage_map_seq_show(seq, v); } else { - smap = (struct bpf_sk_storage_map *)info->map; + smap = (struct bpf_local_storage_map *)info->map; b = &smap->buckets[info->bucket_id]; raw_spin_unlock_bh(&b->lock); } @@ -1437,6 +878,8 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { .target = "bpf_sk_storage_map", .attach_target = bpf_iter_attach_map, .detach_target = bpf_iter_detach_map, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), diff --git a/net/core/datagram.c b/net/core/datagram.c index 639745d4f3b9..9fcaa544f11a 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -623,10 +623,11 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, while (length && iov_iter_count(from)) { struct page *pages[MAX_SKB_FRAGS]; + struct page *last_head = NULL; size_t start; ssize_t copied; unsigned long truesize; - int n = 0; + int refs, n = 0; if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; @@ -649,13 +650,37 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); } - while (copied) { + for (refs = 0; copied != 0; start = 0) { int size = min_t(int, copied, PAGE_SIZE - start); - skb_fill_page_desc(skb, frag++, pages[n], start, size); - start = 0; + struct page *head = compound_head(pages[n]); + + start += (pages[n] - head) << PAGE_SHIFT; copied -= size; n++; + if (frag) { + skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1]; + + if (head == skb_frag_page(last) && + start == skb_frag_off(last) + skb_frag_size(last)) { + skb_frag_size_add(last, size); + /* We combined this page, we need to release + * a reference. Since compound pages refcount + * is shared among many pages, batch the refcount + * adjustments to limit false sharing. + */ + last_head = head; + refs++; + continue; + } + } + if (refs) { + page_ref_sub(last_head, refs); + refs = 0; + } + skb_fill_page_desc(skb, frag++, head, start, size); } + if (refs) + page_ref_sub(last_head, refs); } return 0; } diff --git a/net/core/dev.c b/net/core/dev.c index 266073e300b5..38a172a63318 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -98,6 +98,7 @@ #include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> +#include <net/dsa.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/pkt_sched.h> @@ -1130,7 +1131,7 @@ EXPORT_SYMBOL(__dev_get_by_flags); * @name: name string * * Network device names need to be valid file names to - * to allow sysfs to work. We also disallow any kind of + * allow sysfs to work. We also disallow any kind of * whitespace. */ bool dev_valid_name(const char *name) @@ -5192,7 +5193,7 @@ skip_classify: } } - if (unlikely(skb_vlan_tag_present(skb))) { + if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { check_vlan_id: if (skb_vlan_tag_get_id(skb)) { /* Vlan id is non 0 and vlan_do_receive() above couldn't @@ -5621,17 +5622,60 @@ static void flush_backlog(struct work_struct *work) local_bh_enable(); } +static bool flush_required(int cpu) +{ +#if IS_ENABLED(CONFIG_RPS) + struct softnet_data *sd = &per_cpu(softnet_data, cpu); + bool do_flush; + + local_irq_disable(); + rps_lock(sd); + + /* as insertion into process_queue happens with the rps lock held, + * process_queue access may race only with dequeue + */ + do_flush = !skb_queue_empty(&sd->input_pkt_queue) || + !skb_queue_empty_lockless(&sd->process_queue); + rps_unlock(sd); + local_irq_enable(); + + return do_flush; +#endif + /* without RPS we can't safely check input_pkt_queue: during a + * concurrent remote skb_queue_splice() we can detect as empty both + * input_pkt_queue and process_queue even if the latter could end-up + * containing a lot of packets. + */ + return true; +} + static void flush_all_backlogs(void) { + static cpumask_t flush_cpus; unsigned int cpu; + /* since we are under rtnl lock protection we can use static data + * for the cpumask and avoid allocating on stack the possibly + * large mask + */ + ASSERT_RTNL(); + get_online_cpus(); - for_each_online_cpu(cpu) - queue_work_on(cpu, system_highpri_wq, - per_cpu_ptr(&flush_works, cpu)); + cpumask_clear(&flush_cpus); + for_each_online_cpu(cpu) { + if (flush_required(cpu)) { + queue_work_on(cpu, system_highpri_wq, + per_cpu_ptr(&flush_works, cpu)); + cpumask_set_cpu(cpu, &flush_cpus); + } + } - for_each_online_cpu(cpu) + /* we can have in flight packet[s] on the cpus we are not flushing, + * synchronize_net() in rollback_registered_many() will take care of + * them + */ + for_each_cpu(cpu, &flush_cpus) flush_work(per_cpu_ptr(&flush_works, cpu)); put_online_cpus(); @@ -6293,7 +6337,7 @@ EXPORT_SYMBOL(__napi_schedule); * @n: napi context * * Test if NAPI routine is already running, and if not mark - * it as running. This is used as a condition variable + * it as running. This is used as a condition variable to * insure only one NAPI poll instance runs. We also make * sure there is no pending NAPI disable. */ @@ -6533,8 +6577,7 @@ EXPORT_SYMBOL(napi_busy_loop); static void napi_hash_add(struct napi_struct *napi) { - if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || - test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) return; spin_lock(&napi_hash_lock); @@ -6555,20 +6598,14 @@ static void napi_hash_add(struct napi_struct *napi) /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ -bool napi_hash_del(struct napi_struct *napi) +static void napi_hash_del(struct napi_struct *napi) { - bool rcu_sync_needed = false; - spin_lock(&napi_hash_lock); - if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { - rcu_sync_needed = true; - hlist_del_rcu(&napi->napi_hash_node); - } + hlist_del_init_rcu(&napi->napi_hash_node); + spin_unlock(&napi_hash_lock); - return rcu_sync_needed; } -EXPORT_SYMBOL_GPL(napi_hash_del); static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) { @@ -6600,7 +6637,11 @@ static void init_gro_hash(struct napi_struct *napi) void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { + if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) + return; + INIT_LIST_HEAD(&napi->poll_list); + INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; init_gro_hash(napi); @@ -6653,18 +6694,19 @@ static void flush_gro_hash(struct napi_struct *napi) } /* Must be called in process context */ -void netif_napi_del(struct napi_struct *napi) +void __netif_napi_del(struct napi_struct *napi) { - might_sleep(); - if (napi_hash_del(napi)) - synchronize_net(); - list_del_init(&napi->dev_list); + if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) + return; + + napi_hash_del(napi); + list_del_rcu(&napi->dev_list); napi_free_frags(napi); flush_gro_hash(napi); napi->gro_bitmask = 0; } -EXPORT_SYMBOL(netif_napi_del); +EXPORT_SYMBOL(__netif_napi_del); static int napi_poll(struct napi_struct *n, struct list_head *repoll) { @@ -9470,7 +9512,7 @@ int __netdev_update_features(struct net_device *dev) /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); - /* some features can't be enabled if they're off an an upper device */ + /* some features can't be enabled if they're off on an upper device */ netdev_for_each_upper_dev_rcu(dev, upper, iter) features = netdev_sync_upper_features(dev, upper, features); @@ -9986,10 +10028,12 @@ EXPORT_SYMBOL(netdev_refcnt_read); * We can get stuck here if buggy protocols don't correctly * call dev_put. */ +#define WAIT_REFS_MIN_MSECS 1 +#define WAIT_REFS_MAX_MSECS 250 static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; - int refcnt; + int wait = 0, refcnt; linkwatch_forget_dev(dev); @@ -10023,7 +10067,13 @@ static void netdev_wait_allrefs(struct net_device *dev) rebroadcast_time = jiffies; } - msleep(250); + if (!wait) { + rcu_barrier(); + wait = WAIT_REFS_MIN_MSECS; + } else { + msleep(wait); + wait = min(wait << 1, WAIT_REFS_MAX_MSECS); + } refcnt = netdev_refcnt_read(dev); diff --git a/net/core/devlink.c b/net/core/devlink.c index 80ec1cd81c64..045468390480 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -523,15 +523,20 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return -EMSGSIZE; switch (devlink_port->attrs.flavour) { case DEVLINK_PORT_FLAVOUR_PCI_PF: - if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, - attrs->pci_pf.pf)) + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_pf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) + return -EMSGSIZE; + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_VF: - if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, - attrs->pci_vf.pf) || - nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, - attrs->pci_vf.vf)) + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_vf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) + return -EMSGSIZE; + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PHYSICAL: @@ -3017,9 +3022,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) static int devlink_nl_flash_update_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, - const char *status_msg, - const char *component, - unsigned long done, unsigned long total) + struct devlink_flash_notify *params) { void *hdr; @@ -3033,19 +3036,22 @@ static int devlink_nl_flash_update_fill(struct sk_buff *msg, if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) goto out; - if (status_msg && + if (params->status_msg && nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, - status_msg)) + params->status_msg)) goto nla_put_failure; - if (component && + if (params->component && nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, - component)) + params->component)) goto nla_put_failure; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, - done, DEVLINK_ATTR_PAD)) + params->done, DEVLINK_ATTR_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, - total, DEVLINK_ATTR_PAD)) + params->total, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, + params->timeout, DEVLINK_ATTR_PAD)) goto nla_put_failure; out: @@ -3059,10 +3065,7 @@ nla_put_failure: static void __devlink_flash_update_notify(struct devlink *devlink, enum devlink_command cmd, - const char *status_msg, - const char *component, - unsigned long done, - unsigned long total) + struct devlink_flash_notify *params) { struct sk_buff *msg; int err; @@ -3075,8 +3078,7 @@ static void __devlink_flash_update_notify(struct devlink *devlink, if (!msg) return; - err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg, - component, done, total); + err = devlink_nl_flash_update_fill(msg, devlink, cmd, params); if (err) goto out_free_msg; @@ -3090,17 +3092,21 @@ out_free_msg: void devlink_flash_update_begin_notify(struct devlink *devlink) { + struct devlink_flash_notify params = { 0 }; + __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE, - NULL, NULL, 0, 0); + ¶ms); } EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify); void devlink_flash_update_end_notify(struct devlink *devlink) { + struct devlink_flash_notify params = { 0 }; + __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_END, - NULL, NULL, 0, 0); + ¶ms); } EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify); @@ -3110,12 +3116,36 @@ void devlink_flash_update_status_notify(struct devlink *devlink, unsigned long done, unsigned long total) { + struct devlink_flash_notify params = { + .status_msg = status_msg, + .component = component, + .done = done, + .total = total, + }; + __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_STATUS, - status_msg, component, done, total); + ¶ms); } EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); +void devlink_flash_update_timeout_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long timeout) +{ + struct devlink_flash_notify params = { + .status_msg = status_msg, + .component = component, + .timeout = timeout, + }; + + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_STATUS, + ¶ms); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); + static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { @@ -4317,7 +4347,7 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) } } - err = region->ops->snapshot(devlink, info->extack, &data); + err = region->ops->snapshot(devlink, region->ops, info->extack, &data); if (err) goto err_snapshot_capture; @@ -5895,6 +5925,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) continue; + mutex_lock(&devlink->lock); list_for_each_entry(port, &devlink->port_list, list) { mutex_lock(&port->reporters_lock); list_for_each_entry(reporter, &port->reporter_list, list) { @@ -5909,12 +5940,14 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&port->reporters_lock); + mutex_unlock(&devlink->lock); goto out; } idx++; } mutex_unlock(&port->reporters_lock); } + mutex_unlock(&devlink->lock); } out: mutex_unlock(&devlink_mutex); @@ -6088,6 +6121,28 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, return 0; } +static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + int err; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->test) { + devlink_health_reporter_put(reporter); + return -EOPNOTSUPP; + } + + err = reporter->ops->test(reporter, info->extack); + + devlink_health_reporter_put(reporter); + return err; +} + struct devlink_stats { u64 rx_bytes; u64 rx_packets; @@ -7309,6 +7364,14 @@ static const struct genl_ops devlink_nl_ops[] = { DEVLINK_NL_FLAG_NO_LOCK, }, { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_test_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | + DEVLINK_NL_FLAG_NO_LOCK, + }, + { .cmd = DEVLINK_CMD_FLASH_UPDATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_flash_update, @@ -7555,11 +7618,11 @@ int devlink_port_register(struct devlink *devlink, devlink_port->index = port_index; devlink_port->registered = true; spin_lock_init(&devlink_port->type_lock); + INIT_LIST_HEAD(&devlink_port->reporter_list); + mutex_init(&devlink_port->reporters_lock); list_add_tail(&devlink_port->list, &devlink->port_list); INIT_LIST_HEAD(&devlink_port->param_list); mutex_unlock(&devlink->lock); - INIT_LIST_HEAD(&devlink_port->reporter_list); - mutex_init(&devlink_port->reporters_lock); INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); @@ -7576,13 +7639,13 @@ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; - WARN_ON(!list_empty(&devlink_port->reporter_list)); - mutex_destroy(&devlink_port->reporters_lock); devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); mutex_lock(&devlink->lock); list_del(&devlink_port->list); mutex_unlock(&devlink->lock); + WARN_ON(!list_empty(&devlink_port->reporter_list)); + mutex_destroy(&devlink_port->reporters_lock); } EXPORT_SYMBOL_GPL(devlink_port_unregister); @@ -7600,14 +7663,8 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } -/** - * devlink_port_type_eth_set - Set port type to Ethernet - * - * @devlink_port: devlink port - * @netdev: related netdevice - */ -void devlink_port_type_eth_set(struct devlink_port *devlink_port, - struct net_device *netdev) +static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, + struct net_device *netdev) { const struct net_device_ops *ops = netdev->netdev_ops; @@ -7641,6 +7698,24 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port, err = ops->ndo_get_port_parent_id(netdev, &ppid); WARN_ON(err != -EOPNOTSUPP); } +} + +/** + * devlink_port_type_eth_set - Set port type to Ethernet + * + * @devlink_port: devlink port + * @netdev: related netdevice + */ +void devlink_port_type_eth_set(struct devlink_port *devlink_port, + struct net_device *netdev) +{ + if (netdev) + devlink_port_type_netdev_checks(devlink_port, netdev); + else + dev_warn(devlink_port->devlink->dev, + "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", + devlink_port->index); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, netdev); } EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); @@ -7712,9 +7787,12 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set); * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes * * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance + * @external: indicates if the port is for an external controller */ -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf) +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -7723,8 +7801,9 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf) DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) return; - + attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; + attrs->pci_pf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); @@ -7732,11 +7811,13 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes * * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @vf: associated VF of a PF for the devlink port instance + * @external: indicates if the port is for an external controller */ -void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, - u16 pf, u16 vf) +void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -7745,8 +7826,10 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) return; + attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; + attrs->pci_vf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); @@ -7777,9 +7860,23 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, WARN_ON(1); return -EINVAL; case DEVLINK_PORT_FLAVOUR_PCI_PF: + if (attrs->pci_pf.external) { + n = snprintf(name, len, "c%u", attrs->pci_pf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); break; case DEVLINK_PORT_FLAVOUR_PCI_VF: + if (attrs->pci_vf.external) { + n = snprintf(name, len, "c%u", attrs->pci_vf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } n = snprintf(name, len, "pf%uvf%u", attrs->pci_vf.pf, attrs->pci_vf.vf); break; diff --git a/net/core/filter.c b/net/core/filter.c index 21eaf3b182f2..08f577114acc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4459,6 +4459,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; if (optlen != sizeof(int)) return -EINVAL; @@ -4480,6 +4481,20 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, tp->snd_ssthresh = val; } break; + case TCP_BPF_DELACK_MAX: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_DELACK_MAX || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_delack_max = timeout; + break; + case TCP_BPF_RTO_MIN: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_RTO_MIN || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_rto_min = timeout; + break; case TCP_SAVE_SYN: if (val < 0 || val > 1) ret = -EINVAL; @@ -4550,9 +4565,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, tp = tcp_sk(sk); if (optlen <= 0 || !tp->saved_syn || - optlen > tp->saved_syn[0]) + optlen > tcp_saved_syn_len(tp->saved_syn)) goto err_clear; - memcpy(optval, tp->saved_syn + 1, optlen); + memcpy(optval, tp->saved_syn->data, optlen); break; default: goto err_clear; @@ -4654,9 +4669,99 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, + int optname, const u8 **start) +{ + struct sk_buff *syn_skb = bpf_sock->syn_skb; + const u8 *hdr_start; + int ret; + + if (syn_skb) { + /* sk is a request_sock here */ + + if (optname == TCP_BPF_SYN) { + hdr_start = syn_skb->data; + ret = tcp_hdrlen(syn_skb); + } else if (optname == TCP_BPF_SYN_IP) { + hdr_start = skb_network_header(syn_skb); + ret = skb_network_header_len(syn_skb) + + tcp_hdrlen(syn_skb); + } else { + /* optname == TCP_BPF_SYN_MAC */ + hdr_start = skb_mac_header(syn_skb); + ret = skb_mac_header_len(syn_skb) + + skb_network_header_len(syn_skb) + + tcp_hdrlen(syn_skb); + } + } else { + struct sock *sk = bpf_sock->sk; + struct saved_syn *saved_syn; + + if (sk->sk_state == TCP_NEW_SYN_RECV) + /* synack retransmit. bpf_sock->syn_skb will + * not be available. It has to resort to + * saved_syn (if it is saved). + */ + saved_syn = inet_reqsk(sk)->saved_syn; + else + saved_syn = tcp_sk(sk)->saved_syn; + + if (!saved_syn) + return -ENOENT; + + if (optname == TCP_BPF_SYN) { + hdr_start = saved_syn->data + + saved_syn->mac_hdrlen + + saved_syn->network_hdrlen; + ret = saved_syn->tcp_hdrlen; + } else if (optname == TCP_BPF_SYN_IP) { + hdr_start = saved_syn->data + + saved_syn->mac_hdrlen; + ret = saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; + } else { + /* optname == TCP_BPF_SYN_MAC */ + + /* TCP_SAVE_SYN may not have saved the mac hdr */ + if (!saved_syn->mac_hdrlen) + return -ENOENT; + + hdr_start = saved_syn->data; + ret = saved_syn->mac_hdrlen + + saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; + } + } + + *start = hdr_start; + return ret; +} + BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && + optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { + int ret, copy_len = 0; + const u8 *start; + + ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start); + if (ret > 0) { + copy_len = ret; + if (optlen < copy_len) { + copy_len = optlen; + ret = -ENOSPC; + } + + memcpy(optval, start, copy_len); + } + + /* Zero out unused buffer at the end */ + memset(optval + copy_len, 0, optlen - copy_len); + + return ret; + } + return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); } @@ -6151,6 +6256,232 @@ static const struct bpf_func_proto bpf_sk_assign_proto = { .arg3_type = ARG_ANYTHING, }; +static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend, + u8 search_kind, const u8 *magic, + u8 magic_len, bool *eol) +{ + u8 kind, kind_len; + + *eol = false; + + while (op < opend) { + kind = op[0]; + + if (kind == TCPOPT_EOL) { + *eol = true; + return ERR_PTR(-ENOMSG); + } else if (kind == TCPOPT_NOP) { + op++; + continue; + } + + if (opend - op < 2 || opend - op < op[1] || op[1] < 2) + /* Something is wrong in the received header. + * Follow the TCP stack's tcp_parse_options() + * and just bail here. + */ + return ERR_PTR(-EFAULT); + + kind_len = op[1]; + if (search_kind == kind) { + if (!magic_len) + return op; + + if (magic_len > kind_len - 2) + return ERR_PTR(-ENOMSG); + + if (!memcmp(&op[2], magic, magic_len)) + return op; + } + + op += kind_len; + } + + return ERR_PTR(-ENOMSG); +} + +BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + void *, search_res, u32, len, u64, flags) +{ + bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN; + const u8 *op, *opend, *magic, *search = search_res; + u8 search_kind, search_len, copy_len, magic_len; + int ret; + + /* 2 byte is the minimal option len except TCPOPT_NOP and + * TCPOPT_EOL which are useless for the bpf prog to learn + * and this helper disallow loading them also. + */ + if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN) + return -EINVAL; + + search_kind = search[0]; + search_len = search[1]; + + if (search_len > len || search_kind == TCPOPT_NOP || + search_kind == TCPOPT_EOL) + return -EINVAL; + + if (search_kind == TCPOPT_EXP || search_kind == 253) { + /* 16 or 32 bit magic. +2 for kind and kind length */ + if (search_len != 4 && search_len != 6) + return -EINVAL; + magic = &search[2]; + magic_len = search_len - 2; + } else { + if (search_len) + return -EINVAL; + magic = NULL; + magic_len = 0; + } + + if (load_syn) { + ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op); + if (ret < 0) + return ret; + + opend = op + ret; + op += sizeof(struct tcphdr); + } else { + if (!bpf_sock->skb || + bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) + /* This bpf_sock->op cannot call this helper */ + return -EPERM; + + opend = bpf_sock->skb_data_end; + op = bpf_sock->skb->data + sizeof(struct tcphdr); + } + + op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len, + &eol); + if (IS_ERR(op)) + return PTR_ERR(op); + + copy_len = op[1]; + ret = copy_len; + if (copy_len > len) { + ret = -ENOSPC; + copy_len = len; + } + + memcpy(search_res, op, copy_len); + return ret; +} + +static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { + .func = bpf_sock_ops_load_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + const void *, from, u32, len, u64, flags) +{ + u8 new_kind, new_kind_len, magic_len = 0, *opend; + const u8 *op, *new_op, *magic = NULL; + struct sk_buff *skb; + bool eol; + + if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB) + return -EPERM; + + if (len < 2 || flags) + return -EINVAL; + + new_op = from; + new_kind = new_op[0]; + new_kind_len = new_op[1]; + + if (new_kind_len > len || new_kind == TCPOPT_NOP || + new_kind == TCPOPT_EOL) + return -EINVAL; + + if (new_kind_len > bpf_sock->remaining_opt_len) + return -ENOSPC; + + /* 253 is another experimental kind */ + if (new_kind == TCPOPT_EXP || new_kind == 253) { + if (new_kind_len < 4) + return -EINVAL; + /* Match for the 2 byte magic also. + * RFC 6994: the magic could be 2 or 4 bytes. + * Hence, matching by 2 byte only is on the + * conservative side but it is the right + * thing to do for the 'search-for-duplication' + * purpose. + */ + magic = &new_op[2]; + magic_len = 2; + } + + /* Check for duplication */ + skb = bpf_sock->skb; + op = skb->data + sizeof(struct tcphdr); + opend = bpf_sock->skb_data_end; + + op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len, + &eol); + if (!IS_ERR(op)) + return -EEXIST; + + if (PTR_ERR(op) != -ENOMSG) + return PTR_ERR(op); + + if (eol) + /* The option has been ended. Treat it as no more + * header option can be written. + */ + return -ENOSPC; + + /* No duplication found. Store the header option. */ + memcpy(opend, from, new_kind_len); + + bpf_sock->remaining_opt_len -= new_kind_len; + bpf_sock->skb_data_end += new_kind_len; + + return 0; +} + +static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { + .func = bpf_sock_ops_store_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + u32, len, u64, flags) +{ + if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB) + return -EPERM; + + if (flags || len < 2) + return -EINVAL; + + if (len > bpf_sock->remaining_opt_len) + return -ENOSPC; + + bpf_sock->remaining_opt_len -= len; + + return 0; +} + +static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { + .func = bpf_sock_ops_reserve_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -6180,6 +6511,9 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_lwt_seg6_adjust_srh || func == bpf_lwt_seg6_action || #endif +#ifdef CONFIG_INET + func == bpf_sock_ops_store_hdr_opt || +#endif func == bpf_lwt_in_push_encap || func == bpf_lwt_xmit_push_encap) return true; @@ -6551,6 +6885,12 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; #ifdef CONFIG_INET + case BPF_FUNC_load_hdr_opt: + return &bpf_sock_ops_load_hdr_opt_proto; + case BPF_FUNC_store_hdr_opt: + return &bpf_sock_ops_store_hdr_opt_proto; + case BPF_FUNC_reserve_hdr_opt: + return &bpf_sock_ops_reserve_hdr_opt_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ @@ -7350,6 +7690,20 @@ static bool sock_ops_is_valid_access(int off, int size, return false; info->reg_type = PTR_TO_SOCKET_OR_NULL; break; + case offsetof(struct bpf_sock_ops, skb_data): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sock_ops, skb_data_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sock_ops, skb_tcp_flags): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, + size_default); default: if (size != size_default) return false; @@ -8451,17 +8805,22 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; switch (si->off) { - case offsetof(struct bpf_sock_ops, op) ... + case offsetof(struct bpf_sock_ops, op): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + op), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, op)); + break; + + case offsetof(struct bpf_sock_ops, replylong[0]) ... offsetof(struct bpf_sock_ops, replylong[3]): - BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, op) != - sizeof_field(struct bpf_sock_ops_kern, op)); BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) != sizeof_field(struct bpf_sock_ops_kern, reply)); BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) != sizeof_field(struct bpf_sock_ops_kern, replylong)); off = si->off; - off -= offsetof(struct bpf_sock_ops, op); - off += offsetof(struct bpf_sock_ops_kern, op); + off -= offsetof(struct bpf_sock_ops, replylong[0]); + off += offsetof(struct bpf_sock_ops_kern, replylong[0]); if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, off); @@ -8682,6 +9041,49 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, sk): SOCK_OPS_GET_SK(); break; + case offsetof(struct bpf_sock_ops, skb_data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb_data_end), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb_data_end)); + break; + case offsetof(struct bpf_sock_ops, skb_data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->dst_reg, + offsetof(struct sk_buff, data)); + break; + case offsetof(struct bpf_sock_ops, skb_len): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + si->dst_reg, si->dst_reg, + offsetof(struct sk_buff, len)); + break; + case offsetof(struct bpf_sock_ops, skb_tcp_flags): + off = offsetof(struct sk_buff, cb); + off += offsetof(struct tcp_skb_cb, tcp_flags); + *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb, + tcp_flags), + si->dst_reg, si->dst_reg, off); + break; } return insn - insn_buf; } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 6bbd06f7dc7d..c714e6a9dad4 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -116,6 +116,12 @@ static int dev_seq_show(struct seq_file *seq, void *v) return 0; } +static u32 softnet_backlog_len(struct softnet_data *sd) +{ + return skb_queue_len_lockless(&sd->input_pkt_queue) + + skb_queue_len_lockless(&sd->process_queue); +} + static struct softnet_data *softnet_get_online(loff_t *pos) { struct softnet_data *sd = NULL; @@ -159,12 +165,17 @@ static int softnet_seq_show(struct seq_file *seq, void *v) rcu_read_unlock(); #endif + /* the index is the CPU id owing this sd. Since offline CPUs are not + * displayed, it would be othrwise not trivial for the user-space + * mapping the data a specific CPU + */ seq_printf(seq, - "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", sd->processed, sd->dropped, sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ - sd->received_rps, flow_limit_count); + sd->received_rps, flow_limit_count, + softnet_backlog_len(sd), (int)seq->index); return 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2338753e936b..c310c7c1cef7 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -297,7 +297,7 @@ static int netpoll_owner_active(struct net_device *dev) { struct napi_struct *napi; - list_for_each_entry(napi, &dev->napi_list, dev_list) { + list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (napi->poll_owner == smp_processor_id()) return 1; } diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index d964a5147f22..e33fde06d528 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -107,6 +107,36 @@ unsigned int ptp_classify_raw(const struct sk_buff *skb) } EXPORT_SYMBOL_GPL(ptp_classify_raw); +struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type) +{ + u8 *ptr = skb_mac_header(skb); + + if (type & PTP_CLASS_VLAN) + ptr += VLAN_HLEN; + + switch (type & PTP_CLASS_PMASK) { + case PTP_CLASS_IPV4: + ptr += IPV4_HLEN(ptr) + UDP_HLEN; + break; + case PTP_CLASS_IPV6: + ptr += IP6_HLEN + UDP_HLEN; + break; + case PTP_CLASS_L2: + break; + default: + return NULL; + } + + ptr += ETH_HLEN; + + /* Ensure that the entire header is present in this packet. */ + if (ptr + sizeof(struct ptp_header) > skb->data + skb->len) + return NULL; + + return (struct ptp_header *)ptr; +} +EXPORT_SYMBOL_GPL(ptp_parse_header); + void __init ptp_classifier_init(void) { static struct sock_filter ptp_filter[] __initdata = { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6faf73d6a0f7..e0774471f56d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -895,9 +895,6 @@ void __kfree_skb_defer(struct sk_buff *skb) void napi_consume_skb(struct sk_buff *skb, int budget) { - if (unlikely(!skb)) - return; - /* Zero budget indicate non-NAPI context called us, like netpoll */ if (unlikely(!budget)) { dev_consume_skb_any(skb); @@ -5955,8 +5952,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, size = SKB_WITH_OVERHEAD(ksize(data)); memcpy((struct skb_shared_info *)(data + size), - skb_shinfo(skb), offsetof(struct skb_shared_info, - frags[skb_shinfo(skb)->nr_frags])); + skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); if (skb_orphan_frags(skb, gfp_mask)) { kfree(data); return -ENOMEM; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 649583158983..4b5f7c8fecd1 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -494,14 +494,34 @@ end: struct sk_psock *sk_psock_init(struct sock *sk, int node) { - struct sk_psock *psock = kzalloc_node(sizeof(*psock), - GFP_ATOMIC | __GFP_NOWARN, - node); - if (!psock) - return NULL; + struct sk_psock *psock; + struct proto *prot; + + write_lock_bh(&sk->sk_callback_lock); + + if (inet_csk_has_ulp(sk)) { + psock = ERR_PTR(-EINVAL); + goto out; + } + + if (sk->sk_user_data) { + psock = ERR_PTR(-EBUSY); + goto out; + } + psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node); + if (!psock) { + psock = ERR_PTR(-ENOMEM); + goto out; + } + + prot = READ_ONCE(sk->sk_prot); psock->sk = sk; - psock->eval = __SK_NONE; + psock->eval = __SK_NONE; + psock->sk_proto = prot; + psock->saved_unhash = prot->unhash; + psock->saved_close = prot->close; + psock->saved_write_space = sk->sk_write_space; INIT_LIST_HEAD(&psock->link); spin_lock_init(&psock->link_lock); @@ -516,6 +536,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) rcu_assign_sk_user_data_nocopy(sk, psock); sock_hold(sk); +out: + write_unlock_bh(&sk->sk_callback_lock); return psock; } EXPORT_SYMBOL_GPL(sk_psock_init); diff --git a/net/core/sock.c b/net/core/sock.c index 6c5c6b18eff4..ba9e7d91e2ef 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -413,18 +413,6 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, return 0; } -static void sock_warn_obsolete_bsdism(const char *name) -{ - static int warned; - static char warncomm[TASK_COMM_LEN]; - if (strcmp(warncomm, current->comm) && warned < 5) { - strcpy(warncomm, current->comm); - pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", - warncomm, name); - warned++; - } -} - static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { @@ -984,7 +972,6 @@ set_sndbuf: break; case SO_BSDCOMPAT: - sock_warn_obsolete_bsdism("setsockopt"); break; case SO_PASSCRED: @@ -1387,7 +1374,6 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_BSDCOMPAT: - sock_warn_obsolete_bsdism("getsockopt"); break; case SO_TIMESTAMP_OLD: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 119f52a99dc1..078386d7d9a2 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -184,8 +184,6 @@ static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { struct proto *prot; - sock_owned_by_me(sk); - switch (sk->sk_type) { case SOCK_STREAM: prot = tcp_bpf_get_proto(sk, psock); @@ -272,8 +270,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, } } else { psock = sk_psock_init(sk, map->numa_node); - if (!psock) { - ret = -ENOMEM; + if (IS_ERR(psock)) { + ret = PTR_ERR(psock); goto out_progs; } } @@ -322,8 +320,8 @@ static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk) if (!psock) { psock = sk_psock_init(sk, map->numa_node); - if (!psock) - return -ENOMEM; + if (IS_ERR(psock)) + return PTR_ERR(psock); } ret = sock_map_init_proto(sk, psock); @@ -478,8 +476,6 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; - if (inet_csk_has_ulp(sk)) - return -EINVAL; link = sk_psock_init_link(); if (!link) @@ -563,10 +559,12 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) return false; } -static int sock_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static int sock_hash_update_common(struct bpf_map *map, void *key, + struct sock *sk, u64 flags); + +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, + u64 flags) { - u32 idx = *(u32 *)key; struct socket *sock; struct sock *sk; int ret; @@ -595,14 +593,38 @@ static int sock_map_update_elem(struct bpf_map *map, void *key, sock_map_sk_acquire(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; + else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) + ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else - ret = sock_map_update_common(map, idx, sk, flags); + ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: fput(sock->file); return ret; } +static int sock_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + struct sock *sk = (struct sock *)value; + int ret; + + if (!sock_map_sk_is_suitable(sk)) + return -EOPNOTSUPP; + + local_bh_disable(); + bh_lock_sock(sk); + if (!sock_map_sk_state_allowed(sk)) + ret = -EOPNOTSUPP; + else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) + ret = sock_map_update_common(map, *(u32 *)key, sk, flags); + else + ret = sock_hash_update_common(map, key, sk, flags); + bh_unlock_sock(sk); + local_bh_enable(); + return ret; +} + BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { @@ -683,6 +705,7 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = { static int sock_map_btf_id; const struct bpf_map_ops sock_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_map_alloc, .map_free = sock_map_free, .map_get_next_key = sock_map_get_next_key, @@ -855,8 +878,6 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; - if (inet_csk_has_ulp(sk)) - return -EINVAL; link = sk_psock_init_link(); if (!link) @@ -915,45 +936,6 @@ out_free: return ret; } -static int sock_hash_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) -{ - struct socket *sock; - struct sock *sk; - int ret; - u64 ufd; - - if (map->value_size == sizeof(u64)) - ufd = *(u64 *)value; - else - ufd = *(u32 *)value; - if (ufd > S32_MAX) - return -EINVAL; - - sock = sockfd_lookup(ufd, &ret); - if (!sock) - return ret; - sk = sock->sk; - if (!sk) { - ret = -EINVAL; - goto out; - } - if (!sock_map_sk_is_suitable(sk)) { - ret = -EOPNOTSUPP; - goto out; - } - - sock_map_sk_acquire(sk); - if (!sock_map_sk_state_allowed(sk)) - ret = -EOPNOTSUPP; - else - ret = sock_hash_update_common(map, key, sk, flags); - sock_map_sk_release(sk); -out: - fput(sock->file); - return ret; -} - static int sock_hash_get_next_key(struct bpf_map *map, void *key, void *key_next) { @@ -1219,10 +1201,11 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = { static int sock_hash_map_btf_id; const struct bpf_map_ops sock_hash_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, .map_get_next_key = sock_hash_get_next_key, - .map_update_elem = sock_hash_update_elem, + .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_lookup_elem = sock_hash_lookup, .map_lookup_elem_sys_only = sock_hash_lookup_sys, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 6ada114bbcca..d86d8d11cfe4 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -22,7 +22,7 @@ #include <net/busy_poll.h> #include <net/pkt_sched.h> -static int two __maybe_unused = 2; +static int two = 2; static int three = 3; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; @@ -546,7 +546,7 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = &two, }, { .procname = "devconf_inherit_init_net", @@ -587,6 +587,19 @@ static struct ctl_table netns_core_table[] = { { } }; +static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) +{ + /* fallback tunnels for initns only */ + if (!strncmp(str, "initns", 6)) + sysctl_fb_tunnels_only_for_init_net = 1; + /* no fallback tunnels anywhere */ + else if (!strncmp(str, "none", 4)) + sysctl_fb_tunnels_only_for_init_net = 2; + + return 1; +} +__setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); + static __net_init int sysctl_core_net_init(struct net *net) { struct ctl_table *tbl; diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 0a72510d5de1..8f3dd3b1d2d0 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -274,7 +274,7 @@ void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) /** * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection * This routine is called when the peer acknowledges the receipt of Ack Vectors - * up to and including @ackno. While based on on section A.3 of RFC 4340, here + * up to and including @ackno. While based on section A.3 of RFC 4340, here * are additional precautions to prevent corrupted buffer state. In particular, * we use tail_ackno to identify outdated records; it always marks the earliest * packet of group (2) in 11.4.2. diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 9c28c8251125..bb3d70664dde 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -495,7 +495,8 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt)); + rcu_dereference(ireq->ireq_opt), + inet_sk(sk)->tos); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -537,7 +538,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) local_bh_disable(); bh_lock_sock(ctl_sk); err = ip_build_and_send_pkt(skb, ctl_sk, - rxiph->daddr, rxiph->saddr, NULL); + rxiph->daddr, rxiph->saddr, NULL, + inet_sk(ctl_sk)->tos); bh_unlock_sock(ctl_sk); if (net_xmit_eval(err) == 0) { @@ -731,7 +733,7 @@ int dccp_invalid_packet(struct sk_buff *skb) return 1; } /* - * If P.Data Offset is too too large for packet, drop packet and return + * If P.Data Offset is too large for packet, drop packet and return */ if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) { DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff); diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 0e06dfc32273..927c796d7682 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -85,7 +85,7 @@ static void dccp_retransmit_timer(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); /* - * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was + * More than 4MSL (8 minutes) has passed, a RESET(aborted) was * sent, no need to retransmit, this sock is dead. */ if (dccp_write_timeout(sk)) diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 1ce9ba8cf545..5c18c0214aac 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -330,11 +330,7 @@ EXPORT_SYMBOL_GPL(call_dsa_notifiers); int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx) { - struct dsa_devlink_priv *dl_priv; - struct dsa_switch *ds; - - dl_priv = devlink_priv(dl); - ds = dl_priv->ds; + struct dsa_switch *ds = dsa_devlink_to_ds(dl); if (!ds->ops->devlink_param_get) return -EOPNOTSUPP; @@ -346,11 +342,7 @@ EXPORT_SYMBOL_GPL(dsa_devlink_param_get); int dsa_devlink_param_set(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx) { - struct dsa_devlink_priv *dl_priv; - struct dsa_switch *ds; - - dl_priv = devlink_priv(dl); - ds = dl_priv->ds; + struct dsa_switch *ds = dsa_devlink_to_ds(dl); if (!ds->ops->devlink_param_set) return -EOPNOTSUPP; @@ -412,6 +404,22 @@ void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, } EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister); +struct devlink_region * +dsa_devlink_region_create(struct dsa_switch *ds, + const struct devlink_region_ops *ops, + u32 region_max_snapshots, u64 region_size) +{ + return devlink_region_create(ds->devlink, ops, region_max_snapshots, + region_size); +} +EXPORT_SYMBOL_GPL(dsa_devlink_region_create); + +void dsa_devlink_region_destroy(struct devlink_region *region) +{ + devlink_region_destroy(region); +} +EXPORT_SYMBOL_GPL(dsa_devlink_region_destroy); + struct dsa_port *dsa_port_from_netdev(struct net_device *netdev) { if (!netdev || !dsa_slave_dev_check(netdev)) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index c0ffc7a2b65f..3cf67f5fe54a 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -21,9 +21,6 @@ static DEFINE_MUTEX(dsa2_mutex); LIST_HEAD(dsa_tree_list); -static const struct devlink_ops dsa_devlink_ops = { -}; - struct dsa_switch *dsa_switch_find(int tree_index, int sw_index) { struct dsa_switch_tree *dst; @@ -382,6 +379,22 @@ static void dsa_port_teardown(struct dsa_port *dp) dp->setup = false; } +static int dsa_devlink_info_get(struct devlink *dl, + struct devlink_info_req *req, + struct netlink_ext_ack *extack) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (ds->ops->devlink_info_get) + return ds->ops->devlink_info_get(ds, req, extack); + + return -EOPNOTSUPP; +} + +static const struct devlink_ops dsa_devlink_ops = { + .info_get = dsa_devlink_info_get, +}; + static int dsa_switch_setup(struct dsa_switch *ds) { struct dsa_devlink_priv *dl_priv; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 1653e3377cb3..2da656d984ef 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -164,8 +164,6 @@ int dsa_port_vlan_add(struct dsa_port *dp, struct switchdev_trans *trans); int dsa_port_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); -int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags); -int dsa_port_vid_del(struct dsa_port *dp, u16 vid); int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index e23ece229c7e..9a4fb80d2731 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -193,11 +193,44 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) dsa_port_set_state_now(dp, BR_STATE_FORWARDING); } +/* Must be called under rcu_read_lock() */ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, bool vlan_filtering) { struct dsa_switch *ds = dp->ds; - int i; + int err, i; + + /* VLAN awareness was off, so the question is "can we turn it on". + * We may have had 8021q uppers, those need to go. Make sure we don't + * enter an inconsistent state: deny changing the VLAN awareness state + * as long as we have 8021q uppers. + */ + if (vlan_filtering && dsa_is_user_port(ds, dp->index)) { + struct net_device *upper_dev, *slave = dp->slave; + struct net_device *br = dp->bridge_dev; + struct list_head *iter; + + netdev_for_each_upper_dev_rcu(slave, upper_dev, iter) { + struct bridge_vlan_info br_info; + u16 vid; + + if (!is_vlan_dev(upper_dev)) + continue; + + vid = vlan_dev_vlan_id(upper_dev); + + /* br_vlan_get_info() returns -EINVAL or -ENOENT if the + * device, respectively the VID is not found, returning + * 0 means success, which is a failure for us here. + */ + err = br_vlan_get_info(br, vid, &br_info); + if (err == 0) { + dev_err(ds->dev, "Must remove upper %s first\n", + upper_dev->name); + return false; + } + } + } if (!ds->vlan_filtering_is_global) return true; @@ -232,15 +265,24 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct dsa_switch *ds = dp->ds; int err; - /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ - if (switchdev_trans_ph_prepare(trans)) - return 0; + if (switchdev_trans_ph_prepare(trans)) { + bool apply; - if (!ds->ops->port_vlan_filtering) - return 0; + if (!ds->ops->port_vlan_filtering) + return -EOPNOTSUPP; - if (!dsa_port_can_apply_vlan_filtering(dp, vlan_filtering)) - return -EINVAL; + /* We are called from dsa_slave_switchdev_blocking_event(), + * which is not under rcu_read_lock(), unlike + * dsa_slave_switchdev_event(). + */ + rcu_read_lock(); + apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering); + rcu_read_unlock(); + if (!apply) + return -EINVAL; + + return 0; + } if (dsa_port_is_vlan_filtering(dp) == vlan_filtering) return 0; @@ -433,39 +475,6 @@ int dsa_port_vlan_del(struct dsa_port *dp, return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); } -int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags) -{ - struct switchdev_obj_port_vlan vlan = { - .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .flags = flags, - .vid_begin = vid, - .vid_end = vid, - }; - struct switchdev_trans trans; - int err; - - trans.ph_prepare = true; - err = dsa_port_vlan_add(dp, &vlan, &trans); - if (err) - return err; - - trans.ph_prepare = false; - return dsa_port_vlan_add(dp, &vlan, &trans); -} -EXPORT_SYMBOL(dsa_port_vid_add); - -int dsa_port_vid_del(struct dsa_port *dp, u16 vid) -{ - struct switchdev_obj_port_vlan vlan = { - .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .vid_begin = vid, - .vid_end = vid, - }; - - return dsa_port_vlan_del(dp, &vlan); -} -EXPORT_SYMBOL(dsa_port_vid_del); - static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) { struct device_node *phy_dn; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 16e5f98d4882..e7c1d62fde99 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -303,13 +303,36 @@ static int dsa_slave_port_attr_set(struct net_device *dev, return ret; } +/* Must be called under rcu_read_lock() */ +static int +dsa_slave_vlan_check_for_8021q_uppers(struct net_device *slave, + const struct switchdev_obj_port_vlan *vlan) +{ + struct net_device *upper_dev; + struct list_head *iter; + + netdev_for_each_upper_dev_rcu(slave, upper_dev, iter) { + u16 vid; + + if (!is_vlan_dev(upper_dev)) + continue; + + vid = vlan_dev_vlan_id(upper_dev); + if (vid >= vlan->vid_begin && vid <= vlan->vid_end) + return -EBUSY; + } + + return 0; +} + static int dsa_slave_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct switchdev_trans *trans) { + struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan vlan; - int err; + int vid, err; if (obj->orig_dev != dev) return -EOPNOTSUPP; @@ -319,6 +342,17 @@ static int dsa_slave_vlan_add(struct net_device *dev, vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); + /* Deny adding a bridge VLAN when there is already an 802.1Q upper with + * the same VID. + */ + if (trans->ph_prepare && br_vlan_enabled(dp->bridge_dev)) { + rcu_read_lock(); + err = dsa_slave_vlan_check_for_8021q_uppers(dev, &vlan); + rcu_read_unlock(); + if (err) + return err; + } + err = dsa_port_vlan_add(dp, &vlan, trans); if (err) return err; @@ -333,6 +367,12 @@ static int dsa_slave_vlan_add(struct net_device *dev, if (err) return err; + for (vid = vlan.vid_begin; vid <= vlan.vid_end; vid++) { + err = vlan_vid_add(master, htons(ETH_P_8021Q), vid); + if (err) + return err; + } + return 0; } @@ -376,7 +416,10 @@ static int dsa_slave_port_obj_add(struct net_device *dev, static int dsa_slave_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { + struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); + struct switchdev_obj_port_vlan *vlan; + int vid, err; if (obj->orig_dev != dev) return -EOPNOTSUPP; @@ -384,10 +427,19 @@ static int dsa_slave_vlan_del(struct net_device *dev, if (dsa_port_skip_vlan_configuration(dp)) return 0; + vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); + /* Do not deprogram the CPU port as it may be shared with other user * ports which can be members of this VLAN as well. */ - return dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj)); + err = dsa_port_vlan_del(dp, vlan); + if (err) + return err; + + for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) + vlan_vid_del(master, htons(ETH_P_8021Q), vid); + + return 0; } static int dsa_slave_port_obj_del(struct net_device *dev, @@ -1232,64 +1284,66 @@ static int dsa_slave_get_ts_info(struct net_device *dev, static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { + struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); - struct bridge_vlan_info info; + struct switchdev_obj_port_vlan vlan = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .vid_begin = vid, + .vid_end = vid, + /* This API only allows programming tagged, non-PVID VIDs */ + .flags = 0, + }; + struct switchdev_trans trans; int ret; - /* Check for a possible bridge VLAN entry now since there is no - * need to emulate the switchdev prepare + commit phase. - */ - if (dp->bridge_dev) { - if (dsa_port_skip_vlan_configuration(dp)) - return 0; + /* User port... */ + trans.ph_prepare = true; + ret = dsa_port_vlan_add(dp, &vlan, &trans); + if (ret) + return ret; - /* br_vlan_get_info() returns -EINVAL or -ENOENT if the - * device, respectively the VID is not found, returning - * 0 means success, which is a failure for us here. - */ - ret = br_vlan_get_info(dp->bridge_dev, vid, &info); - if (ret == 0) - return -EBUSY; - } + trans.ph_prepare = false; + ret = dsa_port_vlan_add(dp, &vlan, &trans); + if (ret) + return ret; - ret = dsa_port_vid_add(dp, vid, 0); + /* And CPU port... */ + trans.ph_prepare = true; + ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &trans); if (ret) return ret; - ret = dsa_port_vid_add(dp->cpu_dp, vid, 0); + trans.ph_prepare = false; + ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &trans); if (ret) return ret; - return 0; + return vlan_vid_add(master, proto, vid); } static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { + struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); - struct bridge_vlan_info info; - int ret; - - /* Check for a possible bridge VLAN entry now since there is no - * need to emulate the switchdev prepare + commit phase. - */ - if (dp->bridge_dev) { - if (dsa_port_skip_vlan_configuration(dp)) - return 0; - - /* br_vlan_get_info() returns -EINVAL or -ENOENT if the - * device, respectively the VID is not found, returning - * 0 means success, which is a failure for us here. - */ - ret = br_vlan_get_info(dp->bridge_dev, vid, &info); - if (ret == 0) - return -EBUSY; - } + struct switchdev_obj_port_vlan vlan = { + .vid_begin = vid, + .vid_end = vid, + /* This API only allows programming tagged, non-PVID VIDs */ + .flags = 0, + }; + int err; /* Do not deprogram the CPU port as it may be shared with other user * ports which can be members of this VLAN as well. */ - return dsa_port_vid_del(dp, vid); + err = dsa_port_vlan_del(dp, &vlan); + if (err) + return err; + + vlan_vid_del(master, proto, vid); + + return 0; } struct dsa_hw_port { @@ -1784,7 +1838,7 @@ int dsa_slave_create(struct dsa_port *port) rtnl_lock(); ret = dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN); rtnl_unlock(); - if (ret) + if (ret && ret != -EOPNOTSUPP) dev_warn(ds->dev, "nonfatal error %d setting MTU on port %d\n", ret, port->index); @@ -1792,8 +1846,9 @@ int dsa_slave_create(struct dsa_port *port) ret = dsa_slave_phy_setup(slave_dev); if (ret) { - netdev_err(master, "error %d setting up slave PHY for %s\n", - ret, slave_dev->name); + netdev_err(slave_dev, + "error %d setting up PHY for tree %d, switch %d, port %d\n", + ret, ds->dst->index, ds->index, port->index); goto out_gcells; } @@ -1880,9 +1935,9 @@ static int dsa_slave_changeupper(struct net_device *dev, return err; } -static int dsa_slave_upper_vlan_check(struct net_device *dev, - struct netdev_notifier_changeupper_info * - info) +static int +dsa_prevent_bridging_8021q_upper(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *ext_ack; struct net_device *slave; @@ -1912,14 +1967,56 @@ static int dsa_slave_upper_vlan_check(struct net_device *dev, return NOTIFY_DONE; } +static int +dsa_slave_check_8021q_upper(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct net_device *br = dp->bridge_dev; + struct bridge_vlan_info br_info; + struct netlink_ext_ack *extack; + int err = NOTIFY_DONE; + u16 vid; + + if (!br || !br_vlan_enabled(br)) + return NOTIFY_DONE; + + extack = netdev_notifier_info_to_extack(&info->info); + vid = vlan_dev_vlan_id(info->upper_dev); + + /* br_vlan_get_info() returns -EINVAL or -ENOENT if the + * device, respectively the VID is not found, returning + * 0 means success, which is a failure for us here. + */ + err = br_vlan_get_info(br, vid, &br_info); + if (err == 0) { + NL_SET_ERR_MSG_MOD(extack, + "This VLAN is already configured by the bridge"); + return notifier_from_errno(-EBUSY); + } + + return NOTIFY_DONE; +} + static int dsa_slave_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - if (event == NETDEV_CHANGEUPPER) { + switch (event) { + case NETDEV_PRECHANGEUPPER: { + struct netdev_notifier_changeupper_info *info = ptr; + + if (!dsa_slave_dev_check(dev)) + return dsa_prevent_bridging_8021q_upper(dev, ptr); + + if (is_vlan_dev(info->upper_dev)) + return dsa_slave_check_8021q_upper(dev, ptr); + break; + } + case NETDEV_CHANGEUPPER: if (!dsa_slave_dev_check(dev)) - return dsa_slave_upper_vlan_check(dev, ptr); + return NOTIFY_DONE; return dsa_slave_changeupper(dev, ptr); } diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 86c8dc5c32a0..9afef6f0f9df 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -232,43 +232,6 @@ static int dsa_switch_mdb_del(struct dsa_switch *ds, return 0; } -static int dsa_port_vlan_device_check(struct net_device *vlan_dev, - int vlan_dev_vid, - void *arg) -{ - struct switchdev_obj_port_vlan *vlan = arg; - u16 vid; - - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { - if (vid == vlan_dev_vid) - return -EBUSY; - } - - return 0; -} - -static int dsa_port_vlan_check(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) -{ - const struct dsa_port *dp = dsa_to_port(ds, port); - int err = 0; - - /* Device is not bridged, let it proceed with the VLAN device - * creation. - */ - if (!dp->bridge_dev) - return err; - - /* dsa_slave_vlan_rx_{add,kill}_vid() cannot use the prepare phase and - * already checks whether there is an overlapping bridge VLAN entry - * with the same VID, so here we only need to check that if we are - * adding a bridge VLAN entry there is not an overlapping VLAN device - * claiming that VID. - */ - return vlan_for_each(dp->slave, dsa_port_vlan_device_check, - (void *)vlan); -} - static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port, struct dsa_notifier_vlan_info *info) { @@ -291,10 +254,6 @@ static int dsa_switch_vlan_prepare(struct dsa_switch *ds, for (port = 0; port < ds->num_ports; port++) { if (dsa_switch_vlan_match(ds, port, info)) { - err = dsa_port_vlan_check(ds, port, info->vlan); - if (err) - return err; - err = ds->ops->port_vlan_prepare(ds, port, info->vlan); if (err) return err; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 780b2a15ac9b..8e3e8a5b8559 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -146,15 +146,15 @@ EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); * user explicitly configured this @vid through the bridge core, then the @vid * is installed again, but this time with the flags from the bridge layer. */ -static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid, +static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, u16 flags, bool enabled) { - struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_port *dp = dsa_to_port(ctx->ds, port); if (enabled) - return dsa_port_vid_add(dp, vid, flags); + return ctx->ops->vlan_add(ctx->ds, dp->index, vid, flags); - return dsa_port_vid_del(dp, vid); + return ctx->ops->vlan_del(ctx->ds, dp->index, vid); } /* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single @@ -209,25 +209,29 @@ static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid, * +-+-----+-+-----+-+-----+-+-----+-+ +-+-----+-+-----+-+-----+-+-----+-+ * swp0 swp1 swp2 swp3 swp0 swp1 swp2 swp3 */ -int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) +static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port, + bool enabled) { - int upstream = dsa_upstream_port(ds, port); - u16 rx_vid = dsa_8021q_rx_vid(ds, port); - u16 tx_vid = dsa_8021q_tx_vid(ds, port); - int i, err; + int upstream = dsa_upstream_port(ctx->ds, port); + u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port); + u16 tx_vid = dsa_8021q_tx_vid(ctx->ds, port); + struct net_device *master; + int i, err, subvlan; /* The CPU port is implicitly configured by * configuring the front-panel ports */ - if (!dsa_is_user_port(ds, port)) + if (!dsa_is_user_port(ctx->ds, port)) return 0; + master = dsa_to_port(ctx->ds, port)->cpu_dp->master; + /* Add this user port's RX VID to the membership list of all others * (including itself). This is so that bridging will not be hindered. * L2 forwarding rules still take precedence when there are no VLAN * restrictions, so there are no concerns about leaking traffic. */ - for (i = 0; i < ds->num_ports; i++) { + for (i = 0; i < ctx->ds->num_ports; i++) { u16 flags; if (i == upstream) @@ -240,9 +244,10 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) /* The RX VID is a regular VLAN on all others */ flags = BRIDGE_VLAN_INFO_UNTAGGED; - err = dsa_8021q_vid_apply(ds, i, rx_vid, flags, enabled); + err = dsa_8021q_vid_apply(ctx, i, rx_vid, flags, enabled); if (err) { - dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n", + dev_err(ctx->ds->dev, + "Failed to apply RX VID %d to port %d: %d\n", rx_vid, port, err); return err; } @@ -251,80 +256,115 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) /* CPU port needs to see this port's RX VID * as tagged egress. */ - err = dsa_8021q_vid_apply(ds, upstream, rx_vid, 0, enabled); + err = dsa_8021q_vid_apply(ctx, upstream, rx_vid, 0, enabled); if (err) { - dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n", + dev_err(ctx->ds->dev, + "Failed to apply RX VID %d to port %d: %d\n", rx_vid, port, err); return err; } + /* Add to the master's RX filter not only @rx_vid, but in fact + * the entire subvlan range, just in case this DSA switch might + * want to use sub-VLANs. + */ + for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++) { + u16 vid = dsa_8021q_rx_vid_subvlan(ctx->ds, port, subvlan); + + if (enabled) + vlan_vid_add(master, ctx->proto, vid); + else + vlan_vid_del(master, ctx->proto, vid); + } + /* Finally apply the TX VID on this port and on the CPU port */ - err = dsa_8021q_vid_apply(ds, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED, + err = dsa_8021q_vid_apply(ctx, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED, enabled); if (err) { - dev_err(ds->dev, "Failed to apply TX VID %d on port %d: %d\n", + dev_err(ctx->ds->dev, + "Failed to apply TX VID %d on port %d: %d\n", tx_vid, port, err); return err; } - err = dsa_8021q_vid_apply(ds, upstream, tx_vid, 0, enabled); + err = dsa_8021q_vid_apply(ctx, upstream, tx_vid, 0, enabled); if (err) { - dev_err(ds->dev, "Failed to apply TX VID %d on port %d: %d\n", + dev_err(ctx->ds->dev, + "Failed to apply TX VID %d on port %d: %d\n", tx_vid, upstream, err); return err; } return err; } -EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging); -static int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, +int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled) +{ + int rc, port; + + ASSERT_RTNL(); + + for (port = 0; port < ctx->ds->num_ports; port++) { + rc = dsa_8021q_setup_port(ctx, port, enabled); + if (rc < 0) { + dev_err(ctx->ds->dev, + "Failed to setup VLAN tagging for port %d: %d\n", + port, rc); + return rc; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_8021q_setup); + +static int dsa_8021q_crosschip_link_apply(struct dsa_8021q_context *ctx, + int port, + struct dsa_8021q_context *other_ctx, int other_port, bool enabled) { - u16 rx_vid = dsa_8021q_rx_vid(ds, port); + u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port); /* @rx_vid of local @ds port @port goes to @other_port of * @other_ds */ - return dsa_8021q_vid_apply(other_ds, other_port, rx_vid, + return dsa_8021q_vid_apply(other_ctx, other_port, rx_vid, BRIDGE_VLAN_INFO_UNTAGGED, enabled); } -static int dsa_8021q_crosschip_link_add(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, - struct list_head *crosschip_links) +static int dsa_8021q_crosschip_link_add(struct dsa_8021q_context *ctx, int port, + struct dsa_8021q_context *other_ctx, + int other_port) { struct dsa_8021q_crosschip_link *c; - list_for_each_entry(c, crosschip_links, list) { - if (c->port == port && c->other_ds == other_ds && + list_for_each_entry(c, &ctx->crosschip_links, list) { + if (c->port == port && c->other_ctx == other_ctx && c->other_port == other_port) { refcount_inc(&c->refcount); return 0; } } - dev_dbg(ds->dev, "adding crosschip link from port %d to %s port %d\n", - port, dev_name(other_ds->dev), other_port); + dev_dbg(ctx->ds->dev, + "adding crosschip link from port %d to %s port %d\n", + port, dev_name(other_ctx->ds->dev), other_port); c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) return -ENOMEM; c->port = port; - c->other_ds = other_ds; + c->other_ctx = other_ctx; c->other_port = other_port; refcount_set(&c->refcount, 1); - list_add(&c->list, crosschip_links); + list_add(&c->list, &ctx->crosschip_links); return 0; } -static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds, +static void dsa_8021q_crosschip_link_del(struct dsa_8021q_context *ctx, struct dsa_8021q_crosschip_link *c, - struct list_head *crosschip_links, bool *keep) { *keep = !refcount_dec_and_test(&c->refcount); @@ -332,9 +372,9 @@ static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds, if (*keep) return; - dev_dbg(ds->dev, + dev_dbg(ctx->ds->dev, "deleting crosschip link from port %d to %s port %d\n", - c->port, dev_name(c->other_ds->dev), c->other_port); + c->port, dev_name(c->other_ctx->ds->dev), c->other_port); list_del(&c->list); kfree(c); @@ -347,64 +387,58 @@ static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds, * or untagged: it doesn't matter, since it should never egress a frame having * our @rx_vid. */ -int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, - struct list_head *crosschip_links) +int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port, + struct dsa_8021q_context *other_ctx, + int other_port) { /* @other_upstream is how @other_ds reaches us. If we are part * of disjoint trees, then we are probably connected through * our CPU ports. If we're part of the same tree though, we should * probably use dsa_towards_port. */ - int other_upstream = dsa_upstream_port(other_ds, other_port); + int other_upstream = dsa_upstream_port(other_ctx->ds, other_port); int rc; - rc = dsa_8021q_crosschip_link_add(ds, port, other_ds, - other_port, crosschip_links); + rc = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_port); if (rc) return rc; - rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds, + rc = dsa_8021q_crosschip_link_apply(ctx, port, other_ctx, other_port, true); if (rc) return rc; - rc = dsa_8021q_crosschip_link_add(ds, port, other_ds, - other_upstream, - crosschip_links); + rc = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_upstream); if (rc) return rc; - return dsa_8021q_crosschip_link_apply(ds, port, other_ds, + return dsa_8021q_crosschip_link_apply(ctx, port, other_ctx, other_upstream, true); } EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join); -int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, - struct list_head *crosschip_links) +int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port, + struct dsa_8021q_context *other_ctx, + int other_port) { - int other_upstream = dsa_upstream_port(other_ds, other_port); + int other_upstream = dsa_upstream_port(other_ctx->ds, other_port); struct dsa_8021q_crosschip_link *c, *n; - list_for_each_entry_safe(c, n, crosschip_links, list) { - if (c->port == port && c->other_ds == other_ds && + list_for_each_entry_safe(c, n, &ctx->crosschip_links, list) { + if (c->port == port && c->other_ctx == other_ctx && (c->other_port == other_port || c->other_port == other_upstream)) { - struct dsa_switch *other_ds = c->other_ds; + struct dsa_8021q_context *other_ctx = c->other_ctx; int other_port = c->other_port; bool keep; int rc; - dsa_8021q_crosschip_link_del(ds, c, crosschip_links, - &keep); + dsa_8021q_crosschip_link_del(ctx, c, &keep); if (keep) continue; - rc = dsa_8021q_crosschip_link_apply(ds, port, - other_ds, + rc = dsa_8021q_crosschip_link_apply(ctx, port, + other_ctx, other_port, false); if (rc) diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 9b4a4d719291..3710f9daa46d 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -72,14 +72,21 @@ static inline bool sja1105_is_meta_frame(const struct sk_buff *skb) static bool sja1105_can_use_vlan_as_tags(const struct sk_buff *skb) { struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); + u16 vlan_tci; if (hdr->h_vlan_proto == htons(ETH_P_SJA1105)) return true; - if (hdr->h_vlan_proto != htons(ETH_P_8021Q)) + if (hdr->h_vlan_proto != htons(ETH_P_8021Q) && + !skb_vlan_tag_present(skb)) return false; - return vid_is_dsa_8021q(ntohs(hdr->h_vlan_TCI) & VLAN_VID_MASK); + if (skb_vlan_tag_present(skb)) + vlan_tci = skb_vlan_tag_get(skb); + else + vlan_tci = ntohs(hdr->h_vlan_TCI); + + return vid_is_dsa_8021q(vlan_tci & VLAN_VID_MASK); } /* This is the first time the tagger sees the frame on RX. @@ -283,7 +290,8 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, hdr = eth_hdr(skb); tpid = ntohs(hdr->h_proto); - is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q); + is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q || + skb_vlan_tag_present(skb)); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); @@ -292,7 +300,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, if (is_tagged) { /* Normal traffic path. */ skb_push_rcsum(skb, ETH_HLEN); - __skb_vlan_pop(skb, &tci); + if (skb_vlan_tag_present(skb)) { + tci = skb_vlan_tag_get(skb); + __vlan_hwaccel_clear_tag(skb); + } else { + __skb_vlan_pop(skb, &tci); + } skb_pull_rcsum(skb, ETH_HLEN); skb_reset_network_header(skb); skb_reset_transport_header(skb); diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 9ef54cdcf662..9ecda09ecb11 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -223,7 +223,7 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); for (i = from_channel; i < old_total; i++) - if (xdp_get_umem_from_qid(dev, i)) { + if (xsk_get_pool_from_qid(dev, i)) { GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); return -EINVAL; } diff --git a/net/ethtool/common.c b/net/ethtool/common.c index ed19573fccd7..24036e3055a1 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -192,6 +192,8 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(400000, LR4_ER4_FR4, Full), __DEFINE_LINK_MODE_NAME(400000, DR4, Full), __DEFINE_LINK_MODE_NAME(400000, CR4, Full), + __DEFINE_LINK_MODE_NAME(100, FX, Half), + __DEFINE_LINK_MODE_NAME(100, FX, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 441794e0034f..328d15cd4006 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1706,7 +1706,7 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, min(channels.rx_count, channels.tx_count); to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); for (i = from_channel; i < to_channel; i++) - if (xdp_get_umem_from_qid(dev, i)) + if (xsk_get_pool_from_qid(dev, i)) return -EINVAL; ret = dev->ethtool_ops->set_channels(dev, &channels); @@ -1861,23 +1861,18 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT); } else { /* Driver expects to be called at twice the frequency in rc */ - int n = rc * 2, i, interval = HZ / n; + int n = rc * 2, interval = HZ / n; + u64 count = n * id.data, i = 0; - /* Count down seconds */ do { - /* Count down iterations per second */ - i = n; - do { - rtnl_lock(); - rc = ops->set_phys_id(dev, - (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); - rtnl_unlock(); - if (rc) - break; - schedule_timeout_interruptible(interval); - } while (!signal_pending(current) && --i != 0); - } while (!signal_pending(current) && - (id.data == 0 || --id.data != 0)); + rtnl_lock(); + rc = ops->set_phys_id(dev, + (i++ & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); + rtnl_unlock(); + if (rc) + break; + schedule_timeout_interruptible(interval); + } while (!signal_pending(current) && (!id.data || i < count)); } rtnl_lock(); @@ -3025,13 +3020,14 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) case TCP_V4_FLOW: case TCP_V6_FLOW: match->key.basic.ip_proto = IPPROTO_TCP; + match->mask.basic.ip_proto = 0xff; break; case UDP_V4_FLOW: case UDP_V6_FLOW: match->key.basic.ip_proto = IPPROTO_UDP; + match->mask.basic.ip_proto = 0xff; break; } - match->mask.basic.ip_proto = 0xff; match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_BASIC); match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] = diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 7044a2853886..29dcd675b65a 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -272,6 +272,8 @@ static const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100, FX, Half), + __DEFINE_LINK_MODE_PARAMS(100, FX, Full), }; static const struct nla_policy diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 7aea35d1e8a5..1980aa7eb2b6 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -10,6 +10,7 @@ struct pause_req_info { struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; + struct ethtool_pause_stats pausestat; }; #define PAUSE_REPDATA(__reply_base) \ @@ -22,8 +23,15 @@ pause_get_policy[ETHTOOL_A_PAUSE_MAX + 1] = { [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_REJECT }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_REJECT }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_REJECT }, + [ETHTOOL_A_PAUSE_STATS] = { .type = NLA_REJECT }, }; +static void ethtool_stats_init(u64 *stats, unsigned int n) +{ + while (n--) + stats[n] = ETHTOOL_STAT_NOT_SET; +} + static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) @@ -34,10 +42,17 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base, if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; + ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); + if (req_base->flags & ETHTOOL_FLAG_STATS && + dev->ethtool_ops->get_pause_stats) { + ethtool_stats_init((u64 *)&data->pausestat, + sizeof(data->pausestat) / 8); + dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); + } ethnl_ops_complete(dev); return 0; @@ -46,9 +61,50 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base, static int pause_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { - return nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ + int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */ nla_total_size(sizeof(u8)); /* _PAUSE_TX */ + + if (req_base->flags & ETHTOOL_FLAG_STATS) + n += nla_total_size(0) + /* _PAUSE_STATS */ + nla_total_size_64bit(sizeof(u64)) * + (ETHTOOL_A_PAUSE_STAT_MAX - 2); + return n; +} + +static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype, + u16 padtype) +{ + if (val == ETHTOOL_STAT_NOT_SET) + return 0; + if (nla_put_u64_64bit(skb, attrtype, val, padtype)) + return -EMSGSIZE; + + return 0; +} + +static int pause_put_stats(struct sk_buff *skb, + const struct ethtool_pause_stats *pause_stats) +{ + const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); + if (!nest) + return -EMSGSIZE; + + if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, + ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || + ethtool_put_stat(skb, pause_stats->rx_pause_frames, + ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; } static int pause_fill_reply(struct sk_buff *skb, @@ -63,6 +119,10 @@ static int pause_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause)) return -EMSGSIZE; + if (req_base->flags & ETHTOOL_FLAG_STATS && + pause_put_stats(skb, &data->pausestat)) + return -EMSGSIZE; + return 0; } @@ -89,6 +149,7 @@ pause_set_policy[ETHTOOL_A_PAUSE_MAX + 1] = { [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, + [ETHTOOL_A_PAUSE_STATS] = { .type = NLA_REJECT }, }; int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info) diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 7e11a6c35bc3..4cfd9e829c7b 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -60,17 +60,7 @@ hsr_node_table_show(struct seq_file *sfp, void *data) return 0; } -/* hsr_node_table_open - Open the node_table file - * - * Description: - * This routine opens a debugfs file node_table of specific hsr - * or prp device - */ -static int -hsr_node_table_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, hsr_node_table_show, inode->i_private); -} +DEFINE_SHOW_ATTRIBUTE(hsr_node_table); void hsr_debugfs_rename(struct net_device *dev) { @@ -85,13 +75,6 @@ void hsr_debugfs_rename(struct net_device *dev) priv->node_tbl_root = d; } -static const struct file_operations hsr_fops = { - .open = hsr_node_table_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /* hsr_debugfs_init - create hsr node_table file for dumping * the node table * @@ -113,7 +96,7 @@ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) de = debugfs_create_file("node_table", S_IFREG | 0444, priv->node_tbl_root, priv, - &hsr_fops); + &hsr_node_table_fops); if (IS_ERR(de)) { pr_err("Cannot create hsr node_table file\n"); debugfs_remove(priv->node_tbl_root); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4307503a6f0b..b7260c8cef2e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1017,6 +1017,7 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon const struct proto_ops inet_stream_ops = { .family = PF_INET, + .flags = PROTO_CMSG_DATA_ONLY, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 2eb71579f4d2..471d33a0d095 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -498,7 +498,7 @@ static void cipso_v4_doi_free_rcu(struct rcu_head *entry) /** * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine * @doi: the DOI value - * @audit_secid: the LSM secid to use in the audit message + * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CIPSO engine. The NetLabel routines will diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index abd083415f89..5308cfa3de62 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -237,7 +237,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk, /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel - * header to the outer L3 tunnel header, or we are are simply + * header to the outer L3 tunnel header, or we are simply * treating the GRE tunnel header as though it is a UDP protocol * specific header such as VXLAN or GENEVE. */ @@ -429,7 +429,7 @@ next_proto: /* We can clear the encap_mark for GUE as we are essentially doing * one of two possible things. We are either adding an L4 tunnel - * header to the outer L3 tunnel header, or we are are simply + * header to the outer L3 tunnel header, or we are simply * treating the GRE tunnel header as though it is a UDP protocol * specific header such as VXLAN or GENEVE. */ diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index cf36f955bfe6..8f2e974a1e4d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -690,9 +690,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, rcu_read_unlock(); } - tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | + tos = icmp_pointers[type].error ? (RT_TOS(iph->tos) | IPTOS_PREC_INTERNETCONTROL) : - iph->tos; + iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) @@ -784,7 +784,7 @@ EXPORT_SYMBOL(icmp_ndo_send); static void icmp_socket_deliver(struct sk_buff *skb, u32 info) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; const struct net_protocol *ipprot; int protocol = iph->protocol; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f1bd95f243b3..366a4507b5a3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -125,6 +125,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, bool net_admin) { const struct inet_sock *inet = inet_sk(sk); + struct inet_diag_sockopt inet_sockopt; if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) goto errout; @@ -180,6 +181,22 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); + memset(&inet_sockopt, 0, sizeof(inet_sockopt)); + inet_sockopt.recverr = inet->recverr; + inet_sockopt.is_icsk = inet->is_icsk; + inet_sockopt.freebind = inet->freebind; + inet_sockopt.hdrincl = inet->hdrincl; + inet_sockopt.mc_loop = inet->mc_loop; + inet_sockopt.transparent = inet->transparent; + inet_sockopt.mc_all = inet->mc_all; + inet_sockopt.nodefrag = inet->nodefrag; + inet_sockopt.bind_address_no_port = inet->bind_address_no_port; + inet_sockopt.recverr_rfc4884 = inet->recverr_rfc4884; + inet_sockopt.defer_connect = inet->defer_connect; + if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt), + &inet_sockopt)) + goto errout; + return 0; errout: return 1; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 239e54474b65..8cbe74313f38 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -228,7 +228,7 @@ static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, - const int dif, const int sdif, bool exact_dif) + const int dif, const int sdif) { int score = -1; @@ -277,15 +277,13 @@ static struct sock *inet_lhash2_lookup(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { - bool exact_dif = inet_exact_dif_match(net, skb); struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; int score, hiscore = 0; inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { sk = (struct sock *)icsk; - score = compute_score(sk, net, hnum, daddr, - dif, sdif, exact_dif); + score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 948747aac4e2..da1b5038bdfd 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -47,32 +47,32 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, unsigned char *iph = skb_network_header(skb); memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); - memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); + memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen); opt = &(IPCB(skb)->opt); if (opt->srr) - memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4); + memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4); if (!is_frag) { if (opt->rr_needaddr) - ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt); + ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt); if (opt->ts_needaddr) - ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt); + ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt); if (opt->ts_needtime) { __be32 midtime; midtime = inet_current_timestamp(); - memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4); + memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4); } return; } if (opt->rr) { - memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]); + memset(iph + opt->rr, IPOPT_NOP, iph[opt->rr + 1]); opt->rr = 0; opt->rr_needaddr = 0; } if (opt->ts) { - memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]); + memset(iph + opt->ts, IPOPT_NOP, iph[opt->ts + 1]); opt->ts = 0; opt->ts_needaddr = opt->ts_needtime = 0; } @@ -495,26 +495,29 @@ EXPORT_SYMBOL(ip_options_compile); void ip_options_undo(struct ip_options *opt) { if (opt->srr) { - unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr); - memmove(optptr+7, optptr+3, optptr[1]-7); - memcpy(optptr+3, &opt->faddr, 4); + unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr); + + memmove(optptr + 7, optptr + 3, optptr[1] - 7); + memcpy(optptr + 3, &opt->faddr, 4); } if (opt->rr_needaddr) { - unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr); + unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr); + optptr[2] -= 4; - memset(&optptr[optptr[2]-1], 0, 4); + memset(&optptr[optptr[2] - 1], 0, 4); } if (opt->ts) { - unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr); + unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr); + if (opt->ts_needtime) { optptr[2] -= 4; - memset(&optptr[optptr[2]-1], 0, 4); - if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC) + memset(&optptr[optptr[2] - 1], 0, 4); + if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC) optptr[2] -= 4; } if (opt->ts_needaddr) { optptr[2] -= 4; - memset(&optptr[optptr[2]-1], 0, 4); + memset(&optptr[optptr[2] - 1], 0, 4); } } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e6f2ada9e7d5..8b200252c655 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -143,7 +143,8 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) * */ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, - __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) + __be32 saddr, __be32 daddr, struct ip_options_rcu *opt, + u8 tos) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); @@ -156,7 +157,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; - iph->tos = inet->tos; + iph->tos = tos; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; @@ -997,7 +998,7 @@ static int __ip_append_data(struct sock *sk, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; + maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu; if (cork->length + length > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, @@ -1352,7 +1353,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (cork->flags & IPCORK_OPT) opt = cork->opt; - if (!(rt->dst.dev->features&NETIF_F_SG)) + if (!(rt->dst.dev->features & NETIF_F_SG)) return -EOPNOTSUPP; hh_len = LL_RESERVED_SPACE(rt->dst.dev); @@ -1537,7 +1538,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, ip_select_ident(net, skb, sk); if (opt) { - iph->ihl += opt->optlen>>2; + iph->ihl += opt->optlen >> 2; ip_options_build(skb, opt, cork->addr, rt, 0); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d2c223554ff7..ec6036713e2c 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1124,8 +1124,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, dev_put(dev); err = -EINVAL; - if (sk->sk_bound_dev_if && - (!midx || midx != sk->sk_bound_dev_if)) + if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if) break; inet->uc_index = ifindex; @@ -1189,7 +1188,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, err = -EINVAL; if (sk->sk_bound_dev_if && mreq.imr_ifindex != sk->sk_bound_dev_if && - (!midx || midx != sk->sk_bound_dev_if)) + midx != sk->sk_bound_dev_if) break; inet->mc_index = mreq.imr_ifindex; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 876fd6ff1ff9..939792a38814 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1038,10 +1038,13 @@ static int ipmr_cache_report(struct mr_table *mrt, memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = assert; msg->im_mbz = 0; - if (assert == IGMPMSG_WRVIFWHOLE) + if (assert == IGMPMSG_WRVIFWHOLE) { msg->im_vif = vifi; - else + msg->im_vif_hi = vifi >> 8; + } else { msg->im_vif = mrt->mroute_reg_vif_num; + msg->im_vif_hi = mrt->mroute_reg_vif_num >> 8; + } ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -1054,6 +1057,7 @@ static int ipmr_cache_report(struct mr_table *mrt, ip_hdr(skb)->protocol = 0; msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; + msg->im_vif_hi = vifi >> 8; skb_dst_set(skb, dst_clone(skb_dst(pkt))); /* Add our header */ igmp = skb_put(skb, sizeof(struct igmphdr)); @@ -2396,6 +2400,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen) + nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */ + nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */ + nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */ + + nla_total_size(4) /* IPMRA_CREPORT_TABLE */ /* IPMRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; @@ -2427,11 +2432,12 @@ static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IPMR; if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) || - nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) || + nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif | (msg->im_vif_hi << 8)) || nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR, msg->im_src.s_addr) || nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR, - msg->im_dst.s_addr)) + msg->im_dst.s_addr) || + nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id)) goto nla_put_failure; nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 134e92382275..8c0f17c6863c 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -42,8 +42,8 @@ static int call_nexthop_notifiers(struct net *net, { int err; - err = atomic_notifier_call_chain(&net->nexthop.notifier_chain, - event_type, nh); + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + event_type, nh); return notifier_to_errno(err); } @@ -133,12 +133,9 @@ static struct nexthop *nexthop_alloc(void) static struct nh_group *nexthop_grp_alloc(u16 num_nh) { - size_t sz = offsetof(struct nexthop, nh_grp) - + sizeof(struct nh_group) - + sizeof(struct nh_grp_entry) * num_nh; struct nh_group *nhg; - nhg = kzalloc(sz, GFP_KERNEL); + nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL); if (nhg) nhg->num_nh = num_nh; @@ -279,7 +276,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, case AF_INET: fib_nh = &nhi->fib_nh; if (fib_nh->fib_nh_gw_family && - nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) + nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) goto nla_put_failure; break; @@ -800,7 +797,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, return; } - newg->has_v4 = nhg->has_v4; + newg->has_v4 = false; newg->mpath = nhg->mpath; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; @@ -809,12 +806,18 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, nhges = nhg->nh_entries; new_nhges = newg->nh_entries; for (i = 0, j = 0; i < nhg->num_nh; ++i) { + struct nh_info *nhi; + /* current nexthop getting removed */ if (nhg->nh_entries[i].nh == nh) { newg->num_nh--; continue; } + nhi = rtnl_dereference(nhges[i].nh->nh_info); + if (nhi->family == AF_INET) + newg->has_v4 = true; + list_del(&nhges[i].nh_list); new_nhges[j].nh_parent = nhges[i].nh_parent; new_nhges[j].nh = nhges[i].nh; @@ -867,8 +870,6 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) bool do_flush = false; struct fib_info *fi; - call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); - list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; @@ -906,6 +907,8 @@ static void __remove_nexthop(struct net *net, struct nexthop *nh, static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { + call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); + /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); @@ -961,6 +964,23 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, return 0; } +static void nh_group_v4_update(struct nh_group *nhg) +{ + struct nh_grp_entry *nhges; + bool has_v4 = false; + int i; + + nhges = nhg->nh_entries; + for (i = 0; i < nhg->num_nh; i++) { + struct nh_info *nhi; + + nhi = rtnl_dereference(nhges[i].nh->nh_info); + if (nhi->family == AF_INET) + has_v4 = true; + } + nhg->has_v4 = has_v4; +} + static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) @@ -984,6 +1004,21 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); + /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially + * update IPv4 indication in all the groups using the nexthop. + */ + if (oldi->family == AF_INET && newi->family == AF_INET6) { + struct nh_grp_entry *nhge; + + list_for_each_entry(nhge, &old->grp_list, nh_list) { + struct nexthop *nhp = nhge->nh_parent; + struct nh_group *nhg; + + nhg = rtnl_dereference(nhp->nh_grp); + nh_group_v4_update(nhg); + } + } + return 0; } @@ -1101,7 +1136,7 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, while (1) { struct nexthop *nh; - next = rtnl_dereference(*pp); + next = *pp; if (!next) break; @@ -1924,14 +1959,15 @@ static struct notifier_block nh_netdev_notifier = { int register_nexthop_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb); + return blocking_notifier_chain_register(&net->nexthop.notifier_chain, + nb); } EXPORT_SYMBOL(register_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain, - nb); + return blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, + nb); } EXPORT_SYMBOL(unregister_nexthop_notifier); @@ -1951,7 +1987,7 @@ static int __net_init nexthop_net_init(struct net *net) net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; - ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); + BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index df6fbefe44d4..248856b301c4 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -293,7 +293,8 @@ EXPORT_SYMBOL_GPL(ping_close); /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, - struct sockaddr *uaddr, int addr_len) { + struct sockaddr *uaddr, int addr_len) +{ struct net *net = sock_net(sk); if (sk->sk_family == AF_INET) { struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; @@ -310,10 +311,10 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); - chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); - if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; + else + chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); if ((!inet_can_nonlocal_bind(net, isk) && chk_addr_ret != RTN_LOCAL) || @@ -383,20 +384,6 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) } } -static void ping_clear_saddr(struct sock *sk, int dif) -{ - sk->sk_bound_dev_if = dif; - if (sk->sk_family == AF_INET) { - struct inet_sock *isk = inet_sk(sk); - isk->inet_rcv_saddr = isk->inet_saddr = 0; -#if IS_ENABLED(CONFIG_IPV6) - } else if (sk->sk_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr)); - memset(&np->saddr, 0, sizeof(np->saddr)); -#endif - } -} /* * We need our own bind because there are no privileged id's == local ports. * Moreover, we don't allow binding to multi- and broadcast addresses. @@ -420,12 +407,13 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto out; err = -EADDRINUSE; - ping_set_saddr(sk, uaddr); snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port); if (ping_get_port(sk, snum) != 0) { - ping_clear_saddr(sk, dif); + /* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */ + sk->sk_bound_dev_if = dif; goto out; } + ping_set_saddr(sk, uaddr); pr_debug("after bind(): num = %hu, dif = %d\n", isk->inet_num, @@ -647,7 +635,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, } int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, - void *user_icmph, size_t icmph_len) { + void *user_icmph, size_t icmph_len) +{ u8 type, code; if (len > 0xFFFF) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 407956be7deb..1170653a89cd 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -260,11 +260,12 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) err = EHOSTUNREACH; if (code > NR_ICMP_UNREACH) break; - err = icmp_err_convert[code].errno; - harderr = icmp_err_convert[code].fatal; if (code == ICMP_FRAG_NEEDED) { harderr = inet->pmtudisc != IP_PMTUDISC_DONT; err = EMSGSIZE; + } else { + err = icmp_err_convert[code].errno; + harderr = icmp_err_convert[code].fatal; } } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 58642b29a499..d15a78b26dfa 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -623,7 +623,7 @@ static inline u32 fnhe_hashfun(__be32 daddr) u32 hval; net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); - hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); + hval = jhash_1word((__force u32)daddr, fnhe_hashrnd); return hash_32(hval, FNHE_HASH_SHIFT); } @@ -1016,13 +1016,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; struct net *net = dev_net(dst->dev); - u32 old_mtu = ipv4_mtu(dst); struct fib_result res; bool lock = false; + u32 old_mtu; if (ip_mtu_locked(dst)) return; + old_mtu = ipv4_mtu(dst); if (old_mtu < mtu) return; @@ -1066,7 +1067,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u8 protocol) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; u32 mark = IP4_REPLY_MARK(net, skb->mark); @@ -1083,7 +1084,7 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu); static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; @@ -1101,7 +1102,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct dst_entry *odst = NULL; @@ -1131,7 +1132,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); + __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) @@ -1156,7 +1157,7 @@ EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; @@ -1172,7 +1173,7 @@ EXPORT_SYMBOL_GPL(ipv4_redirect); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct net *net = sock_net(sk); @@ -1312,7 +1313,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) static unsigned int ipv4_mtu(const struct dst_entry *dst) { - const struct rtable *rt = (const struct rtable *) dst; + const struct rtable *rt = (const struct rtable *)dst; unsigned int mtu = rt->rt_pmtu; if (!mtu || time_after_eq(jiffies, rt->dst.expires)) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index f0794f0232ba..c375c126f436 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -286,11 +286,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb) { + struct tcp_request_sock *treq; struct request_sock *req; #ifdef CONFIG_MPTCP - struct tcp_request_sock *treq; - if (sk_is_mptcp(sk)) ops = &mptcp_subflow_request_sock_ops; #endif @@ -299,8 +298,9 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, if (!req) return NULL; -#if IS_ENABLED(CONFIG_MPTCP) treq = tcp_rsk(req); + treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; +#if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); if (treq->is_mptcp) { int err = mptcp_subflow_init_cookie_req(req, sk, skb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 54023a46db04..3e5f4f2e705e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1330,6 +1330,15 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = &comp_sack_nr_max, }, { + .procname = "tcp_reflect_tos", + .data = &init_net.ipv4.sysctl_tcp_reflect_tos, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 31f3b858db81..65057744fac8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -418,6 +418,8 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto_min = TCP_RTO_MIN; + icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); @@ -562,7 +564,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_is_writeable(sk)) { + if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -574,7 +576,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * pairs with the input side. */ smp_mb__after_atomic(); - if (sk_stream_is_writeable(sk)) + if (__sk_stream_is_writeable(sk, 1)) mask |= EPOLLOUT | EPOLLWRNORM; } } else @@ -1002,12 +1004,12 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; + goto wait_for_space; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, tcp_rtx_and_write_queues_empty(sk)); if (!skb) - goto wait_for_memory; + goto wait_for_space; #ifdef CONFIG_TLS_DEVICE skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); @@ -1026,7 +1028,7 @@ new_segment: goto new_segment; } if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; + goto wait_for_space; if (can_coalesce) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); @@ -1067,9 +1069,8 @@ new_segment: tcp_push_one(sk, mss_now); continue; -wait_for_sndbuf: +wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); @@ -1280,7 +1281,7 @@ restart: new_segment: if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; + goto wait_for_space; if (unlikely(process_backlog >= 16)) { process_backlog = 0; @@ -1291,7 +1292,7 @@ new_segment: skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, first_skb); if (!skb) - goto wait_for_memory; + goto wait_for_space; process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL; @@ -1324,7 +1325,7 @@ new_segment: struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) - goto wait_for_memory; + goto wait_for_space; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { @@ -1338,7 +1339,7 @@ new_segment: copy = min_t(int, copy, pfrag->size - pfrag->offset); if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; + goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, @@ -1391,9 +1392,8 @@ new_segment: tcp_push_one(sk, mss_now); continue; -wait_for_sndbuf: +wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); @@ -1525,7 +1525,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ -static void tcp_cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; @@ -2685,6 +2685,8 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto_min = TCP_RTO_MIN; + icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd = TCP_INIT_CWND; tp->snd_cwnd_cnt = 0; @@ -3207,7 +3209,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; case TCP_SAVE_SYN: - if (val < 0 || val > 1) + /* 0: disable, 1: enable, 2: start from ether_header */ + if (val < 0 || val > 2) err = -EINVAL; else tp->save_syn = val; @@ -3788,20 +3791,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level, lock_sock(sk); if (tp->saved_syn) { - if (len < tp->saved_syn[0]) { - if (put_user(tp->saved_syn[0], optlen)) { + if (len < tcp_saved_syn_len(tp->saved_syn)) { + if (put_user(tcp_saved_syn_len(tp->saved_syn), + optlen)) { release_sock(sk); return -EFAULT; } release_sock(sk); return -EINVAL; } - len = tp->saved_syn[0]; + len = tcp_saved_syn_len(tp->saved_syn); if (put_user(len, optlen)) { release_sock(sk); return -EFAULT; } - if (copy_to_user(optval, tp->saved_syn + 1, len)) { + if (copy_to_user(optval, tp->saved_syn->data, len)) { release_sock(sk); return -EFAULT; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 7aa68f4aae6c..37f4cb2bba5c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -567,10 +567,9 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; } -static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops) +static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) { - if (sk->sk_family == AF_INET6 && - unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { + if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { spin_lock_bh(&tcpv6_prot_lock); if (likely(ops != tcpv6_prot_saved)) { tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); @@ -603,13 +602,11 @@ struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; - if (!psock->sk_proto) { - struct proto *ops = READ_ONCE(sk->sk_prot); - - if (tcp_bpf_assert_proto_ops(ops)) + if (sk->sk_family == AF_INET6) { + if (tcp_bpf_assert_proto_ops(psock->sk_proto)) return ERR_PTR(-EINVAL); - tcp_bpf_check_v6_needs_rebuild(sk, ops); + tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } return &tcp_bpf_prots[family][config]; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 09b62de04eea..af2814c9342a 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -295,7 +295,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ - tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); + tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 184ea556f50e..50834e7f958e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -138,6 +138,69 @@ void clean_acked_data_flush(void) EXPORT_SYMBOL_GPL(clean_acked_data_flush); #endif +#ifdef CONFIG_CGROUP_BPF +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) +{ + bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && + BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); + bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); + struct bpf_sock_ops_kern sock_ops; + + if (likely(!unknown_opt && !parse_all_opt)) + return; + + /* The skb will be handled in the + * bpf_skops_established() or + * bpf_skops_write_hdr_opt(). + */ + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_SYN_SENT: + case TCP_LISTEN: + return; + } + + sock_owned_by_me(sk); + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); + + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); +} + +static void bpf_skops_established(struct sock *sk, int bpf_op, + struct sk_buff *skb) +{ + struct bpf_sock_ops_kern sock_ops; + + sock_owned_by_me(sk); + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = bpf_op; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ + if (skb) + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); + + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); +} +#else +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) +{ +} + +static void bpf_skops_established(struct sock *sk, int bpf_op, + struct sk_buff *skb) +{ +} +#endif + static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, unsigned int len) { @@ -3801,7 +3864,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, foc->exp = exp_opt; } -static void smc_parse_options(const struct tcphdr *th, +static bool smc_parse_options(const struct tcphdr *th, struct tcp_options_received *opt_rx, const unsigned char *ptr, int opsize) @@ -3810,10 +3873,13 @@ static void smc_parse_options(const struct tcphdr *th, if (static_branch_unlikely(&tcp_have_smc)) { if (th->syn && !(opsize & 1) && opsize >= TCPOLEN_EXP_SMC_BASE && - get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { opt_rx->smc_ok = 1; + return true; + } } #endif + return false; } /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped @@ -3874,6 +3940,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->saw_unknown = 0; while (length > 0) { int opcode = *ptr++; @@ -3964,15 +4031,21 @@ void tcp_parse_options(const struct net *net, */ if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && get_unaligned_be16(ptr) == - TCPOPT_FASTOPEN_MAGIC) + TCPOPT_FASTOPEN_MAGIC) { tcp_parse_fastopen_option(opsize - TCPOLEN_EXP_FASTOPEN_BASE, ptr + 2, th->syn, foc, true); - else - smc_parse_options(th, opt_rx, ptr, - opsize); + break; + } + + if (smc_parse_options(th, opt_rx, ptr, opsize)) + break; + + opt_rx->saw_unknown = 1; break; + default: + opt_rx->saw_unknown = 1; } ptr += opsize-2; length -= opsize; @@ -5259,12 +5332,6 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return true; } -/* When incoming ACK allowed to free some skb from write_queue, - * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket - * on the exit from tcp input handler. - * - * PROBLEM: sndbuf expansion does not work well with largesend. - */ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -5279,16 +5346,13 @@ static void tcp_new_space(struct sock *sk) static void tcp_check_space(struct sock *sk) { - if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { - sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); - /* pairs with tcp_poll() */ - smp_mb(); - if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - tcp_new_space(sk); - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); - } + /* pairs with tcp_poll() */ + smp_mb(); + if (sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + tcp_new_space(sk); + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } } @@ -5590,6 +5654,8 @@ syn_challenge: goto discard; } + bpf_skops_parse_hdr(sk, skb); + return true; discard: @@ -5798,7 +5864,7 @@ discard: } EXPORT_SYMBOL(tcp_rcv_established); -void tcp_init_transfer(struct sock *sk, int bpf_op) +void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -5819,7 +5885,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); tp->snd_cwnd_stamp = tcp_jiffies32; - tcp_call_bpf(sk, bpf_op, 0, NULL); + bpf_skops_established(sk, bpf_op, skb); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } @@ -5838,7 +5904,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id(sk, skb); } - tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb); /* Prevent spurious tcp_cwnd_restart() on first data * packet. @@ -6310,7 +6376,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } else { tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; - tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, + skb); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } smp_mb(); @@ -6599,13 +6666,27 @@ static void tcp_reqsk_record_syn(const struct sock *sk, { if (tcp_sk(sk)->save_syn) { u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); - u32 *copy; + struct saved_syn *saved_syn; + u32 mac_hdrlen; + void *base; + + if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */ + base = skb_mac_header(skb); + mac_hdrlen = skb_mac_header_len(skb); + len += mac_hdrlen; + } else { + base = skb_network_header(skb); + mac_hdrlen = 0; + } - copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); - if (copy) { - copy[0] = len; - memcpy(©[1], skb_network_header(skb), len); - req->saved_syn = copy; + saved_syn = kmalloc(struct_size(saved_syn, data, len), + GFP_ATOMIC); + if (saved_syn) { + saved_syn->mac_hdrlen = mac_hdrlen; + saved_syn->network_hdrlen = skb_network_header_len(skb); + saved_syn->tcp_hdrlen = tcp_hdrlen(skb); + memcpy(saved_syn->data, base, len); + req->saved_syn = saved_syn; } } } @@ -6744,6 +6825,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); + tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; tcp_openreq_init_rwin(req, sk, dst); sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { @@ -6752,7 +6834,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, - &foc, TCP_SYNACK_FASTOPEN); + &foc, TCP_SYNACK_FASTOPEN, skb); /* Add the child socket directly into the accept queue */ if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { reqsk_fastopen_remove(fastopen_sk, req, false); @@ -6770,7 +6852,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_timeout_init((struct sock *)req)); af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : - TCP_SYNACK_COOKIE); + TCP_SYNACK_COOKIE, + skb); if (want_cookie) { reqsk_free(req); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5084333b5ab6..ace48b2790ff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -575,7 +575,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is - * is already accepted it is treated as a connected one below. + * already accepted it is treated as a connected one below. */ if (fastopen && !fastopen->sk) break; @@ -965,18 +965,23 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff *skb; + u8 tos; /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc, synack_type); + skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); + + tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tcp_rsk(req)->syn_tos : inet_sk(sk)->tos; if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); @@ -984,7 +989,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt)); + rcu_dereference(ireq->ireq_opt), + tos & ~INET_ECN_MASK); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -1529,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = prandom_u32(); + /* Set ToS of the new socket based upon the value of incoming SYN. */ + if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) + newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; + if (!dst) { dst = inet_csk_route_child_sock(sk, newsk, req); if (!dst) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 495dda2449fe..56c306e3cd2f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -548,6 +548,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); + bpf_skops_init_child(sk, newsk); tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 85ff417bda7f..386978dcd318 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -438,6 +438,7 @@ struct tcp_out_options { u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ + u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ @@ -452,6 +453,145 @@ static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) #endif } +#ifdef CONFIG_CGROUP_BPF +static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, + enum tcp_synack_type synack_type) +{ + if (unlikely(!skb)) + return BPF_WRITE_HDR_TCP_CURRENT_MSS; + + if (unlikely(synack_type == TCP_SYNACK_COOKIE)) + return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; + + return 0; +} + +/* req, syn_skb and synack_type are used when writing synack */ +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts, + unsigned int *remaining) +{ + struct bpf_sock_ops_kern sock_ops; + int err; + + if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || + !*remaining) + return; + + /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ + + /* init sock_ops */ + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + + sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; + + if (req) { + /* The listen "sk" cannot be passed here because + * it is not locked. It would not make too much + * sense to do bpf_setsockopt(listen_sk) based + * on individual connection request also. + * + * Thus, "req" is passed here and the cgroup-bpf-progs + * of the listen "sk" will be run. + * + * "req" is also used here for fastopen even the "sk" here is + * a fullsock "child" sk. It is to keep the behavior + * consistent between fastopen and non-fastopen on + * the bpf programming side. + */ + sock_ops.sk = (struct sock *)req; + sock_ops.syn_skb = syn_skb; + } else { + sock_owned_by_me(sk); + + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + } + + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); + sock_ops.remaining_opt_len = *remaining; + /* tcp_current_mss() does not pass a skb */ + if (skb) + bpf_skops_init_skb(&sock_ops, skb, 0); + + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); + + if (err || sock_ops.remaining_opt_len == *remaining) + return; + + opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; + /* round up to 4 bytes */ + opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; + + *remaining -= opts->bpf_opt_len; +} + +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts) +{ + u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; + struct bpf_sock_ops_kern sock_ops; + int err; + + if (likely(!max_opt_len)) + return; + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + + sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; + + if (req) { + sock_ops.sk = (struct sock *)req; + sock_ops.syn_skb = syn_skb; + } else { + sock_owned_by_me(sk); + + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + } + + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); + sock_ops.remaining_opt_len = max_opt_len; + first_opt_off = tcp_hdrlen(skb) - max_opt_len; + bpf_skops_init_skb(&sock_ops, skb, first_opt_off); + + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); + + if (err) + nr_written = 0; + else + nr_written = max_opt_len - sock_ops.remaining_opt_len; + + if (nr_written < max_opt_len) + memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, + max_opt_len - nr_written); +} +#else +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +} + +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts) +{ +} +#endif + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -691,6 +831,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -701,7 +843,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -758,6 +901,9 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, + synack_type, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -826,6 +972,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + + size = MAX_TCP_OPTION_SPACE - remaining; + } + return size; } @@ -1213,6 +1368,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } #endif + /* BPF prog is the last one writing header option */ + bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); + INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, tcp_v6_send_check, tcp_v4_send_check, sk, skb); @@ -1524,7 +1682,6 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); sk_mem_uncharge(sk, delta_truesize); - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); } /* Any change of skb->len requires recalculation of tso factor. */ @@ -3336,20 +3493,20 @@ int tcp_send_synack(struct sock *sk) } /** - * tcp_make_synack - Prepare a SYN-ACK. - * sk: listener socket - * dst: dst entry attached to the SYNACK - * req: request_sock pointer - * foc: cookie for tcp fast open - * synack_type: Type of synback to prepare - * - * Allocate one skb and build a SYNACK packet. - * @dst is consumed : Caller should not use it again. + * tcp_make_synack - Allocate one skb and build a SYNACK packet. + * @sk: listener socket + * @dst: dst entry attached to the SYNACK. It is consumed and caller + * should not use it again. + * @req: request_sock pointer + * @foc: cookie for tcp fast open + * @synack_type: Type of synack to prepare + * @syn_skb: SYN packet just received. It could be NULL for rtx case. */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); @@ -3408,8 +3565,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + /* bpf program will be interested in the tcp_flags */ + TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc, synack_type) + sizeof(*th); + foc, synack_type, + syn_skb) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -3441,6 +3601,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif + bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, + synack_type, &opts); + skb->skb_mstamp_ns = now; tcp_add_tx_delay(skb, tp); @@ -3741,6 +3904,8 @@ void tcp_send_delayed_ack(struct sock *sk) ato = min(ato, max_ato); } + ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max); + /* Stay within the limit we were given */ timeout = jiffies + ato; @@ -3934,7 +4099,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, + NULL); if (!res) { __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 6cebf412d590..5842081bc8a2 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -10,7 +10,7 @@ #include <net/tcp.h> /* These factors derived from the recommended values in the aer: - * .01 and and 7/8. + * .01 and 7/8. */ #define TCP_SCALABLE_AI_CNT 100U #define TCP_SCALABLE_MD_SCALE 3 diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e88efba07551..09f0a23d1a01 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1170,7 +1170,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = inet->uc_index; } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { /* oif is set, packet is to local broadcast and - * and uc_index is set. oif is most likely set + * uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index eddd973e6575..7a94791efc1a 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -22,10 +22,9 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) prot->close = sock_map_close; } -static void udp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops) +static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) { - if (sk->sk_family == AF_INET6 && - unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) { + if (unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) { spin_lock_bh(&udpv6_prot_lock); if (likely(ops != udpv6_prot_saved)) { udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops); @@ -46,8 +45,8 @@ struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; - if (!psock->sk_proto) - udp_bpf_check_v6_needs_rebuild(sk, READ_ONCE(sk->sk_prot)); + if (sk->sk_family == AF_INET6) + udp_bpf_check_v6_needs_rebuild(psock->sk_proto); return &udp_bpf_prots[family]; } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 9ebf3fe0d2b1..c70c192bc91b 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -191,6 +191,13 @@ static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt, return -EAFNOSUPPORT; } +static int eafnosupport_ipv6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)) +{ + kfree_skb(skb); + return -EAFNOSUPPORT; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow, .ipv6_route_input = eafnosupport_ipv6_route_input, @@ -201,6 +208,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, .fib6_nh_init = eafnosupport_fib6_nh_init, .ip6_del_rt = eafnosupport_ip6_del_rt, + .ipv6_fragment = eafnosupport_ipv6_fragment, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 0306509ab063..e648fbebb167 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -661,6 +661,7 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, const struct proto_ops inet6_stream_ops = { .family = PF_INET6, + .flags = PROTO_CMSG_DATA_ONLY, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, @@ -1026,6 +1027,7 @@ static const struct ipv6_stub ipv6_stub_impl = { .xfrm6_rcv_encap = xfrm6_rcv_encap, #endif .nd_tbl = &nd_tbl, + .ipv6_fragment = ip6_fragment, }; static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 2d3add9e6116..55c290d55605 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -94,7 +94,7 @@ EXPORT_SYMBOL(__inet6_lookup_established); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const struct in6_addr *daddr, - const int dif, const int sdif, bool exact_dif) + const int dif, const int sdif) { int score = -1; @@ -138,15 +138,13 @@ static struct sock *inet6_lhash2_lookup(struct net *net, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { - bool exact_dif = inet6_exact_dif_match(net, skb); struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; int score, hiscore = 0; inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { sk = (struct sock *)icsk; - score = compute_score(sk, net, hnum, daddr, dif, sdif, - exact_dif); + score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 4a664ad4f4d4..141c0a4c569a 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1812,10 +1812,14 @@ static struct fib6_node *fib6_repair_tree(struct net *net, children = 0; child = NULL; - if (fn_r) - child = fn_r, children |= 1; - if (fn_l) - child = fn_l, children |= 2; + if (fn_r) { + child = fn_r; + children |= 1; + } + if (fn_l) { + child = fn_l; + children |= 2; + } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c78e67d7747f..a2a65e327f49 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1492,7 +1492,7 @@ emsgsize: * Otherwise, we need to reserve fragment header and * fragment alignment (= 8-15 octects, in total). * - * Note that we may need to "move" the data from the tail of + * Note that we may need to "move" the data from the tail * of the buffer to the new fragment when we split * the message. * diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c index 9ee077bf4f49..787c74aa85e3 100644 --- a/net/ipv6/netfilter/ip6t_NPT.c +++ b/net/ipv6/netfilter/ip6t_NPT.c @@ -77,16 +77,43 @@ static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt, return true; } +static struct ipv6hdr *icmpv6_bounced_ipv6hdr(struct sk_buff *skb, + struct ipv6hdr *_bounced_hdr) +{ + if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + return NULL; + + if (!icmpv6_is_err(icmp6_hdr(skb)->icmp6_type)) + return NULL; + + return skb_header_pointer(skb, + skb_transport_offset(skb) + sizeof(struct icmp6hdr), + sizeof(struct ipv6hdr), + _bounced_hdr); +} + static unsigned int ip6t_snpt_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_npt_tginfo *npt = par->targinfo; + struct ipv6hdr _bounced_hdr; + struct ipv6hdr *bounced_hdr; + struct in6_addr bounced_pfx; if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->saddr)) { icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offsetof(struct ipv6hdr, saddr)); return NF_DROP; } + + /* rewrite dst addr of bounced packet which was sent to dst range */ + bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr); + if (bounced_hdr) { + ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->daddr, npt->src_pfx_len); + if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0) + ip6t_npt_map_pfx(npt, &bounced_hdr->daddr); + } + return XT_CONTINUE; } @@ -94,12 +121,24 @@ static unsigned int ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_npt_tginfo *npt = par->targinfo; + struct ipv6hdr _bounced_hdr; + struct ipv6hdr *bounced_hdr; + struct in6_addr bounced_pfx; if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->daddr)) { icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offsetof(struct ipv6hdr, daddr)); return NF_DROP; } + + /* rewrite src addr of bounced packet which was sent from dst range */ + bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr); + if (bounced_hdr) { + ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->saddr, npt->src_pfx_len); + if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0) + ip6t_npt_map_pfx(npt, &bounced_hdr->saddr); + } + return XT_CONTINUE; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index fb075d9545b9..bde6d48a9942 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5284,9 +5284,10 @@ static int ip6_route_multipath_del(struct fib6_config *cfg, { struct fib6_config r_cfg; struct rtnexthop *rtnh; + int last_err = 0; int remaining; int attrlen; - int err = 1, last_err = 0; + int err; remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 305870a72352..8db59f4e5f13 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -458,7 +458,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is - * is already accepted it is treated as a connected one below. + * already accepted it is treated as a connected one below. */ if (fastopen && !fastopen->sk) break; @@ -501,7 +501,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = tcp_inet6_sk(sk); @@ -509,13 +510,14 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; + u8 tclass; /* First, grab a route. */ if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, IPPROTO_TCP)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, foc, synack_type); + skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, @@ -527,9 +529,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_lock(); opt = ireq->ipv6_opt; + tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tcp_rsk(req)->syn_tos : np->tclass; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass, + err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, + tclass & ~INET_ECN_MASK, sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); @@ -958,8 +963,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); - ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass, - priority); + ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, + tclass & ~INET_ECN_MASK, priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); @@ -1067,8 +1072,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) label = ip6_flowlabel(ipv6h); } - tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, - label, priority); + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, + ipv6_get_dsfield(ipv6h), label, priority); #ifdef CONFIG_TCP_MD5SIG out: @@ -1121,7 +1126,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index), - 0, 0, sk->sk_priority); + ipv6_get_dsfield(ipv6_hdr(skb)), 0, sk->sk_priority); } @@ -1309,6 +1314,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (np->repflow) newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); + /* Set ToS of the new socket based upon the value of incoming SYN. */ + if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) + newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; + /* Clone native IPv6 options from listening socket (if any) Yes, keeping reference count would be much more clever, diff --git a/net/l2tp/Makefile b/net/l2tp/Makefile index 399a7e5db2f4..cf8f27071d3f 100644 --- a/net/l2tp/Makefile +++ b/net/l2tp/Makefile @@ -5,6 +5,8 @@ obj-$(CONFIG_L2TP) += l2tp_core.o +CFLAGS_l2tp_core.o += -I$(src) + # Build l2tp as modules if L2TP is M obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_PPPOL2TP)) += l2tp_ppp.o obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_IP)) += l2tp_ip.o diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 701fc72ad9f4..7be5103ff2a8 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -61,6 +61,10 @@ #include <linux/atomic.h> #include "l2tp_core.h" +#include "trace.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" #define L2TP_DRV_VERSION "V2.0" @@ -116,11 +120,6 @@ static bool l2tp_sk_is_v6(struct sock *sk) } #endif -static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk) -{ - return sk->sk_user_data; -} - static inline struct l2tp_net *l2tp_pernet(const struct net *net) { return net_generic(net, l2tp_net_id); @@ -151,23 +150,30 @@ l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id) static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) { + trace_free_tunnel(tunnel); sock_put(tunnel->sock); /* the tunnel is freed in the socket destructor */ } static void l2tp_session_free(struct l2tp_session *session) { - struct l2tp_tunnel *tunnel = session->tunnel; + trace_free_session(session); + if (session->tunnel) + l2tp_tunnel_dec_refcount(session->tunnel); + kfree(session); +} - if (tunnel) { +struct l2tp_tunnel *l2tp_sk_to_tunnel(struct sock *sk) +{ + struct l2tp_tunnel *tunnel = sk->sk_user_data; + + if (tunnel) if (WARN_ON(tunnel->magic != L2TP_TUNNEL_MAGIC)) - goto out; - l2tp_tunnel_dec_refcount(tunnel); - } + return NULL; -out: - kfree(session); + return tunnel; } +EXPORT_SYMBOL_GPL(l2tp_sk_to_tunnel); void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel) { @@ -381,6 +387,8 @@ int l2tp_session_register(struct l2tp_session *session, hlist_add_head(&session->hlist, head); write_unlock_bh(&tunnel->hlist_lock); + trace_register_session(session); + return 0; err_tlock_pnlock: @@ -409,10 +417,6 @@ static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *sk skb_queue_walk_safe(&session->reorder_q, skbp, tmp) { if (L2TP_SKB_CB(skbp)->ns > ns) { __skb_queue_before(&session->reorder_q, skbp, skb); - l2tp_dbg(session, L2TP_MSG_SEQ, - "%s: pkt %hu, inserted before %hu, reorder_q len=%d\n", - session->name, ns, L2TP_SKB_CB(skbp)->ns, - skb_queue_len(&session->reorder_q)); atomic_long_inc(&session->stats.rx_oos_packets); goto out; } @@ -445,9 +449,7 @@ static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff * /* Bump our Nr */ session->nr++; session->nr &= session->nr_max; - - l2tp_dbg(session, L2TP_MSG_SEQ, "%s: updated nr to %hu\n", - session->name, session->nr); + trace_session_seqnum_update(session); } /* call private receive handler */ @@ -472,37 +474,27 @@ static void l2tp_recv_dequeue(struct l2tp_session *session) start: spin_lock_bh(&session->reorder_q.lock); skb_queue_walk_safe(&session->reorder_q, skb, tmp) { - if (time_after(jiffies, L2TP_SKB_CB(skb)->expires)) { + struct l2tp_skb_cb *cb = L2TP_SKB_CB(skb); + + /* If the packet has been pending on the queue for too long, discard it */ + if (time_after(jiffies, cb->expires)) { atomic_long_inc(&session->stats.rx_seq_discards); atomic_long_inc(&session->stats.rx_errors); - l2tp_dbg(session, L2TP_MSG_SEQ, - "%s: oos pkt %u len %d discarded (too old), waiting for %u, reorder_q_len=%d\n", - session->name, L2TP_SKB_CB(skb)->ns, - L2TP_SKB_CB(skb)->length, session->nr, - skb_queue_len(&session->reorder_q)); + trace_session_pkt_expired(session, cb->ns); session->reorder_skip = 1; __skb_unlink(skb, &session->reorder_q); kfree_skb(skb); continue; } - if (L2TP_SKB_CB(skb)->has_seq) { + if (cb->has_seq) { if (session->reorder_skip) { - l2tp_dbg(session, L2TP_MSG_SEQ, - "%s: advancing nr to next pkt: %u -> %u", - session->name, session->nr, - L2TP_SKB_CB(skb)->ns); session->reorder_skip = 0; - session->nr = L2TP_SKB_CB(skb)->ns; + session->nr = cb->ns; + trace_session_seqnum_reset(session); } - if (L2TP_SKB_CB(skb)->ns != session->nr) { - l2tp_dbg(session, L2TP_MSG_SEQ, - "%s: holding oos pkt %u len %d, waiting for %u, reorder_q_len=%d\n", - session->name, L2TP_SKB_CB(skb)->ns, - L2TP_SKB_CB(skb)->length, session->nr, - skb_queue_len(&session->reorder_q)); + if (cb->ns != session->nr) goto out; - } } __skb_unlink(skb, &session->reorder_q); @@ -535,14 +527,13 @@ static int l2tp_seq_check_rx_window(struct l2tp_session *session, u32 nr) */ static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb) { - if (!l2tp_seq_check_rx_window(session, L2TP_SKB_CB(skb)->ns)) { + struct l2tp_skb_cb *cb = L2TP_SKB_CB(skb); + + if (!l2tp_seq_check_rx_window(session, cb->ns)) { /* Packet sequence number is outside allowed window. * Discard it. */ - l2tp_dbg(session, L2TP_MSG_SEQ, - "%s: pkt %u len %d discarded, outside window, nr=%u\n", - session->name, L2TP_SKB_CB(skb)->ns, - L2TP_SKB_CB(skb)->length, session->nr); + trace_session_pkt_outside_rx_window(session, cb->ns); goto discard; } @@ -559,10 +550,10 @@ static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb) * is seen. After nr_oos_count_max in-sequence packets, reset the * sequence number to re-enable packet reception. */ - if (L2TP_SKB_CB(skb)->ns == session->nr) { + if (cb->ns == session->nr) { skb_queue_tail(&session->reorder_q, skb); } else { - u32 nr_oos = L2TP_SKB_CB(skb)->ns; + u32 nr_oos = cb->ns; u32 nr_next = (session->nr_oos + 1) & session->nr_max; if (nr_oos == nr_next) @@ -573,17 +564,10 @@ static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb) session->nr_oos = nr_oos; if (session->nr_oos_count > session->nr_oos_count_max) { session->reorder_skip = 1; - l2tp_dbg(session, L2TP_MSG_SEQ, - "%s: %d oos packets received. Resetting sequence numbers\n", - session->name, session->nr_oos_count); } if (!session->reorder_skip) { atomic_long_inc(&session->stats.rx_seq_discards); - l2tp_dbg(session, L2TP_MSG_SEQ, - "%s: oos pkt %u len %d discarded, waiting for %u, reorder_q_len=%d\n", - session->name, L2TP_SKB_CB(skb)->ns, - L2TP_SKB_CB(skb)->length, session->nr, - skb_queue_len(&session->reorder_q)); + trace_session_pkt_oos(session, cb->ns); goto discard; } skb_queue_tail(&session->reorder_q, skb); @@ -660,16 +644,14 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, int length) { struct l2tp_tunnel *tunnel = session->tunnel; - u32 ns = 0, nr = 0; int offset; /* Parse and check optional cookie */ if (session->peer_cookie_len > 0) { if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) { - l2tp_info(tunnel, L2TP_MSG_DATA, - "%s: cookie mismatch (%u/%u). Discarding.\n", - tunnel->name, tunnel->tunnel_id, - session->session_id); + pr_warn_ratelimited("%s: cookie mismatch (%u/%u). Discarding.\n", + tunnel->name, tunnel->tunnel_id, + session->session_id); atomic_long_inc(&session->stats.rx_cookie_discards); goto discard; } @@ -686,32 +668,21 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, L2TP_SKB_CB(skb)->has_seq = 0; if (tunnel->version == L2TP_HDR_VER_2) { if (hdrflags & L2TP_HDRFLAG_S) { - ns = ntohs(*(__be16 *)ptr); - ptr += 2; - nr = ntohs(*(__be16 *)ptr); - ptr += 2; - /* Store L2TP info in the skb */ - L2TP_SKB_CB(skb)->ns = ns; + L2TP_SKB_CB(skb)->ns = ntohs(*(__be16 *)ptr); L2TP_SKB_CB(skb)->has_seq = 1; + ptr += 2; + /* Skip past nr in the header */ + ptr += 2; - l2tp_dbg(session, L2TP_MSG_SEQ, - "%s: recv data ns=%u, nr=%u, session nr=%u\n", - session->name, ns, nr, session->nr); } } else if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) { u32 l2h = ntohl(*(__be32 *)ptr); if (l2h & 0x40000000) { - ns = l2h & 0x00ffffff; - /* Store L2TP info in the skb */ - L2TP_SKB_CB(skb)->ns = ns; + L2TP_SKB_CB(skb)->ns = l2h & 0x00ffffff; L2TP_SKB_CB(skb)->has_seq = 1; - - l2tp_dbg(session, L2TP_MSG_SEQ, - "%s: recv data ns=%u, session nr=%u\n", - session->name, ns, session->nr); } ptr += 4; } @@ -722,9 +693,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, * configure it so. */ if (!session->lns_mode && !session->send_seq) { - l2tp_info(session, L2TP_MSG_SEQ, - "%s: requested to enable seq numbers by LNS\n", - session->name); + trace_session_seqnum_lns_enable(session); session->send_seq = 1; l2tp_session_set_header_len(session, tunnel->version); } @@ -733,9 +702,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, * If user has configured mandatory sequence numbers, discard. */ if (session->recv_seq) { - l2tp_warn(session, L2TP_MSG_SEQ, - "%s: recv data has no seq numbers when required. Discarding.\n", - session->name); + pr_warn_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n", + session->name); atomic_long_inc(&session->stats.rx_seq_discards); goto discard; } @@ -746,15 +714,12 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, * LAC is broken. Discard the frame. */ if (!session->lns_mode && session->send_seq) { - l2tp_info(session, L2TP_MSG_SEQ, - "%s: requested to disable seq numbers by LNS\n", - session->name); + trace_session_seqnum_lns_disable(session); session->send_seq = 0; l2tp_session_set_header_len(session, tunnel->version); } else if (session->send_seq) { - l2tp_warn(session, L2TP_MSG_SEQ, - "%s: recv data has no seq numbers when required. Discarding.\n", - session->name); + pr_warn_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n", + session->name); atomic_long_inc(&session->stats.rx_seq_discards); goto discard; } @@ -816,9 +781,6 @@ static void l2tp_session_queue_purge(struct l2tp_session *session) { struct sk_buff *skb = NULL; - if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) - return; - while ((skb = skb_dequeue(&session->reorder_q))) { atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); @@ -847,22 +809,11 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) /* Short packet? */ if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX)) { - l2tp_info(tunnel, L2TP_MSG_DATA, - "%s: recv short packet (len=%d)\n", - tunnel->name, skb->len); + pr_warn_ratelimited("%s: recv short packet (len=%d)\n", + tunnel->name, skb->len); goto error; } - /* Trace packet contents, if enabled */ - if (tunnel->debug & L2TP_MSG_DATA) { - length = min(32u, skb->len); - if (!pskb_may_pull(skb, length)) - goto error; - - pr_debug("%s: recv\n", tunnel->name); - print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length); - } - /* Point to L2TP header */ optr = skb->data; ptr = skb->data; @@ -873,9 +824,8 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) /* Check protocol version */ version = hdrflags & L2TP_HDR_VER_MASK; if (version != tunnel->version) { - l2tp_info(tunnel, L2TP_MSG_DATA, - "%s: recv protocol version mismatch: got %d expected %d\n", - tunnel->name, version, tunnel->version); + pr_warn_ratelimited("%s: recv protocol version mismatch: got %d expected %d\n", + tunnel->name, version, tunnel->version); goto error; } @@ -883,12 +833,8 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) length = skb->len; /* If type is control packet, it is handled by userspace. */ - if (hdrflags & L2TP_HDRFLAG_T) { - l2tp_dbg(tunnel, L2TP_MSG_DATA, - "%s: recv control packet, len=%d\n", - tunnel->name, length); + if (hdrflags & L2TP_HDRFLAG_T) goto error; - } /* Skip flags */ ptr += 2; @@ -917,9 +863,8 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) l2tp_session_dec_refcount(session); /* Not found? Pass to userspace to deal with */ - l2tp_info(tunnel, L2TP_MSG_DATA, - "%s: no session found (%u/%u). Passing up.\n", - tunnel->name, tunnel_id, session_id); + pr_warn_ratelimited("%s: no session found (%u/%u). Passing up.\n", + tunnel->name, tunnel_id, session_id); goto error; } @@ -949,12 +894,17 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct l2tp_tunnel *tunnel; + /* Note that this is called from the encap_rcv hook inside an + * RCU-protected region, but without the socket being locked. + * Hence we use rcu_dereference_sk_user_data to access the + * tunnel data structure rather the usual l2tp_sk_to_tunnel + * accessor function. + */ tunnel = rcu_dereference_sk_user_data(sk); if (!tunnel) goto pass_up; - - l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n", - tunnel->name, skb->len); + if (WARN_ON(tunnel->magic != L2TP_TUNNEL_MAGIC)) + goto pass_up; if (l2tp_udp_recv_core(tunnel, skb)) goto pass_up; @@ -993,8 +943,7 @@ static int l2tp_build_l2tpv2_header(struct l2tp_session *session, void *buf) *bufp++ = 0; session->ns++; session->ns &= 0xffff; - l2tp_dbg(session, L2TP_MSG_SEQ, "%s: updated ns to %u\n", - session->name, session->ns); + trace_session_seqnum_update(session); } return bufp - optr; @@ -1030,9 +979,7 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) l2h = 0x40000000 | session->ns; session->ns++; session->ns &= 0xffffff; - l2tp_dbg(session, L2TP_MSG_SEQ, - "%s: updated ns to %u\n", - session->name, session->ns); + trace_session_seqnum_update(session); } *((__be32 *)bufp) = htonl(l2h); @@ -1042,74 +989,39 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) return bufp - optr; } -static void l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, - struct flowi *fl, size_t data_len) +/* Queue the packet to IP for output: tunnel socket lock must be held */ +static int l2tp_xmit_queue(struct l2tp_tunnel *tunnel, struct sk_buff *skb, struct flowi *fl) { - struct l2tp_tunnel *tunnel = session->tunnel; - unsigned int len = skb->len; - int error; - - /* Debug */ - if (session->send_seq) - l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes, ns=%u\n", - session->name, data_len, session->ns - 1); - else - l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes\n", - session->name, data_len); - - if (session->debug & L2TP_MSG_DATA) { - int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; - unsigned char *datap = skb->data + uhlen; - - pr_debug("%s: xmit\n", session->name); - print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, - datap, min_t(size_t, 32, len - uhlen)); - } + int err; - /* Queue the packet to IP for output */ skb->ignore_df = 1; skb_dst_drop(skb); #if IS_ENABLED(CONFIG_IPV6) if (l2tp_sk_is_v6(tunnel->sock)) - error = inet6_csk_xmit(tunnel->sock, skb, NULL); + err = inet6_csk_xmit(tunnel->sock, skb, NULL); else #endif - error = ip_queue_xmit(tunnel->sock, skb, fl); + err = ip_queue_xmit(tunnel->sock, skb, fl); - /* Update stats */ - if (error >= 0) { - atomic_long_inc(&tunnel->stats.tx_packets); - atomic_long_add(len, &tunnel->stats.tx_bytes); - atomic_long_inc(&session->stats.tx_packets); - atomic_long_add(len, &session->stats.tx_bytes); - } else { - atomic_long_inc(&tunnel->stats.tx_errors); - atomic_long_inc(&session->stats.tx_errors); - } + return err >= 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } -/* If caller requires the skb to have a ppp header, the header must be - * inserted in the skb data before calling this function. - */ -int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len) +static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, unsigned int *len) { - int data_len = skb->len; struct l2tp_tunnel *tunnel = session->tunnel; + unsigned int data_len = skb->len; struct sock *sk = tunnel->sock; - struct flowi *fl; - struct udphdr *uh; - struct inet_sock *inet; - int headroom; - int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; - int udp_len; + int headroom, uhlen, udp_len; int ret = NET_XMIT_SUCCESS; + struct inet_sock *inet; + struct udphdr *uh; /* Check that there's enough headroom in the skb to insert IP, * UDP and L2TP headers. If not enough, expand it to * make room. Adjust truesize. */ - headroom = NET_SKB_PAD + sizeof(struct iphdr) + - uhlen + hdr_len; + uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(*uh) : 0; + headroom = NET_SKB_PAD + sizeof(struct iphdr) + uhlen + session->hdr_len; if (skb_cow_head(skb, headroom)) { kfree_skb(skb); return NET_XMIT_DROP; @@ -1117,14 +1029,13 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len /* Setup L2TP header */ if (tunnel->version == L2TP_HDR_VER_2) - l2tp_build_l2tpv2_header(session, __skb_push(skb, hdr_len)); + l2tp_build_l2tpv2_header(session, __skb_push(skb, session->hdr_len)); else - l2tp_build_l2tpv3_header(session, __skb_push(skb, hdr_len)); + l2tp_build_l2tpv3_header(session, __skb_push(skb, session->hdr_len)); /* Reset skb netfilter state */ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | - IPSKB_REROUTED); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); nf_reset_ct(skb); bh_lock_sock(sk); @@ -1143,8 +1054,12 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len goto out_unlock; } + /* Report transmitted length before we add encap header, which keeps + * statistics consistent for both UDP and IP encap tx/rx paths. + */ + *len = skb->len; + inet = inet_sk(sk); - fl = &inet->cork.fl; switch (tunnel->encap) { case L2TP_ENCAPTYPE_UDP: /* Setup UDP header */ @@ -1153,7 +1068,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len uh = udp_hdr(skb); uh->source = inet->inet_sport; uh->dest = inet->inet_dport; - udp_len = uhlen + hdr_len + data_len; + udp_len = uhlen + session->hdr_len + data_len; uh->len = htons(udp_len); /* Calculate UDP checksum if configured to do so */ @@ -1172,12 +1087,34 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len break; } - l2tp_xmit_core(session, skb, fl, data_len); + ret = l2tp_xmit_queue(tunnel, skb, &inet->cork.fl); + out_unlock: bh_unlock_sock(sk); return ret; } + +/* If caller requires the skb to have a ppp header, the header must be + * inserted in the skb data before calling this function. + */ +int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb) +{ + unsigned int len = 0; + int ret; + + ret = l2tp_xmit_core(session, skb, &len); + if (ret == NET_XMIT_SUCCESS) { + atomic_long_inc(&session->tunnel->stats.tx_packets); + atomic_long_add(len, &session->tunnel->stats.tx_bytes); + atomic_long_inc(&session->stats.tx_packets); + atomic_long_add(len, &session->stats.tx_bytes); + } else { + atomic_long_inc(&session->tunnel->stats.tx_errors); + atomic_long_inc(&session->stats.tx_errors); + } + return ret; +} EXPORT_SYMBOL_GPL(l2tp_xmit_skb); /***************************************************************************** @@ -1190,13 +1127,11 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb); */ static void l2tp_tunnel_destruct(struct sock *sk) { - struct l2tp_tunnel *tunnel = l2tp_tunnel(sk); + struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); if (!tunnel) goto end; - l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing...\n", tunnel->name); - /* Disable udp encapsulation */ switch (tunnel->encap) { case L2TP_ENCAPTYPE_UDP: @@ -1255,34 +1190,16 @@ static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) struct hlist_node *tmp; struct l2tp_session *session; - l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing all sessions...\n", - tunnel->name); - write_lock_bh(&tunnel->hlist_lock); tunnel->acpt_newsess = false; for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { again: hlist_for_each_safe(walk, tmp, &tunnel->session_hlist[hash]) { session = hlist_entry(walk, struct l2tp_session, hlist); - - l2tp_info(session, L2TP_MSG_CONTROL, - "%s: closing session\n", session->name); - hlist_del_init(&session->hlist); - if (test_and_set_bit(0, &session->dead)) - goto again; - write_unlock_bh(&tunnel->hlist_lock); - - l2tp_session_unhash(session); - l2tp_session_queue_purge(session); - - if (session->session_close) - (*session->session_close)(session); - - l2tp_session_dec_refcount(session); - + l2tp_session_delete(session); write_lock_bh(&tunnel->hlist_lock); /* Now restart from the beginning of this hash @@ -1299,7 +1216,7 @@ again: /* Tunnel socket destroy hook for UDP encapsulation */ static void l2tp_udp_encap_destroy(struct sock *sk) { - struct l2tp_tunnel *tunnel = l2tp_tunnel(sk); + struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); if (tunnel) l2tp_tunnel_delete(tunnel); @@ -1464,7 +1381,7 @@ out: static struct lock_class_key l2tp_socket_class; -int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, +int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp) { struct l2tp_tunnel *tunnel = NULL; @@ -1483,16 +1400,12 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 tunnel->version = version; tunnel->tunnel_id = tunnel_id; tunnel->peer_tunnel_id = peer_tunnel_id; - tunnel->debug = L2TP_DEFAULT_DEBUG_FLAGS; tunnel->magic = L2TP_TUNNEL_MAGIC; sprintf(&tunnel->name[0], "tunl %u", tunnel_id); rwlock_init(&tunnel->hlist_lock); tunnel->acpt_newsess = true; - if (cfg) - tunnel->debug = cfg->debug; - tunnel->encap = encap; refcount_set(&tunnel->ref_count, 1); @@ -1597,6 +1510,8 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, "l2tp_sock"); sk->sk_allocation = GFP_ATOMIC; + trace_register_tunnel(tunnel); + if (tunnel->fd >= 0) sockfd_put(sock); @@ -1617,6 +1532,7 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_register); void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { if (!test_and_set_bit(0, &tunnel->dead)) { + trace_delete_tunnel(tunnel); l2tp_tunnel_inc_refcount(tunnel); queue_work(l2tp_wq, &tunnel->del_work); } @@ -1628,6 +1544,7 @@ void l2tp_session_delete(struct l2tp_session *session) if (test_and_set_bit(0, &session->dead)) return; + trace_delete_session(session); l2tp_session_unhash(session); l2tp_session_queue_purge(session); if (session->session_close) @@ -1686,12 +1603,8 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn INIT_HLIST_NODE(&session->hlist); INIT_HLIST_NODE(&session->global_hlist); - /* Inherit debug options from tunnel */ - session->debug = tunnel->debug; - if (cfg) { session->pwtype = cfg->pw_type; - session->debug = cfg->debug; session->send_seq = cfg->send_seq; session->recv_seq = cfg->recv_seq; session->lns_mode = cfg->lns_mode; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 3468d6b177a0..cb21d906343e 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -51,7 +51,6 @@ struct l2tp_session_cfg { unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ - int debug; /* bitmask of debug message categories */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ int cookie_len; /* 0, 4 or 8 bytes */ @@ -66,6 +65,7 @@ struct l2tp_session_cfg { * Is linked into a per-tunnel session hashlist; and in the case of an L2TPv3 session into * an additional per-net ("global") hashlist. */ +#define L2TP_SESSION_NAME_MAX 32 struct l2tp_session { int magic; /* should be L2TP_SESSION_MAGIC */ long dead; @@ -90,14 +90,13 @@ struct l2tp_session { struct hlist_node hlist; /* hash list node */ refcount_t ref_count; - char name[32]; /* for logging */ + char name[L2TP_SESSION_NAME_MAX]; /* for logging */ char ifname[IFNAMSIZ]; unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ unsigned int send_seq:1; /* send packets with sequence numbers? */ unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ - int debug; /* bitmask of debug message categories */ int reorder_timeout; /* configured reorder timeout (in jiffies) */ int reorder_skip; /* set if skip to next nr */ enum l2tp_pwtype pwtype; @@ -131,7 +130,6 @@ struct l2tp_session { /* L2TP tunnel configuration */ struct l2tp_tunnel_cfg { - int debug; /* bitmask of debug message categories */ enum l2tp_encap_type encap; /* Used only for kernel-created sockets */ @@ -154,6 +152,7 @@ struct l2tp_tunnel_cfg { * Maintains a hashlist of sessions belonging to the tunnel instance. * Is linked into a per-net list of tunnels. */ +#define L2TP_TUNNEL_NAME_MAX 20 struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ @@ -170,8 +169,7 @@ struct l2tp_tunnel { u32 peer_tunnel_id; int version; /* 2=>L2TPv2, 3=>L2TPv3 */ - char name[20]; /* for logging */ - int debug; /* bitmask of debug message categories */ + char name[L2TP_TUNNEL_NAME_MAX]; /* for logging */ enum l2tp_encap_type encap; struct l2tp_stats stats; @@ -237,7 +235,7 @@ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, * Creation of a new instance is a two-step process: create, then register. * Destruction is triggered using the *_delete functions, and completes asynchronously. */ -int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, +int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, @@ -263,8 +261,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); /* Transmit path helpers for sending packets over the tunnel socket. */ void l2tp_session_set_header_len(struct l2tp_session *session, int version); -int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, - int hdr_len); +int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb); /* Pseudowire management. * Pseudowires should register with l2tp core on module init, and unregister @@ -276,6 +273,11 @@ void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); /* IOCTL helper for IP encap modules. */ int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg); +/* Extract the tunnel structure from a socket's sk_user_data pointer, + * validating the tunnel magic feather. + */ +struct l2tp_tunnel *l2tp_sk_to_tunnel(struct sock *sk); + static inline int l2tp_get_l2specific_len(struct l2tp_session *session) { switch (session->l2specific_type) { @@ -337,19 +339,6 @@ static inline int l2tp_v3_ensure_opt_in_linear(struct l2tp_session *session, str return 0; } -#define l2tp_printk(ptr, type, func, fmt, ...) \ -do { \ - if (((ptr)->debug) & (type)) \ - func(fmt, ##__VA_ARGS__); \ -} while (0) - -#define l2tp_warn(ptr, type, fmt, ...) \ - l2tp_printk(ptr, type, pr_warn, fmt, ##__VA_ARGS__) -#define l2tp_info(ptr, type, fmt, ...) \ - l2tp_printk(ptr, type, pr_info, fmt, ##__VA_ARGS__) -#define l2tp_dbg(ptr, type, fmt, ...) \ - l2tp_printk(ptr, type, pr_debug, fmt, ##__VA_ARGS__) - #define MODULE_ALIAS_L2TP_PWTYPE(type) \ MODULE_ALIAS("net-l2tp-type-" __stringify(type)) diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 96cb9601c21b..bca75bef8282 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -167,7 +167,7 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) tunnel->sock ? refcount_read(&tunnel->sock->sk_refcnt) : 0, refcount_read(&tunnel->ref_count)); seq_printf(m, " %08x rx %ld/%ld/%ld rx %ld/%ld/%ld\n", - tunnel->debug, + 0, atomic_long_read(&tunnel->stats.tx_packets), atomic_long_read(&tunnel->stats.tx_bytes), atomic_long_read(&tunnel->stats.tx_errors), @@ -192,7 +192,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", - session->debug, + 0, jiffies_to_msecs(session->reorder_timeout)); seq_printf(m, " offset 0 l2specific %hu/%hu\n", session->l2specific_type, l2tp_get_l2specific_len(session)); diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 7ed2b4eced94..6cd97c75445c 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -76,7 +76,7 @@ static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev struct l2tp_eth *priv = netdev_priv(dev); struct l2tp_session *session = priv->session; unsigned int len = skb->len; - int ret = l2tp_xmit_skb(session, skb, session->hdr_len); + int ret = l2tp_xmit_skb(session, skb); if (likely(ret == NET_XMIT_SUCCESS)) { atomic_long_add(len, &priv->tx_bytes); @@ -128,17 +128,6 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, struct net_device *dev; struct l2tp_eth *priv; - if (session->debug & L2TP_MSG_DATA) { - unsigned int length; - - length = min(32u, skb->len); - if (!pskb_may_pull(skb, length)) - goto error; - - pr_debug("%s: eth recv\n", session->name); - print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length); - } - if (!pskb_may_pull(skb, ETH_HLEN)) goto error; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index df2a35b5714a..97ae1255fcb6 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -118,7 +118,6 @@ static int l2tp_ip_recv(struct sk_buff *skb) struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; struct iphdr *iph; - int length; if (!pskb_may_pull(skb, 4)) goto discard; @@ -147,20 +146,6 @@ static int l2tp_ip_recv(struct sk_buff *skb) if (!tunnel) goto discard_sess; - /* Trace packet contents, if enabled */ - if (tunnel->debug & L2TP_MSG_DATA) { - length = min(32u, skb->len); - if (!pskb_may_pull(skb, length)) - goto discard_sess; - - /* Point to L2TP header */ - optr = skb->data; - ptr = skb->data; - ptr += 4; - pr_debug("%s: ip recv\n", tunnel->name); - print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); - } - if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) goto discard_sess; @@ -248,8 +233,8 @@ static void l2tp_ip_close(struct sock *sk, long timeout) static void l2tp_ip_destroy_sock(struct sock *sk) { + struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); struct sk_buff *skb; - struct l2tp_tunnel *tunnel = sk->sk_user_data; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index bc757bc7e264..e5e5036257b0 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -131,7 +131,6 @@ static int l2tp_ip6_recv(struct sk_buff *skb) struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; struct ipv6hdr *iph; - int length; if (!pskb_may_pull(skb, 4)) goto discard; @@ -160,20 +159,6 @@ static int l2tp_ip6_recv(struct sk_buff *skb) if (!tunnel) goto discard_sess; - /* Trace packet contents, if enabled */ - if (tunnel->debug & L2TP_MSG_DATA) { - length = min(32u, skb->len); - if (!pskb_may_pull(skb, length)) - goto discard_sess; - - /* Point to L2TP header */ - optr = skb->data; - ptr = skb->data; - ptr += 4; - pr_debug("%s: ip recv\n", tunnel->name); - print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); - } - if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) goto discard_sess; @@ -262,7 +247,7 @@ static void l2tp_ip6_close(struct sock *sk, long timeout) static void l2tp_ip6_destroy_sock(struct sock *sk) { - struct l2tp_tunnel *tunnel = sk->sk_user_data; + struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); lock_sock(sk); ip6_flush_pending_frames(sk); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index def78eebca4c..83c015f7f20d 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -229,14 +229,11 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info goto out; } - if (attrs[L2TP_ATTR_DEBUG]) - cfg.debug = nla_get_u32(attrs[L2TP_ATTR_DEBUG]); - ret = -EINVAL; switch (cfg.encap) { case L2TP_ENCAPTYPE_UDP: case L2TP_ENCAPTYPE_IP: - ret = l2tp_tunnel_create(net, fd, proto_version, tunnel_id, + ret = l2tp_tunnel_create(fd, proto_version, tunnel_id, peer_tunnel_id, &cfg, &tunnel); break; } @@ -307,9 +304,6 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info goto out; } - if (info->attrs[L2TP_ATTR_DEBUG]) - tunnel->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_MODIFY); @@ -400,7 +394,7 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) || nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || - nla_put_u32(skb, L2TP_ATTR_DEBUG, tunnel->debug) || + nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) || nla_put_u16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap)) goto nla_put_failure; @@ -605,9 +599,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); } - if (info->attrs[L2TP_ATTR_DEBUG]) - cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - if (info->attrs[L2TP_ATTR_RECV_SEQ]) cfg.recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); @@ -689,9 +680,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf goto out; } - if (info->attrs[L2TP_ATTR_DEBUG]) - session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - if (info->attrs[L2TP_ATTR_RECV_SEQ]) session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); @@ -730,7 +718,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id) || - nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) || + nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) || nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype)) goto nla_put_failure; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 13c3153b40d6..aea85f91f059 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -237,17 +237,9 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int if (sk->sk_state & PPPOX_BOUND) { struct pppox_sock *po; - l2tp_dbg(session, L2TP_MSG_DATA, - "%s: recv %d byte data frame, passing to ppp\n", - session->name, data_len); - po = pppox_sk(sk); ppp_input(&po->chan, skb); } else { - l2tp_dbg(session, L2TP_MSG_DATA, - "%s: recv %d byte data frame, passing to L2TP socket\n", - session->name, data_len); - if (sock_queue_rcv_skb(sk, skb) < 0) { atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); @@ -259,7 +251,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int no_sock: rcu_read_unlock(); - l2tp_info(session, L2TP_MSG_DATA, "%s: no socket\n", session->name); + pr_warn_ratelimited("%s: no socket in recv\n", session->name); kfree_skb(skb); } @@ -324,7 +316,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, } local_bh_disable(); - l2tp_xmit_skb(session, skb, session->hdr_len); + l2tp_xmit_skb(session, skb); local_bh_enable(); sock_put(sk); @@ -383,7 +375,7 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) skb->data[1] = PPP_UI; local_bh_disable(); - l2tp_xmit_skb(session, skb, session->hdr_len); + l2tp_xmit_skb(session, skb); local_bh_enable(); sock_put(sk); @@ -710,7 +702,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, if (!tunnel) { struct l2tp_tunnel_cfg tcfg = { .encap = L2TP_ENCAPTYPE_UDP, - .debug = 0, }; /* Prevent l2tp_tunnel_register() from trying to set up @@ -721,7 +712,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, goto end; } - error = l2tp_tunnel_create(sock_net(sk), info.fd, + error = l2tp_tunnel_create(info.fd, info.version, info.tunnel_id, info.peer_tunnel_id, &tcfg, @@ -840,8 +831,6 @@ out_no_ppp: drop_refcnt = false; sk->sk_state = PPPOX_CONNECTED; - l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n", - session->name); end: if (error) { @@ -1076,6 +1065,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, if (!session) return -ENOTCONN; + if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) + return -EBADF; + /* Not defined for tunnels */ if (!session->session_id && !session->peer_session_id) return -ENOSYS; @@ -1090,6 +1082,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, if (!session) return -ENOTCONN; + if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) + return -EBADF; + /* Not defined for tunnels */ if (!session->session_id && !session->peer_session_id) return -ENOSYS; @@ -1103,6 +1098,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, if (!session) return -ENOTCONN; + if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) + return -EBADF; + /* Session 0 represents the parent tunnel */ if (!session->session_id && !session->peer_session_id) { u32 session_id; @@ -1157,9 +1155,7 @@ static int pppol2tp_tunnel_setsockopt(struct sock *sk, switch (optname) { case PPPOL2TP_SO_DEBUG: - tunnel->debug = val; - l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: set debug=%x\n", - tunnel->name, tunnel->debug); + /* Tunnel debug flags option is deprecated */ break; default: @@ -1185,9 +1181,6 @@ static int pppol2tp_session_setsockopt(struct sock *sk, break; } session->recv_seq = !!val; - l2tp_info(session, L2TP_MSG_CONTROL, - "%s: set recv_seq=%d\n", - session->name, session->recv_seq); break; case PPPOL2TP_SO_SENDSEQ: @@ -1203,9 +1196,6 @@ static int pppol2tp_session_setsockopt(struct sock *sk, PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; } l2tp_session_set_header_len(session, session->tunnel->version); - l2tp_info(session, L2TP_MSG_CONTROL, - "%s: set send_seq=%d\n", - session->name, session->send_seq); break; case PPPOL2TP_SO_LNSMODE: @@ -1214,22 +1204,14 @@ static int pppol2tp_session_setsockopt(struct sock *sk, break; } session->lns_mode = !!val; - l2tp_info(session, L2TP_MSG_CONTROL, - "%s: set lns_mode=%d\n", - session->name, session->lns_mode); break; case PPPOL2TP_SO_DEBUG: - session->debug = val; - l2tp_info(session, L2TP_MSG_CONTROL, "%s: set debug=%x\n", - session->name, session->debug); + /* Session debug flags option is deprecated */ break; case PPPOL2TP_SO_REORDERTO: session->reorder_timeout = msecs_to_jiffies(val); - l2tp_info(session, L2TP_MSG_CONTROL, - "%s: set reorder_timeout=%d\n", - session->name, session->reorder_timeout); break; default: @@ -1297,9 +1279,8 @@ static int pppol2tp_tunnel_getsockopt(struct sock *sk, switch (optname) { case PPPOL2TP_SO_DEBUG: - *val = tunnel->debug; - l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get debug=%x\n", - tunnel->name, tunnel->debug); + /* Tunnel debug flags option is deprecated */ + *val = 0; break; default: @@ -1321,32 +1302,23 @@ static int pppol2tp_session_getsockopt(struct sock *sk, switch (optname) { case PPPOL2TP_SO_RECVSEQ: *val = session->recv_seq; - l2tp_info(session, L2TP_MSG_CONTROL, - "%s: get recv_seq=%d\n", session->name, *val); break; case PPPOL2TP_SO_SENDSEQ: *val = session->send_seq; - l2tp_info(session, L2TP_MSG_CONTROL, - "%s: get send_seq=%d\n", session->name, *val); break; case PPPOL2TP_SO_LNSMODE: *val = session->lns_mode; - l2tp_info(session, L2TP_MSG_CONTROL, - "%s: get lns_mode=%d\n", session->name, *val); break; case PPPOL2TP_SO_DEBUG: - *val = session->debug; - l2tp_info(session, L2TP_MSG_CONTROL, "%s: get debug=%d\n", - session->name, *val); + /* Session debug flags option is deprecated */ + *val = 0; break; case PPPOL2TP_SO_REORDERTO: *val = (int)jiffies_to_msecs(session->reorder_timeout); - l2tp_info(session, L2TP_MSG_CONTROL, - "%s: get reorder_timeout=%d\n", session->name, *val); break; default: @@ -1534,7 +1506,7 @@ static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v) (tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N', refcount_read(&tunnel->ref_count) - 1); seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n", - tunnel->debug, + 0, atomic_long_read(&tunnel->stats.tx_packets), atomic_long_read(&tunnel->stats.tx_bytes), atomic_long_read(&tunnel->stats.tx_errors), @@ -1580,7 +1552,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", - session->debug, + 0, jiffies_to_msecs(session->reorder_timeout)); seq_printf(m, " %hu/%hu %ld/%ld/%ld %ld/%ld/%ld\n", session->nr, session->ns, diff --git a/net/l2tp/trace.h b/net/l2tp/trace.h new file mode 100644 index 000000000000..8596eaa12a2e --- /dev/null +++ b/net/l2tp/trace.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM l2tp + +#if !defined(_TRACE_L2TP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_L2TP_H + +#include <linux/tracepoint.h> +#include <linux/l2tp.h> +#include "l2tp_core.h" + +#define encap_type_name(e) { L2TP_ENCAPTYPE_##e, #e } +#define show_encap_type_name(val) \ + __print_symbolic(val, \ + encap_type_name(UDP), \ + encap_type_name(IP)) + +#define pw_type_name(p) { L2TP_PWTYPE_##p, #p } +#define show_pw_type_name(val) \ + __print_symbolic(val, \ + pw_type_name(ETH_VLAN), \ + pw_type_name(ETH), \ + pw_type_name(PPP), \ + pw_type_name(PPP_AC), \ + pw_type_name(IP)) + +DECLARE_EVENT_CLASS(tunnel_only_evt, + TP_PROTO(struct l2tp_tunnel *tunnel), + TP_ARGS(tunnel), + TP_STRUCT__entry( + __array(char, name, L2TP_TUNNEL_NAME_MAX) + ), + TP_fast_assign( + memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX); + ), + TP_printk("%s", __entry->name) +); + +DECLARE_EVENT_CLASS(session_only_evt, + TP_PROTO(struct l2tp_session *session), + TP_ARGS(session), + TP_STRUCT__entry( + __array(char, name, L2TP_SESSION_NAME_MAX) + ), + TP_fast_assign( + memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); + ), + TP_printk("%s", __entry->name) +); + +TRACE_EVENT(register_tunnel, + TP_PROTO(struct l2tp_tunnel *tunnel), + TP_ARGS(tunnel), + TP_STRUCT__entry( + __array(char, name, L2TP_TUNNEL_NAME_MAX) + __field(int, fd) + __field(u32, tid) + __field(u32, ptid) + __field(int, version) + __field(enum l2tp_encap_type, encap) + ), + TP_fast_assign( + memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX); + __entry->fd = tunnel->fd; + __entry->tid = tunnel->tunnel_id; + __entry->ptid = tunnel->peer_tunnel_id; + __entry->version = tunnel->version; + __entry->encap = tunnel->encap; + ), + TP_printk("%s: type=%s encap=%s version=L2TPv%d tid=%u ptid=%u fd=%d", + __entry->name, + __entry->fd > 0 ? "managed" : "unmanaged", + show_encap_type_name(__entry->encap), + __entry->version, + __entry->tid, + __entry->ptid, + __entry->fd) +); + +DEFINE_EVENT(tunnel_only_evt, delete_tunnel, + TP_PROTO(struct l2tp_tunnel *tunnel), + TP_ARGS(tunnel) +); + +DEFINE_EVENT(tunnel_only_evt, free_tunnel, + TP_PROTO(struct l2tp_tunnel *tunnel), + TP_ARGS(tunnel) +); + +TRACE_EVENT(register_session, + TP_PROTO(struct l2tp_session *session), + TP_ARGS(session), + TP_STRUCT__entry( + __array(char, name, L2TP_SESSION_NAME_MAX) + __field(u32, tid) + __field(u32, ptid) + __field(u32, sid) + __field(u32, psid) + __field(enum l2tp_pwtype, pwtype) + ), + TP_fast_assign( + memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); + __entry->tid = session->tunnel ? session->tunnel->tunnel_id : 0; + __entry->ptid = session->tunnel ? session->tunnel->peer_tunnel_id : 0; + __entry->sid = session->session_id; + __entry->psid = session->peer_session_id; + __entry->pwtype = session->pwtype; + ), + TP_printk("%s: pseudowire=%s sid=%u psid=%u tid=%u ptid=%u", + __entry->name, + show_pw_type_name(__entry->pwtype), + __entry->sid, + __entry->psid, + __entry->sid, + __entry->psid) +); + +DEFINE_EVENT(session_only_evt, delete_session, + TP_PROTO(struct l2tp_session *session), + TP_ARGS(session) +); + +DEFINE_EVENT(session_only_evt, free_session, + TP_PROTO(struct l2tp_session *session), + TP_ARGS(session) +); + +DEFINE_EVENT(session_only_evt, session_seqnum_lns_enable, + TP_PROTO(struct l2tp_session *session), + TP_ARGS(session) +); + +DEFINE_EVENT(session_only_evt, session_seqnum_lns_disable, + TP_PROTO(struct l2tp_session *session), + TP_ARGS(session) +); + +DECLARE_EVENT_CLASS(session_seqnum_evt, + TP_PROTO(struct l2tp_session *session), + TP_ARGS(session), + TP_STRUCT__entry( + __array(char, name, L2TP_SESSION_NAME_MAX) + __field(u32, ns) + __field(u32, nr) + ), + TP_fast_assign( + memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); + __entry->ns = session->ns; + __entry->nr = session->nr; + ), + TP_printk("%s: ns=%u nr=%u", + __entry->name, + __entry->ns, + __entry->nr) +); + +DEFINE_EVENT(session_seqnum_evt, session_seqnum_update, + TP_PROTO(struct l2tp_session *session), + TP_ARGS(session) +); + +DEFINE_EVENT(session_seqnum_evt, session_seqnum_reset, + TP_PROTO(struct l2tp_session *session), + TP_ARGS(session) +); + +DECLARE_EVENT_CLASS(session_pkt_discard_evt, + TP_PROTO(struct l2tp_session *session, u32 pkt_ns), + TP_ARGS(session, pkt_ns), + TP_STRUCT__entry( + __array(char, name, L2TP_SESSION_NAME_MAX) + __field(u32, pkt_ns) + __field(u32, my_nr) + __field(u32, reorder_q_len) + ), + TP_fast_assign( + memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); + __entry->pkt_ns = pkt_ns, + __entry->my_nr = session->nr; + __entry->reorder_q_len = skb_queue_len(&session->reorder_q); + ), + TP_printk("%s: pkt_ns=%u my_nr=%u reorder_q_len=%u", + __entry->name, + __entry->pkt_ns, + __entry->my_nr, + __entry->reorder_q_len) +); + +DEFINE_EVENT(session_pkt_discard_evt, session_pkt_expired, + TP_PROTO(struct l2tp_session *session, u32 pkt_ns), + TP_ARGS(session, pkt_ns) +); + +DEFINE_EVENT(session_pkt_discard_evt, session_pkt_outside_rx_window, + TP_PROTO(struct l2tp_session *session, u32 pkt_ns), + TP_ARGS(session, pkt_ns) +); + +DEFINE_EVENT(session_pkt_discard_evt, session_pkt_oos, + TP_PROTO(struct l2tp_session *session, u32 pkt_ns), + TP_ARGS(session, pkt_ns) +); + +#endif /* _TRACE_L2TP_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 313ba97acae3..cd4cf84a7f99 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -350,7 +350,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, sta->sta.addr, tid); /* We have no API to update the timeout value in the * driver so reject the timeout update if the timeout - * changed. If if did not change, i.e., no real update, + * changed. If it did not change, i.e., no real update, * just reply with success. */ rcu_read_lock(); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 87fddd84c621..8d75a4045d6e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -826,9 +826,9 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, memcpy(new->data, resp, resp_len); if (csa) - memcpy(new->csa_counter_offsets, csa->counter_offsets_presp, + memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_presp, csa->n_counter_offsets_presp * - sizeof(new->csa_counter_offsets[0])); + sizeof(new->cntdwn_counter_offsets[0])); rcu_assign_pointer(sdata->u.ap.probe_resp, new); if (old) @@ -837,6 +837,59 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, return 0; } +static int ieee80211_set_fils_discovery(struct ieee80211_sub_if_data *sdata, + struct cfg80211_fils_discovery *params) +{ + struct fils_discovery_data *new, *old = NULL; + struct ieee80211_fils_discovery *fd; + + if (!params->tmpl || !params->tmpl_len) + return -EINVAL; + + fd = &sdata->vif.bss_conf.fils_discovery; + fd->min_interval = params->min_interval; + fd->max_interval = params->max_interval; + + old = sdata_dereference(sdata->u.ap.fils_discovery, sdata); + new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL); + if (!new) + return -ENOMEM; + new->len = params->tmpl_len; + memcpy(new->data, params->tmpl, params->tmpl_len); + rcu_assign_pointer(sdata->u.ap.fils_discovery, new); + + if (old) + kfree_rcu(old, rcu_head); + + return 0; +} + +static int +ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata, + struct cfg80211_unsol_bcast_probe_resp *params) +{ + struct unsol_bcast_probe_resp_data *new, *old = NULL; + + if (!params->tmpl || !params->tmpl_len) + return -EINVAL; + + old = sdata_dereference(sdata->u.ap.unsol_bcast_probe_resp, sdata); + new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL); + if (!new) + return -ENOMEM; + new->len = params->tmpl_len; + memcpy(new->data, params->tmpl, params->tmpl_len); + rcu_assign_pointer(sdata->u.ap.unsol_bcast_probe_resp, new); + + if (old) + kfree_rcu(old, rcu_head); + + sdata->vif.bss_conf.unsol_bcast_probe_resp_interval = + params->interval; + + return 0; +} + static int ieee80211_set_ftm_responder_params( struct ieee80211_sub_if_data *sdata, const u8 *lci, size_t lci_len, @@ -926,10 +979,10 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, new->tail_len = new_tail_len; if (csa) { - new->csa_current_counter = csa->count; - memcpy(new->csa_counter_offsets, csa->counter_offsets_beacon, + new->cntdwn_current_counter = csa->count; + memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_beacon, csa->n_counter_offsets_beacon * - sizeof(new->csa_counter_offsets[0])); + sizeof(new->cntdwn_counter_offsets[0])); } /* copy in head */ @@ -1099,12 +1152,26 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, } err = ieee80211_assign_beacon(sdata, ¶ms->beacon, NULL); - if (err < 0) { - ieee80211_vif_release_channel(sdata); - return err; - } + if (err < 0) + goto error; changed |= err; + if (params->fils_discovery.max_interval) { + err = ieee80211_set_fils_discovery(sdata, + ¶ms->fils_discovery); + if (err < 0) + goto error; + changed |= BSS_CHANGED_FILS_DISCOVERY; + } + + if (params->unsol_bcast_probe_resp.interval) { + err = ieee80211_set_unsol_bcast_probe_resp(sdata, + ¶ms->unsol_bcast_probe_resp); + if (err < 0) + goto error; + changed |= BSS_CHANGED_UNSOL_BCAST_PROBE_RESP; + } + err = drv_start_ap(sdata->local, sdata); if (err) { old = sdata_dereference(sdata->u.ap.beacon, sdata); @@ -1112,8 +1179,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (old) kfree_rcu(old, rcu_head); RCU_INIT_POINTER(sdata->u.ap.beacon, NULL); - ieee80211_vif_release_channel(sdata); - return err; + goto error; } ieee80211_recalc_dtim(local, sdata); @@ -1124,6 +1190,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, netif_carrier_on(vlan->dev); return 0; + +error: + ieee80211_vif_release_channel(sdata); + return err; } static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, @@ -1160,6 +1230,8 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) struct ieee80211_local *local = sdata->local; struct beacon_data *old_beacon; struct probe_resp *old_probe_resp; + struct fils_discovery_data *old_fils_discovery; + struct unsol_bcast_probe_resp_data *old_unsol_bcast_probe_resp; struct cfg80211_chan_def chandef; sdata_assert_lock(sdata); @@ -1168,6 +1240,11 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) if (!old_beacon) return -ENOENT; old_probe_resp = sdata_dereference(sdata->u.ap.probe_resp, sdata); + old_fils_discovery = sdata_dereference(sdata->u.ap.fils_discovery, + sdata); + old_unsol_bcast_probe_resp = + sdata_dereference(sdata->u.ap.unsol_bcast_probe_resp, + sdata); /* abort any running channel switch */ mutex_lock(&local->mtx); @@ -1191,9 +1268,15 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) /* remove beacon and probe response */ RCU_INIT_POINTER(sdata->u.ap.beacon, NULL); RCU_INIT_POINTER(sdata->u.ap.probe_resp, NULL); + RCU_INIT_POINTER(sdata->u.ap.fils_discovery, NULL); + RCU_INIT_POINTER(sdata->u.ap.unsol_bcast_probe_resp, NULL); kfree_rcu(old_beacon, rcu_head); if (old_probe_resp) kfree_rcu(old_probe_resp, rcu_head); + if (old_fils_discovery) + kfree_rcu(old_fils_discovery, rcu_head); + if (old_unsol_bcast_probe_resp) + kfree_rcu(old_unsol_bcast_probe_resp, rcu_head); kfree(sdata->vif.bss_conf.ftmr_params); sdata->vif.bss_conf.ftmr_params = NULL; @@ -1696,6 +1779,7 @@ static int ieee80211_change_station(struct wiphy *wiphy, rcu_assign_pointer(vlansdata->u.vlan.sta, sta); __ieee80211_check_fast_rx_iface(vlansdata); + drv_sta_set_4addr(local, sta->sdata, &sta->sta, true); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && @@ -3186,9 +3270,9 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, break; if ((params->n_counter_offsets_beacon > - IEEE80211_MAX_CSA_COUNTERS_NUM) || + IEEE80211_MAX_CNTDWN_COUNTERS_NUM) || (params->n_counter_offsets_presp > - IEEE80211_MAX_CSA_COUNTERS_NUM)) + IEEE80211_MAX_CNTDWN_COUNTERS_NUM)) return -EINVAL; csa.counter_offsets_beacon = params->counter_offsets_beacon; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 54080290d6e2..90470392fdaa 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -408,6 +408,7 @@ static const char *hw_flag_names[] = { FLAG(SUPPORTS_MULTI_BSSID), FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), FLAG(AMPDU_KEYBORDER_SUPPORT), + FLAG(SUPPORTS_TX_ENCAP_OFFLOAD), #undef FLAG }; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 41d495d73d3a..bcdfd19a596b 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1384,4 +1384,33 @@ static inline int drv_reset_tid_config(struct ieee80211_local *local, return ret; } + +static inline void drv_update_vif_offload(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + might_sleep(); + check_sdata_in_driver(sdata); + + if (!local->ops->update_vif_offload) + return; + + trace_drv_update_vif_offload(local, sdata); + local->ops->update_vif_offload(&local->hw, &sdata->vif); + trace_drv_return_void(local); +} + +static inline void drv_sta_set_4addr(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, bool enabled) +{ + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_sta_set_4addr(local, sdata, sta, enabled); + if (local->ops->sta_set_4addr) + local->ops->sta_set_4addr(&local->hw, &sdata->vif, sta, enabled); + trace_drv_return_void(local); +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 53632c2f5217..c0963969a465 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -145,9 +145,9 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, *pos++ = csa_settings->block_tx ? 1 : 0; *pos++ = ieee80211_frequency_to_channel( csa_settings->chandef.chan->center_freq); - presp->csa_counter_offsets[0] = (pos - presp->head); + presp->cntdwn_counter_offsets[0] = (pos - presp->head); *pos++ = csa_settings->count; - presp->csa_current_counter = csa_settings->count; + presp->cntdwn_current_counter = csa_settings->count; } /* put the remaining rates in WLAN_EID_EXT_SUPP_RATES */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 0b1eaec6649f..ecd9229012bf 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -259,15 +259,27 @@ struct beacon_data { u8 *head, *tail; int head_len, tail_len; struct ieee80211_meshconf_ie *meshconf; - u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM]; - u8 csa_current_counter; + u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; + u8 cntdwn_current_counter; struct rcu_head rcu_head; }; struct probe_resp { struct rcu_head rcu_head; int len; - u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM]; + u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; + u8 data[]; +}; + +struct fils_discovery_data { + struct rcu_head rcu_head; + int len; + u8 data[]; +}; + +struct unsol_bcast_probe_resp_data { + struct rcu_head rcu_head; + int len; u8 data[]; }; @@ -286,6 +298,8 @@ struct ps_data { struct ieee80211_if_ap { struct beacon_data __rcu *beacon; struct probe_resp __rcu *probe_resp; + struct fils_discovery_data __rcu *fils_discovery; + struct unsol_bcast_probe_resp_data __rcu *unsol_bcast_probe_resp; /* to be used after channel switch. */ struct cfg80211_beacon_data *next_beacon; @@ -989,8 +1003,6 @@ struct ieee80211_sub_if_data { } debugfs; #endif - bool hw_80211_encap; - /* must be last, dynamically sized area in this! */ struct ieee80211_vif vif; }; @@ -1767,6 +1779,7 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local); bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata); void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata, bool update_bss); +void ieee80211_recalc_offload(struct ieee80211_local *local); static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) { @@ -2040,8 +2053,6 @@ void ieee80211_dynamic_ps_timer(struct timer_list *t); void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool powersave); -void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, - struct ieee80211_hdr *hdr); void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, bool ack, u16 tx_time); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 9740ae8fa697..7ac9af66f545 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -348,439 +348,6 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata, return 0; } -void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, - const int offset) -{ - struct ieee80211_local *local = sdata->local; - u32 flags = sdata->u.mntr.flags; - -#define ADJUST(_f, _s) do { \ - if (flags & MONITOR_FLAG_##_f) \ - local->fif_##_s += offset; \ - } while (0) - - ADJUST(FCSFAIL, fcsfail); - ADJUST(PLCPFAIL, plcpfail); - ADJUST(CONTROL, control); - ADJUST(CONTROL, pspoll); - ADJUST(OTHER_BSS, other_bss); - -#undef ADJUST -} - -static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_local *local = sdata->local; - int i; - - for (i = 0; i < IEEE80211_NUM_ACS; i++) { - if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) - sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE; - else if (local->hw.queues >= IEEE80211_NUM_ACS) - sdata->vif.hw_queue[i] = i; - else - sdata->vif.hw_queue[i] = 0; - } - sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE; -} - -int ieee80211_add_virtual_monitor(struct ieee80211_local *local) -{ - struct ieee80211_sub_if_data *sdata; - int ret; - - if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) - return 0; - - ASSERT_RTNL(); - - if (local->monitor_sdata) - return 0; - - sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL); - if (!sdata) - return -ENOMEM; - - /* set up data */ - sdata->local = local; - sdata->vif.type = NL80211_IFTYPE_MONITOR; - snprintf(sdata->name, IFNAMSIZ, "%s-monitor", - wiphy_name(local->hw.wiphy)); - sdata->wdev.iftype = NL80211_IFTYPE_MONITOR; - - sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM; - - ieee80211_set_default_queues(sdata); - - ret = drv_add_interface(local, sdata); - if (WARN_ON(ret)) { - /* ok .. stupid driver, it asked for this! */ - kfree(sdata); - return ret; - } - - ret = ieee80211_check_queues(sdata, NL80211_IFTYPE_MONITOR); - if (ret) { - kfree(sdata); - return ret; - } - - mutex_lock(&local->iflist_mtx); - rcu_assign_pointer(local->monitor_sdata, sdata); - mutex_unlock(&local->iflist_mtx); - - mutex_lock(&local->mtx); - ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef, - IEEE80211_CHANCTX_EXCLUSIVE); - mutex_unlock(&local->mtx); - if (ret) { - mutex_lock(&local->iflist_mtx); - RCU_INIT_POINTER(local->monitor_sdata, NULL); - mutex_unlock(&local->iflist_mtx); - synchronize_net(); - drv_remove_interface(local, sdata); - kfree(sdata); - return ret; - } - - skb_queue_head_init(&sdata->skb_queue); - INIT_WORK(&sdata->work, ieee80211_iface_work); - - return 0; -} - -void ieee80211_del_virtual_monitor(struct ieee80211_local *local) -{ - struct ieee80211_sub_if_data *sdata; - - if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) - return; - - ASSERT_RTNL(); - - mutex_lock(&local->iflist_mtx); - - sdata = rcu_dereference_protected(local->monitor_sdata, - lockdep_is_held(&local->iflist_mtx)); - if (!sdata) { - mutex_unlock(&local->iflist_mtx); - return; - } - - RCU_INIT_POINTER(local->monitor_sdata, NULL); - mutex_unlock(&local->iflist_mtx); - - synchronize_net(); - - mutex_lock(&local->mtx); - ieee80211_vif_release_channel(sdata); - mutex_unlock(&local->mtx); - - drv_remove_interface(local, sdata); - - kfree(sdata); -} - -/* - * NOTE: Be very careful when changing this function, it must NOT return - * an error on interface type changes that have been pre-checked, so most - * checks should be in ieee80211_check_concurrent_iface. - */ -int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); - struct net_device *dev = wdev->netdev; - struct ieee80211_local *local = sdata->local; - struct sta_info *sta; - u32 changed = 0; - int res; - u32 hw_reconf_flags = 0; - - switch (sdata->vif.type) { - case NL80211_IFTYPE_WDS: - if (!is_valid_ether_addr(sdata->u.wds.remote_addr)) - return -ENOLINK; - break; - case NL80211_IFTYPE_AP_VLAN: { - struct ieee80211_sub_if_data *master; - - if (!sdata->bss) - return -ENOLINK; - - mutex_lock(&local->mtx); - list_add(&sdata->u.vlan.list, &sdata->bss->vlans); - mutex_unlock(&local->mtx); - - master = container_of(sdata->bss, - struct ieee80211_sub_if_data, u.ap); - sdata->control_port_protocol = - master->control_port_protocol; - sdata->control_port_no_encrypt = - master->control_port_no_encrypt; - sdata->control_port_over_nl80211 = - master->control_port_over_nl80211; - sdata->control_port_no_preauth = - master->control_port_no_preauth; - sdata->vif.cab_queue = master->vif.cab_queue; - memcpy(sdata->vif.hw_queue, master->vif.hw_queue, - sizeof(sdata->vif.hw_queue)); - sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef; - - mutex_lock(&local->key_mtx); - sdata->crypto_tx_tailroom_needed_cnt += - master->crypto_tx_tailroom_needed_cnt; - mutex_unlock(&local->key_mtx); - - break; - } - case NL80211_IFTYPE_AP: - sdata->bss = &sdata->u.ap; - break; - case NL80211_IFTYPE_MESH_POINT: - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_MONITOR: - case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_P2P_DEVICE: - case NL80211_IFTYPE_OCB: - case NL80211_IFTYPE_NAN: - /* no special treatment */ - break; - case NL80211_IFTYPE_UNSPECIFIED: - case NUM_NL80211_IFTYPES: - case NL80211_IFTYPE_P2P_CLIENT: - case NL80211_IFTYPE_P2P_GO: - /* cannot happen */ - WARN_ON(1); - break; - } - - if (local->open_count == 0) { - res = drv_start(local); - if (res) - goto err_del_bss; - /* we're brought up, everything changes */ - hw_reconf_flags = ~0; - ieee80211_led_radio(local, true); - ieee80211_mod_tpt_led_trig(local, - IEEE80211_TPT_LEDTRIG_FL_RADIO, 0); - } - - /* - * Copy the hopefully now-present MAC address to - * this interface, if it has the special null one. - */ - if (dev && is_zero_ether_addr(dev->dev_addr)) { - memcpy(dev->dev_addr, - local->hw.wiphy->perm_addr, - ETH_ALEN); - memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN); - - if (!is_valid_ether_addr(dev->dev_addr)) { - res = -EADDRNOTAVAIL; - goto err_stop; - } - } - - switch (sdata->vif.type) { - case NL80211_IFTYPE_AP_VLAN: - /* no need to tell driver, but set carrier and chanctx */ - if (rtnl_dereference(sdata->bss->beacon)) { - ieee80211_vif_vlan_copy_chanctx(sdata); - netif_carrier_on(dev); - } else { - netif_carrier_off(dev); - } - break; - case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) { - local->cooked_mntrs++; - break; - } - - if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { - res = drv_add_interface(local, sdata); - if (res) - goto err_stop; - } else if (local->monitors == 0 && local->open_count == 0) { - res = ieee80211_add_virtual_monitor(local); - if (res) - goto err_stop; - } - - /* must be before the call to ieee80211_configure_filter */ - local->monitors++; - if (local->monitors == 1) { - local->hw.conf.flags |= IEEE80211_CONF_MONITOR; - hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; - } - - ieee80211_adjust_monitor_flags(sdata, 1); - ieee80211_configure_filter(local); - mutex_lock(&local->mtx); - ieee80211_recalc_idle(local); - mutex_unlock(&local->mtx); - - netif_carrier_on(dev); - break; - default: - if (coming_up) { - ieee80211_del_virtual_monitor(local); - - res = drv_add_interface(local, sdata); - if (res) - goto err_stop; - res = ieee80211_check_queues(sdata, - ieee80211_vif_type_p2p(&sdata->vif)); - if (res) - goto err_del_interface; - } - - if (sdata->vif.type == NL80211_IFTYPE_AP) { - local->fif_pspoll++; - local->fif_probe_req++; - - ieee80211_configure_filter(local); - } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { - local->fif_probe_req++; - } - - if (sdata->vif.probe_req_reg) - drv_config_iface_filter(local, sdata, - FIF_PROBE_REQ, - FIF_PROBE_REQ); - - if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && - sdata->vif.type != NL80211_IFTYPE_NAN) - changed |= ieee80211_reset_erp_info(sdata); - ieee80211_bss_info_change_notify(sdata, changed); - - switch (sdata->vif.type) { - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_MESH_POINT: - case NL80211_IFTYPE_OCB: - netif_carrier_off(dev); - break; - case NL80211_IFTYPE_WDS: - case NL80211_IFTYPE_P2P_DEVICE: - case NL80211_IFTYPE_NAN: - break; - default: - /* not reached */ - WARN_ON(1); - } - - /* - * Set default queue parameters so drivers don't - * need to initialise the hardware if the hardware - * doesn't start up with sane defaults. - * Enable QoS for anything but station interfaces. - */ - ieee80211_set_wmm_default(sdata, true, - sdata->vif.type != NL80211_IFTYPE_STATION); - } - - set_bit(SDATA_STATE_RUNNING, &sdata->state); - - switch (sdata->vif.type) { - case NL80211_IFTYPE_WDS: - /* Create STA entry for the WDS peer */ - sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr, - GFP_KERNEL); - if (!sta) { - res = -ENOMEM; - goto err_del_interface; - } - - sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); - sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); - sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); - - res = sta_info_insert(sta); - if (res) { - /* STA has been freed */ - goto err_del_interface; - } - - rate_control_rate_init(sta); - netif_carrier_on(dev); - break; - case NL80211_IFTYPE_P2P_DEVICE: - rcu_assign_pointer(local->p2p_sdata, sdata); - break; - case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) - break; - list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list); - break; - default: - break; - } - - /* - * set_multicast_list will be invoked by the networking core - * which will check whether any increments here were done in - * error and sync them down to the hardware as filter flags. - */ - if (sdata->flags & IEEE80211_SDATA_ALLMULTI) - atomic_inc(&local->iff_allmultis); - - if (coming_up) - local->open_count++; - - if (hw_reconf_flags) - ieee80211_hw_config(local, hw_reconf_flags); - - ieee80211_recalc_ps(local); - - if (sdata->vif.type == NL80211_IFTYPE_MONITOR || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - local->ops->wake_tx_queue) { - /* XXX: for AP_VLAN, actually track AP queues */ - if (dev) - netif_tx_start_all_queues(dev); - } else if (dev) { - unsigned long flags; - int n_acs = IEEE80211_NUM_ACS; - int ac; - - if (local->hw.queues < IEEE80211_NUM_ACS) - n_acs = 1; - - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - if (sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE || - (local->queue_stop_reasons[sdata->vif.cab_queue] == 0 && - skb_queue_empty(&local->pending[sdata->vif.cab_queue]))) { - for (ac = 0; ac < n_acs; ac++) { - int ac_queue = sdata->vif.hw_queue[ac]; - - if (local->queue_stop_reasons[ac_queue] == 0 && - skb_queue_empty(&local->pending[ac_queue])) - netif_start_subqueue(dev, ac); - } - } - spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - } - - return 0; - err_del_interface: - drv_remove_interface(local, sdata); - err_stop: - if (!local->open_count) - drv_stop(local); - err_del_bss: - sdata->bss = NULL; - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { - mutex_lock(&local->mtx); - list_del(&sdata->u.vlan.list); - mutex_unlock(&local->mtx); - } - /* might already be clear but that doesn't matter */ - clear_bit(SDATA_STATE_RUNNING, &sdata->state); - return res; -} - static int ieee80211_open(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -1227,60 +794,547 @@ static const struct net_device_ops ieee80211_dataif_8023_ops = { .ndo_get_stats64 = ieee80211_get_stats64, }; -static void __ieee80211_set_hw_80211_encap(struct ieee80211_sub_if_data *sdata, - bool enable) +static bool ieee80211_iftype_supports_encap_offload(enum nl80211_iftype iftype) { - sdata->dev->netdev_ops = enable ? &ieee80211_dataif_8023_ops : - &ieee80211_dataif_ops; - sdata->hw_80211_encap = enable; + switch (iftype) { + /* P2P GO and client are mapped to AP/STATION types */ + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_STATION: + return true; + default: + return false; + } } -bool ieee80211_set_hw_80211_encap(struct ieee80211_vif *vif, bool enable) +static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdata) { - struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; - struct ieee80211_sub_if_data *iter; - struct ieee80211_key *key; + u32 flags; + + flags = sdata->vif.offload_flags; + + if (ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) && + ieee80211_iftype_supports_encap_offload(sdata->vif.type)) { + flags |= IEEE80211_OFFLOAD_ENCAP_ENABLED; + + if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG) && + local->hw.wiphy->frag_threshold != (u32)-1) + flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED; + + if (local->monitors) + flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED; + } else { + flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED; + } + + if (sdata->vif.offload_flags == flags) + return false; + + sdata->vif.offload_flags = flags; + return true; +} + +static void ieee80211_set_vif_encap_ops(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *bss = sdata; + bool enabled; + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + if (!sdata->bss) + return; + + bss = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); + } + + if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) || + !ieee80211_iftype_supports_encap_offload(bss->vif.type)) + return; + + enabled = bss->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_ENABLED; + if (sdata->wdev.use_4addr && + !(bss->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_4ADDR)) + enabled = false; + + sdata->dev->netdev_ops = enabled ? &ieee80211_dataif_8023_ops : + &ieee80211_dataif_ops; +} + +static void ieee80211_recalc_sdata_offload(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *vsdata; + + if (ieee80211_set_sdata_offload_flags(sdata)) { + drv_update_vif_offload(local, sdata); + ieee80211_set_vif_encap_ops(sdata); + } + + list_for_each_entry(vsdata, &local->interfaces, list) { + if (vsdata->vif.type != NL80211_IFTYPE_AP_VLAN || + vsdata->bss != &sdata->u.ap) + continue; + + ieee80211_set_vif_encap_ops(vsdata); + } +} + +void ieee80211_recalc_offload(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata; + + if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD)) + return; mutex_lock(&local->iflist_mtx); - list_for_each_entry(iter, &local->interfaces, list) { - struct ieee80211_sub_if_data *disable = NULL; - - if (vif->type == NL80211_IFTYPE_MONITOR) { - disable = iter; - __ieee80211_set_hw_80211_encap(iter, false); - } else if (iter->vif.type == NL80211_IFTYPE_MONITOR) { - disable = sdata; - enable = false; - } - if (disable) - sdata_dbg(disable, - "disable hw 80211 encap due to mon co-exist\n"); + + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + + ieee80211_recalc_sdata_offload(sdata); } + mutex_unlock(&local->iflist_mtx); +} - if (enable == sdata->hw_80211_encap) - return enable; +void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, + const int offset) +{ + struct ieee80211_local *local = sdata->local; + u32 flags = sdata->u.mntr.flags; - if (!sdata->dev) - return false; +#define ADJUST(_f, _s) do { \ + if (flags & MONITOR_FLAG_##_f) \ + local->fif_##_s += offset; \ + } while (0) + + ADJUST(FCSFAIL, fcsfail); + ADJUST(PLCPFAIL, plcpfail); + ADJUST(CONTROL, control); + ADJUST(CONTROL, pspoll); + ADJUST(OTHER_BSS, other_bss); + +#undef ADJUST +} + +static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + int i; + + for (i = 0; i < IEEE80211_NUM_ACS; i++) { + if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) + sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE; + else if (local->hw.queues >= IEEE80211_NUM_ACS) + sdata->vif.hw_queue[i] = i; + else + sdata->vif.hw_queue[i] = 0; + } + sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE; +} + +int ieee80211_add_virtual_monitor(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata; + int ret; + + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + return 0; + + ASSERT_RTNL(); + + if (local->monitor_sdata) + return 0; + + sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL); + if (!sdata) + return -ENOMEM; + + /* set up data */ + sdata->local = local; + sdata->vif.type = NL80211_IFTYPE_MONITOR; + snprintf(sdata->name, IFNAMSIZ, "%s-monitor", + wiphy_name(local->hw.wiphy)); + sdata->wdev.iftype = NL80211_IFTYPE_MONITOR; + + sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM; + + ieee80211_set_default_queues(sdata); + + ret = drv_add_interface(local, sdata); + if (WARN_ON(ret)) { + /* ok .. stupid driver, it asked for this! */ + kfree(sdata); + return ret; + } + + ret = ieee80211_check_queues(sdata, NL80211_IFTYPE_MONITOR); + if (ret) { + kfree(sdata); + return ret; + } + + mutex_lock(&local->iflist_mtx); + rcu_assign_pointer(local->monitor_sdata, sdata); + mutex_unlock(&local->iflist_mtx); + + mutex_lock(&local->mtx); + ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef, + IEEE80211_CHANCTX_EXCLUSIVE); + mutex_unlock(&local->mtx); + if (ret) { + mutex_lock(&local->iflist_mtx); + RCU_INIT_POINTER(local->monitor_sdata, NULL); + mutex_unlock(&local->iflist_mtx); + synchronize_net(); + drv_remove_interface(local, sdata); + kfree(sdata); + return ret; + } + + skb_queue_head_init(&sdata->skb_queue); + INIT_WORK(&sdata->work, ieee80211_iface_work); + + return 0; +} + +void ieee80211_del_virtual_monitor(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata; + + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + return; + + ASSERT_RTNL(); + + mutex_lock(&local->iflist_mtx); + + sdata = rcu_dereference_protected(local->monitor_sdata, + lockdep_is_held(&local->iflist_mtx)); + if (!sdata) { + mutex_unlock(&local->iflist_mtx); + return; + } + + RCU_INIT_POINTER(local->monitor_sdata, NULL); + mutex_unlock(&local->iflist_mtx); + + synchronize_net(); + + mutex_lock(&local->mtx); + ieee80211_vif_release_channel(sdata); + mutex_unlock(&local->mtx); + + drv_remove_interface(local, sdata); + + kfree(sdata); +} + +/* + * NOTE: Be very careful when changing this function, it must NOT return + * an error on interface type changes that have been pre-checked, so most + * checks should be in ieee80211_check_concurrent_iface. + */ +int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + struct net_device *dev = wdev->netdev; + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + u32 changed = 0; + int res; + u32 hw_reconf_flags = 0; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_WDS: + if (!is_valid_ether_addr(sdata->u.wds.remote_addr)) + return -ENOLINK; + break; + case NL80211_IFTYPE_AP_VLAN: { + struct ieee80211_sub_if_data *master; + + if (!sdata->bss) + return -ENOLINK; + + mutex_lock(&local->mtx); + list_add(&sdata->u.vlan.list, &sdata->bss->vlans); + mutex_unlock(&local->mtx); + + master = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + sdata->control_port_protocol = + master->control_port_protocol; + sdata->control_port_no_encrypt = + master->control_port_no_encrypt; + sdata->control_port_over_nl80211 = + master->control_port_over_nl80211; + sdata->control_port_no_preauth = + master->control_port_no_preauth; + sdata->vif.cab_queue = master->vif.cab_queue; + memcpy(sdata->vif.hw_queue, master->vif.hw_queue, + sizeof(sdata->vif.hw_queue)); + sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef; + + mutex_lock(&local->key_mtx); + sdata->crypto_tx_tailroom_needed_cnt += + master->crypto_tx_tailroom_needed_cnt; + mutex_unlock(&local->key_mtx); + + break; + } + case NL80211_IFTYPE_AP: + sdata->bss = &sdata->u.ap; + break; + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_MONITOR: + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_OCB: + case NL80211_IFTYPE_NAN: + /* no special treatment */ + break; + case NL80211_IFTYPE_UNSPECIFIED: + case NUM_NL80211_IFTYPES: + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_P2P_GO: + /* cannot happen */ + WARN_ON(1); + break; + } + + if (local->open_count == 0) { + res = drv_start(local); + if (res) + goto err_del_bss; + /* we're brought up, everything changes */ + hw_reconf_flags = ~0; + ieee80211_led_radio(local, true); + ieee80211_mod_tpt_led_trig(local, + IEEE80211_TPT_LEDTRIG_FL_RADIO, 0); + } + + /* + * Copy the hopefully now-present MAC address to + * this interface, if it has the special null one. + */ + if (dev && is_zero_ether_addr(dev->dev_addr)) { + memcpy(dev->dev_addr, + local->hw.wiphy->perm_addr, + ETH_ALEN); + memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN); + + if (!is_valid_ether_addr(dev->dev_addr)) { + res = -EADDRNOTAVAIL; + goto err_stop; + } + } + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + /* no need to tell driver, but set carrier and chanctx */ + if (rtnl_dereference(sdata->bss->beacon)) { + ieee80211_vif_vlan_copy_chanctx(sdata); + netif_carrier_on(dev); + ieee80211_set_vif_encap_ops(sdata); + } else { + netif_carrier_off(dev); + } + break; + case NL80211_IFTYPE_MONITOR: + if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) { + local->cooked_mntrs++; + break; + } + + if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { + res = drv_add_interface(local, sdata); + if (res) + goto err_stop; + } else if (local->monitors == 0 && local->open_count == 0) { + res = ieee80211_add_virtual_monitor(local); + if (res) + goto err_stop; + } + + /* must be before the call to ieee80211_configure_filter */ + local->monitors++; + if (local->monitors == 1) { + local->hw.conf.flags |= IEEE80211_CONF_MONITOR; + hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; + } + + ieee80211_adjust_monitor_flags(sdata, 1); + ieee80211_configure_filter(local); + ieee80211_recalc_offload(local); + mutex_lock(&local->mtx); + ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); + + netif_carrier_on(dev); + break; + default: + if (coming_up) { + ieee80211_del_virtual_monitor(local); + ieee80211_set_sdata_offload_flags(sdata); + + res = drv_add_interface(local, sdata); + if (res) + goto err_stop; + + ieee80211_set_vif_encap_ops(sdata); + res = ieee80211_check_queues(sdata, + ieee80211_vif_type_p2p(&sdata->vif)); + if (res) + goto err_del_interface; + } + + if (sdata->vif.type == NL80211_IFTYPE_AP) { + local->fif_pspoll++; + local->fif_probe_req++; + + ieee80211_configure_filter(local); + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + local->fif_probe_req++; + } + + if (sdata->vif.probe_req_reg) + drv_config_iface_filter(local, sdata, + FIF_PROBE_REQ, + FIF_PROBE_REQ); + + if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + sdata->vif.type != NL80211_IFTYPE_NAN) + changed |= ieee80211_reset_erp_info(sdata); + ieee80211_bss_info_change_notify(sdata, changed); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_OCB: + netif_carrier_off(dev); + break; + case NL80211_IFTYPE_WDS: + case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: + break; + default: + /* not reached */ + WARN_ON(1); + } - if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG) && - (local->hw.wiphy->frag_threshold != (u32)-1)) - enable = false; + /* + * Set default queue parameters so drivers don't + * need to initialise the hardware if the hardware + * doesn't start up with sane defaults. + * Enable QoS for anything but station interfaces. + */ + ieee80211_set_wmm_default(sdata, true, + sdata->vif.type != NL80211_IFTYPE_STATION); + } - mutex_lock(&sdata->local->key_mtx); - list_for_each_entry(key, &sdata->key_list, list) { - if (key->conf.cipher == WLAN_CIPHER_SUITE_TKIP) - enable = false; + set_bit(SDATA_STATE_RUNNING, &sdata->state); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_WDS: + /* Create STA entry for the WDS peer */ + sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr, + GFP_KERNEL); + if (!sta) { + res = -ENOMEM; + goto err_del_interface; + } + + sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); + sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); + sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); + + res = sta_info_insert(sta); + if (res) { + /* STA has been freed */ + goto err_del_interface; + } + + rate_control_rate_init(sta); + netif_carrier_on(dev); + break; + case NL80211_IFTYPE_P2P_DEVICE: + rcu_assign_pointer(local->p2p_sdata, sdata); + break; + case NL80211_IFTYPE_MONITOR: + if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) + break; + list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list); + break; + default: + break; } - mutex_unlock(&sdata->local->key_mtx); - __ieee80211_set_hw_80211_encap(sdata, enable); + /* + * set_multicast_list will be invoked by the networking core + * which will check whether any increments here were done in + * error and sync them down to the hardware as filter flags. + */ + if (sdata->flags & IEEE80211_SDATA_ALLMULTI) + atomic_inc(&local->iff_allmultis); + + if (coming_up) + local->open_count++; - return enable; + if (hw_reconf_flags) + ieee80211_hw_config(local, hw_reconf_flags); + + ieee80211_recalc_ps(local); + + if (sdata->vif.type == NL80211_IFTYPE_MONITOR || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN || + local->ops->wake_tx_queue) { + /* XXX: for AP_VLAN, actually track AP queues */ + if (dev) + netif_tx_start_all_queues(dev); + } else if (dev) { + unsigned long flags; + int n_acs = IEEE80211_NUM_ACS; + int ac; + + if (local->hw.queues < IEEE80211_NUM_ACS) + n_acs = 1; + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + if (sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE || + (local->queue_stop_reasons[sdata->vif.cab_queue] == 0 && + skb_queue_empty(&local->pending[sdata->vif.cab_queue]))) { + for (ac = 0; ac < n_acs; ac++) { + int ac_queue = sdata->vif.hw_queue[ac]; + + if (local->queue_stop_reasons[ac_queue] == 0 && + skb_queue_empty(&local->pending[ac_queue])) + netif_start_subqueue(dev, ac); + } + } + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + } + + return 0; + err_del_interface: + drv_remove_interface(local, sdata); + err_stop: + if (!local->open_count) + drv_stop(local); + err_del_bss: + sdata->bss = NULL; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + mutex_lock(&local->mtx); + list_del(&sdata->u.vlan.list); + mutex_unlock(&local->mtx); + } + /* might already be clear but that doesn't matter */ + clear_bit(SDATA_STATE_RUNNING, &sdata->state); + return res; } -EXPORT_SYMBOL(ieee80211_set_hw_80211_encap); static void ieee80211_if_free(struct net_device *dev) { @@ -1484,7 +1538,6 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.txpower = INT_MIN; /* unset */ sdata->noack_map = 0; - sdata->hw_80211_encap = false; /* only monitor/p2p-device differ */ if (sdata->dev) { @@ -1619,6 +1672,7 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, ieee80211_teardown_sdata(sdata); + ieee80211_set_sdata_offload_flags(sdata); ret = drv_change_interface(local, sdata, internal_type, p2p); if (ret) type = ieee80211_vif_type_p2p(&sdata->vif); @@ -1631,6 +1685,7 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, ieee80211_check_queues(sdata, type); ieee80211_setup_sdata(sdata, type); + ieee80211_set_vif_encap_ops(sdata); err = ieee80211_do_open(&sdata->wdev, false); WARN(err, "type change: do_open returned %d", err); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 2df636c32432..8c5f829ff6d7 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -177,13 +177,6 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) } } - /* TKIP countermeasures don't work in encap offload mode */ - if (key->conf.cipher == WLAN_CIPHER_SUITE_TKIP && - sdata->hw_80211_encap) { - sdata_dbg(sdata, "TKIP is not allowed in hw 80211 encap mode\n"); - return -EINVAL; - } - ret = drv_set_key(key->local, SET_KEY, sdata, sta ? &sta->sta : NULL, &key->conf); @@ -219,14 +212,6 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - /* We cannot do software crypto of data frames with - * encapsulation offload enabled. However for 802.11w to - * function properly we need cmac/gmac keys. - */ - if (sdata->hw_80211_encap) - return -EINVAL; - fallthrough; - case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: diff --git a/net/mac80211/main.c b/net/mac80211/main.c index b4a2efe8e83a..523380aed92e 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1168,7 +1168,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT; } - local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CSA_COUNTERS_NUM; + local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CNTDWN_COUNTERS_NUM; /* * We use the number of queues for feature tests (QoS, HT) internally diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 7ecd801a943b..ce5825d6f1d1 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -672,7 +672,7 @@ void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh) * @hdr: 802.11 frame header * @fc: frame control field * @meshda: destination address in the mesh - * @meshsa: source address address in the mesh. Same as TA, as frame is + * @meshsa: source address in the mesh. Same as TA, as frame is * locally originated. * * Return the length of the 802.11 (does not include a mesh control header) @@ -864,8 +864,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) *pos++ = 0x0; *pos++ = ieee80211_frequency_to_channel( csa->settings.chandef.chan->center_freq); - bcn->csa_current_counter = csa->settings.count; - bcn->csa_counter_offsets[0] = hdr_len + 6; + bcn->cntdwn_current_counter = csa->settings.count; + bcn->cntdwn_counter_offsets[0] = hdr_len + 6; *pos++ = csa->settings.count; *pos++ = WLAN_EID_CHAN_SWITCH_PARAM; *pos++ = 6; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index bec23d2eee7a..313eee12410e 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -212,7 +212,7 @@ static void prepare_frame_for_deferred_tx(struct ieee80211_sub_if_data *sdata, skb->priority = 7; info->control.vif = &sdata->vif; - info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; ieee80211_set_qos_hdr(sdata, skb); ieee80211_mps_set_frame_flags(sdata, NULL, hdr); } @@ -1163,7 +1163,7 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, if (skb_queue_len(&mpath->frame_queue) >= MESH_FRAME_QUEUE_LEN) skb_to_free = skb_dequeue(&mpath->frame_queue); - info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; ieee80211_set_qos_hdr(sdata, skb); skb_queue_tail(&mpath->frame_queue, skb); if (skb_to_free) diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 031e905f684a..76d19c09d26e 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -432,7 +432,7 @@ static void mpsp_qos_null_append(struct sta_info *sta, info = IEEE80211_SKB_CB(new_skb); info->control.vif = &sdata->vif; - info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; __skb_queue_tail(frames, new_skb); } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2e400b0ff696..2489c6c64c2d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2432,23 +2432,6 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM; } -void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, - struct ieee80211_hdr *hdr) -{ - /* - * We can postpone the mgd.timer whenever receiving unicast frames - * from AP because we know that the connection is working both ways - * at that time. But multicast frames (and hence also beacons) must - * be ignored here, because we need to trigger the timer during - * data idle periods for sending the periodic probe request to the - * AP we're connected to. - */ - if (is_multicast_ether_addr(hdr->addr1)) - return; - - ieee80211_sta_reset_conn_monitor(sdata); -} - static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -2521,21 +2504,13 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, { ieee80211_sta_tx_wmm_ac_notify(sdata, hdr, tx_time); - if (!ieee80211_is_data(hdr->frame_control)) - return; - - if (ieee80211_is_any_nullfunc(hdr->frame_control) && - sdata->u.mgd.probe_send_count > 0) { - if (ack) - ieee80211_sta_reset_conn_monitor(sdata); - else - sdata->u.mgd.nullfunc_failed = true; - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + if (!ieee80211_is_any_nullfunc(hdr->frame_control) || + !sdata->u.mgd.probe_send_count) return; - } - if (ack) - ieee80211_sta_reset_conn_monitor(sdata); + if (!ack) + sdata->u.mgd.nullfunc_failed = true; + ieee80211_queue_work(&sdata->local->hw, &sdata->work); } static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata, @@ -3548,6 +3523,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } + if (sdata->wdev.use_4addr) + drv_sta_set_4addr(local, sdata, &sta->sta, true); + mutex_unlock(&sdata->local->sta_mtx); /* @@ -3605,8 +3583,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * Start timer to probe the connection to the AP now. * Also start the timer that will detect beacon loss. */ - ieee80211_sta_rx_notify(sdata, (struct ieee80211_hdr *)mgmt); ieee80211_sta_reset_beacon_monitor(sdata); + ieee80211_sta_reset_conn_monitor(sdata); ret = true; out: @@ -4577,10 +4555,26 @@ static void ieee80211_sta_conn_mon_timer(struct timer_list *t) from_timer(sdata, t, u.mgd.conn_mon_timer); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + unsigned long timeout; if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn) return; + sta = sta_info_get(sdata, ifmgd->bssid); + if (!sta) + return; + + timeout = sta->status_stats.last_ack; + if (time_before(sta->status_stats.last_ack, sta->rx_stats.last_rx)) + timeout = sta->rx_stats.last_rx; + timeout += IEEE80211_CONNECTION_IDLE_TIME; + + if (time_is_before_jiffies(timeout)) { + mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(timeout)); + return; + } + ieee80211_queue_work(&local->hw, &ifmgd->monitor_work); } diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index f470d1a7ce9b..1ac7b8c374c9 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -916,7 +916,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, if (beacon) for (i = 0; i < params->n_csa_offsets; i++) data[params->csa_offsets[i]] = - beacon->csa_current_counter; + beacon->cntdwn_current_counter; rcu_read_unlock(); } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index a959ebf56852..7f88a1b2215c 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1812,9 +1812,6 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) sta->rx_stats.last_rate = sta_stats_encode_rate(status); } - if (rx->sdata->vif.type == NL80211_IFTYPE_STATION) - ieee80211_sta_rx_notify(rx->sdata, hdr); - sta->rx_stats.fragments++; u64_stats_update_begin(&rx->sta->rx_stats.syncp); @@ -2900,7 +2897,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) fwd_hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_RETRY); info = IEEE80211_SKB_CB(fwd_skb); memset(info, 0, sizeof(*info)); - info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->control.vif = &rx->sdata->vif; info->control.jiffies = jiffies; if (is_multicast_ether_addr(fwd_hdr->addr1)) { @@ -4149,7 +4146,6 @@ void ieee80211_check_fast_rx(struct sta_info *sta) fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2); fastrx.expected_ds_bits = 0; } else { - fastrx.sta_notify = sdata->u.mgd.probe_send_count > 0; fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1); fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr3); fastrx.expected_ds_bits = @@ -4379,11 +4375,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, pskb_trim(skb, skb->len - fast_rx->icv_len)) goto drop; - if (unlikely(fast_rx->sta_notify)) { - ieee80211_sta_rx_notify(rx->sdata, hdr); - fast_rx->sta_notify = false; - } - /* statistics part of ieee80211_rx_h_sta_process() */ if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { stats->last_signal = status->signal; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index d5010116cf4d..91a61b44b4e0 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -336,7 +336,6 @@ struct ieee80211_fast_tx { * @expected_ds_bits: from/to DS bits expected * @icv_len: length of the MIC if present * @key: bool indicating encryption is expected (key is set) - * @sta_notify: notify the MLME code (once) * @internal_forward: forward froms internally on AP/VLAN type interfaces * @uses_rss: copy of USES_RSS hw flag * @da_offs: offset of the DA in the header (for header conversion) @@ -352,7 +351,6 @@ struct ieee80211_fast_rx { __le16 expected_ds_bits; u8 icv_len; u8 key:1, - sta_notify:1, internal_forward:1, uses_rss:1; u8 da_offs, sa_offs; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 0794396a7988..7fe5bececfd9 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -66,8 +66,8 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, info->control.jiffies = jiffies; info->control.vif = &sta->sdata->vif; - info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING | - IEEE80211_TX_INTFL_RETRANSMISSION; + info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; + info->flags |= IEEE80211_TX_INTFL_RETRANSMISSION; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; sta->status_stats.filtered++; @@ -184,18 +184,6 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) struct ieee80211_mgmt *mgmt = (void *) skb->data; struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; - struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb); - - if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { - sta->status_stats.last_ack = jiffies; - if (txinfo->status.is_valid_ack_signal) { - sta->status_stats.last_ack_signal = - (s8)txinfo->status.ack_signal; - sta->status_stats.ack_signal_filled = true; - ewma_avg_signal_add(&sta->status_stats.avg_ack_signal, - -txinfo->status.ack_signal); - } - } if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *) skb->data; @@ -890,7 +878,8 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, } static void __ieee80211_tx_status(struct ieee80211_hw *hw, - struct ieee80211_tx_status *status) + struct ieee80211_tx_status *status, + int rates_idx, int retry_count) { struct sk_buff *skb = status->skb; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; @@ -899,17 +888,12 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, struct sta_info *sta; __le16 fc; struct ieee80211_supported_band *sband; - int retry_count; - int rates_idx; bool send_to_cooked; bool acked; bool noack_success; struct ieee80211_bar *bar; int shift = 0; int tid = IEEE80211_NUM_TIDS; - u16 tx_time_est; - - rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); sband = local->hw.wiphy->bands[info->band]; fc = hdr->frame_control; @@ -987,24 +971,14 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) { ieee80211_handle_filtered_frame(local, sta, skb); return; - } else { + } else if (ieee80211_is_data_present(fc)) { if (!acked && !noack_success) - sta->status_stats.retry_failed++; - sta->status_stats.retry_count += retry_count; + sta->status_stats.msdu_failed[tid]++; - if (ieee80211_is_data_present(fc)) { - if (!acked && !noack_success) - sta->status_stats.msdu_failed[tid]++; - - sta->status_stats.msdu_retries[tid] += - retry_count; - } + sta->status_stats.msdu_retries[tid] += + retry_count; } - rate_control_tx_status(local, sband, status); - if (ieee80211_vif_is_mesh(&sta->sdata->vif)) - ieee80211s_update_metric(local, sta, status); - if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); @@ -1012,37 +986,6 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data, acked, info->status.tx_time); - - if (info->status.tx_time && - wiphy_ext_feature_isset(local->hw.wiphy, - NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) - ieee80211_sta_register_airtime(&sta->sta, tid, - info->status.tx_time, 0); - - if ((tx_time_est = ieee80211_info_get_tx_time_est(info)) > 0) { - /* Do this here to avoid the expensive lookup of the sta - * in ieee80211_report_used_skb(). - */ - ieee80211_sta_update_pending_airtime(local, sta, - skb_get_queue_mapping(skb), - tx_time_est, - true); - ieee80211_info_set_tx_time_est(info, 0); - } - - if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { - if (acked) { - if (sta->status_stats.lost_packets) - sta->status_stats.lost_packets = 0; - - /* Track when last TDLS packet was ACKed */ - sta->status_stats.last_pkt_time = jiffies; - } else if (noack_success) { - /* nothing to do here, do not account as lost */ - } else { - ieee80211_lost_packet(sta, info); - } - } } /* SNMP counters @@ -1101,7 +1044,10 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, * with this test... */ if (!local->monitors && (!send_to_cooked || !local->cooked_mntrs)) { - dev_kfree_skb(skb); + if (status->free_list) + list_add_tail(&skb->list, status->free_list); + else + dev_kfree_skb(skb); return; } @@ -1126,7 +1072,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) if (sta) status.sta = &sta->sta; - __ieee80211_tx_status(hw, &status); + ieee80211_tx_status_ext(hw, &status); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_tx_status); @@ -1137,10 +1083,12 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct ieee80211_sta *pubsta = status->sta; + struct sk_buff *skb = status->skb; struct ieee80211_supported_band *sband; - struct sta_info *sta; - int retry_count; + struct sta_info *sta = NULL; + int rates_idx, retry_count; bool acked, noack_success; + u16 tx_time_est; if (pubsta) { sta = container_of(pubsta, struct sta_info, sta); @@ -1149,13 +1097,22 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, sta->tx_stats.last_rate_info = *status->rate; } - if (status->skb) - return __ieee80211_tx_status(hw, status); + if (skb && (tx_time_est = + ieee80211_info_get_tx_time_est(IEEE80211_SKB_CB(skb))) > 0) { + /* Do this here to avoid the expensive lookup of the sta + * in ieee80211_report_used_skb(). + */ + ieee80211_sta_update_pending_airtime(local, sta, + skb_get_queue_mapping(skb), + tx_time_est, + true); + ieee80211_info_set_tx_time_est(IEEE80211_SKB_CB(skb), 0); + } - if (!status->sta) - return; + if (!status->info) + goto free; - ieee80211_tx_get_rates(hw, info, &retry_count); + rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); sband = hw->wiphy->bands[info->band]; @@ -1167,20 +1124,30 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, sta->status_stats.retry_failed++; sta->status_stats.retry_count += retry_count; - if (acked) { - sta->status_stats.last_ack = jiffies; + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { + if (acked) { + sta->status_stats.last_ack = jiffies; - if (sta->status_stats.lost_packets) - sta->status_stats.lost_packets = 0; + if (sta->status_stats.lost_packets) + sta->status_stats.lost_packets = 0; - /* Track when last packet was ACKed */ - sta->status_stats.last_pkt_time = jiffies; - } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { - return; - } else if (noack_success) { - /* nothing to do here, do not account as lost */ - } else { - ieee80211_lost_packet(sta, info); + /* Track when last packet was ACKed */ + sta->status_stats.last_pkt_time = jiffies; + + if (info->status.is_valid_ack_signal) { + sta->status_stats.last_ack_signal = + (s8)info->status.ack_signal; + sta->status_stats.ack_signal_filled = true; + ewma_avg_signal_add(&sta->status_stats.avg_ack_signal, + -info->status.ack_signal); + } + } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { + return; + } else if (noack_success) { + /* nothing to do here, do not account as lost */ + } else { + ieee80211_lost_packet(sta, info); + } } rate_control_tx_status(local, sband, status); @@ -1188,6 +1155,10 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, ieee80211s_update_metric(local, sta, status); } + if (skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) + return __ieee80211_tx_status(hw, status, rates_idx, + retry_count); + if (acked || noack_success) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (!pubsta) @@ -1199,6 +1170,16 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, } else { I802_DEBUG_INC(local->dot11FailedCount); } + +free: + if (!skb) + return; + + ieee80211_report_used_skb(local, skb, false); + if (status->free_list) + list_add_tail(&skb->list, status->free_list); + else + dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_tx_status_ext); @@ -1225,69 +1206,23 @@ void ieee80211_tx_status_8023(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb) { - struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_tx_status status = { + .skb = skb, + .info = IEEE80211_SKB_CB(skb), + }; struct sta_info *sta; - int retry_count; - int rates_idx; - bool acked; sdata = vif_to_sdata(vif); - acked = info->flags & IEEE80211_TX_STAT_ACK; - rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); - rcu_read_lock(); - if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) - goto counters_update; - - if (IS_ERR(sta)) - goto counters_update; - - if (!acked) - sta->status_stats.retry_failed++; - - if (rates_idx != -1) - sta->tx_stats.last_rate = info->status.rates[rates_idx]; - - sta->status_stats.retry_count += retry_count; - - if (ieee80211_hw_check(hw, REPORTS_TX_ACK_STATUS)) { - if (acked && vif->type == NL80211_IFTYPE_STATION) - ieee80211_sta_reset_conn_monitor(sdata); - - sta->status_stats.last_ack = jiffies; - if (info->flags & IEEE80211_TX_STAT_ACK) { - if (sta->status_stats.lost_packets) - sta->status_stats.lost_packets = 0; + if (!ieee80211_lookup_ra_sta(sdata, skb, &sta) && !IS_ERR(sta)) + status.sta = &sta->sta; - sta->status_stats.last_pkt_time = jiffies; - } else { - ieee80211_lost_packet(sta, info); - } - } + ieee80211_tx_status_ext(hw, &status); -counters_update: rcu_read_unlock(); - ieee80211_led_tx(local); - - if (!(info->flags & IEEE80211_TX_STAT_ACK) && - !(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED)) - goto skip_stats_update; - - I802_DEBUG_INC(local->dot11TransmittedFrameCount); - if (is_multicast_ether_addr(skb->data)) - I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); - if (retry_count > 0) - I802_DEBUG_INC(local->dot11RetryCount); - if (retry_count > 1) - I802_DEBUG_INC(local->dot11MultipleRetryCount); - -skip_stats_update: - ieee80211_report_used_skb(local, skb, false); - dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_tx_status_8023); diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 50ab5b9d8eab..89723907a094 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2734,6 +2734,39 @@ TRACE_EVENT(drv_get_ftm_responder_stats, ) ); +DEFINE_EVENT(local_sdata_addr_evt, drv_update_vif_offload, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata) +); + +TRACE_EVENT(drv_sta_set_4addr, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, bool enabled), + + TP_ARGS(local, sdata, sta, enabled), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + __field(bool, enabled) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_ASSIGN; + __entry->enabled = enabled; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " enabled:%d", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->enabled + ) +); + #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index dca01d7e6e3e..adc83d830691 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -531,7 +531,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) info->control.jiffies = jiffies; info->control.vif = &tx->sdata->vif; - info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; skb_queue_tail(&sta->ps_tx_buf[ac], tx->skb); spin_unlock(&sta->ps_lock); @@ -1134,7 +1134,7 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, tx->sta->sta.addr, tx->sta->sta.aid); } info->control.vif = &tx->sdata->vif; - info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; __skb_queue_tail(&tid_tx->pending, skb); if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER) @@ -1179,7 +1179,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, * we are doing the needed processing, so remove the flag * now. */ - info->flags &= ~IEEE80211_TX_INTFL_NEED_TXPROCESSING; + info->control.flags &= ~IEEE80211_TX_INTCFL_NEED_TXPROCESSING; hdr = (struct ieee80211_hdr *) skb->data; @@ -1258,7 +1258,7 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) return NULL; - if (!(info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP) && + if (!(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) && unlikely(!ieee80211_is_data_present(hdr->frame_control))) { if ((!ieee80211_is_mgmt(hdr->frame_control) || ieee80211_is_bufferable_mmpdu(hdr->frame_control) || @@ -3649,7 +3649,7 @@ begin: else info->flags &= ~IEEE80211_TX_CTL_AMPDU; - if (info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP) + if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) goto encap_out; if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) { @@ -4190,38 +4190,18 @@ static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, struct net_device *dev, struct sta_info *sta, - struct sk_buff *skb) + struct ieee80211_key *key, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ethhdr *ehdr = (struct ethhdr *)skb->data; struct ieee80211_local *local = sdata->local; - bool authorized = false; - bool multicast; - unsigned char *ra = ehdr->h_dest; - - if (IS_ERR(sta) || (sta && !sta->uploaded)) - sta = NULL; - - if (sdata->vif.type == NL80211_IFTYPE_STATION && - (!sta || !test_sta_flag(sta, WLAN_STA_TDLS_PEER))) - ra = sdata->u.mgd.bssid; + struct tid_ampdu_tx *tid_tx; + u8 tid; - if (is_zero_ether_addr(ra)) - goto out_free; - - multicast = is_multicast_ether_addr(ra); - - if (sta) - authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); - - if (!multicast && !authorized && - (ehdr->h_proto != sdata->control_port_protocol || - !ether_addr_equal(sdata->vif.addr, ehdr->h_source))) - goto out_free; - - if (multicast && sdata->vif.type == NL80211_IFTYPE_AP && - !atomic_read(&sdata->u.ap.num_mcast_sta)) - goto out_free; + if (local->ops->wake_tx_queue) { + u16 queue = __ieee80211_select_queue(sdata, sta, skb); + skb_set_queue_mapping(skb, queue); + skb_get_hash(skb); + } if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning)) && test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) @@ -4229,36 +4209,42 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, memset(info, 0, sizeof(*info)); - if (unlikely(!multicast && skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) - info->ack_frame_id = ieee80211_store_ack_skb(local, skb, - &info->flags, NULL); + tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; + tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); + if (tid_tx) { + if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { + /* fall back to non-offload slow path */ + __ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL); + return; + } - if (unlikely(sdata->control_port_protocol == ehdr->h_proto)) { - if (sdata->control_port_no_encrypt) - info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; - info->control.flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; + info->flags |= IEEE80211_TX_CTL_AMPDU; + if (tid_tx->timeout) + tid_tx->last_tx = jiffies; } - if (multicast) - info->flags |= IEEE80211_TX_CTL_NO_ACK; + if (unlikely(skb->sk && + skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) + info->ack_frame_id = ieee80211_store_ack_skb(local, skb, + &info->flags, NULL); info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; ieee80211_tx_stats(dev, skb->len); - if (sta) { - sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; - sta->tx_stats.packets[skb_get_queue_mapping(skb)]++; - } + sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; + sta->tx_stats.packets[skb_get_queue_mapping(skb)]++; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); - info->control.flags |= IEEE80211_TX_CTRL_HW_80211_ENCAP; + info->flags |= IEEE80211_TX_CTL_HW_80211_ENCAP; info->control.vif = &sdata->vif; + if (key) + info->control.hw_key = &key->conf; + ieee80211_tx_8023(sdata, skb, skb->len, sta, false); return; @@ -4271,12 +4257,10 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ethhdr *ehdr = (struct ethhdr *)skb->data; + struct ieee80211_key *key; struct sta_info *sta; - - if (WARN_ON(!sdata->hw_80211_encap)) { - kfree_skb(skb); - return NETDEV_TX_OK; - } + bool offload = true; if (unlikely(skb->len < ETH_HLEN)) { kfree_skb(skb); @@ -4285,11 +4269,26 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, rcu_read_lock(); - if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) + if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { kfree_skb(skb); + goto out; + } + + if (unlikely(IS_ERR_OR_NULL(sta) || !sta->uploaded || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || + sdata->control_port_protocol == ehdr->h_proto)) + offload = false; + else if ((key = rcu_dereference(sta->ptk[sta->ptk_idx])) && + (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) || + key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) + offload = false; + + if (offload) + ieee80211_8023_xmit(sdata, dev, sta, key, skb); else - ieee80211_8023_xmit(sdata, dev, sta, skb); + ieee80211_subif_start_xmit(skb, dev); +out: rcu_read_unlock(); return NETDEV_TX_OK; @@ -4365,7 +4364,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, sdata = vif_to_sdata(info->control.vif); - if (info->flags & IEEE80211_TX_INTFL_NEED_TXPROCESSING) { + if (info->control.flags & IEEE80211_TX_INTCFL_NEED_TXPROCESSING) { chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (unlikely(!chanctx_conf)) { dev_kfree_skb(skb); @@ -4373,7 +4372,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, } info->band = chanctx_conf->def.chan->band; result = ieee80211_tx(sdata, NULL, skb, true); - } else if (info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP) { + } else if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { dev_kfree_skb(skb); return true; @@ -4538,14 +4537,14 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, return 0; } -static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata, - struct beacon_data *beacon) +static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata, + struct beacon_data *beacon) { struct probe_resp *resp; u8 *beacon_data; size_t beacon_data_len; int i; - u8 count = beacon->csa_current_counter; + u8 count = beacon->cntdwn_current_counter; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: @@ -4565,36 +4564,36 @@ static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata, } rcu_read_lock(); - for (i = 0; i < IEEE80211_MAX_CSA_COUNTERS_NUM; ++i) { + for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; ++i) { resp = rcu_dereference(sdata->u.ap.probe_resp); - if (beacon->csa_counter_offsets[i]) { - if (WARN_ON_ONCE(beacon->csa_counter_offsets[i] >= + if (beacon->cntdwn_counter_offsets[i]) { + if (WARN_ON_ONCE(beacon->cntdwn_counter_offsets[i] >= beacon_data_len)) { rcu_read_unlock(); return; } - beacon_data[beacon->csa_counter_offsets[i]] = count; + beacon_data[beacon->cntdwn_counter_offsets[i]] = count; } if (sdata->vif.type == NL80211_IFTYPE_AP && resp) - resp->data[resp->csa_counter_offsets[i]] = count; + resp->data[resp->cntdwn_counter_offsets[i]] = count; } rcu_read_unlock(); } -static u8 __ieee80211_csa_update_counter(struct beacon_data *beacon) +static u8 __ieee80211_beacon_update_cntdwn(struct beacon_data *beacon) { - beacon->csa_current_counter--; + beacon->cntdwn_current_counter--; /* the counter should never reach 0 */ - WARN_ON_ONCE(!beacon->csa_current_counter); + WARN_ON_ONCE(!beacon->cntdwn_current_counter); - return beacon->csa_current_counter; + return beacon->cntdwn_current_counter; } -u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif) +u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct beacon_data *beacon = NULL; @@ -4612,15 +4611,15 @@ u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif) if (!beacon) goto unlock; - count = __ieee80211_csa_update_counter(beacon); + count = __ieee80211_beacon_update_cntdwn(beacon); unlock: rcu_read_unlock(); return count; } -EXPORT_SYMBOL(ieee80211_csa_update_counter); +EXPORT_SYMBOL(ieee80211_beacon_update_cntdwn); -void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter) +void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct beacon_data *beacon = NULL; @@ -4637,15 +4636,15 @@ void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter) if (!beacon) goto unlock; - if (counter < beacon->csa_current_counter) - beacon->csa_current_counter = counter; + if (counter < beacon->cntdwn_current_counter) + beacon->cntdwn_current_counter = counter; unlock: rcu_read_unlock(); } -EXPORT_SYMBOL(ieee80211_csa_set_counter); +EXPORT_SYMBOL(ieee80211_beacon_set_cntdwn); -bool ieee80211_csa_is_complete(struct ieee80211_vif *vif) +bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct beacon_data *beacon = NULL; @@ -4688,20 +4687,21 @@ bool ieee80211_csa_is_complete(struct ieee80211_vif *vif) goto out; } - if (!beacon->csa_counter_offsets[0]) + if (!beacon->cntdwn_counter_offsets[0]) goto out; - if (WARN_ON_ONCE(beacon->csa_counter_offsets[0] > beacon_data_len)) + if (WARN_ON_ONCE(beacon->cntdwn_counter_offsets[0] > beacon_data_len)) goto out; - if (beacon_data[beacon->csa_counter_offsets[0]] == 1) + if (beacon_data[beacon->cntdwn_counter_offsets[0]] == 1) ret = true; + out: rcu_read_unlock(); return ret; } -EXPORT_SYMBOL(ieee80211_csa_is_complete); +EXPORT_SYMBOL(ieee80211_beacon_cntdwn_is_complete); static int ieee80211_beacon_protect(struct sk_buff *skb, struct ieee80211_local *local, @@ -4761,11 +4761,11 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, beacon = rcu_dereference(ap->beacon); if (beacon) { - if (beacon->csa_counter_offsets[0]) { + if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) - __ieee80211_csa_update_counter(beacon); + ieee80211_beacon_update_cntdwn(vif); - ieee80211_set_csa(sdata, beacon); + ieee80211_set_beacon_cntdwn(sdata, beacon); } /* @@ -4809,11 +4809,11 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, if (!beacon) goto out; - if (beacon->csa_counter_offsets[0]) { + if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) - __ieee80211_csa_update_counter(beacon); + __ieee80211_beacon_update_cntdwn(beacon); - ieee80211_set_csa(sdata, beacon); + ieee80211_set_beacon_cntdwn(sdata, beacon); } skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + @@ -4833,16 +4833,16 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, if (!beacon) goto out; - if (beacon->csa_counter_offsets[0]) { + if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) /* TODO: For mesh csa_counter is in TU, so * decrementing it by one isn't correct, but * for now we leave it consistent with overall * mac80211's behavior. */ - __ieee80211_csa_update_counter(beacon); + __ieee80211_beacon_update_cntdwn(beacon); - ieee80211_set_csa(sdata, beacon); + ieee80211_set_beacon_cntdwn(sdata, beacon); } if (ifmsh->sync_ops) @@ -4874,13 +4874,13 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, if (offs && beacon) { int i; - for (i = 0; i < IEEE80211_MAX_CSA_COUNTERS_NUM; i++) { - u16 csa_off = beacon->csa_counter_offsets[i]; + for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; i++) { + u16 csa_off = beacon->cntdwn_counter_offsets[i]; if (!csa_off) continue; - offs->csa_counter_offs[i] = csa_off_base + csa_off; + offs->cntdwn_counter_offs[i] = csa_off_base + csa_off; } } @@ -4999,6 +4999,63 @@ out: } EXPORT_SYMBOL(ieee80211_proberesp_get); +struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) +{ + struct sk_buff *skb = NULL; + struct fils_discovery_data *tmpl = NULL; + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + if (sdata->vif.type != NL80211_IFTYPE_AP) + return NULL; + + rcu_read_lock(); + tmpl = rcu_dereference(sdata->u.ap.fils_discovery); + if (!tmpl) { + rcu_read_unlock(); + return NULL; + } + + skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len); + if (skb) { + skb_reserve(skb, sdata->local->hw.extra_tx_headroom); + skb_put_data(skb, tmpl->data, tmpl->len); + } + + rcu_read_unlock(); + return skb; +} +EXPORT_SYMBOL(ieee80211_get_fils_discovery_tmpl); + +struct sk_buff * +ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) +{ + struct sk_buff *skb = NULL; + struct unsol_bcast_probe_resp_data *tmpl = NULL; + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + if (sdata->vif.type != NL80211_IFTYPE_AP) + return NULL; + + rcu_read_lock(); + tmpl = rcu_dereference(sdata->u.ap.unsol_bcast_probe_resp); + if (!tmpl) { + rcu_read_unlock(); + return NULL; + } + + skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len); + if (skb) { + skb_reserve(skb, sdata->local->hw.extra_tx_headroom); + skb_put_data(skb, tmpl->data, tmpl->len); + } + + rcu_read_unlock(); + return skb; +} +EXPORT_SYMBOL(ieee80211_get_unsol_bcast_probe_resp_tmpl); + struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index d1b64d0751f2..fb0e3a657d2d 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -315,10 +315,6 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta); - /* If HT IE reported 3839 bytes only, stay with that size. */ - if (sta->sta.max_amsdu_len == IEEE80211_MAX_MPDU_LEN_HT_3839) - return; - switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) { case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454; diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 0a6a15f3456d..056986c7a228 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -22,6 +22,11 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), + SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), + SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), + SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE), + SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW), + SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA), SNMP_MIB_SENTINEL }; diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index d7de340fc997..937a177729f1 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -15,6 +15,11 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ + MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */ + MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */ + MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */ + MPTCP_MIB_NODSSWINDOW, /* Segments not in MPTCP windows */ + MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 770da3627848..b4a9624d7bf2 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -23,8 +23,6 @@ static int pm_nl_pernet_id; struct mptcp_pm_addr_entry { struct list_head list; - unsigned int flags; - int ifindex; struct mptcp_addr_info addr; struct rcu_head rcu; }; @@ -129,7 +127,7 @@ select_local_address(const struct pm_nl_pernet *pernet, rcu_read_lock(); spin_lock_bh(&msk->join_list_lock); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) + if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; /* avoid any address already in use by subflows and @@ -160,7 +158,7 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) * can lead to additional addresses not being announced. */ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) + if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) continue; if (i++ == pos) { ret = entry; @@ -220,8 +218,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) msk->pm.subflows++; check_work_pending(msk); spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect(sk, local->ifindex, - &local->addr, &remote); + __mptcp_subflow_connect(sk, &local->addr, &remote); spin_lock_bh(&msk->pm.lock); return; } @@ -267,13 +264,13 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) local.family = remote.family; spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect((struct sock *)msk, 0, &local, &remote); + __mptcp_subflow_connect((struct sock *)msk, &local, &remote); spin_lock_bh(&msk->pm.lock); } static bool address_use_port(struct mptcp_pm_addr_entry *entry) { - return (entry->flags & + return (entry->addr.flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == MPTCP_PM_ADDR_FLAG_SIGNAL; } @@ -303,9 +300,9 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, goto out; } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) + if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) pernet->add_addr_signal_max++; - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) pernet->local_addr_max++; entry->addr.id = pernet->next_id++; @@ -358,8 +355,9 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) if (!entry) return -ENOMEM; - entry->flags = 0; entry->addr = skc_local; + entry->addr.ifindex = 0; + entry->addr.flags = 0; ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); if (ret < 0) kfree(entry); @@ -397,8 +395,8 @@ mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = { [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, }, [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, }, [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, }, - [MPTCP_PM_ADDR_ATTR_ADDR6] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct in6_addr), }, + [MPTCP_PM_ADDR_ATTR_ADDR6] = + NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 }, [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 }, [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 }, @@ -473,14 +471,17 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]); skip_family: - if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) - entry->ifindex = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); + if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { + u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); + + entry->addr.ifindex = val; + } if (tb[MPTCP_PM_ADDR_ATTR_ID]) entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) - entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); + entry->addr.flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); return 0; } @@ -548,9 +549,9 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) ret = -EINVAL; goto out; } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) + if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) pernet->add_addr_signal_max--; - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) pernet->local_addr_max--; pernet->addrs--; @@ -606,10 +607,10 @@ static int mptcp_nl_fill_addr(struct sk_buff *skb, goto nla_put_failure; if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id)) goto nla_put_failure; - if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags)) + if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->addr.flags)) goto nla_put_failure; - if (entry->ifindex && - nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex)) + if (entry->addr.ifindex && + nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->addr.ifindex)) goto nla_put_failure; if (addr->family == AF_INET && diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 365ba96c84b0..386cd4e60250 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -24,8 +24,6 @@ #include "protocol.h" #include "mib.h" -#define MPTCP_SAME_STATE TCP_MAX_STATES - #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct mptcp6_sock { struct mptcp_sock msk; @@ -34,6 +32,8 @@ struct mptcp6_sock { #endif struct mptcp_skb_cb { + u64 map_seq; + u64 end_seq; u32 offset; }; @@ -112,64 +112,205 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) return 0; } -static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, - struct sk_buff *skb, - unsigned int offset, size_t copy_len) +static void mptcp_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + __kfree_skb(skb); +} + +static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, + struct sk_buff *from) +{ + bool fragstolen; + int delta; + + if (MPTCP_SKB_CB(from)->offset || + !skb_try_coalesce(to, from, &fragstolen, &delta)) + return false; + + pr_debug("colesced seq %llx into %llx new len %d new end seq %llx", + MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, + to->len, MPTCP_SKB_CB(from)->end_seq); + MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; + kfree_skb_partial(from, fragstolen); + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + return true; +} + +static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, + struct sk_buff *from) +{ + if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq) + return false; + + return mptcp_try_coalesce((struct sock *)msk, to, from); +} + +/* "inspired" by tcp_data_queue_ofo(), main differences: + * - use mptcp seqs + * - don't cope with sacks + */ +static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) { struct sock *sk = (struct sock *)msk; - struct sk_buff *tail; + struct rb_node **p, *parent; + u64 seq, end_seq, max_seq; + struct sk_buff *skb1; + int space; + + seq = MPTCP_SKB_CB(skb)->map_seq; + end_seq = MPTCP_SKB_CB(skb)->end_seq; + space = tcp_space(sk); + max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq; + + pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, + RB_EMPTY_ROOT(&msk->out_of_order_queue)); + if (after64(seq, max_seq)) { + /* out of window */ + mptcp_drop(sk, skb); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); + return; + } - __skb_unlink(skb, &ssk->sk_receive_queue); + p = &msk->out_of_order_queue.rb_node; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE); + if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) { + rb_link_node(&skb->rbnode, NULL, p); + rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); + msk->ooo_last_skb = skb; + goto end; + } - skb_ext_reset(skb); - skb_orphan(skb); - msk->ack_seq += copy_len; + /* with 2 subflows, adding at end of ooo queue is quite likely + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ + if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); + return; + } - tail = skb_peek_tail(&sk->sk_receive_queue); - if (offset == 0 && tail) { - bool fragstolen; - int delta; + /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ + if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); + parent = &msk->ooo_last_skb->rbnode; + p = &parent->rb_right; + goto insert; + } - if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { - kfree_skb_partial(skb, fragstolen); - atomic_add(delta, &sk->sk_rmem_alloc); - sk_mem_charge(sk, delta); + /* Find place to insert this segment. Handle overlaps on the way. */ + parent = NULL; + while (*p) { + parent = *p; + skb1 = rb_to_skb(parent); + if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { + p = &parent->rb_left; + continue; + } + if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) { + if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + mptcp_drop(sk, skb); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); + return; + } + if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { + /* partial overlap: + * | skb | + * | skb1 | + * continue traversing + */ + } else { + /* skb's seq == skb1's seq and skb covers skb1. + * Replace skb1 with skb. + */ + rb_replace_node(&skb1->rbnode, &skb->rbnode, + &msk->out_of_order_queue); + mptcp_drop(sk, skb1); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); + goto merge_right; + } + } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); return; } + p = &parent->rb_right; } - skb_set_owner_r(skb, sk); - __skb_queue_tail(&sk->sk_receive_queue, skb); - MPTCP_SKB_CB(skb)->offset = offset; -} +insert: + /* Insert segment into RB tree. */ + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); -static void mptcp_stop_timer(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); +merge_right: + /* Remove other segments covered by skb. */ + while ((skb1 = skb_rb_next(skb)) != NULL) { + if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) + break; + rb_erase(&skb1->rbnode, &msk->out_of_order_queue); + mptcp_drop(sk, skb1); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); + } + /* If there is no skb after us, we are the last_skb ! */ + if (!skb1) + msk->ooo_last_skb = skb; - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); - mptcp_sk(sk)->timer_ival = 0; +end: + skb_condense(skb); + skb_set_owner_r(skb, sk); } -/* both sockets must be locked */ -static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, - struct sock *ssk) +static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, + struct sk_buff *skb, unsigned int offset, + size_t copy_len) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - u64 dsn = mptcp_subflow_get_mapped_dsn(subflow); + struct sock *sk = (struct sock *)msk; + struct sk_buff *tail; - /* revalidate data sequence number. - * - * mptcp_subflow_data_available() is usually called - * without msk lock. Its unlikely (but possible) - * that msk->ack_seq has been advanced since the last - * call found in-sequence data. + __skb_unlink(skb, &ssk->sk_receive_queue); + + skb_ext_reset(skb); + skb_orphan(skb); + + /* the skb map_seq accounts for the skb offset: + * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq + * value */ - if (likely(dsn == msk->ack_seq)) + MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); + MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; + MPTCP_SKB_CB(skb)->offset = offset; + + if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { + /* in sequence */ + msk->ack_seq += copy_len; + tail = skb_peek_tail(&sk->sk_receive_queue); + if (tail && mptcp_try_coalesce(sk, tail, skb)) + return true; + + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); return true; + } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { + mptcp_data_queue_ofo(msk, skb); + return false; + } - subflow->data_avail = 0; - return mptcp_subflow_data_available(ssk); + /* old data, keep it simple and drop the whole pkt, sender + * will retransmit as needed, if needed. + */ + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); + mptcp_drop(sk, skb); + return false; +} + +static void mptcp_stop_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + mptcp_sk(sk)->timer_ival = 0; } static void mptcp_check_data_fin_ack(struct sock *sk) @@ -315,11 +456,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct tcp_sock *tp; bool done = false; - if (!mptcp_subflow_dsn_valid(msk, ssk)) { - *bytes = 0; - return false; - } - + pr_debug("msk=%p ssk=%p", msk, ssk); tp = tcp_sk(ssk); do { u32 map_remaining, offset; @@ -357,9 +494,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, if (tp->urg_data) done = true; - __mptcp_move_skb(msk, ssk, skb, offset, len); + if (__mptcp_move_skb(msk, ssk, skb, offset, len)) + moved += len; seq += len; - moved += len; if (WARN_ON_ONCE(map_remaining < len)) break; @@ -378,20 +515,56 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } } while (more_data_avail); - *bytes = moved; - - /* If the moves have caught up with the DATA_FIN sequence number - * it's time to ack the DATA_FIN and change socket state, but - * this is not a good place to change state. Let the workqueue - * do it. - */ - if (mptcp_pending_data_fin(sk, NULL) && - schedule_work(&msk->work)) - sock_hold(sk); + *bytes += moved; + if (moved) + tcp_cleanup_rbuf(ssk, moved); return done; } +static bool mptcp_ofo_queue(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + struct sk_buff *skb, *tail; + bool moved = false; + struct rb_node *p; + u64 end_seq; + + p = rb_first(&msk->out_of_order_queue); + pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); + while (p) { + skb = rb_to_skb(p); + if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) + break; + + p = rb_next(p); + rb_erase(&skb->rbnode, &msk->out_of_order_queue); + + if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq, + msk->ack_seq))) { + mptcp_drop(sk, skb); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); + continue; + } + + end_seq = MPTCP_SKB_CB(skb)->end_seq; + tail = skb_peek_tail(&sk->sk_receive_queue); + if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) { + int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; + + /* skip overlapping data, if any */ + pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d", + MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, + delta); + MPTCP_SKB_CB(skb)->offset += delta; + __skb_queue_tail(&sk->sk_receive_queue, skb); + } + msk->ack_seq = end_seq; + moved = true; + } + return moved; +} + /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ @@ -407,8 +580,19 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) return false; /* must re-check after taking the lock */ - if (!READ_ONCE(sk->sk_lock.owned)) + if (!READ_ONCE(sk->sk_lock.owned)) { __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + mptcp_ofo_queue(msk); + + /* If the moves have caught up with the DATA_FIN sequence number + * it's time to ack the DATA_FIN and change socket state, but + * this is not a good place to change state. Let the workqueue + * do it. + */ + if (mptcp_pending_data_fin(sk, NULL) && + schedule_work(&msk->work)) + sock_hold(sk); + } spin_unlock_bh(&sk->sk_lock.slock); @@ -417,9 +601,17 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) void mptcp_data_ready(struct sock *sk, struct sock *ssk) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); + bool wake; - set_bit(MPTCP_DATA_READY, &msk->flags); + /* move_skbs_to_msk below can legitly clear the data_avail flag, + * but we will need later to properly woke the reader, cache its + * value + */ + wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL; + if (wake) + set_bit(MPTCP_DATA_READY, &msk->flags); if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) && move_skbs_to_msk(msk, ssk)) @@ -440,7 +632,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) move_skbs_to_msk(msk, ssk); } wake: - sk->sk_data_ready(sk); + if (wake) + sk->sk_data_ready(sk); } static void __mptcp_flush_join_list(struct mptcp_sock *msk) @@ -474,7 +667,7 @@ void mptcp_data_acked(struct sock *sk) { mptcp_reset_timer(sk); - if ((!sk_stream_is_writeable(sk) || + if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && schedule_work(&mptcp_sk(sk)->work)) sock_hold(sk); @@ -569,6 +762,20 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) put_page(dfrag->page); } +static bool mptcp_is_writeable(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + if (!sk_stream_is_writeable((struct sock *)msk)) + return false; + + mptcp_for_each_subflow(msk, subflow) { + if (sk_stream_is_writeable(subflow->tcp_sock)) + return true; + } + return false; +} + static void mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -611,8 +818,15 @@ out: sk_mem_reclaim_partial(sk); /* Only wake up writers if a subflow is ready */ - if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) + if (mptcp_is_writeable(msk)) { + set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags); + smp_mb__after_atomic(); + + /* set SEND_SPACE before sk_stream_write_space clears + * NOSPACE + */ sk_stream_write_space(sk); + } } } @@ -803,60 +1017,128 @@ out: return ret; } -static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock) +static void mptcp_nospace(struct mptcp_sock *msk) { + struct mptcp_subflow_context *subflow; + clear_bit(MPTCP_SEND_SPACE, &msk->flags); smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ - /* enables sk->write_space() callbacks */ - set_bit(SOCK_NOSPACE, &sock->flags); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct socket *sock = READ_ONCE(ssk->sk_socket); + + /* enables ssk->write_space() callbacks */ + if (sock) + set_bit(SOCK_NOSPACE, &sock->flags); + } } -static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) +static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ + if (subflow->request_join && !subflow->fully_established) + return false; + + /* only send if our side has not closed yet */ + return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); +} + +#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ + sizeof(struct tcphdr) - \ + MAX_TCP_OPTION_SPACE - \ + sizeof(struct ipv6hdr) - \ + sizeof(struct frag_hdr)) + +struct subflow_send_info { + struct sock *ssk; + u64 ratio; +}; + +static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, + u32 *sndbuf) +{ + struct subflow_send_info send_info[2]; struct mptcp_subflow_context *subflow; - struct sock *backup = NULL; + int i, nr_active = 0; + struct sock *ssk; + u64 ratio; + u32 pace; - sock_owned_by_me((const struct sock *)msk); + sock_owned_by_me((struct sock *)msk); + *sndbuf = 0; if (!mptcp_ext_cache_refill(msk)) return NULL; + if (__mptcp_check_fallback(msk)) { + if (!msk->first) + return NULL; + *sndbuf = msk->first->sk_sndbuf; + return sk_stream_memory_free(msk->first) ? msk->first : NULL; + } + + /* re-use last subflow, if the burst allow that */ + if (msk->last_snd && msk->snd_burst > 0 && + sk_stream_memory_free(msk->last_snd) && + mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { + mptcp_for_each_subflow(msk, subflow) { + ssk = mptcp_subflow_tcp_sock(subflow); + *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); + } + return msk->last_snd; + } + + /* pick the subflow with the lower wmem/wspace ratio */ + for (i = 0; i < 2; ++i) { + send_info[i].ssk = NULL; + send_info[i].ratio = -1; + } mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + ssk = mptcp_subflow_tcp_sock(subflow); + if (!mptcp_subflow_active(subflow)) + continue; - if (!sk_stream_memory_free(ssk)) { - struct socket *sock = ssk->sk_socket; + nr_active += !subflow->backup; + *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); + if (!sk_stream_memory_free(subflow->tcp_sock)) + continue; - if (sock) - mptcp_nospace(msk, sock); + pace = READ_ONCE(ssk->sk_pacing_rate); + if (!pace) + continue; - return NULL; + ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, + pace); + if (ratio < send_info[subflow->backup].ratio) { + send_info[subflow->backup].ssk = ssk; + send_info[subflow->backup].ratio = ratio; } + } - if (subflow->backup) { - if (!backup) - backup = ssk; + pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld", + msk, nr_active, send_info[0].ssk, send_info[0].ratio, + send_info[1].ssk, send_info[1].ratio); - continue; - } + /* pick the best backup if no other subflow is active */ + if (!nr_active) + send_info[0].ssk = send_info[1].ssk; - return ssk; + if (send_info[0].ssk) { + msk->last_snd = send_info[0].ssk; + msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, + sk_stream_wspace(msk->last_snd)); + return msk->last_snd; } - - return backup; + return NULL; } -static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) +static void ssk_check_wmem(struct mptcp_sock *msk) { - struct socket *sock; - - if (likely(sk_stream_is_writeable(ssk))) - return; - - sock = READ_ONCE(ssk->sk_socket); - if (sock) - mptcp_nospace(msk, sock); + if (unlikely(!mptcp_is_writeable(msk))) + mptcp_nospace(msk); } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) @@ -866,6 +1148,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct page_frag *pfrag; size_t copied = 0; struct sock *ssk; + u32 sndbuf; bool tx_ok; long timeo; @@ -892,7 +1175,7 @@ restart: } __mptcp_flush_join_list(msk); - ssk = mptcp_subflow_get_send(msk); + ssk = mptcp_subflow_get_send(msk, &sndbuf); while (!sk_stream_memory_free(sk) || !ssk || !mptcp_page_frag_refill(ssk, pfrag)) { @@ -909,19 +1192,25 @@ restart: mptcp_reset_timer(sk); } + mptcp_nospace(msk); ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto out; mptcp_clean_una(sk); - ssk = mptcp_subflow_get_send(msk); + ssk = mptcp_subflow_get_send(msk, &sndbuf); if (list_empty(&msk->conn_list)) { ret = -ENOTCONN; goto out; } } + /* do auto tuning */ + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && + sndbuf > READ_ONCE(sk->sk_sndbuf)) + WRITE_ONCE(sk->sk_sndbuf, sndbuf); + pr_debug("conn_list->subflow=%p", ssk); lock_sock(ssk); @@ -938,6 +1227,10 @@ restart: break; } + /* burst can be negative, we will try move to the next subflow + * at selection time, if possible. + */ + msk->snd_burst -= ret; copied += ret; tx_ok = msg_data_left(msg); @@ -947,7 +1240,6 @@ restart: if (!sk_stream_memory_free(ssk) || !mptcp_page_frag_refill(ssk, pfrag) || !mptcp_ext_cache_refill(msk)) { - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal); mptcp_set_timeout(sk, ssk); @@ -995,9 +1287,9 @@ restart: mptcp_reset_timer(sk); } - ssk_check_wmem(msk, ssk); release_sock(ssk); out: + ssk_check_wmem(msk); release_sock(sk); return copied ? : ret; } @@ -1135,10 +1427,14 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk; + bool slow; ssk = mptcp_subflow_tcp_sock(subflow); + slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); tcp_sk(ssk)->window_clamp = window_clamp; + tcp_cleanup_rbuf(ssk, 1); + unlock_sock_fast(ssk, slow); } } } @@ -1154,6 +1450,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) unsigned int moved = 0; bool done; + /* avoid looping forever below on racing close */ + if (((struct sock *)msk)->sk_state == TCP_CLOSE) + return false; + + __mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); @@ -1165,7 +1466,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) release_sock(ssk); } while (!done); - return moved > 0; + if (mptcp_ofo_queue(msk) || moved > 0) { + mptcp_check_data_fin((struct sock *)msk); + return true; + } + return false; } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, @@ -1261,6 +1566,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, set_bit(MPTCP_DATA_READY, &msk->flags); } out_err: + pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", + msk, test_bit(MPTCP_DATA_READY, &msk->flags), + skb_queue_empty(&sk->sk_receive_queue), copied); mptcp_rcv_space_adjust(msk, copied); release_sock(sk); @@ -1311,9 +1619,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) sock_owned_by_me((const struct sock *)msk); + if (__mptcp_check_fallback(msk)) + return msk->first; + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + if (!mptcp_subflow_active(subflow)) + continue; + /* still data outstanding at TCP level? Don't retransmit. */ if (!tcp_write_queue_empty(ssk)) return NULL; @@ -1474,6 +1788,7 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->rtx_queue); __set_bit(MPTCP_SEND_SPACE, &msk->flags); INIT_WORK(&msk->work, mptcp_worker); + msk->out_of_order_queue = RB_ROOT; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; @@ -1507,7 +1822,7 @@ static int mptcp_init_sock(struct sock *sk) sk_sockets_allocated_inc(sk); sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; - sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; return 0; } @@ -1813,6 +2128,7 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + skb_rbtree_purge(&msk->out_of_order_queue); mptcp_token_destroy(msk); if (msk->cached_ext) __skb_ext_put(msk->cached_ext); @@ -2288,13 +2604,13 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); + pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); if (state == TCP_LISTEN) return mptcp_check_readable(msk); if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); - if (sk_stream_is_writeable(sk) && - test_bit(MPTCP_SEND_SPACE, &msk->flags)) + if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) mask |= EPOLLOUT | EPOLLWRNORM; } if (sk->sk_shutdown & RCV_SHUTDOWN) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 60b27d44c184..493bd2c13bc6 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -140,6 +140,8 @@ struct mptcp_addr_info { sa_family_t family; __be16 port; u8 id; + u8 flags; + int ifindex; union { struct in_addr addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -194,6 +196,8 @@ struct mptcp_sock { u64 write_seq; u64 ack_seq; u64 rcv_data_fin_seq; + struct sock *last_snd; + int snd_burst; atomic64_t snd_una; unsigned long timer_ival; u32 token; @@ -204,6 +208,8 @@ struct mptcp_sock { bool snd_data_fin_enable; spinlock_t join_list_lock; struct work_struct work; + struct sk_buff *ooo_last_skb; + struct rb_root out_of_order_queue; struct list_head conn_list; struct list_head rtx_queue; struct list_head join_list; @@ -268,6 +274,12 @@ mptcp_subflow_rsk(const struct request_sock *rsk) return (struct mptcp_subflow_request_sock *)rsk; } +enum mptcp_data_avail { + MPTCP_SUBFLOW_NODATA, + MPTCP_SUBFLOW_DATA_AVAIL, + MPTCP_SUBFLOW_OOO_DATA +}; + /* MPTCP subflow context */ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ @@ -292,10 +304,10 @@ struct mptcp_subflow_context { map_valid : 1, mpc_map : 1, backup : 1, - data_avail : 1, rx_eof : 1, use_64bit_ack : 1, /* Set when we received a 64-bit DSN */ can_ack : 1; /* only after processing the remote a key */ + enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -350,8 +362,7 @@ bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); /* called with sk socket lock held */ -int __mptcp_subflow_connect(struct sock *sk, int ifindex, - const struct mptcp_addr_info *loc, +int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); @@ -464,12 +475,12 @@ static inline bool before64(__u64 seq1, __u64 seq2) void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); -static inline bool __mptcp_check_fallback(struct mptcp_sock *msk) +static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk) { return test_bit(MPTCP_FALLBACK_DONE, &msk->flags); } -static inline bool mptcp_check_fallback(struct sock *sk) +static inline bool mptcp_check_fallback(const struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9ead43f79023..141d555b7bd2 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -20,6 +20,7 @@ #include <net/ip6_route.h> #endif #include <net/mptcp.h> +#include <uapi/linux/mptcp.h> #include "protocol.h" #include "mib.h" @@ -434,6 +435,7 @@ static void mptcp_sock_destruct(struct sock *sk) sock_orphan(sk); } + skb_rbtree_purge(&mptcp_sk(sk)->out_of_order_queue); mptcp_token_destroy(mptcp_sk(sk)); inet_sock_destruct(sk); } @@ -804,16 +806,25 @@ validate_seq: return MAPPING_OK; } -static int subflow_read_actor(read_descriptor_t *desc, - struct sk_buff *skb, - unsigned int offset, size_t len) +static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, + u64 limit) { - size_t copy_len = min(desc->count, len); - - desc->count -= copy_len; - - pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count); - return copy_len; + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; + u32 incr; + + incr = limit >= skb->len ? skb->len + fin : limit; + + pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, + subflow->map_subflow_seq); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); + tcp_sk(ssk)->copied_seq += incr; + if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) + sk_eat_skb(ssk, skb); + if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) + subflow->map_valid = 0; + if (incr) + tcp_cleanup_rbuf(ssk, incr); } static bool subflow_check_data_avail(struct sock *ssk) @@ -825,13 +836,13 @@ static bool subflow_check_data_avail(struct sock *ssk) pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); + if (!skb_peek(&ssk->sk_receive_queue)) + subflow->data_avail = 0; if (subflow->data_avail) return true; msk = mptcp_sk(subflow->conn); for (;;) { - u32 map_remaining; - size_t delta; u64 ack_seq; u64 old_ack; @@ -849,6 +860,7 @@ static bool subflow_check_data_avail(struct sock *ssk) subflow->map_data_len = skb->len; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; + subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; return true; } @@ -876,42 +888,18 @@ static bool subflow_check_data_avail(struct sock *ssk) ack_seq = mptcp_subflow_get_mapped_dsn(subflow); pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, ack_seq); - if (ack_seq == old_ack) + if (ack_seq == old_ack) { + subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; break; + } else if (after64(ack_seq, old_ack)) { + subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA; + break; + } /* only accept in-sequence mapping. Old values are spurious - * retransmission; we can hit "future" values on active backup - * subflow switch, we relay on retransmissions to get - * in-sequence data. - * Cuncurrent subflows support will require subflow data - * reordering + * retransmission */ - map_remaining = subflow->map_data_len - - mptcp_subflow_get_map_offset(subflow); - if (before64(ack_seq, old_ack)) - delta = min_t(size_t, old_ack - ack_seq, map_remaining); - else - delta = min_t(size_t, ack_seq - old_ack, map_remaining); - - /* discard mapped data */ - pr_debug("discarding %zu bytes, current map len=%d", delta, - map_remaining); - if (delta) { - read_descriptor_t desc = { - .count = delta, - }; - int ret; - - ret = tcp_read_sock(ssk, &desc, subflow_read_actor); - if (ret < 0) { - ssk->sk_err = -ret; - goto fatal; - } - if (ret < delta) - return false; - if (delta == map_remaining) - subflow->map_valid = 0; - } + mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); } return true; @@ -922,13 +910,13 @@ fatal: ssk->sk_error_report(ssk); tcp_set_state(ssk, TCP_CLOSE); tcp_send_active_reset(ssk, GFP_ATOMIC); + subflow->data_avail = 0; return false; } bool mptcp_subflow_data_available(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct sk_buff *skb; /* check if current mapping is still valid */ if (subflow->map_valid && @@ -941,15 +929,7 @@ bool mptcp_subflow_data_available(struct sock *sk) subflow->map_data_len); } - if (!subflow_check_data_avail(sk)) { - subflow->data_avail = 0; - return false; - } - - skb = skb_peek(&sk->sk_receive_queue); - subflow->data_avail = skb && - before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq); - return subflow->data_avail; + return subflow_check_data_avail(sk); } /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, @@ -996,8 +976,10 @@ static void subflow_write_space(struct sock *sk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; - sk_stream_write_space(sk); - if (sk_stream_is_writeable(sk)) { + if (!sk_stream_is_writeable(sk)) + return; + + if (sk_stream_is_writeable(parent)) { set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); smp_mb__after_atomic(); /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ @@ -1056,8 +1038,7 @@ static void mptcp_info2sockaddr(const struct mptcp_addr_info *info, #endif } -int __mptcp_subflow_connect(struct sock *sk, int ifindex, - const struct mptcp_addr_info *loc, +int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *remote) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1102,7 +1083,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, if (loc->family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - ssk->sk_bound_dev_if = ifindex; + ssk->sk_bound_dev_if = loc->ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) goto failed; @@ -1114,7 +1095,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, subflow->local_id = local_id; subflow->remote_id = remote_id; subflow->request_join = 1; - subflow->request_bkup = 1; + subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP); mptcp_info2sockaddr(remote, &addr); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 2c1593089ede..eb0e329f9b8d 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -29,7 +29,6 @@ if IP_VS config IP_VS_IPV6 bool "IPv6 support for IPVS" depends on IPV6 = y || IP_VS = IPV6 - select IP6_NF_IPTABLES select NF_DEFRAG_IPV6 help Add IPv6 support to IPVS. diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 678c5b14841c..8dbfd84322a8 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2508,6 +2508,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) /* Set timeout values for (tcp tcpfin udp) */ ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); goto out_unlock; + } else if (!len) { + /* No more commands with len == 0 below */ + ret = -EINVAL; + goto out_unlock; } usvc_compat = (struct ip_vs_service_user *)arg; @@ -2584,9 +2588,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) break; case IP_VS_SO_SET_DELDEST: ret = ip_vs_del_dest(svc, &udest); - break; - default: - ret = -EINVAL; } out_unlock: diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5b97d233f89b..234b7cab37c3 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -859,7 +859,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) out: nf_conntrack_double_unlock(hash, reply_hash); - NF_CT_STAT_INC(net, insert_failed); local_bh_enable(); return -EEXIST; } @@ -909,6 +908,7 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct) tstamp->start = ktime_get_real_ns(); } +/* caller must hold locks to prevent concurrent changes */ static int __nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) { @@ -922,23 +922,21 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, if (nf_ct_is_dying(ct)) return NF_DROP; - if (!atomic_inc_not_zero(&ct->ct_general.use)) - return NF_DROP; - if (((ct->status & IPS_NAT_DONE_MASK) == 0) || nf_ct_match(ct, loser_ct)) { struct net *net = nf_ct_net(ct); + nf_conntrack_get(&ct->ct_general); + nf_ct_acct_merge(ct, ctinfo, loser_ct); nf_ct_add_to_dying_list(loser_ct); nf_conntrack_put(&loser_ct->ct_general); nf_ct_set(skb, ct, ctinfo); - NF_CT_STAT_INC(net, insert_failed); + NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } - nf_ct_put(ct); return NF_DROP; } @@ -998,6 +996,8 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[repl_idx]); + + NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } @@ -1027,10 +1027,10 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) * * Failing that, the new, unconfirmed conntrack is still added to the table * provided that the collision only occurs in the ORIGINAL direction. - * The new entry will be added after the existing one in the hash list, + * The new entry will be added only in the non-clashing REPLY direction, * so packets in the ORIGINAL direction will continue to match the existing * entry. The new entry will also have a fixed timeout so it expires -- - * due to the collision, it will not see bidirectional traffic. + * due to the collision, it will only see reply traffic. * * Returns NF_DROP if the clash could not be resolved. */ @@ -1725,10 +1725,8 @@ nf_conntrack_handle_icmp(struct nf_conn *tmpl, else return NF_ACCEPT; - if (ret <= 0) { + if (ret <= 0) NF_CT_STAT_INC_ATOMIC(state->net, error); - NF_CT_STAT_INC_ATOMIC(state->net, invalid); - } return ret; } @@ -1802,10 +1800,8 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) if (tmpl || ctinfo == IP_CT_UNTRACKED) { /* Previously seen (loopback or untracked)? Ignore. */ if ((tmpl && !nf_ct_is_template(tmpl)) || - ctinfo == IP_CT_UNTRACKED) { - NF_CT_STAT_INC_ATOMIC(state->net, ignore); + ctinfo == IP_CT_UNTRACKED) return NF_ACCEPT; - } skb->_nfct = 0; } @@ -1813,7 +1809,6 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); if (dataoff <= 0) { pr_debug("not prepared to track yet or error occurred\n"); - NF_CT_STAT_INC_ATOMIC(state->net, error); NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index c3a4214dc958..3d0fd33be018 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2497,7 +2497,6 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || - nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) || nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || nla_put_be32(skb, CTA_STATS_INSERT_FAILED, htonl(st->insert_failed)) || @@ -2505,7 +2504,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) || nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) || nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, - htonl(st->search_restart))) + htonl(st->search_restart)) || + nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE, + htonl(st->clash_resolve))) goto nla_put_failure; nlmsg_end(skb, nlh); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index a604f43e3e6b..0ff39740797d 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -435,11 +435,11 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, - 0, + st->clash_resolve, /* was: searched */ st->found, 0, st->invalid, - st->ignore, + 0, 0, 0, st->insert, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4603b667973a..97fb6f776114 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -650,6 +650,8 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, [NFTA_TABLE_HANDLE] = { .type = NLA_U64 }, + [NFTA_TABLE_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN } }; static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, @@ -676,6 +678,11 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, NFTA_TABLE_PAD)) goto nla_put_failure; + if (table->udata) { + if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata)) + goto nla_put_failure; + } + nlmsg_end(skb, nlh); return 0; @@ -988,8 +995,9 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, int family = nfmsg->nfgen_family; const struct nlattr *attr; struct nft_table *table; - u32 flags = 0; struct nft_ctx ctx; + u32 flags = 0; + u16 udlen = 0; int err; lockdep_assert_held(&net->nft.commit_mutex); @@ -1025,6 +1033,16 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (table->name == NULL) goto err_strdup; + if (nla[NFTA_TABLE_USERDATA]) { + udlen = nla_len(nla[NFTA_TABLE_USERDATA]); + table->udata = kzalloc(udlen, GFP_KERNEL); + if (table->udata == NULL) + goto err_table_udata; + + nla_memcpy(table->udata, nla[NFTA_TABLE_USERDATA], udlen); + table->udlen = udlen; + } + err = rhltable_init(&table->chains_ht, &nft_chain_ht_params); if (err) goto err_chain_ht; @@ -1047,6 +1065,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, err_trans: rhltable_destroy(&table->chains_ht); err_chain_ht: + kfree(table->udata); +err_table_udata: kfree(table->name); err_strdup: kfree(table); @@ -5737,6 +5757,8 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, [NFTA_OBJ_HANDLE] = { .type = NLA_U64}, + [NFTA_OBJ_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, @@ -5884,6 +5906,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, struct nft_object *obj; struct nft_ctx ctx; u32 objtype; + u16 udlen; int err; if (!nla[NFTA_OBJ_TYPE] || @@ -5928,7 +5951,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]); if (IS_ERR(obj)) { err = PTR_ERR(obj); - goto err1; + goto err_init; } obj->key.table = table; obj->handle = nf_tables_alloc_handle(table); @@ -5936,32 +5959,44 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL); if (!obj->key.name) { err = -ENOMEM; - goto err2; + goto err_strdup; + } + + if (nla[NFTA_OBJ_USERDATA]) { + udlen = nla_len(nla[NFTA_OBJ_USERDATA]); + obj->udata = kzalloc(udlen, GFP_KERNEL); + if (obj->udata == NULL) + goto err_userdata; + + nla_memcpy(obj->udata, nla[NFTA_OBJ_USERDATA], udlen); + obj->udlen = udlen; } err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj); if (err < 0) - goto err3; + goto err_trans; err = rhltable_insert(&nft_objname_ht, &obj->rhlhead, nft_objname_ht_params); if (err < 0) - goto err4; + goto err_obj_ht; list_add_tail_rcu(&obj->list, &table->objects); table->use++; return 0; -err4: +err_obj_ht: /* queued in transaction log */ INIT_LIST_HEAD(&obj->list); return err; -err3: +err_trans: kfree(obj->key.name); -err2: +err_userdata: + kfree(obj->udata); +err_strdup: if (obj->ops->destroy) obj->ops->destroy(&ctx, obj); kfree(obj); -err1: +err_init: module_put(type->owner); return err; } @@ -5993,6 +6028,10 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, NFTA_OBJ_PAD)) goto nla_put_failure; + if (obj->udata && + nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 637ce3e8c575..a28aca5124ce 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -14,6 +14,25 @@ struct nft_socket { }; }; +static void nft_socket_wildcard(const struct nft_pktinfo *pkt, + struct nft_regs *regs, struct sock *sk, + u32 *dest) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + nft_reg_store8(dest, inet_sk(sk)->inet_rcv_saddr == 0); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + nft_reg_store8(dest, ipv6_addr_any(&sk->sk_v6_rcv_saddr)); + break; +#endif + default: + regs->verdict.code = NFT_BREAK; + return; + } +} + static void nft_socket_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -59,6 +78,13 @@ static void nft_socket_eval(const struct nft_expr *expr, return; } break; + case NFT_SOCKET_WILDCARD: + if (!sk_fullsock(sk)) { + regs->verdict.code = NFT_BREAK; + return; + } + nft_socket_wildcard(pkt, regs, sk, dest); + break; default: WARN_ON(1); regs->verdict.code = NFT_BREAK; @@ -97,6 +123,7 @@ static int nft_socket_init(const struct nft_ctx *ctx, priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY])); switch(priv->key) { case NFT_SOCKET_TRANSPARENT: + case NFT_SOCKET_WILDCARD: len = sizeof(u8); break; case NFT_SOCKET_MARK: diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index 713fb38541df..8928ec56c388 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -276,7 +276,7 @@ hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t, return 0; /* follow-up fragments don't contain ports, skip all fragments */ - if (ip->frag_off & htons(IP_MF | IP_OFFSET)) + if (ip_is_fragment(ip)) return 0; hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info); diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 249da67d50a2..1a98247ab148 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -426,7 +426,7 @@ void calipso_doi_free(struct calipso_doi *doi_def) /** * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine * @doi: the DOI value - * @audit_secid: the LSM secid to use in the audit message + * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will @@ -595,7 +595,7 @@ int calipso_req_setattr(struct request_sock *req, /** * calipso_req_delattr - Delete the CALIPSO option from a request socket - * @reg: the request socket + * @req: the request socket * * Description: * Removes the CALIPSO option from a request socket, if present. diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index f73a8382c275..dc8c39f51f7d 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -612,9 +612,8 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, - " nlbl_domain=%s res=%u", - entry->domain ? entry->domain : "(default)", - ret_val == 0 ? 1 : 0); + " nlbl_domain=%s res=1", + entry->domain ? entry->domain : "(default)"); audit_log_end(audit_buf); } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index d2d1448274f5..4ec4c0f72f61 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -848,7 +848,7 @@ retry: * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the - * message has has the capability @cap in the user namespace @user_ns. + * message has the capability @cap in the user namespace @user_ns. */ bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *user_ns, int cap) @@ -867,7 +867,7 @@ EXPORT_SYMBOL(__netlink_ns_capable); * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the - * message has has the capability @cap in the user namespace @user_ns. + * message has the capability @cap in the user namespace @user_ns. */ bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *user_ns, int cap) @@ -883,7 +883,7 @@ EXPORT_SYMBOL(netlink_ns_capable); * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the - * message has has the capability @cap in all user namespaces. + * message has the capability @cap in all user namespaces. */ bool netlink_capable(const struct sk_buff *skb, int cap) { @@ -898,7 +898,7 @@ EXPORT_SYMBOL(netlink_capable); * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the - * message has has the capability @cap over the network namespace of + * message has the capability @cap over the network namespace of * the socket we received the message from. */ bool netlink_net_capable(const struct sk_buff *skb, int cap) @@ -1853,7 +1853,7 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) struct scm_cookie scm; u32 netlink_skb_flags = 0; - if (msg->msg_flags&MSG_OOB) + if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; err = scm_send(sock, msg, &scm, true); @@ -1916,7 +1916,7 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) refcount_inc(&skb->users); netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL); } - err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT); + err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT); out: scm_destroy(&scm); @@ -1929,12 +1929,12 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, struct scm_cookie scm; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - int noblock = flags&MSG_DONTWAIT; + int noblock = flags & MSG_DONTWAIT; size_t copied; struct sk_buff *skb, *data_skb; int err, ret; - if (flags&MSG_OOB) + if (flags & MSG_OOB) return -EOPNOTSUPP; copied = 0; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 1eb65a7a27fd..3a718e327515 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -222,7 +222,7 @@ static int genl_validate_assign_mc_groups(struct genl_family *family) family->mcgrp_offset = first_id; - /* if still initializing, can't and don't need to to realloc bitmaps */ + /* if still initializing, can't and don't need to realloc bitmaps */ if (!init_net.genl_sock) return 0; diff --git a/net/netlink/policy.c b/net/netlink/policy.c index 641ffbdd977a..62f977fa645a 100644 --- a/net/netlink/policy.c +++ b/net/netlink/policy.c @@ -254,12 +254,6 @@ send_attribute: pt->bitfield32_valid)) goto nla_put_failure; break; - case NLA_EXACT_LEN: - type = NL_ATTR_TYPE_BINARY; - if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len) || - nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, pt->len)) - goto nla_put_failure; - break; case NLA_STRING: case NLA_NUL_STRING: case NLA_BINARY: @@ -269,14 +263,27 @@ send_attribute: type = NL_ATTR_TYPE_NUL_STRING; else type = NL_ATTR_TYPE_BINARY; - if (pt->len && nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, - pt->len)) - goto nla_put_failure; - break; - case NLA_MIN_LEN: - type = NL_ATTR_TYPE_BINARY; - if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len)) + + if (pt->validation_type == NLA_VALIDATE_RANGE || + pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG) { + struct netlink_range_validation range; + + nla_get_range_unsigned(pt, &range); + + if (range.min && + nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, + range.min)) + goto nla_put_failure; + + if (range.max < U16_MAX && + nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, + range.max)) + goto nla_put_failure; + } else if (pt->len && + nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, + pt->len)) { goto nla_put_failure; + } break; case NLA_FLAG: type = NL_ATTR_TYPE_FLAG; diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index 304b1a9bb18a..5971fb6f51cc 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -38,9 +38,6 @@ #define DIGITAL_GB_BIT 0x02 -#define DIGITAL_NFC_DEP_REQ_RES_HEADROOM 2 /* SoD: [SB (NFC-A)] + LEN */ -#define DIGITAL_NFC_DEP_REQ_RES_TAILROOM 2 /* EoD: 2-byte CRC */ - #define DIGITAL_NFC_DEP_PFB_TYPE(pfb) ((pfb) & 0xE0) #define DIGITAL_NFC_DEP_PFB_TIMEOUT_BIT 0x10 diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 2611657f40ca..855f2c155956 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -9,7 +9,6 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/openvswitch.h> -#include <linux/netfilter_ipv6.h> #include <linux/sctp.h> #include <linux/tcp.h> #include <linux/udp.h> @@ -742,7 +741,8 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } -static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int ovs_vport_output(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); struct vport *vport = data->vport; @@ -848,13 +848,9 @@ static void ovs_fragment(struct net *net, struct vport *vport, ip_do_fragment(net, skb->sk, skb, ovs_vport_output); refdst_drop(orig_dst); } else if (key->eth.type == htons(ETH_P_IPV6)) { - const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); unsigned long orig_dst; struct rt6_info ovs_rt; - if (!v6ops) - goto err; - prepare_frag(vport, skb, orig_network_offset, ovs_key_mac_proto(key)); memset(&ovs_rt, 0, sizeof(ovs_rt)); @@ -866,7 +862,7 @@ static void ovs_fragment(struct net *net, struct vport *vport, skb_dst_set_noref(skb, &ovs_rt.dst); IP6CB(skb)->frag_max_size = mru; - v6ops->fragment(net, skb->sk, skb, ovs_vport_output); + ipv6_stub->ipv6_fragment(net, skb->sk, skb, ovs_vport_output); refdst_drop(orig_dst); } else { WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", @@ -925,7 +921,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, upcall.mru = OVS_CB(skb)->mru; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; - a = nla_next(a, &rem)) { + a = nla_next(a, &rem)) { switch (nla_type(a)) { case OVS_USERSPACE_ATTR_USERDATA: upcall.userdata = a; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index a3f1204f1ed2..e86b9601f5b1 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1901,8 +1901,8 @@ static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net) lockdep_ovsl_is_held()) kfree_rcu(ct_limit, rcu); } - kfree(ovs_net->ct_limit_info->limits); - kfree(ovs_net->ct_limit_info); + kfree(info->limits); + kfree(info); } static struct sk_buff * diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6e47ef7ef036..00df39b736ed 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -182,7 +182,7 @@ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) head = vport_hash_bucket(dp, port_no); hlist_for_each_entry_rcu(vport, head, dp_hash_node, - lockdep_ovsl_is_held()) { + lockdep_ovsl_is_held()) { if (vport->port_no == port_no) return vport; } @@ -254,7 +254,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) error = ovs_execute_actions(dp, skb, sf_acts, key); if (unlikely(error)) net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", - ovs_dp_name(dp), error); + ovs_dp_name(dp), error); stats_counter = &stats->n_hit; @@ -302,7 +302,7 @@ err: static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, - uint32_t cutlen) + uint32_t cutlen) { unsigned int gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; @@ -1080,11 +1080,12 @@ error: } /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ -static noinline_for_stack struct sw_flow_actions *get_flow_actions(struct net *net, - const struct nlattr *a, - const struct sw_flow_key *key, - const struct sw_flow_mask *mask, - bool log) +static noinline_for_stack +struct sw_flow_actions *get_flow_actions(struct net *net, + const struct nlattr *a, + const struct sw_flow_key *key, + const struct sw_flow_mask *mask, + bool log) { struct sw_flow_actions *acts; struct sw_flow_key masked_key; @@ -1383,7 +1384,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_flow_genl_family, reply, info); } else { - netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply)); + netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, + PTR_ERR(reply)); } } @@ -1513,7 +1515,7 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, - flags, cmd); + flags, cmd); if (!ovs_header) goto error; @@ -1572,11 +1574,13 @@ static struct datapath *lookup_datapath(struct net *net, return dp ? dp : ERR_PTR(-ENODEV); } -static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info) +static void ovs_dp_reset_user_features(struct sk_buff *skb, + struct genl_info *info) { struct datapath *dp; - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, + info->attrs); if (IS_ERR(dp)) return; @@ -2075,7 +2079,7 @@ static unsigned int ovs_get_max_headroom(struct datapath *dp) for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, - lockdep_ovsl_is_held()) { + lockdep_ovsl_is_held()) { dev = vport->dev; dev_headroom = netdev_get_fwd_headroom(dev); if (dev_headroom > max_headroom) @@ -2093,10 +2097,11 @@ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom) int i; dp->max_headroom = new_headroom; - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, - lockdep_ovsl_is_held()) + lockdep_ovsl_is_held()) netdev_set_rx_headroom(vport->dev, new_headroom); + } } static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) @@ -2476,13 +2481,19 @@ error: static int __net_init ovs_init_net(struct net *net) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + int err; INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); INIT_DELAYED_WORK(&ovs_net->masks_rebalance, ovs_dp_masks_rebalance); + + err = ovs_ct_init(net); + if (err) + return err; + schedule_delayed_work(&ovs_net->masks_rebalance, msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); - return ovs_ct_init(net); + return 0; } static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, @@ -2551,7 +2562,8 @@ static int __init dp_init(void) { int err; - BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb)); + BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > + sizeof_field(struct sk_buff, cb)); pr_info("Open vSwitch switching datapath\n"); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index e2235849a57e..87c286ad660e 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -111,12 +111,16 @@ static void flow_free(struct sw_flow *flow) if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); if (flow->sf_acts) - ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); + ovs_nla_free_flow_actions((struct sw_flow_actions __force *) + flow->sf_acts); /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) + for (cpu = 0; cpu < nr_cpu_ids; + cpu = cpumask_next(cpu, &flow->cpu_used_mask)) { if (flow->stats[cpu]) kmem_cache_free(flow_stats_cache, (struct sw_flow_stats __force *)flow->stats[cpu]); + } + kmem_cache_free(flow_cache, flow); } @@ -164,7 +168,6 @@ static struct table_instance *table_instance_alloc(int new_size) ti->n_buckets = new_size; ti->node_ver = 0; - ti->keep_flows = false; get_random_bytes(&ti->hash_seed, sizeof(u32)); return ti; @@ -192,7 +195,7 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma) * zero based counter we store the value at reset, and subtract it * later when processing. */ - for (i = 0; i < ma->max; i++) { + for (i = 0; i < ma->max; i++) { ma->masks_usage_zero_cntr[i] = 0; for_each_possible_cpu(cpu) { @@ -273,7 +276,7 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl, if (ma_count >= ma->max) { err = tbl_mask_array_realloc(tbl, ma->max + - MASK_ARRAY_SIZE_MIN); + MASK_ARRAY_SIZE_MIN); if (err) return err; @@ -288,7 +291,7 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl, BUG_ON(ovsl_dereference(ma->masks[ma_count])); rcu_assign_pointer(ma->masks[ma_count], new); - WRITE_ONCE(ma->count, ma_count +1); + WRITE_ONCE(ma->count, ma_count + 1); return 0; } @@ -309,10 +312,10 @@ static void tbl_mask_array_del_mask(struct flow_table *tbl, return; found: - WRITE_ONCE(ma->count, ma_count -1); + WRITE_ONCE(ma->count, ma_count - 1); - rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]); - RCU_INIT_POINTER(ma->masks[ma_count -1], NULL); + rcu_assign_pointer(ma->masks[i], ma->masks[ma_count - 1]); + RCU_INIT_POINTER(ma->masks[ma_count - 1], NULL); kfree_rcu(mask, rcu); @@ -448,26 +451,23 @@ free_mask_cache: static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) { - struct table_instance *ti = container_of(rcu, struct table_instance, rcu); + struct table_instance *ti; + ti = container_of(rcu, struct table_instance, rcu); __table_instance_destroy(ti); } static void table_instance_flow_free(struct flow_table *table, - struct table_instance *ti, - struct table_instance *ufid_ti, - struct sw_flow *flow, - bool count) + struct table_instance *ti, + struct table_instance *ufid_ti, + struct sw_flow *flow) { hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); - if (count) - table->count--; + table->count--; if (ovs_identifier_is_ufid(&flow->id)) { hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); - - if (count) - table->ufid_count--; + table->ufid_count--; } flow_mask_remove(table, flow->mask); @@ -480,22 +480,25 @@ void table_instance_flow_flush(struct flow_table *table, { int i; - if (ti->keep_flows) - return; - for (i = 0; i < ti->n_buckets; i++) { - struct sw_flow *flow; struct hlist_head *head = &ti->buckets[i]; struct hlist_node *n; + struct sw_flow *flow; hlist_for_each_entry_safe(flow, n, head, flow_table.node[ti->node_ver]) { table_instance_flow_free(table, ti, ufid_ti, - flow, false); + flow); ovs_flow_free(flow, true); } } + + if (WARN_ON(table->count != 0 || + table->ufid_count != 0)) { + table->count = 0; + table->ufid_count = 0; + } } static void table_instance_destroy(struct table_instance *ti, @@ -596,8 +599,6 @@ static void flow_table_copy_flows(struct table_instance *old, lockdep_ovsl_is_held()) table_instance_insert(new, flow); } - - old->keep_flows = true; } static struct table_instance *table_instance_rehash(struct table_instance *ti, @@ -632,8 +633,6 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table) rcu_assign_pointer(flow_table->ti, new_ti); rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti); flow_table->last_rehash = jiffies; - flow_table->count = 0; - flow_table->ufid_count = 0; table_instance_flow_flush(flow_table, old_ti, old_ufid_ti); table_instance_destroy(old_ti, old_ufid_ti); @@ -661,7 +660,7 @@ static int flow_key_start(const struct sw_flow_key *key) return 0; else return rounddown(offsetof(struct sw_flow_key, phy), - sizeof(long)); + sizeof(long)); } static bool cmp_key(const struct sw_flow_key *key1, @@ -673,7 +672,7 @@ static bool cmp_key(const struct sw_flow_key *key1, long diffs = 0; int i; - for (i = key_start; i < key_end; i += sizeof(long)) + for (i = key_start; i < key_end; i += sizeof(long)) diffs |= *cp1++ ^ *cp2++; return diffs == 0; @@ -713,7 +712,7 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, (*n_mask_hit)++; hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver], - lockdep_ovsl_is_held()) { + lockdep_ovsl_is_held()) { if (flow->mask == mask && flow->flow_table.hash == hash && flow_cmp_masked_key(flow, &masked_key, &mask->range)) return flow; @@ -897,7 +896,8 @@ static bool ovs_flow_cmp_ufid(const struct sw_flow *flow, return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len); } -bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match) +bool ovs_flow_cmp(const struct sw_flow *flow, + const struct sw_flow_match *match) { if (ovs_identifier_is_ufid(&flow->id)) return flow_cmp_masked_key(flow, match->key, &match->range); @@ -916,7 +916,7 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, hash = ufid_hash(ufid); head = find_bucket(ti, hash); hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver], - lockdep_ovsl_is_held()) { + lockdep_ovsl_is_held()) { if (flow->ufid_table.hash == hash && ovs_flow_cmp_ufid(flow, ufid)) return flow; @@ -950,7 +950,7 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); BUG_ON(table->count == 0); - table_instance_flow_free(table, ti, ufid_ti, flow, true); + table_instance_flow_free(table, ti, ufid_ti, flow); } static struct sw_flow_mask *mask_alloc(void) @@ -1107,7 +1107,7 @@ void ovs_flow_masks_rebalance(struct flow_table *table) if (!masks_and_count) return; - for (i = 0; i < ma->max; i++) { + for (i = 0; i < ma->max; i++) { struct sw_flow_mask *mask; unsigned int start; int cpu; diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 6e7d4ac59353..d8fb7a3a3dfd 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -53,7 +53,6 @@ struct table_instance { struct rcu_head rcu; int node_ver; u32 hash_seed; - bool keep_flows; }; struct flow_table { diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 0d44c5c013fa..82d801f063b7 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -98,7 +98,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) struct vport *vport; hlist_for_each_entry_rcu(vport, bucket, hash_node, - lockdep_ovsl_is_held()) + lockdep_ovsl_is_held()) if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; @@ -118,7 +118,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) * vport_free(). */ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, - const struct vport_parms *parms) + const struct vport_parms *parms) { struct vport *vport; size_t alloc_size; @@ -397,7 +397,8 @@ int ovs_vport_get_upcall_portids(const struct vport *vport, * * Returns the portid of the target socket. Must be called with rcu_read_lock. */ -u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) +u32 ovs_vport_find_upcall_portid(const struct vport *vport, + struct sk_buff *skb) { struct vport_portids *ids; u32 ids_index; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 2b33e977a905..cefbd50c1090 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -93,52 +93,56 @@ /* Assumptions: - - if device has no dev->hard_header routine, it adds and removes ll header - inside itself. In this case ll header is invisible outside of device, - but higher levels still should reserve dev->hard_header_len. - Some devices are enough clever to reallocate skb, when header - will not fit to reserved space (tunnel), another ones are silly - (PPP). + - If the device has no dev->header_ops, there is no LL header visible + above the device. In this case, its hard_header_len should be 0. + The device may prepend its own header internally. In this case, its + needed_headroom should be set to the space needed for it to add its + internal header. + For example, a WiFi driver pretending to be an Ethernet driver should + set its hard_header_len to be the Ethernet header length, and set its + needed_headroom to be (the real WiFi header length - the fake Ethernet + header length). - packet socket receives packets with pulled ll header, so that SOCK_RAW should push it back. On receive: ----------- -Incoming, dev->hard_header!=NULL +Incoming, dev->header_ops != NULL mac_header -> ll header data -> data -Outgoing, dev->hard_header!=NULL +Outgoing, dev->header_ops != NULL mac_header -> ll header data -> ll header -Incoming, dev->hard_header==NULL - mac_header -> UNKNOWN position. It is very likely, that it points to ll - header. PPP makes it, that is wrong, because introduce - assymetry between rx and tx paths. +Incoming, dev->header_ops == NULL + mac_header -> data + However drivers often make it point to the ll header. + This is incorrect because the ll header should be invisible to us. data -> data -Outgoing, dev->hard_header==NULL - mac_header -> data. ll header is still not built! +Outgoing, dev->header_ops == NULL + mac_header -> data. ll header is invisible to us. data -> data Resume - If dev->hard_header==NULL we are unlikely to restore sensible ll header. + If dev->header_ops == NULL we are unable to restore the ll header, + because it is invisible to us. On transmit: ------------ -dev->hard_header != NULL +dev->header_ops != NULL mac_header -> ll header data -> ll header -dev->hard_header == NULL (ll header is added by device, we cannot control it) +dev->header_ops == NULL (ll header is invisible to us) mac_header -> data data -> data - We should set nh.raw on output to correct posistion, + We should set network_header on output to the correct position, packet classifier depends on it. */ @@ -177,7 +181,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) -#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x))) struct packet_sock; static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/rds/cong.c b/net/rds/cong.c index ccdff09a79c8..8b689ebbd5b5 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -236,7 +236,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map) * tcp_setsockopt and/or tcp_sendmsg will deadlock * when it tries to get the sock_lock()) * 2. Interrupts are masked so that we can mark the - * the port congested from both send and recv paths. + * port congested from both send and recv paths. * (See comment around declaration of rdc_cong_lock). * An attempt to get the sock_lock() here will * therefore trigger warnings. diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index c3319ff3ee11..06603dd1c8aa 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -711,7 +711,7 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6) * original size. The only way to tell the difference is by looking at * the contents, which are initialized to zero. * If the protocol version fields aren't set, this is a connection attempt - * from an older version. This could could be 3.0 or 2.0 - we can't tell. + * from an older version. This could be 3.0 or 2.0 - we can't tell. * We really should have changed this for OFED 1.3 :-( */ diff --git a/net/rds/rdma.c b/net/rds/rdma.c index ccdd304eae0a..1d0afb1dd77b 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -269,7 +269,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } else { nents = ret; - sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); + sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL); if (!sg) { ret = -ENOMEM; goto out; diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 186c8a889b16..0a2f4817ec6c 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -308,9 +308,10 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, key = NULL; /* a no-security key */ memset(&p, 0, sizeof(p)); - p.user_call_ID = user_call_ID; - p.tx_total_len = tx_total_len; - p.interruptibility = interruptibility; + p.user_call_ID = user_call_ID; + p.tx_total_len = tx_total_len; + p.interruptibility = interruptibility; + p.kernel = true; memset(&cp, 0, sizeof(cp)); cp.local = rx->local; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 884cff7bb169..19f714386654 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -76,14 +76,12 @@ struct rxrpc_net { struct work_struct service_conn_reaper; struct timer_list service_conn_reap_timer; - unsigned int nr_client_conns; - unsigned int nr_active_client_conns; - bool kill_all_client_conns; bool live; + + bool kill_all_client_conns; + atomic_t nr_client_conns; spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */ spinlock_t client_conn_discard_lock; /* Prevent multiple discarders */ - struct list_head waiting_client_conns; - struct list_head active_client_conns; struct list_head idle_client_conns; struct work_struct client_conn_reaper; struct timer_list client_conn_reap_timer; @@ -275,8 +273,8 @@ struct rxrpc_local { struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ - struct rb_root client_conns; /* Client connections by socket params */ - spinlock_t client_conns_lock; /* Lock for client_conns */ + struct rb_root client_bundles; /* Client connection bundles by socket params */ + spinlock_t client_bundles_lock; /* Lock for client_bundles */ spinlock_t lock; /* access lock */ rwlock_t services_lock; /* lock for services list */ int debug_id; /* debug ID for printks */ @@ -353,10 +351,7 @@ struct rxrpc_conn_parameters { enum rxrpc_conn_flag { RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */ RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */ - RXRPC_CONN_IN_CLIENT_CONNS, /* Conn is in local->client_conns */ - RXRPC_CONN_EXPOSED, /* Conn has extra ref for exposure */ RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */ - RXRPC_CONN_COUNTED, /* Counted by rxrpc_nr_client_conns */ RXRPC_CONN_PROBING_FOR_UPGRADE, /* Probing for service upgrade */ RXRPC_CONN_FINAL_ACK_0, /* Need final ACK for channel 0 */ RXRPC_CONN_FINAL_ACK_1, /* Need final ACK for channel 1 */ @@ -377,19 +372,6 @@ enum rxrpc_conn_event { }; /* - * The connection cache state. - */ -enum rxrpc_conn_cache_state { - RXRPC_CONN_CLIENT_INACTIVE, /* Conn is not yet listed */ - RXRPC_CONN_CLIENT_WAITING, /* Conn is on wait list, waiting for capacity */ - RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */ - RXRPC_CONN_CLIENT_UPGRADE, /* Conn is on active list, probing for upgrade */ - RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */ - RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */ - RXRPC_CONN__NR_CACHE_STATES -}; - -/* * The connection protocol state. */ enum rxrpc_conn_proto_state { @@ -405,6 +387,23 @@ enum rxrpc_conn_proto_state { }; /* + * RxRPC client connection bundle. + */ +struct rxrpc_bundle { + struct rxrpc_conn_parameters params; + atomic_t usage; + unsigned int debug_id; + bool try_upgrade; /* True if the bundle is attempting upgrade */ + bool alloc_conn; /* True if someone's getting a conn */ + short alloc_error; /* Error from last conn allocation */ + spinlock_t channel_lock; + struct rb_node local_node; /* Node in local->client_conns */ + struct list_head waiting_calls; /* Calls waiting for channels */ + unsigned long avail_chans; /* Mask of available channels */ + struct rxrpc_connection *conns[4]; /* The connections in the bundle (max 4) */ +}; + +/* * RxRPC connection definition * - matched by { local, peer, epoch, conn_id, direction } * - each connection can only handle four simultaneous calls @@ -417,10 +416,7 @@ struct rxrpc_connection { struct rcu_head rcu; struct list_head cache_link; - spinlock_t channel_lock; - unsigned char active_chans; /* Mask of active channels */ -#define RXRPC_ACTIVE_CHANS_MASK ((1 << RXRPC_MAXCALLS) - 1) - struct list_head waiting_calls; /* Calls waiting for channels */ + unsigned char act_chans; /* Mask of active channels */ struct rxrpc_channel { unsigned long final_ack_at; /* Time at which to issue final ACK */ struct rxrpc_call __rcu *call; /* Active call */ @@ -437,10 +433,8 @@ struct rxrpc_connection { struct timer_list timer; /* Conn event timer */ struct work_struct processor; /* connection event processor */ - union { - struct rb_node client_node; /* Node in local->client_conns */ - struct rb_node service_node; /* Node in peer->service_conns */ - }; + struct rxrpc_bundle *bundle; /* Client connection bundle */ + struct rb_node service_node; /* Node in peer->service_conns */ struct list_head proc_link; /* link in procfs list */ struct list_head link; /* link in master connection list */ struct sk_buff_head rx_queue; /* received conn-level packets */ @@ -452,7 +446,6 @@ struct rxrpc_connection { unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ spinlock_t state_lock; /* state-change lock */ - enum rxrpc_conn_cache_state cache_state; enum rxrpc_conn_proto_state state; /* current state of connection */ u32 abort_code; /* Abort code of connection abort */ int debug_id; /* debug ID for printks */ @@ -464,6 +457,7 @@ struct rxrpc_connection { u8 security_size; /* security header size */ u8 security_ix; /* security type */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ + u8 bundle_shift; /* Index into bundle->avail_chans */ short error; /* Local error code */ }; @@ -493,6 +487,8 @@ enum rxrpc_call_flag { RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */ RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */ + RXRPC_CALL_KERNEL, /* The call was made by the kernel */ + RXRPC_CALL_UPGRADE, /* Service upgrade was requested for the call */ }; /* @@ -577,7 +573,7 @@ struct rxrpc_call { struct work_struct processor; /* Event processor */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ struct list_head link; /* link in master call list */ - struct list_head chan_wait_link; /* Link in conn->waiting_calls */ + struct list_head chan_wait_link; /* Link in conn->bundle->waiting_calls */ struct hlist_node error_link; /* link in error distribution list */ struct list_head accept_link; /* Link in rx->acceptq */ struct list_head recvmsg_link; /* Link in rx->recvmsg_q */ @@ -727,6 +723,7 @@ struct rxrpc_call_params { u32 normal; /* Max time since last call packet (msec) */ } timeouts; u8 nr_timeouts; /* Number of timeouts specified */ + bool kernel; /* T if kernel is making the call */ enum rxrpc_interruptibility interruptibility; /* How is interruptible is the call? */ }; @@ -815,18 +812,19 @@ static inline bool rxrpc_is_client_call(const struct rxrpc_call *call) /* * conn_client.c */ -extern unsigned int rxrpc_max_client_connections; extern unsigned int rxrpc_reap_client_connections; extern unsigned long rxrpc_conn_idle_client_expiry; extern unsigned long rxrpc_conn_idle_client_fast_expiry; extern struct idr rxrpc_client_conn_ids; void rxrpc_destroy_client_conn_ids(void); +struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *); +void rxrpc_put_bundle(struct rxrpc_bundle *); int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, gfp_t); void rxrpc_expose_client_call(struct rxrpc_call *); -void rxrpc_disconnect_client_call(struct rxrpc_call *); +void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *); void rxrpc_put_client_conn(struct rxrpc_connection *); void rxrpc_discard_expired_client_conns(struct work_struct *); void rxrpc_destroy_all_client_connections(struct rxrpc_net *); @@ -852,7 +850,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_kill_connection(struct rxrpc_connection *); bool rxrpc_queue_conn(struct rxrpc_connection *); void rxrpc_see_connection(struct rxrpc_connection *); -void rxrpc_get_connection(struct rxrpc_connection *); +struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *); struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *); void rxrpc_put_service_conn(struct rxrpc_connection *); void rxrpc_service_connection_reaper(struct work_struct *); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index a40fae013942..c8015c76a81c 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -41,6 +41,11 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = { struct kmem_cache *rxrpc_call_jar; +static struct semaphore rxrpc_call_limiter = + __SEMAPHORE_INITIALIZER(rxrpc_call_limiter, 1000); +static struct semaphore rxrpc_kernel_call_limiter = + __SEMAPHORE_INITIALIZER(rxrpc_kernel_call_limiter, 1000); + static void rxrpc_call_timer_expired(struct timer_list *t) { struct rxrpc_call *call = from_timer(call, t, timer); @@ -210,6 +215,34 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) } /* + * Wait for a call slot to become available. + */ +static struct semaphore *rxrpc_get_call_slot(struct rxrpc_call_params *p, gfp_t gfp) +{ + struct semaphore *limiter = &rxrpc_call_limiter; + + if (p->kernel) + limiter = &rxrpc_kernel_call_limiter; + if (p->interruptibility == RXRPC_UNINTERRUPTIBLE) { + down(limiter); + return limiter; + } + return down_interruptible(limiter) < 0 ? NULL : limiter; +} + +/* + * Release a call slot. + */ +static void rxrpc_put_call_slot(struct rxrpc_call *call) +{ + struct semaphore *limiter = &rxrpc_call_limiter; + + if (test_bit(RXRPC_CALL_KERNEL, &call->flags)) + limiter = &rxrpc_kernel_call_limiter; + up(limiter); +} + +/* * Set up a call for the given parameters. * - Called with the socket lock held, which it must release. * - If it returns a call, the call's lock will need releasing by the caller. @@ -225,15 +258,21 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, { struct rxrpc_call *call, *xcall; struct rxrpc_net *rxnet; + struct semaphore *limiter; struct rb_node *parent, **pp; const void *here = __builtin_return_address(0); int ret; _enter("%p,%lx", rx, p->user_call_ID); + limiter = rxrpc_get_call_slot(p, gfp); + if (!limiter) + return ERR_PTR(-ERESTARTSYS); + call = rxrpc_alloc_client_call(rx, srx, gfp, debug_id); if (IS_ERR(call)) { release_sock(&rx->sk); + up(limiter); _leave(" = %ld", PTR_ERR(call)); return call; } @@ -243,6 +282,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, trace_rxrpc_call(call->debug_id, rxrpc_call_new_client, atomic_read(&call->usage), here, (const void *)p->user_call_ID); + if (p->kernel) + __set_bit(RXRPC_CALL_KERNEL, &call->flags); /* We need to protect a partially set up call against the user as we * will be acting outside the socket lock. @@ -471,6 +512,8 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) BUG(); spin_unlock_bh(&call->lock); + rxrpc_put_call_slot(call); + del_timer_sync(&call->timer); /* Make sure we don't get any more notifications */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 159e3eda7914..78c845a4f1ad 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -1,63 +1,15 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* Client connection-specific management code. * - * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2016, 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * Client connections need to be cached for a little while after they've made a * call so as to handle retransmitted DATA packets in case the server didn't * receive the final ACK or terminating ABORT we sent it. * - * Client connections can be in one of a number of cache states: - * - * (1) INACTIVE - The connection is not held in any list and may not have been - * exposed to the world. If it has been previously exposed, it was - * discarded from the idle list after expiring. - * - * (2) WAITING - The connection is waiting for the number of client conns to - * drop below the maximum capacity. Calls may be in progress upon it from - * when it was active and got culled. - * - * The connection is on the rxrpc_waiting_client_conns list which is kept - * in to-be-granted order. Culled conns with waiters go to the back of - * the queue just like new conns. - * - * (3) ACTIVE - The connection has at least one call in progress upon it, it - * may freely grant available channels to new calls and calls may be - * waiting on it for channels to become available. - * - * The connection is on the rxnet->active_client_conns list which is kept - * in activation order for culling purposes. - * - * rxrpc_nr_active_client_conns is held incremented also. - * - * (4) UPGRADE - As for ACTIVE, but only one call may be in progress and is - * being used to probe for service upgrade. - * - * (5) CULLED - The connection got summarily culled to try and free up - * capacity. Calls currently in progress on the connection are allowed to - * continue, but new calls will have to wait. There can be no waiters in - * this state - the conn would have to go to the WAITING state instead. - * - * (6) IDLE - The connection has no calls in progress upon it and must have - * been exposed to the world (ie. the EXPOSED flag must be set). When it - * expires, the EXPOSED flag is cleared and the connection transitions to - * the INACTIVE state. - * - * The connection is on the rxnet->idle_client_conns list which is kept in - * order of how soon they'll expire. - * * There are flags of relevance to the cache: * - * (1) EXPOSED - The connection ID got exposed to the world. If this flag is - * set, an extra ref is added to the connection preventing it from being - * reaped when it has no calls outstanding. This flag is cleared and the - * ref dropped when a conn is discarded from the idle list. - * - * This allows us to move terminal call state retransmission to the - * connection and to discard the call immediately we think it is done - * with. It also give us a chance to reuse the connection. - * * (2) DONT_REUSE - The connection should be discarded as soon as possible and * should not be reused. This is set when an exclusive connection is used * or a call ID counter overflows. @@ -78,7 +30,6 @@ #include "ar-internal.h" -__read_mostly unsigned int rxrpc_max_client_connections = 1000; __read_mostly unsigned int rxrpc_reap_client_connections = 900; __read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; @@ -89,8 +40,6 @@ __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; DEFINE_IDR(rxrpc_client_conn_ids); static DEFINE_SPINLOCK(rxrpc_conn_id_lock); -static void rxrpc_cull_active_client_conns(struct rxrpc_net *); - /* * Get a connection ID and epoch for a client connection from the global pool. * The connection struct pointer is then recorded in the idr radix tree. The @@ -162,13 +111,50 @@ void rxrpc_destroy_client_conn_ids(void) } /* + * Allocate a connection bundle. + */ +static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp, + gfp_t gfp) +{ + struct rxrpc_bundle *bundle; + + bundle = kzalloc(sizeof(*bundle), gfp); + if (bundle) { + bundle->params = *cp; + rxrpc_get_peer(bundle->params.peer); + atomic_set(&bundle->usage, 1); + spin_lock_init(&bundle->channel_lock); + INIT_LIST_HEAD(&bundle->waiting_calls); + } + return bundle; +} + +struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle) +{ + atomic_inc(&bundle->usage); + return bundle; +} + +void rxrpc_put_bundle(struct rxrpc_bundle *bundle) +{ + unsigned int d = bundle->debug_id; + unsigned int u = atomic_dec_return(&bundle->usage); + + _debug("PUT B=%x %u", d, u); + if (u == 0) { + rxrpc_put_peer(bundle->params.peer); + kfree(bundle); + } +} + +/* * Allocate a client connection. */ static struct rxrpc_connection * -rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) +rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) { struct rxrpc_connection *conn; - struct rxrpc_net *rxnet = cp->local->rxnet; + struct rxrpc_net *rxnet = bundle->params.local->rxnet; int ret; _enter(""); @@ -180,15 +166,11 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) } atomic_set(&conn->usage, 1); - if (cp->exclusive) - __set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); - if (cp->upgrade) - __set_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags); - - conn->params = *cp; + conn->bundle = bundle; + conn->params = bundle->params; conn->out_clientflag = RXRPC_CLIENT_INITIATED; conn->state = RXRPC_CONN_CLIENT; - conn->service_id = cp->service_id; + conn->service_id = conn->params.service_id; ret = rxrpc_get_client_connection_id(conn, gfp); if (ret < 0) @@ -207,14 +189,16 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); - /* We steal the caller's peer ref. */ - cp->peer = NULL; + rxrpc_get_bundle(bundle); + rxrpc_get_peer(conn->params.peer); rxrpc_get_local(conn->params.local); key_get(conn->params.key); trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_client, atomic_read(&conn->usage), __builtin_return_address(0)); + + atomic_inc(&rxnet->nr_client_conns); trace_rxrpc_client(conn, -1, rxrpc_client_alloc); _leave(" = %p", conn); return conn; @@ -234,13 +218,18 @@ error_0: */ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) { - struct rxrpc_net *rxnet = conn->params.local->rxnet; + struct rxrpc_net *rxnet; int id_cursor, id, distance, limit; + if (!conn) + goto dont_reuse; + + rxnet = conn->params.local->rxnet; if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags)) goto dont_reuse; - if (conn->proto.epoch != rxnet->epoch) + if (conn->state != RXRPC_CONN_CLIENT || + conn->proto.epoch != rxnet->epoch) goto mark_dont_reuse; /* The IDR tree gets very expensive on memory if the connection IDs are @@ -254,7 +243,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) distance = id - id_cursor; if (distance < 0) distance = -distance; - limit = max(rxrpc_max_client_connections * 4, 1024U); + limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024); if (distance > limit) goto mark_dont_reuse; @@ -267,277 +256,247 @@ dont_reuse: } /* - * Create or find a client connection to use for a call. - * - * If we return with a connection, the call will be on its waiting list. It's - * left to the caller to assign a channel and wake up the call. + * Look up the conn bundle that matches the connection parameters, adding it if + * it doesn't yet exist. */ -static int rxrpc_get_client_conn(struct rxrpc_sock *rx, - struct rxrpc_call *call, - struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, - gfp_t gfp) +static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *cp, + gfp_t gfp) { - struct rxrpc_connection *conn, *candidate = NULL; + static atomic_t rxrpc_bundle_id; + struct rxrpc_bundle *bundle, *candidate; struct rxrpc_local *local = cp->local; struct rb_node *p, **pp, *parent; long diff; - int ret = -ENOMEM; - _enter("{%d,%lx},", call->debug_id, call->user_call_ID); + _enter("{%px,%x,%u,%u}", + cp->peer, key_serial(cp->key), cp->security_level, cp->upgrade); - cp->peer = rxrpc_lookup_peer(rx, cp->local, srx, gfp); - if (!cp->peer) - goto error; + if (cp->exclusive) + return rxrpc_alloc_bundle(cp, gfp); - call->cong_cwnd = cp->peer->cong_cwnd; - if (call->cong_cwnd >= call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; - else - call->cong_mode = RXRPC_CALL_SLOW_START; + /* First, see if the bundle is already there. */ + _debug("search 1"); + spin_lock(&local->client_bundles_lock); + p = local->client_bundles.rb_node; + while (p) { + bundle = rb_entry(p, struct rxrpc_bundle, local_node); - /* If the connection is not meant to be exclusive, search the available - * connections to see if the connection we want to use already exists. - */ - if (!cp->exclusive) { - _debug("search 1"); - spin_lock(&local->client_conns_lock); - p = local->client_conns.rb_node; - while (p) { - conn = rb_entry(p, struct rxrpc_connection, client_node); - -#define cmp(X) ((long)conn->params.X - (long)cp->X) - diff = (cmp(peer) ?: - cmp(key) ?: - cmp(security_level) ?: - cmp(upgrade)); +#define cmp(X) ((long)bundle->params.X - (long)cp->X) + diff = (cmp(peer) ?: + cmp(key) ?: + cmp(security_level) ?: + cmp(upgrade)); #undef cmp - if (diff < 0) { - p = p->rb_left; - } else if (diff > 0) { - p = p->rb_right; - } else { - if (rxrpc_may_reuse_conn(conn) && - rxrpc_get_connection_maybe(conn)) - goto found_extant_conn; - /* The connection needs replacing. It's better - * to effect that when we have something to - * replace it with so that we don't have to - * rebalance the tree twice. - */ - break; - } - } - spin_unlock(&local->client_conns_lock); - } - - /* There wasn't a connection yet or we need an exclusive connection. - * We need to create a candidate and then potentially redo the search - * in case we're racing with another thread also trying to connect on a - * shareable connection. - */ - _debug("new conn"); - candidate = rxrpc_alloc_client_connection(cp, gfp); - if (IS_ERR(candidate)) { - ret = PTR_ERR(candidate); - goto error_peer; + if (diff < 0) + p = p->rb_left; + else if (diff > 0) + p = p->rb_right; + else + goto found_bundle; } + spin_unlock(&local->client_bundles_lock); + _debug("not found"); - /* Add the call to the new connection's waiting list in case we're - * going to have to wait for the connection to come live. It's our - * connection, so we want first dibs on the channel slots. We would - * normally have to take channel_lock but we do this before anyone else - * can see the connection. - */ - list_add(&call->chan_wait_link, &candidate->waiting_calls); - - if (cp->exclusive) { - call->conn = candidate; - call->security = candidate->security; - call->security_ix = candidate->security_ix; - call->service_id = candidate->service_id; - _leave(" = 0 [exclusive %d]", candidate->debug_id); - return 0; - } + /* It wasn't. We need to add one. */ + candidate = rxrpc_alloc_bundle(cp, gfp); + if (!candidate) + return NULL; - /* Publish the new connection for userspace to find. We need to redo - * the search before doing this lest we race with someone else adding a - * conflicting instance. - */ _debug("search 2"); - spin_lock(&local->client_conns_lock); - - pp = &local->client_conns.rb_node; + spin_lock(&local->client_bundles_lock); + pp = &local->client_bundles.rb_node; parent = NULL; while (*pp) { parent = *pp; - conn = rb_entry(parent, struct rxrpc_connection, client_node); + bundle = rb_entry(parent, struct rxrpc_bundle, local_node); -#define cmp(X) ((long)conn->params.X - (long)candidate->params.X) +#define cmp(X) ((long)bundle->params.X - (long)cp->X) diff = (cmp(peer) ?: cmp(key) ?: cmp(security_level) ?: cmp(upgrade)); #undef cmp - if (diff < 0) { + if (diff < 0) pp = &(*pp)->rb_left; - } else if (diff > 0) { + else if (diff > 0) pp = &(*pp)->rb_right; - } else { - if (rxrpc_may_reuse_conn(conn) && - rxrpc_get_connection_maybe(conn)) - goto found_extant_conn; - /* The old connection is from an outdated epoch. */ - _debug("replace conn"); - clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags); - rb_replace_node(&conn->client_node, - &candidate->client_node, - &local->client_conns); - trace_rxrpc_client(conn, -1, rxrpc_client_replace); - goto candidate_published; - } + else + goto found_bundle_free; } - _debug("new conn"); - rb_link_node(&candidate->client_node, parent, pp); - rb_insert_color(&candidate->client_node, &local->client_conns); - -candidate_published: - set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); - call->conn = candidate; - call->security = candidate->security; - call->security_ix = candidate->security_ix; - call->service_id = candidate->service_id; - spin_unlock(&local->client_conns_lock); - _leave(" = 0 [new %d]", candidate->debug_id); - return 0; + _debug("new bundle"); + candidate->debug_id = atomic_inc_return(&rxrpc_bundle_id); + rb_link_node(&candidate->local_node, parent, pp); + rb_insert_color(&candidate->local_node, &local->client_bundles); + rxrpc_get_bundle(candidate); + spin_unlock(&local->client_bundles_lock); + _leave(" = %u [new]", candidate->debug_id); + return candidate; + +found_bundle_free: + kfree(candidate); +found_bundle: + rxrpc_get_bundle(bundle); + spin_unlock(&local->client_bundles_lock); + _leave(" = %u [found]", bundle->debug_id); + return bundle; +} - /* We come here if we found a suitable connection already in existence. - * Discard any candidate we may have allocated, and try to get a - * channel on this one. - */ -found_extant_conn: - _debug("found conn"); - spin_unlock(&local->client_conns_lock); +/* + * Create or find a client bundle to use for a call. + * + * If we return with a connection, the call will be on its waiting list. It's + * left to the caller to assign a channel and wake up the call. + */ +static struct rxrpc_bundle *rxrpc_prep_call(struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct rxrpc_conn_parameters *cp, + struct sockaddr_rxrpc *srx, + gfp_t gfp) +{ + struct rxrpc_bundle *bundle; - if (candidate) { - trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate); - rxrpc_put_connection(candidate); - candidate = NULL; - } + _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - spin_lock(&conn->channel_lock); - call->conn = conn; - call->security = conn->security; - call->security_ix = conn->security_ix; - call->service_id = conn->service_id; - list_add_tail(&call->chan_wait_link, &conn->waiting_calls); - spin_unlock(&conn->channel_lock); - _leave(" = 0 [extant %d]", conn->debug_id); - return 0; + cp->peer = rxrpc_lookup_peer(rx, cp->local, srx, gfp); + if (!cp->peer) + goto error; + + call->cong_cwnd = cp->peer->cong_cwnd; + if (call->cong_cwnd >= call->cong_ssthresh) + call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + else + call->cong_mode = RXRPC_CALL_SLOW_START; + if (cp->upgrade) + __set_bit(RXRPC_CALL_UPGRADE, &call->flags); + + /* Find the client connection bundle. */ + bundle = rxrpc_look_up_bundle(cp, gfp); + if (!bundle) + goto error; + + /* Get this call queued. Someone else may activate it whilst we're + * lining up a new connection, but that's fine. + */ + spin_lock(&bundle->channel_lock); + list_add_tail(&call->chan_wait_link, &bundle->waiting_calls); + spin_unlock(&bundle->channel_lock); + + _leave(" = [B=%x]", bundle->debug_id); + return bundle; -error_peer: - rxrpc_put_peer(cp->peer); - cp->peer = NULL; error: - _leave(" = %d", ret); - return ret; + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); } /* - * Activate a connection. + * Allocate a new connection and add it into a bundle. */ -static void rxrpc_activate_conn(struct rxrpc_net *rxnet, - struct rxrpc_connection *conn) +static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp) + __releases(bundle->channel_lock) { - if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) { - trace_rxrpc_client(conn, -1, rxrpc_client_to_upgrade); - conn->cache_state = RXRPC_CONN_CLIENT_UPGRADE; - } else { - trace_rxrpc_client(conn, -1, rxrpc_client_to_active); - conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE; - } - rxnet->nr_active_client_conns++; - list_move_tail(&conn->cache_link, &rxnet->active_client_conns); -} + struct rxrpc_connection *candidate = NULL, *old = NULL; + bool conflict; + int i; -/* - * Attempt to animate a connection for a new call. - * - * If it's not exclusive, the connection is in the endpoint tree, and we're in - * the conn's list of those waiting to grab a channel. There is, however, a - * limit on the number of live connections allowed at any one time, so we may - * have to wait for capacity to become available. - * - * Note that a connection on the waiting queue might *also* have active - * channels if it has been culled to make space and then re-requested by a new - * call. - */ -static void rxrpc_animate_client_conn(struct rxrpc_net *rxnet, - struct rxrpc_connection *conn) -{ - unsigned int nr_conns; + _enter(""); - _enter("%d,%d", conn->debug_id, conn->cache_state); + conflict = bundle->alloc_conn; + if (!conflict) + bundle->alloc_conn = true; + spin_unlock(&bundle->channel_lock); + if (conflict) { + _leave(" [conf]"); + return; + } - if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE || - conn->cache_state == RXRPC_CONN_CLIENT_UPGRADE) - goto out; + candidate = rxrpc_alloc_client_connection(bundle, gfp); - spin_lock(&rxnet->client_conn_cache_lock); + spin_lock(&bundle->channel_lock); + bundle->alloc_conn = false; - nr_conns = rxnet->nr_client_conns; - if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) { - trace_rxrpc_client(conn, -1, rxrpc_client_count); - rxnet->nr_client_conns = nr_conns + 1; + if (IS_ERR(candidate)) { + bundle->alloc_error = PTR_ERR(candidate); + spin_unlock(&bundle->channel_lock); + _leave(" [err %ld]", PTR_ERR(candidate)); + return; } - switch (conn->cache_state) { - case RXRPC_CONN_CLIENT_ACTIVE: - case RXRPC_CONN_CLIENT_UPGRADE: - case RXRPC_CONN_CLIENT_WAITING: - break; - - case RXRPC_CONN_CLIENT_INACTIVE: - case RXRPC_CONN_CLIENT_CULLED: - case RXRPC_CONN_CLIENT_IDLE: - if (nr_conns >= rxrpc_max_client_connections) - goto wait_for_capacity; - goto activate_conn; + bundle->alloc_error = 0; + + for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) { + unsigned int shift = i * RXRPC_MAXCALLS; + int j; + + old = bundle->conns[i]; + if (!rxrpc_may_reuse_conn(old)) { + if (old) + trace_rxrpc_client(old, -1, rxrpc_client_replace); + candidate->bundle_shift = shift; + bundle->conns[i] = candidate; + for (j = 0; j < RXRPC_MAXCALLS; j++) + set_bit(shift + j, &bundle->avail_chans); + candidate = NULL; + break; + } - default: - BUG(); + old = NULL; } -out_unlock: - spin_unlock(&rxnet->client_conn_cache_lock); -out: - _leave(" [%d]", conn->cache_state); - return; + spin_unlock(&bundle->channel_lock); -activate_conn: - _debug("activate"); - rxrpc_activate_conn(rxnet, conn); - goto out_unlock; - -wait_for_capacity: - _debug("wait"); - trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting); - conn->cache_state = RXRPC_CONN_CLIENT_WAITING; - list_move_tail(&conn->cache_link, &rxnet->waiting_client_conns); - goto out_unlock; + if (candidate) { + _debug("discard C=%x", candidate->debug_id); + trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate); + rxrpc_put_connection(candidate); + } + + rxrpc_put_connection(old); + _leave(""); } /* - * Deactivate a channel. + * Add a connection to a bundle if there are no usable connections or we have + * connections waiting for extra capacity. */ -static void rxrpc_deactivate_one_channel(struct rxrpc_connection *conn, - unsigned int channel) +static void rxrpc_maybe_add_conn(struct rxrpc_bundle *bundle, gfp_t gfp) { - struct rxrpc_channel *chan = &conn->channels[channel]; + struct rxrpc_call *call; + int i, usable; - rcu_assign_pointer(chan->call, NULL); - conn->active_chans &= ~(1 << channel); + _enter(""); + + spin_lock(&bundle->channel_lock); + + /* See if there are any usable connections. */ + usable = 0; + for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) + if (rxrpc_may_reuse_conn(bundle->conns[i])) + usable++; + + if (!usable && !list_empty(&bundle->waiting_calls)) { + call = list_first_entry(&bundle->waiting_calls, + struct rxrpc_call, chan_wait_link); + if (test_bit(RXRPC_CALL_UPGRADE, &call->flags)) + bundle->try_upgrade = true; + } + + if (!usable) + goto alloc_conn; + + if (!bundle->avail_chans && + !bundle->try_upgrade && + !list_empty(&bundle->waiting_calls) && + usable < ARRAY_SIZE(bundle->conns)) + goto alloc_conn; + + spin_unlock(&bundle->channel_lock); + _leave(""); + return; + +alloc_conn: + return rxrpc_add_conn_to_bundle(bundle, gfp); } /* @@ -549,35 +508,42 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, unsigned int channel) { struct rxrpc_channel *chan = &conn->channels[channel]; - struct rxrpc_call *call = list_entry(conn->waiting_calls.next, + struct rxrpc_bundle *bundle = conn->bundle; + struct rxrpc_call *call = list_entry(bundle->waiting_calls.next, struct rxrpc_call, chan_wait_link); u32 call_id = chan->call_counter + 1; + _enter("C=%x,%u", conn->debug_id, channel); + trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate); /* Cancel the final ACK on the previous call if it hasn't been sent yet * as the DATA packet will implicitly ACK it. */ clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); - - write_lock_bh(&call->state_lock); - call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; - write_unlock_bh(&call->state_lock); + clear_bit(conn->bundle_shift + channel, &bundle->avail_chans); rxrpc_see_call(call); list_del_init(&call->chan_wait_link); - conn->active_chans |= 1 << channel; call->peer = rxrpc_get_peer(conn->params.peer); + call->conn = rxrpc_get_connection(conn); call->cid = conn->proto.cid | channel; call->call_id = call_id; + call->security = conn->security; + call->security_ix = conn->security_ix; + call->service_id = conn->service_id; trace_rxrpc_connect_call(call); _net("CONNECT call %08x:%08x as call %d on conn %d", call->cid, call->call_id, call->debug_id, conn->debug_id); - /* Paired with the read barrier in rxrpc_wait_for_channel(). This - * orders cid and epoch in the connection wrt to call_id without the - * need to take the channel_lock. + write_lock_bh(&call->state_lock); + call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; + write_unlock_bh(&call->state_lock); + + /* Paired with the read barrier in rxrpc_connect_call(). This orders + * cid and epoch in the connection wrt to call_id without the need to + * take the channel_lock. * * We provisionally assign a callNumber at this point, but we don't * confirm it until the call is about to be exposed. @@ -586,101 +552,137 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, * at the call ID through a connection channel. */ smp_wmb(); - chan->call_id = call_id; - chan->call_debug_id = call->debug_id; + + chan->call_id = call_id; + chan->call_debug_id = call->debug_id; rcu_assign_pointer(chan->call, call); wake_up(&call->waitq); } /* + * Remove a connection from the idle list if it's on it. + */ +static void rxrpc_unidle_conn(struct rxrpc_bundle *bundle, struct rxrpc_connection *conn) +{ + struct rxrpc_net *rxnet = bundle->params.local->rxnet; + bool drop_ref; + + if (!list_empty(&conn->cache_link)) { + drop_ref = false; + spin_lock(&rxnet->client_conn_cache_lock); + if (!list_empty(&conn->cache_link)) { + list_del_init(&conn->cache_link); + drop_ref = true; + } + spin_unlock(&rxnet->client_conn_cache_lock); + if (drop_ref) + rxrpc_put_connection(conn); + } +} + +/* * Assign channels and callNumbers to waiting calls with channel_lock * held by caller. */ -static void rxrpc_activate_channels_locked(struct rxrpc_connection *conn) +static void rxrpc_activate_channels_locked(struct rxrpc_bundle *bundle) { - u8 avail, mask; - - switch (conn->cache_state) { - case RXRPC_CONN_CLIENT_ACTIVE: - mask = RXRPC_ACTIVE_CHANS_MASK; - break; - case RXRPC_CONN_CLIENT_UPGRADE: - mask = 0x01; - break; - default: - return; - } + struct rxrpc_connection *conn; + unsigned long avail, mask; + unsigned int channel, slot; - while (!list_empty(&conn->waiting_calls) && - (avail = ~conn->active_chans, - avail &= mask, - avail != 0)) - rxrpc_activate_one_channel(conn, __ffs(avail)); + if (bundle->try_upgrade) + mask = 1; + else + mask = ULONG_MAX; + + while (!list_empty(&bundle->waiting_calls)) { + avail = bundle->avail_chans & mask; + if (!avail) + break; + channel = __ffs(avail); + clear_bit(channel, &bundle->avail_chans); + + slot = channel / RXRPC_MAXCALLS; + conn = bundle->conns[slot]; + if (!conn) + break; + + if (bundle->try_upgrade) + set_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags); + rxrpc_unidle_conn(bundle, conn); + + channel &= (RXRPC_MAXCALLS - 1); + conn->act_chans |= 1 << channel; + rxrpc_activate_one_channel(conn, channel); + } } /* * Assign channels and callNumbers to waiting calls. */ -static void rxrpc_activate_channels(struct rxrpc_connection *conn) +static void rxrpc_activate_channels(struct rxrpc_bundle *bundle) { - _enter("%d", conn->debug_id); + _enter("B=%x", bundle->debug_id); - trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans); + trace_rxrpc_client(NULL, -1, rxrpc_client_activate_chans); - if (conn->active_chans == RXRPC_ACTIVE_CHANS_MASK) + if (!bundle->avail_chans) return; - spin_lock(&conn->channel_lock); - rxrpc_activate_channels_locked(conn); - spin_unlock(&conn->channel_lock); + spin_lock(&bundle->channel_lock); + rxrpc_activate_channels_locked(bundle); + spin_unlock(&bundle->channel_lock); _leave(""); } /* * Wait for a callNumber and a channel to be granted to a call. */ -static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp) +static int rxrpc_wait_for_channel(struct rxrpc_bundle *bundle, + struct rxrpc_call *call, gfp_t gfp) { + DECLARE_WAITQUEUE(myself, current); int ret = 0; _enter("%d", call->debug_id); - if (!call->call_id) { - DECLARE_WAITQUEUE(myself, current); + if (!gfpflags_allow_blocking(gfp)) { + rxrpc_maybe_add_conn(bundle, gfp); + rxrpc_activate_channels(bundle); + ret = bundle->alloc_error ?: -EAGAIN; + goto out; + } - if (!gfpflags_allow_blocking(gfp)) { - ret = -EAGAIN; - goto out; + add_wait_queue_exclusive(&call->waitq, &myself); + for (;;) { + rxrpc_maybe_add_conn(bundle, gfp); + rxrpc_activate_channels(bundle); + ret = bundle->alloc_error; + if (ret < 0) + break; + + switch (call->interruptibility) { + case RXRPC_INTERRUPTIBLE: + case RXRPC_PREINTERRUPTIBLE: + set_current_state(TASK_INTERRUPTIBLE); + break; + case RXRPC_UNINTERRUPTIBLE: + default: + set_current_state(TASK_UNINTERRUPTIBLE); + break; } - - add_wait_queue_exclusive(&call->waitq, &myself); - for (;;) { - switch (call->interruptibility) { - case RXRPC_INTERRUPTIBLE: - case RXRPC_PREINTERRUPTIBLE: - set_current_state(TASK_INTERRUPTIBLE); - break; - case RXRPC_UNINTERRUPTIBLE: - default: - set_current_state(TASK_UNINTERRUPTIBLE); - break; - } - if (call->call_id) - break; - if ((call->interruptibility == RXRPC_INTERRUPTIBLE || - call->interruptibility == RXRPC_PREINTERRUPTIBLE) && - signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - schedule(); + if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_AWAIT_CONN) + break; + if ((call->interruptibility == RXRPC_INTERRUPTIBLE || + call->interruptibility == RXRPC_PREINTERRUPTIBLE) && + signal_pending(current)) { + ret = -ERESTARTSYS; + break; } - remove_wait_queue(&call->waitq, &myself); - __set_current_state(TASK_RUNNING); + schedule(); } - - /* Paired with the write barrier in rxrpc_activate_one_channel(). */ - smp_rmb(); + remove_wait_queue(&call->waitq, &myself); + __set_current_state(TASK_RUNNING); out: _leave(" = %d", ret); @@ -697,52 +699,50 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, struct sockaddr_rxrpc *srx, gfp_t gfp) { + struct rxrpc_bundle *bundle; struct rxrpc_net *rxnet = cp->local->rxnet; - int ret; + int ret = 0; _enter("{%d,%lx},", call->debug_id, call->user_call_ID); rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper); - rxrpc_cull_active_client_conns(rxnet); - ret = rxrpc_get_client_conn(rx, call, cp, srx, gfp); - if (ret < 0) + bundle = rxrpc_prep_call(rx, call, cp, srx, gfp); + if (IS_ERR(bundle)) { + ret = PTR_ERR(bundle); goto out; + } - rxrpc_animate_client_conn(rxnet, call->conn); - rxrpc_activate_channels(call->conn); - - ret = rxrpc_wait_for_channel(call, gfp); - if (ret < 0) { - trace_rxrpc_client(call->conn, ret, rxrpc_client_chan_wait_failed); - rxrpc_disconnect_client_call(call); - goto out; + if (call->state == RXRPC_CALL_CLIENT_AWAIT_CONN) { + ret = rxrpc_wait_for_channel(bundle, call, gfp); + if (ret < 0) + goto wait_failed; } - spin_lock_bh(&call->conn->params.peer->lock); - hlist_add_head_rcu(&call->error_link, - &call->conn->params.peer->error_targets); - spin_unlock_bh(&call->conn->params.peer->lock); +granted_channel: + /* Paired with the write barrier in rxrpc_activate_one_channel(). */ + smp_rmb(); +out_put_bundle: + rxrpc_put_bundle(bundle); out: _leave(" = %d", ret); return ret; -} -/* - * Note that a connection is about to be exposed to the world. Once it is - * exposed, we maintain an extra ref on it that stops it from being summarily - * discarded before it's (a) had a chance to deal with retransmission and (b) - * had a chance at re-use (the per-connection security negotiation is - * expensive). - */ -static void rxrpc_expose_client_conn(struct rxrpc_connection *conn, - unsigned int channel) -{ - if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) { - trace_rxrpc_client(conn, channel, rxrpc_client_exposed); - rxrpc_get_connection(conn); +wait_failed: + spin_lock(&bundle->channel_lock); + list_del_init(&call->chan_wait_link); + spin_unlock(&bundle->channel_lock); + + if (call->state != RXRPC_CALL_CLIENT_AWAIT_CONN) { + ret = 0; + goto granted_channel; } + + trace_rxrpc_client(call->conn, ret, rxrpc_client_chan_wait_failed); + rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); + rxrpc_disconnect_client_call(bundle, call); + goto out_put_bundle; } /* @@ -764,7 +764,7 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) chan->call_counter++; if (chan->call_counter >= INT_MAX) set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); - rxrpc_expose_client_conn(conn, channel); + trace_rxrpc_client(conn, channel, rxrpc_client_exposed); } } @@ -773,62 +773,56 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) */ static void rxrpc_set_client_reap_timer(struct rxrpc_net *rxnet) { - unsigned long now = jiffies; - unsigned long reap_at = now + rxrpc_conn_idle_client_expiry; + if (!rxnet->kill_all_client_conns) { + unsigned long now = jiffies; + unsigned long reap_at = now + rxrpc_conn_idle_client_expiry; - if (rxnet->live) - timer_reduce(&rxnet->client_conn_reap_timer, reap_at); + if (rxnet->live) + timer_reduce(&rxnet->client_conn_reap_timer, reap_at); + } } /* * Disconnect a client call. */ -void rxrpc_disconnect_client_call(struct rxrpc_call *call) +void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call *call) { - struct rxrpc_connection *conn = call->conn; + struct rxrpc_connection *conn; struct rxrpc_channel *chan = NULL; - struct rxrpc_net *rxnet = conn->params.local->rxnet; - unsigned int channel = -1; + struct rxrpc_net *rxnet = bundle->params.local->rxnet; + unsigned int channel; + bool may_reuse; u32 cid; - spin_lock(&conn->channel_lock); - set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); + _enter("c=%x", call->debug_id); - cid = call->cid; - if (cid) { - channel = cid & RXRPC_CHANNELMASK; - chan = &conn->channels[channel]; - } - trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); + spin_lock(&bundle->channel_lock); + set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); /* Calls that have never actually been assigned a channel can simply be - * discarded. If the conn didn't get used either, it will follow - * immediately unless someone else grabs it in the meantime. + * discarded. */ - if (!list_empty(&call->chan_wait_link)) { + conn = call->conn; + if (!conn) { _debug("call is waiting"); ASSERTCMP(call->call_id, ==, 0); ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); list_del_init(&call->chan_wait_link); - - trace_rxrpc_client(conn, channel, rxrpc_client_chan_unstarted); - - /* We must deactivate or idle the connection if it's now - * waiting for nothing. - */ - spin_lock(&rxnet->client_conn_cache_lock); - if (conn->cache_state == RXRPC_CONN_CLIENT_WAITING && - list_empty(&conn->waiting_calls) && - !conn->active_chans) - goto idle_connection; goto out; } + cid = call->cid; + channel = cid & RXRPC_CHANNELMASK; + chan = &conn->channels[channel]; + trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); + if (rcu_access_pointer(chan->call) != call) { - spin_unlock(&conn->channel_lock); + spin_unlock(&bundle->channel_lock); BUG(); } + may_reuse = rxrpc_may_reuse_conn(conn); + /* If a client call was exposed to the world, we save the result for * retransmission. * @@ -841,14 +835,21 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { _debug("exposed %u,%u", call->call_id, call->abort_code); __rxrpc_disconnect_call(conn, call); + + if (test_and_clear_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) { + trace_rxrpc_client(conn, channel, rxrpc_client_to_active); + bundle->try_upgrade = false; + if (may_reuse) + rxrpc_activate_channels_locked(bundle); + } + } /* See if we can pass the channel directly to another call. */ - if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE && - !list_empty(&conn->waiting_calls)) { + if (may_reuse && !list_empty(&bundle->waiting_calls)) { trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass); rxrpc_activate_one_channel(conn, channel); - goto out_2; + goto out; } /* Schedule the final ACK to be transmitted in a short while so that it @@ -865,128 +866,95 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) rxrpc_reduce_conn_timer(conn, final_ack_at); } - /* Things are more complex and we need the cache lock. We might be - * able to simply idle the conn or it might now be lurking on the wait - * list. It might even get moved back to the active list whilst we're - * waiting for the lock. - */ - spin_lock(&rxnet->client_conn_cache_lock); - - switch (conn->cache_state) { - case RXRPC_CONN_CLIENT_UPGRADE: - /* Deal with termination of a service upgrade probe. */ - if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) { - clear_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags); - trace_rxrpc_client(conn, channel, rxrpc_client_to_active); - conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE; - rxrpc_activate_channels_locked(conn); - } - fallthrough; - case RXRPC_CONN_CLIENT_ACTIVE: - if (list_empty(&conn->waiting_calls)) { - rxrpc_deactivate_one_channel(conn, channel); - if (!conn->active_chans) { - rxnet->nr_active_client_conns--; - goto idle_connection; - } - goto out; - } - - trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass); - rxrpc_activate_one_channel(conn, channel); - goto out; + /* Deactivate the channel. */ + rcu_assign_pointer(chan->call, NULL); + set_bit(conn->bundle_shift + channel, &conn->bundle->avail_chans); + conn->act_chans &= ~(1 << channel); - case RXRPC_CONN_CLIENT_CULLED: - rxrpc_deactivate_one_channel(conn, channel); - ASSERT(list_empty(&conn->waiting_calls)); - if (!conn->active_chans) - goto idle_connection; - goto out; + /* If no channels remain active, then put the connection on the idle + * list for a short while. Give it a ref to stop it going away if it + * becomes unbundled. + */ + if (!conn->act_chans) { + trace_rxrpc_client(conn, channel, rxrpc_client_to_idle); + conn->idle_timestamp = jiffies; - case RXRPC_CONN_CLIENT_WAITING: - rxrpc_deactivate_one_channel(conn, channel); - goto out; + rxrpc_get_connection(conn); + spin_lock(&rxnet->client_conn_cache_lock); + list_move_tail(&conn->cache_link, &rxnet->idle_client_conns); + spin_unlock(&rxnet->client_conn_cache_lock); - default: - BUG(); + rxrpc_set_client_reap_timer(rxnet); } out: - spin_unlock(&rxnet->client_conn_cache_lock); -out_2: - spin_unlock(&conn->channel_lock); + spin_unlock(&bundle->channel_lock); _leave(""); return; +} -idle_connection: - /* As no channels remain active, the connection gets deactivated - * immediately or moved to the idle list for a short while. - */ - if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) { - trace_rxrpc_client(conn, channel, rxrpc_client_to_idle); - conn->idle_timestamp = jiffies; - conn->cache_state = RXRPC_CONN_CLIENT_IDLE; - list_move_tail(&conn->cache_link, &rxnet->idle_client_conns); - if (rxnet->idle_client_conns.next == &conn->cache_link && - !rxnet->kill_all_client_conns) - rxrpc_set_client_reap_timer(rxnet); - } else { - trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive); - conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; - list_del_init(&conn->cache_link); +/* + * Remove a connection from a bundle. + */ +static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) +{ + struct rxrpc_bundle *bundle = conn->bundle; + struct rxrpc_local *local = bundle->params.local; + unsigned int bindex; + bool need_drop = false; + int i; + + _enter("C=%x", conn->debug_id); + + spin_lock(&bundle->channel_lock); + bindex = conn->bundle_shift / RXRPC_MAXCALLS; + if (bundle->conns[bindex] == conn) { + _debug("clear slot %u", bindex); + bundle->conns[bindex] = NULL; + for (i = 0; i < RXRPC_MAXCALLS; i++) + clear_bit(conn->bundle_shift + i, &bundle->avail_chans); + need_drop = true; + } + spin_unlock(&bundle->channel_lock); + + /* If there are no more connections, remove the bundle */ + if (!bundle->avail_chans) { + _debug("maybe unbundle"); + spin_lock(&local->client_bundles_lock); + + for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) + if (bundle->conns[i]) + break; + if (i == ARRAY_SIZE(bundle->conns) && !bundle->params.exclusive) { + _debug("erase bundle"); + rb_erase(&bundle->local_node, &local->client_bundles); + } + + spin_unlock(&local->client_bundles_lock); + if (i == ARRAY_SIZE(bundle->conns)) + rxrpc_put_bundle(bundle); } - goto out; + + if (need_drop) + rxrpc_put_connection(conn); + _leave(""); } /* * Clean up a dead client connection. */ -static struct rxrpc_connection * -rxrpc_put_one_client_conn(struct rxrpc_connection *conn) +static void rxrpc_kill_client_conn(struct rxrpc_connection *conn) { - struct rxrpc_connection *next = NULL; struct rxrpc_local *local = conn->params.local; struct rxrpc_net *rxnet = local->rxnet; - unsigned int nr_conns; - trace_rxrpc_client(conn, -1, rxrpc_client_cleanup); + _enter("C=%x", conn->debug_id); - if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) { - spin_lock(&local->client_conns_lock); - if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, - &conn->flags)) - rb_erase(&conn->client_node, &local->client_conns); - spin_unlock(&local->client_conns_lock); - } + trace_rxrpc_client(conn, -1, rxrpc_client_cleanup); + atomic_dec(&rxnet->nr_client_conns); rxrpc_put_client_connection_id(conn); - - ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE); - - if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) { - trace_rxrpc_client(conn, -1, rxrpc_client_uncount); - spin_lock(&rxnet->client_conn_cache_lock); - nr_conns = --rxnet->nr_client_conns; - - if (nr_conns < rxrpc_max_client_connections && - !list_empty(&rxnet->waiting_client_conns)) { - next = list_entry(rxnet->waiting_client_conns.next, - struct rxrpc_connection, cache_link); - rxrpc_get_connection(next); - rxrpc_activate_conn(rxnet, next); - } - - spin_unlock(&rxnet->client_conn_cache_lock); - } - rxrpc_kill_connection(conn); - if (next) - rxrpc_activate_channels(next); - - /* We need to get rid of the temporary ref we took upon next, but we - * can't call rxrpc_put_connection() recursively. - */ - return next; } /* @@ -998,63 +966,12 @@ void rxrpc_put_client_conn(struct rxrpc_connection *conn) unsigned int debug_id = conn->debug_id; int n; - do { - n = atomic_dec_return(&conn->usage); - trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, n, here); - if (n > 0) - return; + n = atomic_dec_return(&conn->usage); + trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, n, here); + if (n <= 0) { ASSERTCMP(n, >=, 0); - - conn = rxrpc_put_one_client_conn(conn); - } while (conn); -} - -/* - * Kill the longest-active client connections to make room for new ones. - */ -static void rxrpc_cull_active_client_conns(struct rxrpc_net *rxnet) -{ - struct rxrpc_connection *conn; - unsigned int nr_conns = rxnet->nr_client_conns; - unsigned int nr_active, limit; - - _enter(""); - - ASSERTCMP(nr_conns, >=, 0); - if (nr_conns < rxrpc_max_client_connections) { - _leave(" [ok]"); - return; - } - limit = rxrpc_reap_client_connections; - - spin_lock(&rxnet->client_conn_cache_lock); - nr_active = rxnet->nr_active_client_conns; - - while (nr_active > limit) { - ASSERT(!list_empty(&rxnet->active_client_conns)); - conn = list_entry(rxnet->active_client_conns.next, - struct rxrpc_connection, cache_link); - ASSERTIFCMP(conn->cache_state != RXRPC_CONN_CLIENT_ACTIVE, - conn->cache_state, ==, RXRPC_CONN_CLIENT_UPGRADE); - - if (list_empty(&conn->waiting_calls)) { - trace_rxrpc_client(conn, -1, rxrpc_client_to_culled); - conn->cache_state = RXRPC_CONN_CLIENT_CULLED; - list_del_init(&conn->cache_link); - } else { - trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting); - conn->cache_state = RXRPC_CONN_CLIENT_WAITING; - list_move_tail(&conn->cache_link, - &rxnet->waiting_client_conns); - } - - nr_active--; + rxrpc_kill_client_conn(conn); } - - rxnet->nr_active_client_conns = nr_active; - spin_unlock(&rxnet->client_conn_cache_lock); - ASSERTCMP(nr_active, >=, 0); - _leave(" [culled]"); } /* @@ -1088,7 +1005,7 @@ void rxrpc_discard_expired_client_conns(struct work_struct *work) /* We keep an estimate of what the number of conns ought to be after * we've discarded some so that we don't overdo the discarding. */ - nr_conns = rxnet->nr_client_conns; + nr_conns = atomic_read(&rxnet->nr_client_conns); next: spin_lock(&rxnet->client_conn_cache_lock); @@ -1098,7 +1015,6 @@ next: conn = list_entry(rxnet->idle_client_conns.next, struct rxrpc_connection, cache_link); - ASSERT(test_bit(RXRPC_CONN_EXPOSED, &conn->flags)); if (!rxnet->kill_all_client_conns) { /* If the number of connections is over the reap limit, we @@ -1120,18 +1036,13 @@ next: } trace_rxrpc_client(conn, -1, rxrpc_client_discard); - if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags)) - BUG(); - conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; list_del_init(&conn->cache_link); spin_unlock(&rxnet->client_conn_cache_lock); - /* When we cleared the EXPOSED flag, we took on responsibility for the - * reference that that had on the usage count. We deal with that here. - * If someone re-sets the flag and re-gets the ref, that's fine. - */ - rxrpc_put_connection(conn); + rxrpc_unbundle_conn(conn); + rxrpc_put_connection(conn); /* Drop the ->cache_link ref */ + nr_conns--; goto next; @@ -1145,8 +1056,7 @@ not_yet_expired: */ _debug("not yet"); if (!rxnet->kill_all_client_conns) - timer_reduce(&rxnet->client_conn_reap_timer, - conn_expires_at); + timer_reduce(&rxnet->client_conn_reap_timer, conn_expires_at); out: spin_unlock(&rxnet->client_conn_cache_lock); @@ -1181,37 +1091,27 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *local) { struct rxrpc_connection *conn, *tmp; struct rxrpc_net *rxnet = local->rxnet; - unsigned int nr_active; LIST_HEAD(graveyard); _enter(""); spin_lock(&rxnet->client_conn_cache_lock); - nr_active = rxnet->nr_active_client_conns; list_for_each_entry_safe(conn, tmp, &rxnet->idle_client_conns, cache_link) { if (conn->params.local == local) { - ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_IDLE); - trace_rxrpc_client(conn, -1, rxrpc_client_discard); - if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags)) - BUG(); - conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; list_move(&conn->cache_link, &graveyard); - nr_active--; } } - rxnet->nr_active_client_conns = nr_active; spin_unlock(&rxnet->client_conn_cache_lock); - ASSERTCMP(nr_active, >=, 0); while (!list_empty(&graveyard)) { conn = list_entry(graveyard.next, struct rxrpc_connection, cache_link); list_del_init(&conn->cache_link); - + rxrpc_unbundle_conn(conn); rxrpc_put_connection(conn); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 447f55ca6886..0628dad2bdea 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -157,12 +157,12 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, _enter("{%d},%x", conn->debug_id, conn->abort_code); - spin_lock(&conn->channel_lock); + spin_lock(&conn->bundle->channel_lock); for (i = 0; i < RXRPC_MAXCALLS; i++) { call = rcu_dereference_protected( conn->channels[i].call, - lockdep_is_held(&conn->channel_lock)); + lockdep_is_held(&conn->bundle->channel_lock)); if (call) { if (compl == RXRPC_CALL_LOCALLY_ABORTED) trace_rxrpc_abort(call->debug_id, @@ -179,7 +179,7 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, } } - spin_unlock(&conn->channel_lock); + spin_unlock(&conn->bundle->channel_lock); _leave(""); } @@ -210,6 +210,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, conn->error = error; conn->abort_code = abort_code; conn->state = RXRPC_CONN_LOCALLY_ABORTED; + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); spin_unlock_bh(&conn->state_lock); msg.msg_name = &conn->params.peer->srx.transport; @@ -319,6 +320,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, conn->error = -ECONNABORTED; conn->abort_code = abort_code; conn->state = RXRPC_CONN_REMOTELY_ABORTED; + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial); return -ECONNABORTED; @@ -339,7 +341,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (ret < 0) return ret; - spin_lock(&conn->channel_lock); + spin_lock(&conn->bundle->channel_lock); spin_lock(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { @@ -349,12 +351,12 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, rxrpc_call_is_secure( rcu_dereference_protected( conn->channels[loop].call, - lockdep_is_held(&conn->channel_lock))); + lockdep_is_held(&conn->bundle->channel_lock))); } else { spin_unlock(&conn->state_lock); } - spin_unlock(&conn->channel_lock); + spin_unlock(&conn->bundle->channel_lock); return 0; default: diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 8cbe0bf20ed5..3bcbe0665f91 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -41,8 +41,6 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) conn = kzalloc(sizeof(struct rxrpc_connection), gfp); if (conn) { INIT_LIST_HEAD(&conn->cache_link); - spin_lock_init(&conn->channel_lock); - INIT_LIST_HEAD(&conn->waiting_calls); timer_setup(&conn->timer, &rxrpc_connection_timer, 0); INIT_WORK(&conn->processor, &rxrpc_process_connection); INIT_LIST_HEAD(&conn->proc_link); @@ -219,11 +217,11 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) } if (rxrpc_is_client_call(call)) - return rxrpc_disconnect_client_call(call); + return rxrpc_disconnect_client_call(conn->bundle, call); - spin_lock(&conn->channel_lock); + spin_lock(&conn->bundle->channel_lock); __rxrpc_disconnect_call(conn, call); - spin_unlock(&conn->channel_lock); + spin_unlock(&conn->bundle->channel_lock); set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); conn->idle_timestamp = jiffies; @@ -292,12 +290,13 @@ void rxrpc_see_connection(struct rxrpc_connection *conn) /* * Get a ref on a connection. */ -void rxrpc_get_connection(struct rxrpc_connection *conn) +struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *conn) { const void *here = __builtin_return_address(0); int n = atomic_inc_return(&conn->usage); trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n, here); + return conn; } /* @@ -365,6 +364,7 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) conn->security->clear(conn); key_put(conn->params.key); key_put(conn->server_key); + rxrpc_put_bundle(conn->bundle); rxrpc_put_peer(conn->params.peer); if (atomic_dec_and_test(&conn->params.local->rxnet->nr_conns)) diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 21da48e3d2e5..6c847720494f 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -8,6 +8,12 @@ #include <linux/slab.h> #include "ar-internal.h" +static struct rxrpc_bundle rxrpc_service_dummy_bundle = { + .usage = ATOMIC_INIT(1), + .debug_id = UINT_MAX, + .channel_lock = __SPIN_LOCK_UNLOCKED(&rxrpc_service_dummy_bundle.channel_lock), +}; + /* * Find a service connection under RCU conditions. * @@ -127,6 +133,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn */ conn->state = RXRPC_CONN_SERVICE_PREALLOC; atomic_set(&conn->usage, 2); + conn->bundle = rxrpc_get_bundle(&rxrpc_service_dummy_bundle); atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index ede058f9cc15..8c2881054266 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -86,8 +86,8 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, init_rwsem(&local->defrag_sem); skb_queue_head_init(&local->reject_queue); skb_queue_head_init(&local->event_queue); - local->client_conns = RB_ROOT; - spin_lock_init(&local->client_conns_lock); + local->client_bundles = RB_ROOT; + spin_lock_init(&local->client_bundles_lock); spin_lock_init(&local->lock); rwlock_init(&local->services_lock); local->debug_id = atomic_inc_return(&rxrpc_debug_id); diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index b312aab80fed..25bbc4cc8b13 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -62,13 +62,10 @@ static __net_init int rxrpc_init_net(struct net *net) timer_setup(&rxnet->service_conn_reap_timer, rxrpc_service_conn_reap_timeout, 0); - rxnet->nr_client_conns = 0; - rxnet->nr_active_client_conns = 0; + atomic_set(&rxnet->nr_client_conns, 0); rxnet->kill_all_client_conns = false; spin_lock_init(&rxnet->client_conn_cache_lock); spin_lock_init(&rxnet->client_conn_discard_lock); - INIT_LIST_HEAD(&rxnet->waiting_client_conns); - INIT_LIST_HEAD(&rxnet->active_client_conns); INIT_LIST_HEAD(&rxnet->idle_client_conns); INIT_WORK(&rxnet->client_conn_reaper, rxrpc_discard_expired_client_conns); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 3cfff7922ba8..10f2bf2e9068 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -357,6 +357,12 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, _enter(",{%d}", skb->len); + if (hlist_unhashed(&call->error_link)) { + spin_lock_bh(&call->peer->lock); + hlist_add_head_rcu(&call->error_link, &call->peer->error_targets); + spin_unlock_bh(&call->peer->lock); + } + /* Each transmission of a Tx packet needs a new serial number */ serial = atomic_inc_return(&conn->serial); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 543afd9bd664..e2f990754f88 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -165,7 +165,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) "Proto Local " " Remote " " SvID ConnID End Use State Key " - " Serial ISerial\n" + " Serial ISerial CallId0 CallId1 CallId2 CallId3\n" ); return 0; } diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index 1221b0637a7e..4e565eeab426 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -14,7 +14,6 @@ #define RXRPC_RTO_MAX ((unsigned)(120 * HZ)) #define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ -#define rxrpc_min_rtt_wlen 300 /* As sysctl_tcp_min_rtt_wlen */ static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) { diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index e08130e5746b..f114dc2af5cf 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -1169,7 +1169,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, if (response->encrypted.checksum != csum) goto protocol_error_free; - spin_lock(&conn->channel_lock); + spin_lock(&conn->bundle->channel_lock); for (i = 0; i < RXRPC_MAXCALLS; i++) { struct rxrpc_call *call; u32 call_id = ntohl(response->encrypted.call_id[i]); @@ -1186,13 +1186,13 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, if (call_id > conn->channels[i].call_counter) { call = rcu_dereference_protected( conn->channels[i].call, - lockdep_is_held(&conn->channel_lock)); + lockdep_is_held(&conn->bundle->channel_lock)); if (call && call->state < RXRPC_CALL_COMPLETE) goto protocol_error_unlock; conn->channels[i].call_counter = call_id; } } - spin_unlock(&conn->channel_lock); + spin_unlock(&conn->bundle->channel_lock); eproto = tracepoint_string("rxkad_rsp_seq"); abort_code = RXKADOUTOFSEQUENCE; @@ -1219,7 +1219,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, return 0; protocol_error_unlock: - spin_unlock(&conn->channel_lock); + spin_unlock(&conn->bundle->channel_lock); protocol_error_free: kfree(ticket); protocol_error: diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index e91acc95ff28..540351d6a5f4 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -74,21 +74,13 @@ static struct ctl_table rxrpc_sysctl_table[] = { /* Non-time values */ { - .procname = "max_client_conns", - .data = &rxrpc_max_client_connections, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&rxrpc_reap_client_connections, - }, - { .procname = "reap_client_conns", .data = &rxrpc_reap_client_connections, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)SYSCTL_ONE, - .extra2 = (void *)&rxrpc_max_client_connections, + .extra2 = (void *)&n_65535, }, { .procname = "max_backlog", diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 063d8aaf2900..f64af9d9dfee 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -976,7 +976,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, #endif NL_SET_ERR_MSG(extack, "Failed to load TC action module"); err = -ENOENT; - goto err_out; + goto err_free; } /* backward compatibility for policer */ @@ -1013,11 +1013,12 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, err_mod: module_put(a_o->owner); -err_out: +err_free: if (cookie) { kfree(cookie->data); kfree(cookie); } +err_out: return ERR_PTR(err); } diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 2c3619165680..47f9128ecb8f 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1039,7 +1039,7 @@ drop: static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = { [TCA_CT_ACTION] = { .type = NLA_U16 }, - [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) }, + [TCA_CT_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ct)), [TCA_CT_ZONE] = { .type = NLA_U16 }, [TCA_CT_MARK] = { .type = NLA_U32 }, [TCA_CT_MARK_MASK] = { .type = NLA_U32 }, @@ -1049,10 +1049,8 @@ static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = { .len = 128 / BITS_PER_BYTE }, [TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 }, [TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 }, - [TCA_CT_NAT_IPV6_MIN] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct in6_addr) }, - [TCA_CT_NAT_IPV6_MAX] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct in6_addr) }, + [TCA_CT_NAT_IPV6_MIN] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), + [TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 }, [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 }, }; diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index b5042f3ea079..e88fa19ea8a9 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -144,9 +144,8 @@ out: } static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { - [TCA_CTINFO_ACT] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct - tc_ctinfo) }, + [TCA_CTINFO_ACT] = + NLA_POLICY_EXACT_LEN(sizeof(struct tc_ctinfo)), [TCA_CTINFO_ZONE] = { .type = NLA_U16 }, [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 }, [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 }, diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 1fb8d428d2c1..f5dd4d1d274e 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -159,8 +159,8 @@ static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = { }; static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = { - [TCA_GATE_PARMS] = { .len = sizeof(struct tc_gate), - .type = NLA_EXACT_LEN }, + [TCA_GATE_PARMS] = + NLA_POLICY_EXACT_LEN(sizeof(struct tc_gate)), [TCA_GATE_PRIORITY] = { .type = NLA_S32 }, [TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_GATE_BASE_TIME] = { .type = NLA_U64 }, diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 8d735461fa19..fdb69d46276d 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1351,7 +1351,7 @@ static void sctp_select_active_and_retran_path(struct sctp_association *asoc) } /* We did not find anything useful for a possible retransmission - * path; either primary path that we found is the the same as + * path; either primary path that we found is the same as * the current one, or we didn't generally find an active one. */ if (trans_sec == NULL) @@ -1537,7 +1537,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len) /* If we've reached or overflowed our receive buffer, announce * a 0 rwnd if rwnd would still be positive. Store the - * the potential pressure overflow so that the window can be restored + * potential pressure overflow so that the window can be restored * back to original value. */ if (rx_count >= asoc->base.sk->sk_rcvbuf) diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 9e289c770574..589868d96e3f 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -445,7 +445,7 @@ struct sctp_shared_key *sctp_auth_get_shkey( } /* - * Initialize all the possible digest transforms that we can use. Right now + * Initialize all the possible digest transforms that we can use. Right * now, the supported digests are SHA1 and SHA256. We do this here once * because of the restrictiong that transforms may only be allocated in * user context. This forces us to pre-allocated all possible transforms @@ -810,7 +810,7 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, } /* Set a new shared key on either endpoint or association. If the - * the key with a same ID already exists, replace the key (remove the + * key with a same ID already exists, replace the key (remove the * old key and add a new one). */ int sctp_auth_set_key(struct sctp_endpoint *ep, diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 701c5a4e441d..53e5ed79f63f 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -505,7 +505,7 @@ int sctp_in_scope(struct net *net, const union sctp_addr *addr, return 0; /* * For INIT and INIT-ACK address list, let L be the level of - * of requested destination address, sender and receiver + * requested destination address, sender and receiver * SHOULD include all of its addresses with level greater * than or equal to L. * diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index ab6a997e222f..fd4f8243cc35 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -179,7 +179,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, __func__, asoc, max_data); } - /* If the the peer requested that we authenticate DATA chunks + /* If the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with * DATA. */ diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index d19db22262fd..25833238fe93 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -372,7 +372,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) * Level 3 - private addresses. * Level 4 - global addresses * For INIT and INIT-ACK address list, let L be the level of - * of requested destination address, sender and receiver + * requested destination address, sender and receiver * SHOULD include all of its addresses with level greater * than or equal to L. * @@ -1483,10 +1483,10 @@ static __init int sctp_init(void) num_entries = (1UL << order) * PAGE_SIZE / sizeof(struct sctp_bind_hashbucket); - /* And finish by rounding it down to the nearest power of two - * this wastes some memory of course, but its needed because + /* And finish by rounding it down to the nearest power of two. + * This wastes some memory of course, but it's needed because * the hash function operates based on the assumption that - * that the number of entries is a power of two + * the number of entries is a power of two. */ sctp_port_hashsize = rounddown_pow_of_two(num_entries); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index c11c24524652..9a56ae2f3651 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1235,7 +1235,7 @@ nodata: /* Create an Operation Error chunk of a fixed size, specifically, * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads. - * This is a helper function to allocate an error chunk for for those + * This is a helper function to allocate an error chunk for those * invalid parameter codes in which we may not want to report all the * errors, if the incoming chunk is large. If it can't fit in a single * packet, we ignore it. @@ -1780,7 +1780,7 @@ no_hmac: * for init collision case of lost COOKIE ACK. * If skb has been timestamped, then use the stamp, otherwise * use current time. This introduces a small possibility that - * that a cookie may be considered expired, but his would only slow + * a cookie may be considered expired, but this would only slow * down the new association establishment instead of every packet. */ if (sock_flag(ep->base.sk, SOCK_TIMESTAMP)) @@ -2319,7 +2319,7 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, /* This implementation defaults to making the first transport * added as the primary transport. The source address seems to - * be a a better choice than any of the embedded addresses. + * be a better choice than any of the embedded addresses. */ if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE)) goto nomem; diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 1c6c640607c5..407fed46931b 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -740,7 +740,7 @@ static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq) /* Helper function to gather skbs that have possibly become - * ordered by an an incoming chunk. + * ordered by an incoming chunk. */ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e7649bbc2b87..ed8f97166be9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -55,6 +55,9 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group * creation on client */ +struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ +struct workqueue_struct *smc_close_wq; /* wq for close work */ + static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); @@ -436,10 +439,10 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) static void smcr_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { - int bufsize = smc_uncompress_bufsize(clc->rmbe_size); + int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size); - smc->conn.peer_rmbe_idx = clc->rmbe_idx; - smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); + smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx; + smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token); smc->conn.peer_rmbe_size = bufsize; atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); @@ -448,10 +451,10 @@ static void smcr_conn_save_peer_info(struct smc_sock *smc, static void smcd_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { - int bufsize = smc_uncompress_bufsize(clc->dmbe_size); + int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size); - smc->conn.peer_rmbe_idx = clc->dmbe_idx; - smc->conn.peer_token = clc->token; + smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx; + smc->conn.peer_token = clc->d0.token; /* msg header takes up space in the buffer */ smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); @@ -470,11 +473,11 @@ static void smc_conn_save_peer_info(struct smc_sock *smc, static void smc_link_save_peer_info(struct smc_link *link, struct smc_clc_msg_accept_confirm *clc) { - link->peer_qpn = ntoh24(clc->qpn); - memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); - memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); - link->peer_psn = ntoh24(clc->psn); - link->peer_mtu = clc->qp_mtu; + link->peer_qpn = ntoh24(clc->r0.qpn); + memcpy(link->peer_gid, clc->r0.lcl.gid, SMC_GID_SIZE); + memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac)); + link->peer_psn = ntoh24(clc->r0.psn); + link->peer_mtu = clc->r0.qp_mtu; } static void smc_switch_to_fallback(struct smc_sock *smc) @@ -523,11 +526,11 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) /* abort connecting */ static int smc_connect_abort(struct smc_sock *smc, int reason_code, - int local_contact) + int local_first) { bool is_smcd = smc->conn.lgr->is_smcd; - if (local_contact == SMC_FIRST_CONTACT) + if (local_first) smc_lgr_cleanup_early(&smc->conn); else smc_conn_free(&smc->conn); @@ -613,9 +616,9 @@ static int smc_connect_rdma(struct smc_sock *smc, struct smc_link *link; ini->is_smcd = false; - ini->ib_lcl = &aclc->lcl; - ini->ib_clcqpn = ntoh24(aclc->qpn); - ini->srv_first_contact = aclc->hdr.flag; + ini->ib_lcl = &aclc->r0.lcl; + ini->ib_clcqpn = ntoh24(aclc->r0.qpn); + ini->first_contact_peer = aclc->hdr.flag; mutex_lock(&smc_client_lgr_pending); reason_code = smc_conn_create(smc, ini); @@ -626,7 +629,7 @@ static int smc_connect_rdma(struct smc_sock *smc, smc_conn_save_peer_info(smc, aclc); - if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + if (ini->first_contact_local) { link = smc->conn.lnk; } else { /* set link that was assigned by server */ @@ -634,60 +637,62 @@ static int smc_connect_rdma(struct smc_sock *smc, for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *l = &smc->conn.lgr->lnk[i]; - if (l->peer_qpn == ntoh24(aclc->qpn) && - !memcmp(l->peer_gid, &aclc->lcl.gid, SMC_GID_SIZE) && - !memcmp(l->peer_mac, &aclc->lcl.mac, sizeof(l->peer_mac))) { + if (l->peer_qpn == ntoh24(aclc->r0.qpn) && + !memcmp(l->peer_gid, &aclc->r0.lcl.gid, + SMC_GID_SIZE) && + !memcmp(l->peer_mac, &aclc->r0.lcl.mac, + sizeof(l->peer_mac))) { link = l; break; } } if (!link) return smc_connect_abort(smc, SMC_CLC_DECL_NOSRVLINK, - ini->cln_first_contact); + ini->first_contact_local); smc->conn.lnk = link; } /* create send buffer and rmb */ if (smc_buf_create(smc, false)) return smc_connect_abort(smc, SMC_CLC_DECL_MEM, - ini->cln_first_contact); + ini->first_contact_local); - if (ini->cln_first_contact == SMC_FIRST_CONTACT) + if (ini->first_contact_local) smc_link_save_peer_info(link, aclc); if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, - ini->cln_first_contact); + ini->first_contact_local); smc_close_init(smc); smc_rx_init(smc); - if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + if (ini->first_contact_local) { if (smc_ib_ready_link(link)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, - ini->cln_first_contact); + ini->first_contact_local); } else { if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, - ini->cln_first_contact); + ini->first_contact_local); } smc_rmb_sync_sg_for_device(&smc->conn); reason_code = smc_clc_send_confirm(smc); if (reason_code) return smc_connect_abort(smc, reason_code, - ini->cln_first_contact); + ini->first_contact_local); smc_tx_init(smc); - if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + if (ini->first_contact_local) { /* QP confirmation over RoCE fabric */ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); reason_code = smcr_clnt_conf_first_link(smc); smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) return smc_connect_abort(smc, reason_code, - ini->cln_first_contact); + ini->first_contact_local); } mutex_unlock(&smc_client_lgr_pending); @@ -707,8 +712,8 @@ static int smc_connect_ism(struct smc_sock *smc, int rc = 0; ini->is_smcd = true; - ini->ism_gid = aclc->gid; - ini->srv_first_contact = aclc->hdr.flag; + ini->ism_peer_gid = aclc->d0.gid; + ini->first_contact_peer = aclc->hdr.flag; /* there is only one lgr role for SMC-D; use server lock */ mutex_lock(&smc_server_lgr_pending); @@ -724,7 +729,7 @@ static int smc_connect_ism(struct smc_sock *smc, return smc_connect_abort(smc, (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM, - ini->cln_first_contact); + ini->first_contact_local); smc_conn_save_peer_info(smc, aclc); smc_close_init(smc); @@ -733,7 +738,7 @@ static int smc_connect_ism(struct smc_sock *smc, rc = smc_clc_send_confirm(smc); if (rc) - return smc_connect_abort(smc, rc, ini->cln_first_contact); + return smc_connect_abort(smc, rc, ini->first_contact_local); mutex_unlock(&smc_server_lgr_pending); smc_copy_sock_settings_to_clc(smc); @@ -903,7 +908,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, if (smc->use_fallback) goto out; if (flags & O_NONBLOCK) { - if (schedule_work(&smc->connect_work)) + if (queue_work(smc_hs_wq, &smc->connect_work)) smc->connect_nonblock = 1; rc = -EINPROGRESS; } else { @@ -940,10 +945,10 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) mutex_lock(&lsmc->clcsock_release_lock); if (lsmc->clcsock) - rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); + rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); mutex_unlock(&lsmc->clcsock_release_lock); lock_sock(lsk); - if (rc < 0) + if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); @@ -956,6 +961,10 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) goto out; } + /* new clcsock has inherited the smc listen-specific sk_data_ready + * function; switch it back to the original sk_data_ready function + */ + new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; (*new_smc)->clcsock = new_clcsock; out: return rc; @@ -1123,10 +1132,10 @@ static void smc_listen_out_err(struct smc_sock *new_smc) /* listen worker: decline and fall back if possible */ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, - int local_contact) + bool local_first) { /* RDMA setup failed, switch back to TCP */ - if (local_contact == SMC_FIRST_CONTACT) + if (local_first) smc_lgr_cleanup_early(&new_smc->conn); else smc_conn_free(&new_smc->conn); @@ -1182,30 +1191,16 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { - struct smc_clc_msg_smcd *pclc_smcd; int rc; - pclc_smcd = smc_get_clc_msg_smcd(pclc); - ini->ism_gid = pclc_smcd->gid; rc = smc_conn_create(new_smc, ini); if (rc) return rc; - /* Check if peer can be reached via ISM device */ - if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid, - new_smc->conn.lgr->vlan_id, - new_smc->conn.lgr->smcd)) { - if (ini->cln_first_contact == SMC_FIRST_CONTACT) - smc_lgr_cleanup_early(&new_smc->conn); - else - smc_conn_free(&new_smc->conn); - return SMC_CLC_DECL_SMCDNOTALK; - } - /* Create send and receive buffers */ rc = smc_buf_create(new_smc, true); if (rc) { - if (ini->cln_first_contact == SMC_FIRST_CONTACT) + if (ini->first_contact_local) smc_lgr_cleanup_early(&new_smc->conn); else smc_conn_free(&new_smc->conn); @@ -1217,11 +1212,11 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, } /* listen worker: register buffers */ -static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) +static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) { struct smc_connection *conn = &new_smc->conn; - if (local_contact != SMC_FIRST_CONTACT) { + if (!local_first) { if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; } @@ -1233,35 +1228,25 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) /* listen worker: finish RDMA setup */ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, - int local_contact) + bool local_first) { struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; - if (local_contact == SMC_FIRST_CONTACT) + if (local_first) smc_link_save_peer_info(link, cclc); - if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) { - reason_code = SMC_CLC_DECL_ERR_RTOK; - goto decline; - } + if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) + return SMC_CLC_DECL_ERR_RTOK; - if (local_contact == SMC_FIRST_CONTACT) { - if (smc_ib_ready_link(link)) { - reason_code = SMC_CLC_DECL_ERR_RDYLNK; - goto decline; - } + if (local_first) { + if (smc_ib_ready_link(link)) + return SMC_CLC_DECL_ERR_RDYLNK; /* QP confirmation over RoCE fabric */ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); reason_code = smcr_serv_conf_first_link(new_smc); smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); - if (reason_code) - goto decline; } - return 0; - -decline: - smc_listen_decline(new_smc, reason_code, local_contact); return reason_code; } @@ -1272,10 +1257,10 @@ static void smc_listen_work(struct work_struct *work) smc_listen_work); struct socket *newclcsock = new_smc->clcsock; struct smc_clc_msg_accept_confirm cclc; + struct smc_clc_msg_proposal_area *buf; struct smc_clc_msg_proposal *pclc; struct smc_init_info ini = {0}; bool ism_supported = false; - u8 buf[SMC_CLC_MAX_LEN]; int rc = 0; if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) @@ -1297,8 +1282,13 @@ static void smc_listen_work(struct work_struct *work) /* do inband token exchange - * wait for and receive SMC Proposal CLC message */ - pclc = (struct smc_clc_msg_proposal *)&buf; - rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) { + rc = SMC_CLC_DECL_MEM; + goto out_decl; + } + pclc = (struct smc_clc_msg_proposal *)buf; + rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf), SMC_CLC_PROPOSAL, CLC_WAIT_TIME); if (rc) goto out_decl; @@ -1327,7 +1317,10 @@ static void smc_listen_work(struct work_struct *work) /* check if ISM is available */ if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) { + struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc); + ini.is_smcd = true; /* prepare ISM check */ + ini.ism_peer_gid = pclc_smcd->gid; rc = smc_find_ism_device(new_smc, &ini); if (!rc) rc = smc_listen_ism_init(new_smc, pclc, &ini); @@ -1354,13 +1347,13 @@ static void smc_listen_work(struct work_struct *work) rc = smc_listen_rdma_init(new_smc, &ini); if (rc) goto out_unlock; - rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact); + rc = smc_listen_rdma_reg(new_smc, ini.first_contact_local); if (rc) goto out_unlock; } /* send SMC Accept CLC message */ - rc = smc_clc_send_accept(new_smc, ini.cln_first_contact); + rc = smc_clc_send_accept(new_smc, ini.first_contact_local); if (rc) goto out_unlock; @@ -1380,19 +1373,21 @@ static void smc_listen_work(struct work_struct *work) /* finish worker */ if (!ism_supported) { rc = smc_listen_rdma_finish(new_smc, &cclc, - ini.cln_first_contact); - mutex_unlock(&smc_server_lgr_pending); + ini.first_contact_local); if (rc) - return; + goto out_unlock; + mutex_unlock(&smc_server_lgr_pending); } smc_conn_save_peer_info(new_smc, &cclc); smc_listen_out_connected(new_smc); - return; + goto out_free; out_unlock: mutex_unlock(&smc_server_lgr_pending); out_decl: - smc_listen_decline(new_smc, rc, ini.cln_first_contact); + smc_listen_decline(new_smc, rc, ini.first_contact_local); +out_free: + kfree(buf); } static void smc_tcp_listen_work(struct work_struct *work) @@ -1406,7 +1401,7 @@ static void smc_tcp_listen_work(struct work_struct *work) lock_sock(lsk); while (lsk->sk_state == SMC_LISTEN) { rc = smc_clcsock_accept(lsmc, &new_smc); - if (rc) + if (rc) /* clcsock accept queue empty or error */ goto out; if (!new_smc) continue; @@ -1420,13 +1415,29 @@ static void smc_tcp_listen_work(struct work_struct *work) new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; sock_hold(&new_smc->sk); /* sock_put in passive closing */ - if (!schedule_work(&new_smc->smc_listen_work)) + if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) sock_put(&new_smc->sk); } out: release_sock(lsk); - sock_put(&lsmc->sk); /* sock_hold in smc_listen */ + sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ +} + +static void smc_clcsock_data_ready(struct sock *listen_clcsock) +{ + struct smc_sock *lsmc; + + lsmc = (struct smc_sock *) + ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); + if (!lsmc) + return; + lsmc->clcsk_data_ready(listen_clcsock); + if (lsmc->sk.sk_state == SMC_LISTEN) { + sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ + if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work)) + sock_put(&lsmc->sk); + } } static int smc_listen(struct socket *sock, int backlog) @@ -1455,15 +1466,19 @@ static int smc_listen(struct socket *sock, int backlog) if (!smc->use_fallback) tcp_sk(smc->clcsock->sk)->syn_smc = 1; + /* save original sk_data_ready function and establish + * smc-specific sk_data_ready function + */ + smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready; + smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready; + smc->clcsock->sk->sk_user_data = + (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); rc = kernel_listen(smc->clcsock, backlog); if (rc) goto out; sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; - sock_hold(sk); /* sock_hold in tcp_listen_worker */ - if (!schedule_work(&smc->tcp_listen_work)) - sock_put(sk); out: release_sock(sk); @@ -1788,8 +1803,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { if (val) - mod_delayed_work(system_wq, &smc->conn.tx_work, - 0); + mod_delayed_work(smc->conn.lgr->tx_wq, + &smc->conn.tx_work, 0); } break; case TCP_CORK: @@ -1797,8 +1812,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { if (!val) - mod_delayed_work(system_wq, &smc->conn.tx_work, - 0); + mod_delayed_work(smc->conn.lgr->tx_wq, + &smc->conn.tx_work, 0); } break; case TCP_DEFER_ACCEPT: @@ -2081,10 +2096,19 @@ static int __init smc_init(void) if (rc) goto out_pernet_subsys; + rc = -ENOMEM; + smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); + if (!smc_hs_wq) + goto out_pnet; + + smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); + if (!smc_close_wq) + goto out_alloc_hs_wq; + rc = smc_core_init(); if (rc) { pr_err("%s: smc_core_init fails with %d\n", __func__, rc); - goto out_pnet; + goto out_alloc_wqs; } rc = smc_llc_init(); @@ -2136,6 +2160,10 @@ out_proto: proto_unregister(&smc_proto); out_core: smc_core_exit(); +out_alloc_wqs: + destroy_workqueue(smc_close_wq); +out_alloc_hs_wq: + destroy_workqueue(smc_hs_wq); out_pnet: smc_pnet_exit(); out_pernet_subsys: @@ -2150,6 +2178,8 @@ static void __exit smc_exit(void) sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); + destroy_workqueue(smc_close_wq); + destroy_workqueue(smc_hs_wq); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); diff --git a/net/smc/smc.h b/net/smc/smc.h index 6f1c42da7a4c..2bd57e57b7e7 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -18,6 +18,8 @@ #include "smc_ib.h" +#define SMC_V1 1 /* SMC version V1 */ + #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ @@ -201,6 +203,8 @@ struct smc_connection { struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ + void (*clcsk_data_ready)(struct sock *sk); + /* original data_ready fct. **/ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ @@ -235,6 +239,9 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } +extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ +extern struct workqueue_struct *smc_close_wq; /* wq for close work */ + #define SMC_SYSTEMID_LEN 8 extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index ce468ff62a19..b1ce6ccbfaec 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -299,7 +299,7 @@ static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc, conn->lnk = link; spin_unlock_bh(&conn->send_lock); sock_hold(&smc->sk); /* sock_put in abort_work */ - if (!schedule_work(&conn->abort_work)) + if (!queue_work(smc_close_wq, &conn->abort_work)) sock_put(&smc->sk); } } @@ -368,7 +368,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(&smc->sk, SOCK_DONE); sock_hold(&smc->sk); /* sock_put in close_work */ - if (!schedule_work(&conn->close_work)) + if (!queue_work(smc_close_wq, &conn->close_work)) sock_put(&smc->sk); } } diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 779f4142a11d..bd116e1949b9 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -153,7 +153,6 @@ static int smc_clc_prfx_set(struct socket *clcsock, struct sockaddr_in *addr; int rc = -ENOENT; - memset(prop, 0, sizeof(*prop)); if (!dst) { rc = -ENOTCONN; goto out; @@ -320,7 +319,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } datlen = ntohs(clcm->length); if ((len < sizeof(struct smc_clc_msg_hdr)) || - (clcm->version < SMC_CLC_V1) || + (clcm->version < SMC_V1) || ((clcm->type != SMC_CLC_DECLINE) && (clcm->type != expected_type))) { smc->sk.sk_err = EPROTO; @@ -389,7 +388,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); dclc.hdr.type = SMC_CLC_DECLINE; dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); - dclc.hdr.version = SMC_CLC_V1; + dclc.hdr.version = SMC_V1; dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0; if ((!smc->conn.lgr || !smc->conn.lgr->is_smcd) && smc_ib_is_valid_local_systemid()) @@ -412,138 +411,171 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, struct smc_init_info *ini) { - struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; - struct smc_clc_msg_proposal_prefix pclc_prfx; - struct smc_clc_msg_smcd pclc_smcd; - struct smc_clc_msg_proposal pclc; - struct smc_clc_msg_trail trl; + struct smc_clc_msg_proposal_prefix *pclc_prfx; + struct smc_clc_msg_proposal *pclc_base; + struct smc_clc_msg_proposal_area *pclc; + struct smc_clc_ipv6_prefix *ipv6_prfx; + struct smc_clc_msg_smcd *pclc_smcd; + struct smc_clc_msg_trail *trl; int len, i, plen, rc; int reason_code = 0; struct kvec vec[5]; struct msghdr msg; + pclc = kzalloc(sizeof(*pclc), GFP_KERNEL); + if (!pclc) + return -ENOMEM; + + pclc_base = &pclc->pclc_base; + pclc_smcd = &pclc->pclc_smcd; + pclc_prfx = &pclc->pclc_prfx; + ipv6_prfx = pclc->pclc_prfx_ipv6; + trl = &pclc->pclc_trl; + /* retrieve ip prefixes for CLC proposal msg */ - rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx, ipv6_prfx); - if (rc) + rc = smc_clc_prfx_set(smc->clcsock, pclc_prfx, ipv6_prfx); + if (rc) { + kfree(pclc); return SMC_CLC_DECL_CNFERR; /* configuration error */ + } /* send SMC Proposal CLC message */ - plen = sizeof(pclc) + sizeof(pclc_prfx) + - (pclc_prfx.ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) + - sizeof(trl); - memset(&pclc, 0, sizeof(pclc)); - memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); - pclc.hdr.type = SMC_CLC_PROPOSAL; - pclc.hdr.version = SMC_CLC_V1; /* SMC version */ - pclc.hdr.path = smc_type; + plen = sizeof(*pclc_base) + sizeof(*pclc_prfx) + + (pclc_prfx->ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) + + sizeof(*trl); + memcpy(pclc_base->hdr.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + pclc_base->hdr.type = SMC_CLC_PROPOSAL; + pclc_base->hdr.version = SMC_V1; /* SMC version */ + pclc_base->hdr.path = smc_type; if (smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B) { /* add SMC-R specifics */ - memcpy(pclc.lcl.id_for_peer, local_systemid, + memcpy(pclc_base->lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&pclc.lcl.gid, ini->ib_gid, SMC_GID_SIZE); - memcpy(&pclc.lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1], + memcpy(pclc_base->lcl.gid, ini->ib_gid, SMC_GID_SIZE); + memcpy(pclc_base->lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1], ETH_ALEN); - pclc.iparea_offset = htons(0); + pclc_base->iparea_offset = htons(0); } if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { /* add SMC-D specifics */ - memset(&pclc_smcd, 0, sizeof(pclc_smcd)); - plen += sizeof(pclc_smcd); - pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET); - pclc_smcd.gid = ini->ism_dev->local_gid; + plen += sizeof(*pclc_smcd); + pclc_base->iparea_offset = htons(sizeof(*pclc_smcd)); + pclc_smcd->gid = ini->ism_dev->local_gid; } - pclc.hdr.length = htons(plen); + pclc_base->hdr.length = htons(plen); - memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memset(&msg, 0, sizeof(msg)); i = 0; - vec[i].iov_base = &pclc; - vec[i++].iov_len = sizeof(pclc); + vec[i].iov_base = pclc_base; + vec[i++].iov_len = sizeof(*pclc_base); if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { - vec[i].iov_base = &pclc_smcd; - vec[i++].iov_len = sizeof(pclc_smcd); + vec[i].iov_base = pclc_smcd; + vec[i++].iov_len = sizeof(*pclc_smcd); } - vec[i].iov_base = &pclc_prfx; - vec[i++].iov_len = sizeof(pclc_prfx); - if (pclc_prfx.ipv6_prefixes_cnt > 0) { - vec[i].iov_base = &ipv6_prfx[0]; - vec[i++].iov_len = pclc_prfx.ipv6_prefixes_cnt * + vec[i].iov_base = pclc_prfx; + vec[i++].iov_len = sizeof(*pclc_prfx); + if (pclc_prfx->ipv6_prefixes_cnt > 0) { + vec[i].iov_base = ipv6_prfx; + vec[i++].iov_len = pclc_prfx->ipv6_prefixes_cnt * sizeof(ipv6_prfx[0]); } - vec[i].iov_base = &trl; - vec[i++].iov_len = sizeof(trl); + vec[i].iov_base = trl; + vec[i++].iov_len = sizeof(*trl); /* due to the few bytes needed for clc-handshake this cannot block */ len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen); if (len < 0) { smc->sk.sk_err = smc->clcsock->sk->sk_err; reason_code = -smc->sk.sk_err; - } else if (len < (int)sizeof(pclc)) { + } else if (len < ntohs(pclc_base->hdr.length)) { reason_code = -ENETUNREACH; smc->sk.sk_err = -reason_code; } + kfree(pclc); return reason_code; } -/* send CLC CONFIRM message across internal TCP socket */ -int smc_clc_send_confirm(struct smc_sock *smc) +/* build and send CLC CONFIRM / ACCEPT message */ +static int smc_clc_send_confirm_accept(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc, + int first_contact) { struct smc_connection *conn = &smc->conn; - struct smc_clc_msg_accept_confirm cclc; - struct smc_link *link; - int reason_code = 0; struct msghdr msg; struct kvec vec; - int len; /* send SMC Confirm CLC msg */ - memset(&cclc, 0, sizeof(cclc)); - cclc.hdr.type = SMC_CLC_CONFIRM; - cclc.hdr.version = SMC_CLC_V1; /* SMC version */ - if (smc->conn.lgr->is_smcd) { + clc->hdr.version = SMC_V1; /* SMC version */ + if (first_contact) + clc->hdr.flag = 1; + if (conn->lgr->is_smcd) { /* SMC-D specific settings */ - memcpy(cclc.hdr.eyecatcher, SMCD_EYECATCHER, + memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); - cclc.hdr.path = SMC_TYPE_D; - cclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); - cclc.gid = conn->lgr->smcd->local_gid; - cclc.token = conn->rmb_desc->token; - cclc.dmbe_size = conn->rmbe_size_short; - cclc.dmbe_idx = 0; - memcpy(&cclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); - memcpy(cclc.smcd_trl.eyecatcher, SMCD_EYECATCHER, + clc->hdr.path = SMC_TYPE_D; + clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); + clc->d0.gid = conn->lgr->smcd->local_gid; + clc->d0.token = conn->rmb_desc->token; + clc->d0.dmbe_size = conn->rmbe_size_short; + clc->d0.dmbe_idx = 0; + memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); + memcpy(clc->d0.smcd_trl.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); } else { + struct smc_link *link = conn->lnk; + /* SMC-R specific settings */ link = conn->lnk; - memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, + memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); - cclc.hdr.path = SMC_TYPE_R; - cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); - memcpy(cclc.lcl.id_for_peer, local_systemid, + clc->hdr.path = SMC_TYPE_R; + clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + memcpy(clc->r0.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&cclc.lcl.gid, link->gid, SMC_GID_SIZE); - memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], + memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE); + memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); - hton24(cclc.qpn, link->roce_qp->qp_num); - cclc.rmb_rkey = + hton24(clc->r0.qpn, link->roce_qp->qp_num); + clc->r0.rmb_rkey = htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); - cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ - cclc.rmbe_alert_token = htonl(conn->alert_token_local); - cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); - cclc.rmbe_size = conn->rmbe_size_short; - cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address + clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ + clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); + switch (clc->hdr.type) { + case SMC_CLC_ACCEPT: + clc->r0.qp_mtu = link->path_mtu; + break; + case SMC_CLC_CONFIRM: + clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); + break; + } + clc->r0.rmbe_size = conn->rmbe_size_short; + clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address (conn->rmb_desc->sgt[link->link_idx].sgl)); - hton24(cclc.psn, link->psn_initial); - memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER, + hton24(clc->r0.psn, link->psn_initial); + memcpy(clc->r0.smcr_trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); } memset(&msg, 0, sizeof(msg)); - vec.iov_base = &cclc; - vec.iov_len = ntohs(cclc.hdr.length); - len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, - ntohs(cclc.hdr.length)); + vec.iov_base = clc; + vec.iov_len = ntohs(clc->hdr.length); + return kernel_sendmsg(smc->clcsock, &msg, &vec, 1, + ntohs(clc->hdr.length)); +} + +/* send CLC CONFIRM message across internal TCP socket */ +int smc_clc_send_confirm(struct smc_sock *smc) +{ + struct smc_clc_msg_accept_confirm cclc; + int reason_code = 0; + int len; + + /* send SMC Confirm CLC msg */ + memset(&cclc, 0, sizeof(cclc)); + cclc.hdr.type = SMC_CLC_CONFIRM; + len = smc_clc_send_confirm_accept(smc, &cclc, 0); if (len < ntohs(cclc.hdr.length)) { if (len >= 0) { reason_code = -ENETUNREACH; @@ -557,65 +589,14 @@ int smc_clc_send_confirm(struct smc_sock *smc) } /* send CLC ACCEPT message across internal TCP socket */ -int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) +int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact) { - struct smc_connection *conn = &new_smc->conn; struct smc_clc_msg_accept_confirm aclc; - struct smc_link *link; - struct msghdr msg; - struct kvec vec; int len; memset(&aclc, 0, sizeof(aclc)); aclc.hdr.type = SMC_CLC_ACCEPT; - aclc.hdr.version = SMC_CLC_V1; /* SMC version */ - if (srv_first_contact) - aclc.hdr.flag = 1; - - if (new_smc->conn.lgr->is_smcd) { - /* SMC-D specific settings */ - aclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); - memcpy(aclc.hdr.eyecatcher, SMCD_EYECATCHER, - sizeof(SMCD_EYECATCHER)); - aclc.hdr.path = SMC_TYPE_D; - aclc.gid = conn->lgr->smcd->local_gid; - aclc.token = conn->rmb_desc->token; - aclc.dmbe_size = conn->rmbe_size_short; - aclc.dmbe_idx = 0; - memcpy(&aclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); - memcpy(aclc.smcd_trl.eyecatcher, SMCD_EYECATCHER, - sizeof(SMCD_EYECATCHER)); - } else { - /* SMC-R specific settings */ - aclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); - memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, - sizeof(SMC_EYECATCHER)); - aclc.hdr.path = SMC_TYPE_R; - link = conn->lnk; - memcpy(aclc.lcl.id_for_peer, local_systemid, - sizeof(local_systemid)); - memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE); - memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], - ETH_ALEN); - hton24(aclc.qpn, link->roce_qp->qp_num); - aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); - aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ - aclc.rmbe_alert_token = htonl(conn->alert_token_local); - aclc.qp_mtu = link->path_mtu; - aclc.rmbe_size = conn->rmbe_size_short, - aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[link->link_idx].sgl)); - hton24(aclc.psn, link->psn_initial); - memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER, - sizeof(SMC_EYECATCHER)); - } - - memset(&msg, 0, sizeof(msg)); - vec.iov_base = &aclc; - vec.iov_len = ntohs(aclc.hdr.length); - len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, - ntohs(aclc.hdr.length)); + len = smc_clc_send_confirm_accept(new_smc, &aclc, srv_first_contact); if (len < ntohs(aclc.hdr.length)) len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index cf7b45306f4e..fcd8521c7737 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -22,7 +22,6 @@ #define SMC_CLC_CONFIRM 0x03 #define SMC_CLC_DECLINE 0x04 -#define SMC_CLC_V1 0x1 /* SMC version */ #define SMC_TYPE_R 0 /* SMC-R only */ #define SMC_TYPE_D 1 /* SMC-D only */ #define SMC_TYPE_N 2 /* neither SMC-R nor SMC-D */ @@ -38,7 +37,6 @@ #define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */ #define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */ #define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */ -#define SMC_CLC_DECL_SMCDNOTALK 0x03030003 /* SMC-D dev can't talk to peer */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ @@ -111,55 +109,58 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ __be16 iparea_offset; /* offset to IP address information area */ } __aligned(4); -#define SMC_CLC_PROPOSAL_MAX_OFFSET 0x28 -#define SMC_CLC_PROPOSAL_MAX_PREFIX (SMC_CLC_MAX_V6_PREFIX * \ - sizeof(struct smc_clc_ipv6_prefix)) -#define SMC_CLC_MAX_LEN (sizeof(struct smc_clc_msg_proposal) + \ - SMC_CLC_PROPOSAL_MAX_OFFSET + \ - sizeof(struct smc_clc_msg_proposal_prefix) + \ - SMC_CLC_PROPOSAL_MAX_PREFIX + \ - sizeof(struct smc_clc_msg_trail)) +struct smc_clc_msg_proposal_area { + struct smc_clc_msg_proposal pclc_base; + struct smc_clc_msg_smcd pclc_smcd; + struct smc_clc_msg_proposal_prefix pclc_prfx; + struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX]; + struct smc_clc_msg_trail pclc_trl; +}; -struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ - struct smc_clc_msg_hdr hdr; - union { - struct { /* SMC-R */ - struct smc_clc_msg_local lcl; - u8 qpn[3]; /* QP number */ - __be32 rmb_rkey; /* RMB rkey */ - u8 rmbe_idx; /* Index of RMBE in RMB */ - __be32 rmbe_alert_token;/* unique connection id */ -#if defined(__BIG_ENDIAN_BITFIELD) - u8 rmbe_size : 4, /* buf size (compressed) */ - qp_mtu : 4; /* QP mtu */ +struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ + struct smc_clc_msg_local lcl; + u8 qpn[3]; /* QP number */ + __be32 rmb_rkey; /* RMB rkey */ + u8 rmbe_idx; /* Index of RMBE in RMB */ + __be32 rmbe_alert_token; /* unique connection id */ + #if defined(__BIG_ENDIAN_BITFIELD) + u8 rmbe_size : 4, /* buf size (compressed) */ + qp_mtu : 4; /* QP mtu */ #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 qp_mtu : 4, - rmbe_size : 4; + u8 qp_mtu : 4, + rmbe_size : 4; #endif - u8 reserved; - __be64 rmb_dma_addr; /* RMB virtual address */ - u8 reserved2; - u8 psn[3]; /* packet sequence number */ - struct smc_clc_msg_trail smcr_trl; - /* eye catcher "SMCR" EBCDIC */ - } __packed; - struct { /* SMC-D */ - u64 gid; /* Sender GID */ - u64 token; /* DMB token */ - u8 dmbe_idx; /* DMBE index */ + u8 reserved; + __be64 rmb_dma_addr; /* RMB virtual address */ + u8 reserved2; + u8 psn[3]; /* packet sequence number */ + struct smc_clc_msg_trail smcr_trl; + /* eye catcher "SMCR" EBCDIC */ +} __packed; + +struct smcd_clc_msg_accept_confirm { /* SMCD accept/confirm */ + u64 gid; /* Sender GID */ + u64 token; /* DMB token */ + u8 dmbe_idx; /* DMBE index */ #if defined(__BIG_ENDIAN_BITFIELD) - u8 dmbe_size : 4, /* buf size (compressed) */ - reserved3 : 4; + u8 dmbe_size : 4, /* buf size (compressed) */ + reserved3 : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 reserved3 : 4, - dmbe_size : 4; + u8 reserved3 : 4, + dmbe_size : 4; #endif - u16 reserved4; - u32 linkid; /* Link identifier */ - u32 reserved5[3]; - struct smc_clc_msg_trail smcd_trl; - /* eye catcher "SMCD" EBCDIC */ - } __packed; + u16 reserved4; + u32 linkid; /* Link identifier */ + u32 reserved5[3]; + struct smc_clc_msg_trail smcd_trl; + /* eye catcher "SMCD" EBCDIC */ +} __packed; + +struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ + struct smc_clc_msg_hdr hdr; + union { + struct smcr_clc_msg_accept_confirm r0; /* SMC-R */ + struct smcd_clc_msg_accept_confirm d0; /* SMC-D */ }; } __packed; /* format defined in RFC7609 */ @@ -200,6 +201,6 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, struct smc_init_info *ini); int smc_clc_send_confirm(struct smc_sock *smc); -int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); +int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact); #endif diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 0e7409e469c0..0f9ffba07d26 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -210,9 +210,9 @@ again: sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { + smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; + smc->clcsock->sk->sk_user_data = NULL; rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); - /* wake up kernel_accept of smc_tcp_listen_worker */ - smc->clcsock->sk->sk_data_ready(smc->clcsock->sk); } smc_close_cleanup_listen(sk); release_sock(sk); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index a406627b1d55..c811ae1a8add 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -34,7 +34,6 @@ #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) -#define SMC_LGR_FREE_DELAY_FAST (8 * HZ) static struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), @@ -70,7 +69,7 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) * creation. For client use a somewhat higher removal delay time, * otherwise there is a risk of out-of-sync link groups. */ - if (!lgr->freeing && !lgr->freefast) { + if (!lgr->freeing) { mod_delayed_work(system_wq, &lgr->free_work, (!lgr->is_smcd && lgr->role == SMC_CLNT) ? SMC_LGR_FREE_DELAY_CLNT : @@ -78,15 +77,6 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) } } -void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) -{ - if (!lgr->freeing && !lgr->freefast) { - lgr->freefast = 1; - mod_delayed_work(system_wq, &lgr->free_work, - SMC_LGR_FREE_DELAY_FAST); - } -} - /* Register connection's alert token in our lookup structure. * To use rbtrees we have to implement our own insert core. * Requires @conns_lock @@ -227,7 +217,7 @@ void smc_lgr_cleanup_early(struct smc_connection *conn) if (!list_empty(lgr_list)) list_del_init(lgr_list); spin_unlock_bh(lgr_lock); - smc_lgr_schedule_free_work_fast(lgr); + __smc_lgr_terminate(lgr, true); } static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) @@ -396,10 +386,15 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) rc = SMC_CLC_DECL_MEM; goto ism_put_vlan; } + lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0, + SMC_LGR_ID_SIZE, &lgr->id); + if (!lgr->tx_wq) { + rc = -ENOMEM; + goto free_lgr; + } lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; lgr->terminating = 0; - lgr->freefast = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; mutex_init(&lgr->sndbufs_lock); @@ -418,7 +413,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) if (ini->is_smcd) { /* SMC-D specific settings */ get_device(&ini->ism_dev->dev); - lgr->peer_gid = ini->ism_gid; + lgr->peer_gid = ini->ism_peer_gid; lgr->smcd = ini->ism_dev; lgr_list = &ini->ism_dev->lgr_list; lgr_lock = &lgr->smcd->lgr_lock; @@ -437,7 +432,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lnk = &lgr->lnk[link_idx]; rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) - goto free_lgr; + goto free_wq; lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; atomic_inc(&lgr_cnt); @@ -448,6 +443,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) spin_unlock_bh(lgr_lock); return 0; +free_wq: + destroy_workqueue(lgr->tx_wq); free_lgr: kfree(lgr); ism_put_vlan: @@ -517,7 +514,7 @@ static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, smc->sk.sk_state != SMC_CLOSED) { rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf); if (!rc) { - schedule_delayed_work(&conn->tx_work, 0); + queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 0); smc->sk.sk_data_ready(&smc->sk); } } else { @@ -824,11 +821,10 @@ static void smc_lgr_free(struct smc_link_group *lgr) } smc_lgr_free_bufs(lgr); + destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { - if (!lgr->terminating) { - smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(&lgr->smcd->dev); - } + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + put_device(&lgr->smcd->dev); if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { @@ -889,8 +885,6 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) if (lgr->is_smcd) { smc_ism_signal_shutdown(lgr); smcd_unregister_all_dmbs(lgr); - smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(&lgr->smcd->dev); } else { u32 rsn = lgr->llc_termination_rsn; @@ -1296,9 +1290,9 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list; lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock; - ini->cln_first_contact = SMC_FIRST_CONTACT; + ini->first_contact_local = 1; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - if (role == SMC_CLNT && ini->srv_first_contact) + if (role == SMC_CLNT && ini->first_contact_peer) /* create new link group as well */ goto create; @@ -1307,14 +1301,14 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? - smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) : + smcd_lgr_match(lgr, ini->ism_dev, ini->ism_peer_gid) : smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) && !lgr->sync_err && lgr->vlan_id == ini->vlan_id && (role == SMC_CLNT || ini->is_smcd || lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { /* link group found */ - ini->cln_first_contact = SMC_REUSE_CONTACT; + ini->first_contact_local = 0; conn->lgr = lgr; rc = smc_lgr_register_conn(conn, false); write_unlock_bh(&lgr->conns_lock); @@ -1328,8 +1322,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) if (rc) return rc; - if (role == SMC_CLNT && !ini->srv_first_contact && - ini->cln_first_contact == SMC_FIRST_CONTACT) { + if (role == SMC_CLNT && !ini->first_contact_peer && + ini->first_contact_local) { /* Server reuses a link group, but Client wants to start * a new one * send out_of_sync decline, reason synchr. error @@ -1338,7 +1332,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) } create: - if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + if (ini->first_contact_local) { rc = smc_lgr_create(smc, ini); if (rc) goto out; @@ -1892,8 +1886,8 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *lnk, struct smc_clc_msg_accept_confirm *clc) { - conn->rtoken_idx = smc_rtoken_add(lnk, clc->rmb_dma_addr, - clc->rmb_rkey); + conn->rtoken_idx = smc_rtoken_add(lnk, clc->r0.rmb_dma_addr, + clc->r0.rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; return 0; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 1c4d5439d0ff..37a5903789b0 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -137,9 +137,6 @@ struct smc_link { #define SMC_LINKS_PER_LGR_MAX 3 #define SMC_SINGLE_LINK 0 -#define SMC_FIRST_CONTACT 1 /* first contact to a peer */ -#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/ - /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ struct smc_buf_desc { struct list_head list; @@ -228,9 +225,9 @@ struct smc_link_group { u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ struct work_struct terminate_work; /* abnormal lgr termination */ + struct workqueue_struct *tx_wq; /* wq for conn. tx workers */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ - u8 freefast : 1; /* free worker scheduled fast */ u8 freeing : 1; /* lgr is being freed */ bool is_smcd; /* SMC-R or SMC-D */ @@ -294,9 +291,9 @@ struct smc_clc_msg_local; struct smc_init_info { u8 is_smcd; + u8 first_contact_peer; + u8 first_contact_local; unsigned short vlan_id; - int srv_first_contact; - int cln_first_contact; /* SMC-R */ struct smc_clc_msg_local *ib_lcl; struct smc_ib_device *ib_dev; @@ -304,7 +301,7 @@ struct smc_init_info { u8 ib_port; u32 ib_clcqpn; /* SMC-D */ - u64 ism_gid; + u64 ism_peer_gid; struct smcd_dev *ism_dev; }; diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index da9ba6d1679b..f15fca59b4b2 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -22,6 +22,15 @@ #include "smc.h" #include "smc_core.h" +struct smc_diag_dump_ctx { + int pos[2]; +}; + +static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb) +{ + return (struct smc_diag_dump_ctx *)cb->ctx; +} + static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) { sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", @@ -193,13 +202,15 @@ errout: } static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, - struct netlink_callback *cb) + struct netlink_callback *cb, int p_type) { + struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb); struct net *net = sock_net(skb->sk); + int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; + int rc = 0, num = 0; struct sock *sk; - int rc = 0; read_lock(&prot->h.smc_hash->lock); head = &prot->h.smc_hash->ht; @@ -209,13 +220,18 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, sk_for_each(sk, head) { if (!net_eq(sock_net(sk), net)) continue; + if (num < snum) + goto next; rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); - if (rc) - break; + if (rc < 0) + goto out; +next: + num++; } out: read_unlock(&prot->h.smc_hash->lock); + cb_ctx->pos[p_type] = num; return rc; } @@ -223,10 +239,10 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { int rc = 0; - rc = smc_diag_dump_proto(&smc_proto, skb, cb); + rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC); if (!rc) - rc = smc_diag_dump_proto(&smc_proto6, skb, cb); - return rc; + smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6); + return skb->len; } static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 3ea33466ebe9..2db967f2fb50 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1691,7 +1691,7 @@ static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc) spin_lock_irqsave(&lgr->llc_event_q_lock, flags); list_add_tail(&qentry->list, &lgr->llc_event_q); spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags); - schedule_work(&lgr->llc_event_work); + queue_work(system_highpri_wq, &lgr->llc_event_work); } /* copy received msg and add it to the event queue */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 30e5fac7034e..70684c49510e 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -928,7 +928,10 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && - !ismdev->going_away) { + !ismdev->going_away && + (!ini->ism_peer_gid || + !smc_ism_cantalk(ini->ism_peer_gid, ini->vlan_id, + ismdev))) { ini->ism_dev = ismdev; break; } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 54ba0443847e..4532c16bf85e 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -228,8 +228,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) /* for a corked socket defer the RDMA writes if there * is still sufficient sndbuf_space available */ - schedule_delayed_work(&conn->tx_work, - SMC_TX_CORK_DELAY); + queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, + SMC_TX_CORK_DELAY); else smc_tx_sndbuf_nonempty(conn); } /* while (msg_data_left(msg)) */ @@ -499,7 +499,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) if (conn->killed) return -EPIPE; rc = 0; - mod_delayed_work(system_wq, &conn->tx_work, + mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_WORK_DELAY); } return rc; @@ -623,8 +623,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) return; if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && !conn->killed) { - schedule_delayed_work(&conn->tx_work, - SMC_TX_WORK_DELAY); + queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); return; } } diff --git a/net/socket.c b/net/socket.c index 0c0144604f81..82262e1922f9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2628,9 +2628,11 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags) { - /* disallow ancillary data requests from this path */ - if (msg->msg_control || msg->msg_controllen) - return -EINVAL; + if (msg->msg_control || msg->msg_controllen) { + /* disallow ancillary data reqs unless cmsg is plain data */ + if (!(sock->ops->flags & PROTO_CMSG_DATA_ONLY)) + return -EINVAL; + } return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0); } diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 999eee1ed61c..6c86e2a7d942 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -108,8 +108,10 @@ proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp, left -= (s - tmpbuf); if (left && !isspace(*s)) return -EINVAL; - while (left && isspace(*s)) - left--, s++; + while (left && isspace(*s)) { + left--; + s++; + } } else left = 0; *(unsigned int *) table->data = value; diff --git a/net/tipc/core.c b/net/tipc/core.c index 4f6dc74adf45..c2ff42900b53 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -60,6 +60,7 @@ static int __net_init tipc_init_net(struct net *net) tn->trial_addr = 0; tn->addr_trial_end = 0; tn->capabilities = TIPC_NODE_CAPABILITIES; + INIT_WORK(&tn->final_work.work, tipc_net_finalize_work); memset(tn->node_id, 0, sizeof(tn->node_id)); memset(tn->node_id_string, 0, sizeof(tn->node_id_string)); tn->mon_threshold = TIPC_DEF_MON_THRESHOLD; @@ -107,8 +108,13 @@ out_crypto: static void __net_exit tipc_exit_net(struct net *net) { + struct tipc_net *tn = tipc_net(net); + tipc_detach_loopback(net); + /* Make sure the tipc_net_finalize_work() finished */ + cancel_work_sync(&tn->final_work.work); tipc_net_stop(net); + tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); diff --git a/net/tipc/core.h b/net/tipc/core.h index 631d83c9705f..1d57a4d3b05e 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -90,6 +90,12 @@ extern unsigned int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; extern int sysctl_tipc_named_timeout __read_mostly; +struct tipc_net_work { + struct work_struct work; + struct net *net; + u32 addr; +}; + struct tipc_net { u8 node_id[NODE_ID_LEN]; u32 node_addr; @@ -143,6 +149,8 @@ struct tipc_net { /* TX crypto handler */ struct tipc_crypto *crypto_tx; #endif + /* Work item for net finalize */ + struct tipc_net_work final_work; }; static inline struct tipc_net *tipc_net(struct net *net) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 7c523dc81575..40c44101fe8e 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -36,22 +36,28 @@ #include <crypto/aead.h> #include <crypto/aes.h> +#include <crypto/rng.h> #include "crypto.h" +#include "msg.h" +#include "bcast.h" -#define TIPC_TX_PROBE_LIM msecs_to_jiffies(1000) /* > 1s */ -#define TIPC_TX_LASTING_LIM msecs_to_jiffies(120000) /* 2 mins */ +#define TIPC_TX_GRACE_PERIOD msecs_to_jiffies(5000) /* 5s */ +#define TIPC_TX_LASTING_TIME msecs_to_jiffies(10000) /* 10s */ #define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */ -#define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(180000) /* 3 mins */ +#define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(15000) /* 15s */ + #define TIPC_MAX_TFMS_DEF 10 #define TIPC_MAX_TFMS_LIM 1000 +#define TIPC_REKEYING_INTV_DEF (60 * 24) /* default: 1 day */ + /** * TIPC Key ids */ enum { - KEY_UNUSED = 0, - KEY_MIN, - KEY_1 = KEY_MIN, + KEY_MASTER = 0, + KEY_MIN = KEY_MASTER, + KEY_1 = 1, KEY_2, KEY_3, KEY_MAX = KEY_3, @@ -81,6 +87,8 @@ static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok", /* Max TFMs number per key */ int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF; +/* Key exchange switch, default: on */ +int sysctl_tipc_key_exchange_enabled __read_mostly = 1; /** * struct tipc_key - TIPC keys' status indicator @@ -132,6 +140,8 @@ struct tipc_tfm { * @mode: crypto mode is applied to the key * @hint[]: a hint for user key * @rcu: struct rcu_head + * @key: the aead key + * @gen: the key's generation * @seqno: the key seqno (cluster scope) * @refcnt: the key reference counter */ @@ -144,8 +154,10 @@ struct tipc_aead { u32 salt; u8 authsize; u8 mode; - char hint[TIPC_AEAD_HINT_LEN + 1]; + char hint[2 * TIPC_AEAD_HINT_LEN + 1]; struct rcu_head rcu; + struct tipc_aead_key *key; + u16 gen; atomic64_t seqno ____cacheline_aligned; refcount_t refcnt ____cacheline_aligned; @@ -165,26 +177,56 @@ struct tipc_crypto_stats { * @node: TIPC node (RX) * @aead: array of pointers to AEAD keys for encryption/decryption * @peer_rx_active: replicated peer RX active key index + * @key_gen: TX/RX key generation * @key: the key states - * @working: the crypto is working or not + * @skey_mode: session key's mode + * @skey: received session key + * @wq: common workqueue on TX crypto + * @work: delayed work sched for TX/RX + * @key_distr: key distributing state + * @rekeying_intv: rekeying interval (in minutes) * @stats: the crypto statistics + * @name: the crypto name * @sndnxt: the per-peer sndnxt (TX) * @timer1: general timer 1 (jiffies) - * @timer2: general timer 1 (jiffies) + * @timer2: general timer 2 (jiffies) + * @working: the crypto is working or not + * @key_master: flag indicates if master key exists + * @legacy_user: flag indicates if a peer joins w/o master key (for bwd comp.) + * @nokey: no key indication * @lock: tipc_key lock */ struct tipc_crypto { struct net *net; struct tipc_node *node; - struct tipc_aead __rcu *aead[KEY_MAX + 1]; /* key[0] is UNUSED */ + struct tipc_aead __rcu *aead[KEY_MAX + 1]; atomic_t peer_rx_active; + u16 key_gen; struct tipc_key key; - u8 working:1; + u8 skey_mode; + struct tipc_aead_key *skey; + struct workqueue_struct *wq; + struct delayed_work work; +#define KEY_DISTR_SCHED 1 +#define KEY_DISTR_COMPL 2 + atomic_t key_distr; + u32 rekeying_intv; + struct tipc_crypto_stats __percpu *stats; + char name[48]; atomic64_t sndnxt ____cacheline_aligned; unsigned long timer1; unsigned long timer2; + union { + struct { + u8 working:1; + u8 key_master:1; + u8 legacy_user:1; + u8 nokey: 1; + }; + u8 flags; + }; spinlock_t lock; /* crypto lock */ } ____cacheline_aligned; @@ -234,23 +276,35 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_active, u8 new_pending); static int tipc_crypto_key_attach(struct tipc_crypto *c, - struct tipc_aead *aead, u8 pos); + struct tipc_aead *aead, u8 pos, + bool master_key); static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending); static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, - struct sk_buff *skb); -static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active, - struct tipc_msg *hdr); + struct sk_buff *skb, + u8 tx_key); +static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb); static int tipc_crypto_key_revoke(struct net *net, u8 tx_key); +static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode, u8 type); static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_bearer *b, struct sk_buff **skb, int err); static void tipc_crypto_do_cmd(struct net *net, int cmd); static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf); -#ifdef TIPC_CRYPTO_DEBUG static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, char *buf); -#endif +static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, + u16 gen, u8 mode, u32 dnode); +static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr); +static void tipc_crypto_work_tx(struct work_struct *work); +static void tipc_crypto_work_rx(struct work_struct *work); +static int tipc_aead_key_generate(struct tipc_aead_key *skey); + +#define is_tx(crypto) (!(crypto)->node) +#define is_rx(crypto) (!is_tx(crypto)) #define key_next(cur) ((cur) % KEY_MAX + 1) @@ -271,30 +325,55 @@ do { \ /** * tipc_aead_key_validate - Validate a AEAD user key */ -int tipc_aead_key_validate(struct tipc_aead_key *ukey) +int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info) { int keylen; /* Check if algorithm exists */ if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) { - pr_info("Not found cipher: \"%s\"!\n", ukey->alg_name); + GENL_SET_ERR_MSG(info, "unable to load the algorithm (module existed?)"); return -ENODEV; } /* Currently, we only support the "gcm(aes)" cipher algorithm */ - if (strcmp(ukey->alg_name, "gcm(aes)")) + if (strcmp(ukey->alg_name, "gcm(aes)")) { + GENL_SET_ERR_MSG(info, "not supported yet the algorithm"); return -ENOTSUPP; + } /* Check if key size is correct */ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 && keylen != TIPC_AES_GCM_KEY_SIZE_192 && - keylen != TIPC_AES_GCM_KEY_SIZE_256)) - return -EINVAL; + keylen != TIPC_AES_GCM_KEY_SIZE_256)) { + GENL_SET_ERR_MSG(info, "incorrect key length (20, 28 or 36 octets?)"); + return -EKEYREJECTED; + } return 0; } +/** + * tipc_aead_key_generate - Generate new session key + * @skey: input/output key with new content + * + * Return: 0 in case of success, otherwise < 0 + */ +static int tipc_aead_key_generate(struct tipc_aead_key *skey) +{ + int rc = 0; + + /* Fill the key's content with a random value via RNG cipher */ + rc = crypto_get_default_rng(); + if (likely(!rc)) { + rc = crypto_rng_get_bytes(crypto_default_rng, skey->key, + skey->keylen); + crypto_put_default_rng(); + } + + return rc; +} + static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; @@ -339,6 +418,7 @@ static void tipc_aead_free(struct rcu_head *rp) kfree(head); } free_percpu(aead->tfm_entry); + kzfree(aead->key); kfree(aead); } @@ -501,14 +581,15 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, return err; } - /* Copy some chars from the user key as a hint */ - memcpy(tmp->hint, ukey->key, TIPC_AEAD_HINT_LEN); - tmp->hint[TIPC_AEAD_HINT_LEN] = '\0'; + /* Form a hex string of some last bytes as the key's hint */ + bin2hex(tmp->hint, ukey->key + keylen - TIPC_AEAD_HINT_LEN, + TIPC_AEAD_HINT_LEN); /* Initialize the other data */ tmp->mode = mode; tmp->cloned = NULL; tmp->authsize = TIPC_AES_GCM_TAG_SIZE; + tmp->key = kmemdup(ukey, tipc_aead_key_size(ukey), GFP_KERNEL); memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE); atomic_set(&tmp->users, 0); atomic64_set(&tmp->seqno, 0); @@ -663,13 +744,11 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, * but there is no frag_list, it should be still fine! * Otherwise, we must cow it to be a writable buffer with the tailroom. */ -#ifdef TIPC_CRYPTO_DEBUG SKB_LINEAR_ASSERT(skb); if (tailen > skb_tailroom(skb)) { - pr_warn("TX: skb tailroom is not enough: %d, requires: %d\n", - skb_tailroom(skb), tailen); + pr_debug("TX(): skb tailroom is not enough: %d, requires: %d\n", + skb_tailroom(skb), tailen); } -#endif if (unlikely(!skb_cloned(skb) && tailen <= skb_tailroom(skb))) { nsg = 1; @@ -940,8 +1019,6 @@ bool tipc_ehdr_validate(struct sk_buff *skb) return false; if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE)) return false; - if (unlikely(!ehdr->tx_key)) - return false; return true; } @@ -994,6 +1071,8 @@ static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, ehdr->tx_key = tx_key; ehdr->destined = (__rx) ? 1 : 0; ehdr->rx_key_active = (__rx) ? __rx->key.active : 0; + ehdr->rx_nokey = (__rx) ? __rx->nokey : 0; + ehdr->master_key = aead->crypto->key_master; ehdr->reserved_1 = 0; ehdr->reserved_2 = 0; @@ -1019,23 +1098,16 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_active, u8 new_pending) { -#ifdef TIPC_CRYPTO_DEBUG struct tipc_key old = c->key; char buf[32]; -#endif c->key.keys = ((new_passive & KEY_MASK) << (KEY_BITS * 2)) | ((new_active & KEY_MASK) << (KEY_BITS)) | ((new_pending & KEY_MASK)); -#ifdef TIPC_CRYPTO_DEBUG - pr_info("%s(%s): key changing %s ::%pS\n", - (c->node) ? "RX" : "TX", - (c->node) ? tipc_node_get_id_str(c->node) : - tipc_own_id_string(c->net), - tipc_key_change_dump(old, c->key, buf), - __builtin_return_address(0)); -#endif + pr_debug("%s: key changing %s ::%pS\n", c->name, + tipc_key_change_dump(old, c->key, buf), + __builtin_return_address(0)); } /** @@ -1043,6 +1115,7 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, * @c: TIPC crypto to which new key is attached * @ukey: the user key * @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY) + * @master_key: specify this is a cluster master key * * A new TIPC AEAD key will be allocated and initiated with the specified user * key, then attached to the TIPC crypto. @@ -1050,7 +1123,7 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, * Return: new key id in case of success, otherwise: < 0 */ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, - u8 mode) + u8 mode, bool master_key) { struct tipc_aead *aead = NULL; int rc = 0; @@ -1060,17 +1133,11 @@ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, /* Attach it to the crypto */ if (likely(!rc)) { - rc = tipc_crypto_key_attach(c, aead, 0); + rc = tipc_crypto_key_attach(c, aead, 0, master_key); if (rc < 0) tipc_aead_free(&aead->rcu); } - pr_info("%s(%s): key initiating, rc %d!\n", - (c->node) ? "RX" : "TX", - (c->node) ? tipc_node_get_id_str(c->node) : - tipc_own_id_string(c->net), - rc); - return rc; } @@ -1079,58 +1146,58 @@ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, * @c: TIPC crypto to which the new AEAD key is attached * @aead: the new AEAD key pointer * @pos: desired slot in the crypto key array, = 0 if any! + * @master_key: specify this is a cluster master key * * Return: new key id in case of success, otherwise: -EBUSY */ static int tipc_crypto_key_attach(struct tipc_crypto *c, - struct tipc_aead *aead, u8 pos) + struct tipc_aead *aead, u8 pos, + bool master_key) { - u8 new_pending, new_passive, new_key; struct tipc_key key; int rc = -EBUSY; + u8 new_key; spin_lock_bh(&c->lock); key = c->key; + if (master_key) { + new_key = KEY_MASTER; + goto attach; + } if (key.active && key.passive) goto exit; - if (key.passive && !tipc_aead_users(c->aead[key.passive])) - goto exit; if (key.pending) { - if (pos) - goto exit; if (tipc_aead_users(c->aead[key.pending]) > 0) goto exit; + /* if (pos): ok with replacing, will be aligned when needed */ /* Replace it */ - new_pending = key.pending; - new_passive = key.passive; - new_key = new_pending; + new_key = key.pending; } else { if (pos) { if (key.active && pos != key_next(key.active)) { - new_pending = key.pending; - new_passive = pos; - new_key = new_passive; + key.passive = pos; + new_key = pos; goto attach; } else if (!key.active && !key.passive) { - new_pending = pos; - new_passive = key.passive; - new_key = new_pending; + key.pending = pos; + new_key = pos; goto attach; } } - new_pending = key_next(key.active ?: key.passive); - new_passive = key.passive; - new_key = new_pending; + key.pending = key_next(key.active ?: key.passive); + new_key = key.pending; } attach: aead->crypto = c; - tipc_crypto_key_set_state(c, new_passive, key.active, new_pending); + aead->gen = (is_tx(c)) ? ++c->key_gen : c->key_gen; tipc_aead_rcu_replace(c->aead[new_key], aead, &c->lock); - + if (likely(c->key.keys != key.keys)) + tipc_crypto_key_set_state(c, key.passive, key.active, + key.pending); c->working = 1; - c->timer1 = jiffies; - c->timer2 = jiffies; + c->nokey = 0; + c->key_master |= master_key; rc = new_key; exit: @@ -1140,14 +1207,33 @@ exit: void tipc_crypto_key_flush(struct tipc_crypto *c) { + struct tipc_crypto *tx, *rx; int k; spin_lock_bh(&c->lock); - c->working = 0; + if (is_rx(c)) { + /* Try to cancel pending work */ + rx = c; + tx = tipc_net(rx->net)->crypto_tx; + if (cancel_delayed_work(&rx->work)) { + kfree(rx->skey); + rx->skey = NULL; + atomic_xchg(&rx->key_distr, 0); + tipc_node_put(rx->node); + } + /* RX stopping => decrease TX key users if any */ + k = atomic_xchg(&rx->peer_rx_active, 0); + if (k) { + tipc_aead_users_dec(tx->aead[k], 0); + /* Mark the point TX key users changed */ + tx->timer1 = jiffies; + } + } + + c->flags = 0; tipc_crypto_key_set_state(c, 0, 0, 0); for (k = KEY_MIN; k <= KEY_MAX; k++) tipc_crypto_key_detach(c->aead[k], &c->lock); - atomic_set(&c->peer_rx_active, 0); atomic64_set(&c->sndnxt, 0); spin_unlock_bh(&c->lock); } @@ -1206,7 +1292,8 @@ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) rcu_assign_pointer(rx->aead[new_passive], tmp2); refcount_set(&tmp1->refcnt, 1); aligned = true; - pr_info("RX(%s): key is aligned!\n", tipc_node_get_id_str(rx->node)); + pr_info_ratelimited("%s: key[%d] -> key[%d]\n", rx->name, key.pending, + new_pending); exit: spin_unlock(&rx->lock); @@ -1218,6 +1305,7 @@ exit: * @tx: TX crypto handle * @rx: RX crypto handle (can be NULL) * @skb: the message skb which will be decrypted later + * @tx_key: peer TX key id * * This function looks up the existing TX keys and pick one which is suitable * for the message decryption, that must be a cluster key and not used before @@ -1227,7 +1315,8 @@ exit: */ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, - struct sk_buff *skb) + struct sk_buff *skb, + u8 tx_key) { struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb); struct tipc_aead *aead = NULL; @@ -1246,6 +1335,10 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, /* Pick one TX key */ spin_lock(&tx->lock); + if (tx_key == KEY_MASTER) { + aead = tipc_aead_rcu_ptr(tx->aead[KEY_MASTER], &tx->lock); + goto done; + } do { k = (i == 0) ? key.pending : ((i == 1) ? key.active : key.passive); @@ -1265,9 +1358,12 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, skb->next = skb_clone(skb, GFP_ATOMIC); if (unlikely(!skb->next)) pr_warn("Failed to clone skb for next round if any\n"); - WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); break; } while (++i < 3); + +done: + if (likely(aead)) + WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); spin_unlock(&tx->lock); return aead; @@ -1276,53 +1372,73 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, /** * tipc_crypto_key_synch: Synch own key data according to peer key status * @rx: RX crypto handle - * @new_rx_active: latest RX active key from peer - * @hdr: TIPCv2 message + * @skb: TIPCv2 message buffer (incl. the ehdr from peer) * * This function updates the peer node related data as the peer RX active key * has changed, so the number of TX keys' users on this node are increased and * decreased correspondingly. * + * It also considers if peer has no key, then we need to make own master key + * (if any) taking over i.e. starting grace period and also trigger key + * distributing process. + * * The "per-peer" sndnxt is also reset when the peer key has switched. */ -static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active, - struct tipc_msg *hdr) +static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb) { - struct net *net = rx->net; - struct tipc_crypto *tx = tipc_net(net)->crypto_tx; - u8 cur_rx_active; + struct tipc_ehdr *ehdr = (struct tipc_ehdr *)skb_network_header(skb); + struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; + struct tipc_msg *hdr = buf_msg(skb); + u32 self = tipc_own_addr(rx->net); + u8 cur, new; + unsigned long delay; - /* TX might be even not ready yet */ - if (unlikely(!tx->key.active && !tx->key.pending)) - return; + /* Update RX 'key_master' flag according to peer, also mark "legacy" if + * a peer has no master key. + */ + rx->key_master = ehdr->master_key; + if (!rx->key_master) + tx->legacy_user = 1; - cur_rx_active = atomic_read(&rx->peer_rx_active); - if (likely(cur_rx_active == new_rx_active)) + /* For later cases, apply only if message is destined to this node */ + if (!ehdr->destined || msg_short(hdr) || msg_destnode(hdr) != self) return; - /* Make sure this message destined for this node */ - if (unlikely(msg_short(hdr) || - msg_destnode(hdr) != tipc_own_addr(net))) - return; + /* Case 1: Peer has no keys, let's make master key take over */ + if (ehdr->rx_nokey) { + /* Set or extend grace period */ + tx->timer2 = jiffies; + /* Schedule key distributing for the peer if not yet */ + if (tx->key.keys && + !atomic_cmpxchg(&rx->key_distr, 0, KEY_DISTR_SCHED)) { + get_random_bytes(&delay, 2); + delay %= 5; + delay = msecs_to_jiffies(500 * ++delay); + if (queue_delayed_work(tx->wq, &rx->work, delay)) + tipc_node_get(rx->node); + } + } else { + /* Cancel a pending key distributing if any */ + atomic_xchg(&rx->key_distr, 0); + } - /* Peer RX active key has changed, try to update owns' & TX users */ - if (atomic_cmpxchg(&rx->peer_rx_active, - cur_rx_active, - new_rx_active) == cur_rx_active) { - if (new_rx_active) - tipc_aead_users_inc(tx->aead[new_rx_active], INT_MAX); - if (cur_rx_active) - tipc_aead_users_dec(tx->aead[cur_rx_active], 0); + /* Case 2: Peer RX active key has changed, let's update own TX users */ + cur = atomic_read(&rx->peer_rx_active); + new = ehdr->rx_key_active; + if (tx->key.keys && + cur != new && + atomic_cmpxchg(&rx->peer_rx_active, cur, new) == cur) { + if (new) + tipc_aead_users_inc(tx->aead[new], INT_MAX); + if (cur) + tipc_aead_users_dec(tx->aead[cur], 0); atomic64_set(&rx->sndnxt, 0); /* Mark the point TX key users changed */ tx->timer1 = jiffies; -#ifdef TIPC_CRYPTO_DEBUG - pr_info("TX(%s): key users changed %d-- %d++, peer RX(%s)\n", - tipc_own_id_string(net), cur_rx_active, - new_rx_active, tipc_node_get_id_str(rx->node)); -#endif + pr_debug("%s: key users changed %d-- %d++, peer %s\n", + tx->name, cur, new, rx->name); } } @@ -1340,7 +1456,7 @@ static int tipc_crypto_key_revoke(struct net *net, u8 tx_key) tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); spin_unlock(&tx->lock); - pr_warn("TX(%s): key is revoked!\n", tipc_own_id_string(net)); + pr_warn("%s: key is revoked\n", tx->name); return -EKEYREVOKED; } @@ -1357,6 +1473,15 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, if (!c) return -ENOMEM; + /* Allocate workqueue on TX */ + if (!node) { + c->wq = alloc_ordered_workqueue("tipc_crypto", 0); + if (!c->wq) { + kfree(c); + return -ENOMEM; + } + } + /* Allocate statistic structure */ c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC); if (!c->stats) { @@ -1364,53 +1489,52 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, return -ENOMEM; } - c->working = 0; + c->flags = 0; c->net = net; c->node = node; + get_random_bytes(&c->key_gen, 2); tipc_crypto_key_set_state(c, 0, 0, 0); + atomic_set(&c->key_distr, 0); atomic_set(&c->peer_rx_active, 0); atomic64_set(&c->sndnxt, 0); c->timer1 = jiffies; c->timer2 = jiffies; + c->rekeying_intv = TIPC_REKEYING_INTV_DEF; spin_lock_init(&c->lock); - *crypto = c; + scnprintf(c->name, 48, "%s(%s)", (is_rx(c)) ? "RX" : "TX", + (is_rx(c)) ? tipc_node_get_id_str(c->node) : + tipc_own_id_string(c->net)); + + if (is_rx(c)) + INIT_DELAYED_WORK(&c->work, tipc_crypto_work_rx); + else + INIT_DELAYED_WORK(&c->work, tipc_crypto_work_tx); + *crypto = c; return 0; } void tipc_crypto_stop(struct tipc_crypto **crypto) { - struct tipc_crypto *c, *tx, *rx; - bool is_rx; + struct tipc_crypto *c = *crypto; u8 k; - if (!*crypto) + if (!c) return; - rcu_read_lock(); - /* RX stopping? => decrease TX key users if any */ - is_rx = !!((*crypto)->node); - if (is_rx) { - rx = *crypto; - tx = tipc_net(rx->net)->crypto_tx; - k = atomic_read(&rx->peer_rx_active); - if (k) { - tipc_aead_users_dec(tx->aead[k], 0); - /* Mark the point TX key users changed */ - tx->timer1 = jiffies; - } + /* Flush any queued works & destroy wq */ + if (is_tx(c)) { + c->rekeying_intv = 0; + cancel_delayed_work_sync(&c->work); + destroy_workqueue(c->wq); } /* Release AEAD keys */ - c = *crypto; + rcu_read_lock(); for (k = KEY_MIN; k <= KEY_MAX; k++) tipc_aead_put(rcu_dereference(c->aead[k])); rcu_read_unlock(); - - pr_warn("%s(%s) has been purged, node left!\n", - (is_rx) ? "RX" : "TX", - (is_rx) ? tipc_node_get_id_str((*crypto)->node) : - tipc_own_id_string((*crypto)->net)); + pr_debug("%s: has been stopped\n", c->name); /* Free this crypto statistics */ free_percpu(c->stats); @@ -1424,106 +1548,91 @@ void tipc_crypto_timeout(struct tipc_crypto *rx) struct tipc_net *tn = tipc_net(rx->net); struct tipc_crypto *tx = tn->crypto_tx; struct tipc_key key; - u8 new_pending, new_passive; int cmd; - /* TX key activating: - * The pending key (users > 0) -> active - * The active key if any (users == 0) -> free - */ + /* TX pending: taking all users & stable -> active */ spin_lock(&tx->lock); key = tx->key; if (key.active && tipc_aead_users(tx->aead[key.active]) > 0) goto s1; if (!key.pending || tipc_aead_users(tx->aead[key.pending]) <= 0) goto s1; - if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_LIM)) + if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_TIME)) goto s1; tipc_crypto_key_set_state(tx, key.passive, key.pending, 0); if (key.active) tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); this_cpu_inc(tx->stats->stat[STAT_SWITCHES]); - pr_info("TX(%s): key %d is activated!\n", tipc_own_id_string(tx->net), - key.pending); + pr_info("%s: key[%d] is activated\n", tx->name, key.pending); s1: spin_unlock(&tx->lock); - /* RX key activating: - * The pending key (users > 0) -> active - * The active key if any -> passive, freed later - */ + /* RX pending: having user -> active */ spin_lock(&rx->lock); key = rx->key; if (!key.pending || tipc_aead_users(rx->aead[key.pending]) <= 0) goto s2; - new_pending = (key.passive && - !tipc_aead_users(rx->aead[key.passive])) ? - key.passive : 0; - new_passive = (key.active) ?: ((new_pending) ? 0 : key.passive); - tipc_crypto_key_set_state(rx, new_passive, key.pending, new_pending); + if (key.active) + key.passive = key.active; + key.active = key.pending; + rx->timer2 = jiffies; + tipc_crypto_key_set_state(rx, key.passive, key.active, 0); this_cpu_inc(rx->stats->stat[STAT_SWITCHES]); - pr_info("RX(%s): key %d is activated!\n", - tipc_node_get_id_str(rx->node), key.pending); + pr_info("%s: key[%d] is activated\n", rx->name, key.pending); goto s5; s2: - /* RX key "faulty" switching: - * The faulty pending key (users < -30) -> passive - * The passive key (users = 0) -> pending - * Note: This only happens after RX deactivated - s3! - */ - key = rx->key; - if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -30) - goto s3; - if (!key.passive || tipc_aead_users(rx->aead[key.passive]) != 0) + /* RX pending: not working -> remove */ + if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -10) goto s3; - new_pending = key.passive; - new_passive = key.pending; - tipc_crypto_key_set_state(rx, new_passive, key.active, new_pending); + tipc_crypto_key_set_state(rx, key.passive, key.active, 0); + tipc_crypto_key_detach(rx->aead[key.pending], &rx->lock); + pr_debug("%s: key[%d] is removed\n", rx->name, key.pending); goto s5; s3: - /* RX key deactivating: - * The passive key if any -> pending - * The active key -> passive (users = 0) / pending - * The pending key if any -> passive (users = 0) - */ - key = rx->key; + /* RX active: timed out or no user -> pending */ if (!key.active) goto s4; - if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM)) + if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM) && + tipc_aead_users(rx->aead[key.active]) > 0) goto s4; - new_pending = (key.passive) ?: key.active; - new_passive = (key.passive) ? key.active : key.pending; - tipc_aead_users_set(rx->aead[new_pending], 0); - if (new_passive) - tipc_aead_users_set(rx->aead[new_passive], 0); - tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); - pr_info("RX(%s): key %d is deactivated!\n", - tipc_node_get_id_str(rx->node), key.active); + if (key.pending) + key.passive = key.active; + else + key.pending = key.active; + rx->timer2 = jiffies; + tipc_crypto_key_set_state(rx, key.passive, 0, key.pending); + tipc_aead_users_set(rx->aead[key.pending], 0); + pr_debug("%s: key[%d] is deactivated\n", rx->name, key.active); goto s5; s4: - /* RX key passive -> freed: */ - key = rx->key; - if (!key.passive || !tipc_aead_users(rx->aead[key.passive])) + /* RX passive: outdated or not working -> free */ + if (!key.passive) goto s5; - if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM)) + if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM) && + tipc_aead_users(rx->aead[key.passive]) > -10) goto s5; tipc_crypto_key_set_state(rx, 0, key.active, key.pending); tipc_crypto_key_detach(rx->aead[key.passive], &rx->lock); - pr_info("RX(%s): key %d is freed!\n", tipc_node_get_id_str(rx->node), - key.passive); + pr_debug("%s: key[%d] is freed\n", rx->name, key.passive); s5: spin_unlock(&rx->lock); + /* Relax it here, the flag will be set again if it really is, but only + * when we are not in grace period for safety! + */ + if (time_after(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) + tx->legacy_user = 0; + /* Limit max_tfms & do debug commands if needed */ if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM)) return; @@ -1533,6 +1642,22 @@ s5: tipc_crypto_do_cmd(rx->net, cmd); } +static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode, u8 type) +{ + struct sk_buff *skb; + + skb = skb_clone(_skb, GFP_ATOMIC); + if (skb) { + TIPC_SKB_CB(skb)->xmit_type = type; + tipc_crypto_xmit(net, &skb, b, dst, __dnode); + if (skb) + b->media->send_msg(net, skb, b, dst); + } +} + /** * tipc_crypto_xmit - Build & encrypt TIPC message for xmit * @net: struct net @@ -1542,7 +1667,8 @@ s5: * @__dnode: destination node for reference if any * * First, build an encryption message header on the top of the message, then - * encrypt the original TIPC message by using the active or pending TX key. + * encrypt the original TIPC message by using the pending, master or active + * key with this preference order. * If the encryption is successful, the encrypted skb is returned directly or * via the callback. * Otherwise, the skb is freed! @@ -1562,46 +1688,67 @@ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, struct tipc_crypto *__rx = tipc_node_crypto_rx(__dnode); struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_crypto_stats __percpu *stats = tx->stats; + struct tipc_msg *hdr = buf_msg(*skb); struct tipc_key key = tx->key; struct tipc_aead *aead = NULL; - struct sk_buff *probe; + u32 user = msg_user(hdr); + u32 type = msg_type(hdr); int rc = -ENOKEY; - u8 tx_key; + u8 tx_key = 0; /* No encryption? */ if (!tx->working) return 0; - /* Try with the pending key if available and: - * 1) This is the only choice (i.e. no active key) or; - * 2) Peer has switched to this key (unicast only) or; - * 3) It is time to do a pending key probe; - */ + /* Pending key if peer has active on it or probing time */ if (unlikely(key.pending)) { tx_key = key.pending; - if (!key.active) + if (!tx->key_master && !key.active) goto encrypt; if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key) goto encrypt; - if (TIPC_SKB_CB(*skb)->probe) + if (TIPC_SKB_CB(*skb)->xmit_type == SKB_PROBING) { + pr_debug("%s: probing for key[%d]\n", tx->name, + key.pending); + goto encrypt; + } + if (user == LINK_CONFIG || user == LINK_PROTOCOL) + tipc_crypto_clone_msg(net, *skb, b, dst, __dnode, + SKB_PROBING); + } + + /* Master key if this is a *vital* message or in grace period */ + if (tx->key_master) { + tx_key = KEY_MASTER; + if (!key.active) + goto encrypt; + if (TIPC_SKB_CB(*skb)->xmit_type == SKB_GRACING) { + pr_debug("%s: gracing for msg (%d %d)\n", tx->name, + user, type); goto encrypt; - if (!__rx && - time_after(jiffies, tx->timer2 + TIPC_TX_PROBE_LIM)) { - tx->timer2 = jiffies; - probe = skb_clone(*skb, GFP_ATOMIC); - if (probe) { - TIPC_SKB_CB(probe)->probe = 1; - tipc_crypto_xmit(net, &probe, b, dst, __dnode); - if (probe) - b->media->send_msg(net, probe, b, dst); + } + if (user == LINK_CONFIG || + (user == LINK_PROTOCOL && type == RESET_MSG) || + (user == MSG_CRYPTO && type == KEY_DISTR_MSG) || + time_before(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) { + if (__rx && __rx->key_master && + !atomic_read(&__rx->peer_rx_active)) + goto encrypt; + if (!__rx) { + if (likely(!tx->legacy_user)) + goto encrypt; + tipc_crypto_clone_msg(net, *skb, b, dst, + __dnode, SKB_GRACING); } } } + /* Else, use the active key if any */ if (likely(key.active)) { tx_key = key.active; goto encrypt; } + goto exit; encrypt: @@ -1667,30 +1814,21 @@ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, struct tipc_aead *aead = NULL; struct tipc_key key; int rc = -ENOKEY; - u8 tx_key = 0; + u8 tx_key, n; + + tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; /* New peer? * Let's try with TX key (i.e. cluster mode) & verify the skb first! */ - if (unlikely(!rx)) + if (unlikely(!rx || tx_key == KEY_MASTER)) goto pick_tx; - /* Pick RX key according to TX key, three cases are possible: - * 1) The current active key (likely) or; - * 2) The pending (new or deactivated) key (if any) or; - * 3) The passive or old active key (i.e. users > 0); - */ - tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; + /* Pick RX key according to TX key if any */ key = rx->key; - if (likely(tx_key == key.active)) + if (tx_key == key.active || tx_key == key.pending || + tx_key == key.passive) goto decrypt; - if (tx_key == key.pending) - goto decrypt; - if (tx_key == key.passive) { - rx->timer2 = jiffies; - if (tipc_aead_users(rx->aead[key.passive]) > 0) - goto decrypt; - } /* Unknown key, let's try to align RX key(s) */ if (tipc_crypto_key_try_align(rx, tx_key)) @@ -1698,7 +1836,7 @@ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, pick_tx: /* No key suitable? Try to pick one from TX... */ - aead = tipc_crypto_key_pick_tx(tx, rx, *skb); + aead = tipc_crypto_key_pick_tx(tx, rx, *skb, tx_key); if (aead) goto decrypt; goto exit; @@ -1726,8 +1864,19 @@ exit: if (rc == -ENOKEY) { kfree_skb(*skb); *skb = NULL; - if (rx) + if (rx) { + /* Mark rx->nokey only if we dont have a + * pending received session key, nor a newer + * one i.e. in the next slot. + */ + n = key_next(tx_key); + rx->nokey = !(rx->skey || + rcu_access_pointer(rx->aead[n])); + pr_debug_ratelimited("%s: nokey %d, key %d/%x\n", + rx->name, rx->nokey, + tx_key, rx->key.keys); tipc_node_put(rx->node); + } this_cpu_inc(stats->stat[STAT_NOKEYS]); return rc; } else if (rc == -EBADMSG) { @@ -1749,21 +1898,17 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_aead *tmp = NULL; struct tipc_ehdr *ehdr; struct tipc_node *n; - u8 rx_key_active; - bool destined; /* Is this completed by TX? */ - if (unlikely(!rx->node)) { + if (unlikely(is_tx(aead->crypto))) { rx = skb_cb->tx_clone_ctx.rx; -#ifdef TIPC_CRYPTO_DEBUG - pr_info("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n", - (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead, - (*skb)->next, skb_cb->flags); - pr_info("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n", - skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last, - aead->crypto->aead[1], aead->crypto->aead[2], - aead->crypto->aead[3]); -#endif + pr_debug("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n", + (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead, + (*skb)->next, skb_cb->flags); + pr_debug("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n", + skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last, + aead->crypto->aead[1], aead->crypto->aead[2], + aead->crypto->aead[3]); if (unlikely(err)) { if (err == -EBADMSG && (*skb)->next) tipc_rcv(net, (*skb)->next, b); @@ -1784,12 +1929,12 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, goto free_skb; } - /* Skip cloning this time as we had a RX pending key */ - if (rx->key.pending) + /* Ignore cloning if it was TX master key */ + if (ehdr->tx_key == KEY_MASTER) goto rcv; if (tipc_aead_clone(&tmp, aead) < 0) goto rcv; - if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key) < 0) { + if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key, false) < 0) { tipc_aead_free(&tmp->rcu); goto rcv; } @@ -1805,14 +1950,18 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, /* Set the RX key's user */ tipc_aead_users_set(aead, 1); -rcv: /* Mark this point, RX works */ rx->timer1 = jiffies; +rcv: /* Remove ehdr & auth. tag prior to tipc_rcv() */ ehdr = (struct tipc_ehdr *)(*skb)->data; - destined = ehdr->destined; - rx_key_active = ehdr->rx_key_active; + + /* Mark this point, RX passive still works */ + if (rx->key.passive && ehdr->tx_key == rx->key.passive) + rx->timer2 = jiffies; + + skb_reset_network_header(*skb); skb_pull(*skb, tipc_ehdr_size(ehdr)); pskb_trim(*skb, (*skb)->len - aead->authsize); @@ -1822,9 +1971,8 @@ rcv: goto free_skb; } - /* Update peer RX active key & TX users */ - if (destined) - tipc_crypto_key_synch(rx, rx_key_active, buf_msg(*skb)); + /* Ok, everything's fine, try to synch own keys according to peers' */ + tipc_crypto_key_synch(rx, *skb); /* Mark skb decrypted */ skb_cb->decrypted = 1; @@ -1883,7 +2031,7 @@ print_stats: /* Print crypto statistics */ for (i = 0, j = 0; i < MAX_STATS; i++) j += scnprintf(buf + j, 200 - j, "|%11s ", hstats[i]); - pr_info("\nCounter %s", buf); + pr_info("Counter %s", buf); memset(buf, '-', 115); buf[115] = '\0'; @@ -1927,21 +2075,31 @@ static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf) char *s; for (k = KEY_MIN; k <= KEY_MAX; k++) { - if (k == key.passive) - s = "PAS"; - else if (k == key.active) - s = "ACT"; - else if (k == key.pending) - s = "PEN"; - else - s = "-"; + if (k == KEY_MASTER) { + if (is_rx(c)) + continue; + if (time_before(jiffies, + c->timer2 + TIPC_TX_GRACE_PERIOD)) + s = "ACT"; + else + s = "PAS"; + } else { + if (k == key.passive) + s = "PAS"; + else if (k == key.active) + s = "ACT"; + else if (k == key.pending) + s = "PEN"; + else + s = "-"; + } i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s); rcu_read_lock(); aead = rcu_dereference(c->aead[k]); if (aead) i += scnprintf(buf + i, 200 - i, - "{\"%s...\", \"%s\"}/%d:%d", + "{\"0x...%s\", \"%s\"}/%d:%d", aead->hint, (aead->mode == CLUSTER_KEY) ? "c" : "p", atomic_read(&aead->users), @@ -1950,14 +2108,13 @@ static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf) i += scnprintf(buf + i, 200 - i, "\n"); } - if (c->node) + if (is_rx(c)) i += scnprintf(buf + i, 200 - i, "\tPeer RX active: %d\n", atomic_read(&c->peer_rx_active)); return buf; } -#ifdef TIPC_CRYPTO_DEBUG static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, char *buf) { @@ -1968,7 +2125,7 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, /* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */ again: i += scnprintf(buf + i, 32 - i, "["); - for (k = KEY_MIN; k <= KEY_MAX; k++) { + for (k = KEY_1; k <= KEY_3; k++) { if (k == key->passive) s = "pas"; else if (k == key->active) @@ -1978,7 +2135,7 @@ again: else s = "-"; i += scnprintf(buf + i, 32 - i, - (k != KEY_MAX) ? "%s " : "%s", s); + (k != KEY_3) ? "%s " : "%s", s); } if (key != &new) { i += scnprintf(buf + i, 32 - i, "] -> "); @@ -1988,4 +2145,320 @@ again: i += scnprintf(buf + i, 32 - i, "]"); return buf; } -#endif + +/** + * tipc_crypto_msg_rcv - Common 'MSG_CRYPTO' processing point + * @net: the struct net + * @skb: the receiving message buffer + */ +void tipc_crypto_msg_rcv(struct net *net, struct sk_buff *skb) +{ + struct tipc_crypto *rx; + struct tipc_msg *hdr; + + if (unlikely(skb_linearize(skb))) + goto exit; + + hdr = buf_msg(skb); + rx = tipc_node_crypto_rx_by_addr(net, msg_prevnode(hdr)); + if (unlikely(!rx)) + goto exit; + + switch (msg_type(hdr)) { + case KEY_DISTR_MSG: + if (tipc_crypto_key_rcv(rx, hdr)) + goto exit; + break; + default: + break; + } + + tipc_node_put(rx->node); + +exit: + kfree_skb(skb); +} + +/** + * tipc_crypto_key_distr - Distribute a TX key + * @tx: the TX crypto + * @key: the key's index + * @dest: the destination tipc node, = NULL if distributing to all nodes + * + * Return: 0 in case of success, otherwise < 0 + */ +int tipc_crypto_key_distr(struct tipc_crypto *tx, u8 key, + struct tipc_node *dest) +{ + struct tipc_aead *aead; + u32 dnode = tipc_node_get_addr(dest); + int rc = -ENOKEY; + + if (!sysctl_tipc_key_exchange_enabled) + return 0; + + if (key) { + rcu_read_lock(); + aead = tipc_aead_get(tx->aead[key]); + if (likely(aead)) { + rc = tipc_crypto_key_xmit(tx->net, aead->key, + aead->gen, aead->mode, + dnode); + tipc_aead_put(aead); + } + rcu_read_unlock(); + } + + return rc; +} + +/** + * tipc_crypto_key_xmit - Send a session key + * @net: the struct net + * @skey: the session key to be sent + * @gen: the key's generation + * @mode: the key's mode + * @dnode: the destination node address, = 0 if broadcasting to all nodes + * + * The session key 'skey' is packed in a TIPC v2 'MSG_CRYPTO/KEY_DISTR_MSG' + * as its data section, then xmit-ed through the uc/bc link. + * + * Return: 0 in case of success, otherwise < 0 + */ +static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, + u16 gen, u8 mode, u32 dnode) +{ + struct sk_buff_head pkts; + struct tipc_msg *hdr; + struct sk_buff *skb; + u16 size, cong_link_cnt; + u8 *data; + int rc; + + size = tipc_aead_key_size(skey); + skb = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + hdr = buf_msg(skb); + tipc_msg_init(tipc_own_addr(net), hdr, MSG_CRYPTO, KEY_DISTR_MSG, + INT_H_SIZE, dnode); + msg_set_size(hdr, INT_H_SIZE + size); + msg_set_key_gen(hdr, gen); + msg_set_key_mode(hdr, mode); + + data = msg_data(hdr); + *((__be32 *)(data + TIPC_AEAD_ALG_NAME)) = htonl(skey->keylen); + memcpy(data, skey->alg_name, TIPC_AEAD_ALG_NAME); + memcpy(data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->key, + skey->keylen); + + __skb_queue_head_init(&pkts); + __skb_queue_tail(&pkts, skb); + if (dnode) + rc = tipc_node_xmit(net, &pkts, dnode, 0); + else + rc = tipc_bcast_xmit(net, &pkts, &cong_link_cnt); + + return rc; +} + +/** + * tipc_crypto_key_rcv - Receive a session key + * @rx: the RX crypto + * @hdr: the TIPC v2 message incl. the receiving session key in its data + * + * This function retrieves the session key in the message from peer, then + * schedules a RX work to attach the key to the corresponding RX crypto. + * + * Return: "true" if the key has been scheduled for attaching, otherwise + * "false". + */ +static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr) +{ + struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; + struct tipc_aead_key *skey = NULL; + u16 key_gen = msg_key_gen(hdr); + u16 size = msg_data_sz(hdr); + u8 *data = msg_data(hdr); + + spin_lock(&rx->lock); + if (unlikely(rx->skey || (key_gen == rx->key_gen && rx->key.keys))) { + pr_err("%s: key existed <%p>, gen %d vs %d\n", rx->name, + rx->skey, key_gen, rx->key_gen); + goto exit; + } + + /* Allocate memory for the key */ + skey = kmalloc(size, GFP_ATOMIC); + if (unlikely(!skey)) { + pr_err("%s: unable to allocate memory for skey\n", rx->name); + goto exit; + } + + /* Copy key from msg data */ + skey->keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME))); + memcpy(skey->alg_name, data, TIPC_AEAD_ALG_NAME); + memcpy(skey->key, data + TIPC_AEAD_ALG_NAME + sizeof(__be32), + skey->keylen); + + /* Sanity check */ + if (unlikely(size != tipc_aead_key_size(skey))) { + kfree(skey); + skey = NULL; + goto exit; + } + + rx->key_gen = key_gen; + rx->skey_mode = msg_key_mode(hdr); + rx->skey = skey; + rx->nokey = 0; + mb(); /* for nokey flag */ + +exit: + spin_unlock(&rx->lock); + + /* Schedule the key attaching on this crypto */ + if (likely(skey && queue_delayed_work(tx->wq, &rx->work, 0))) + return true; + + return false; +} + +/** + * tipc_crypto_work_rx - Scheduled RX works handler + * @work: the struct RX work + * + * The function processes the previous scheduled works i.e. distributing TX key + * or attaching a received session key on RX crypto. + */ +static void tipc_crypto_work_rx(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct tipc_crypto *rx = container_of(dwork, struct tipc_crypto, work); + struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; + unsigned long delay = msecs_to_jiffies(5000); + bool resched = false; + u8 key; + int rc; + + /* Case 1: Distribute TX key to peer if scheduled */ + if (atomic_cmpxchg(&rx->key_distr, + KEY_DISTR_SCHED, + KEY_DISTR_COMPL) == KEY_DISTR_SCHED) { + /* Always pick the newest one for distributing */ + key = tx->key.pending ?: tx->key.active; + rc = tipc_crypto_key_distr(tx, key, rx->node); + if (unlikely(rc)) + pr_warn("%s: unable to distr key[%d] to %s, err %d\n", + tx->name, key, tipc_node_get_id_str(rx->node), + rc); + + /* Sched for key_distr releasing */ + resched = true; + } else { + atomic_cmpxchg(&rx->key_distr, KEY_DISTR_COMPL, 0); + } + + /* Case 2: Attach a pending received session key from peer if any */ + if (rx->skey) { + rc = tipc_crypto_key_init(rx, rx->skey, rx->skey_mode, false); + if (unlikely(rc < 0)) + pr_warn("%s: unable to attach received skey, err %d\n", + rx->name, rc); + switch (rc) { + case -EBUSY: + case -ENOMEM: + /* Resched the key attaching */ + resched = true; + break; + default: + synchronize_rcu(); + kfree(rx->skey); + rx->skey = NULL; + break; + } + } + + if (resched && queue_delayed_work(tx->wq, &rx->work, delay)) + return; + + tipc_node_put(rx->node); +} + +/** + * tipc_crypto_rekeying_sched - (Re)schedule rekeying w/o new interval + * @tx: TX crypto + * @changed: if the rekeying needs to be rescheduled with new interval + * @new_intv: new rekeying interval (when "changed" = true) + */ +void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed, + u32 new_intv) +{ + unsigned long delay; + bool now = false; + + if (changed) { + if (new_intv == TIPC_REKEYING_NOW) + now = true; + else + tx->rekeying_intv = new_intv; + cancel_delayed_work_sync(&tx->work); + } + + if (tx->rekeying_intv || now) { + delay = (now) ? 0 : tx->rekeying_intv * 60 * 1000; + queue_delayed_work(tx->wq, &tx->work, msecs_to_jiffies(delay)); + } +} + +/** + * tipc_crypto_work_tx - Scheduled TX works handler + * @work: the struct TX work + * + * The function processes the previous scheduled work, i.e. key rekeying, by + * generating a new session key based on current one, then attaching it to the + * TX crypto and finally distributing it to peers. It also re-schedules the + * rekeying if needed. + */ +static void tipc_crypto_work_tx(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct tipc_crypto *tx = container_of(dwork, struct tipc_crypto, work); + struct tipc_aead_key *skey = NULL; + struct tipc_key key = tx->key; + struct tipc_aead *aead; + int rc = -ENOMEM; + + if (unlikely(key.pending)) + goto resched; + + /* Take current key as a template */ + rcu_read_lock(); + aead = rcu_dereference(tx->aead[key.active ?: KEY_MASTER]); + if (unlikely(!aead)) { + rcu_read_unlock(); + /* At least one key should exist for securing */ + return; + } + + /* Lets duplicate it first */ + skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_ATOMIC); + rcu_read_unlock(); + + /* Now, generate new key, initiate & distribute it */ + if (likely(skey)) { + rc = tipc_aead_key_generate(skey) ?: + tipc_crypto_key_init(tx, skey, PER_NODE_KEY, false); + if (likely(rc > 0)) + rc = tipc_crypto_key_distr(tx, rc, NULL); + kzfree(skey); + } + + if (unlikely(rc)) + pr_warn_ratelimited("%s: rekeying returns %d\n", tx->name, rc); + +resched: + /* Re-schedule rekeying if any */ + tipc_crypto_rekeying_sched(tx, false, 0); +} diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h index c3de769f49e8..e71193bd5e36 100644 --- a/net/tipc/crypto.h +++ b/net/tipc/crypto.h @@ -67,6 +67,7 @@ enum { }; extern int sysctl_tipc_max_tfms __read_mostly; +extern int sysctl_tipc_key_exchange_enabled __read_mostly; /** * TIPC encryption message format: @@ -74,7 +75,7 @@ extern int sysctl_tipc_max_tfms __read_mostly; * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 * 1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * w0:|Ver=7| User |D|TX |RX |K| Rsvd | + * w0:|Ver=7| User |D|TX |RX |K|M|N| Rsvd | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * w1:| Seqno | * w2:| (8 octets) | @@ -101,6 +102,9 @@ extern int sysctl_tipc_max_tfms __read_mostly; * RX : Currently RX active key corresponding to the destination * node's TX key (when the "D" bit is set) * K : Keep-alive bit (for RPS, LINK_PROTOCOL/STATE_MSG only) + * M : Bit indicates if sender has master key + * N : Bit indicates if sender has no RX keys corresponding to the + * receiver's TX (when the "D" bit is set) * Rsvd : Reserved bit, field * Word1-2: * Seqno : The 64-bit sequence number of the encrypted message, also @@ -117,7 +121,9 @@ struct tipc_ehdr { __u8 destined:1, user:4, version:3; - __u8 reserved_1:3, + __u8 reserved_1:1, + rx_nokey:1, + master_key:1, keepalive:1, rx_key_active:2, tx_key:2; @@ -128,7 +134,9 @@ struct tipc_ehdr { __u8 tx_key:2, rx_key_active:2, keepalive:1, - reserved_1:3; + master_key:1, + rx_nokey:1, + reserved_1:1; #else #error "Please fix <asm/byteorder.h>" #endif @@ -158,10 +166,35 @@ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, struct sk_buff **skb, struct tipc_bearer *b); int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, - u8 mode); + u8 mode, bool master_key); void tipc_crypto_key_flush(struct tipc_crypto *c); -int tipc_aead_key_validate(struct tipc_aead_key *ukey); +int tipc_crypto_key_distr(struct tipc_crypto *tx, u8 key, + struct tipc_node *dest); +void tipc_crypto_msg_rcv(struct net *net, struct sk_buff *skb); +void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed, + u32 new_intv); +int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info); bool tipc_ehdr_validate(struct sk_buff *skb); +static inline u32 msg_key_gen(struct tipc_msg *m) +{ + return msg_bits(m, 4, 16, 0xffff); +} + +static inline void msg_set_key_gen(struct tipc_msg *m, u32 gen) +{ + msg_set_bits(m, 4, 16, 0xffff, gen); +} + +static inline u32 msg_key_mode(struct tipc_msg *m) +{ + return msg_bits(m, 4, 0, 0xf); +} + +static inline void msg_set_key_mode(struct tipc_msg *m, u32 mode) +{ + msg_set_bits(m, 4, 0, 0xf, mode); +} + #endif /* _TIPC_CRYPTO_H */ #endif diff --git a/net/tipc/link.c b/net/tipc/link.c index cef38a910107..06b880da2a8e 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -216,11 +216,6 @@ enum { #define TIPC_BC_RETR_LIM (jiffies + msecs_to_jiffies(10)) #define TIPC_UC_RETR_TIME (jiffies + msecs_to_jiffies(1)) -/* - * Interval between NACKs when packets arrive out of order - */ -#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2) - /* Link FSM states: */ enum { @@ -1256,6 +1251,11 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, case MSG_FRAGMENTER: case BCAST_PROTOCOL: return false; +#ifdef CONFIG_TIPC_CRYPTO + case MSG_CRYPTO: + tipc_crypto_msg_rcv(l->net, skb); + return true; +#endif default: pr_warn("Dropping received illegal msg type\n"); kfree_skb(skb); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 52e93ba4d8e2..11a429f4c6cd 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -582,7 +582,7 @@ bundle: * @pos: position in outer message of msg to be extracted. * Returns position of next msg * Consumes outer buffer when last packet extracted - * Returns true when when there is an extracted buffer, otherwise false + * Returns true when there is an extracted buffer, otherwise false */ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) { diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 1016e96db5c4..5d64596ba987 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -82,6 +82,7 @@ struct plist; #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 +#define MSG_CRYPTO 14 #define SOCK_WAKEUP 14 /* pseudo user */ #define TOP_SRV 15 /* pseudo user */ @@ -127,7 +128,9 @@ struct tipc_skb_cb { #ifdef CONFIG_TIPC_CRYPTO u8 encrypted:1; u8 decrypted:1; - u8 probe:1; +#define SKB_PROBING 1 +#define SKB_GRACING 2 + u8 xmit_type:2; u8 tx_clone_deferred:1; #endif }; @@ -747,6 +750,9 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) #define GRP_RECLAIM_MSG 4 #define GRP_REMIT_MSG 5 +/* Crypto message types */ +#define KEY_DISTR_MSG 0 + /* * Word 1 */ diff --git a/net/tipc/net.c b/net/tipc/net.c index 85400e4242de..0bb2323201da 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -105,12 +105,6 @@ * - A local spin_lock protecting the queue of subscriber events. */ -struct tipc_net_work { - struct work_struct work; - struct net *net; - u32 addr; -}; - static void tipc_net_finalize(struct net *net, u32 addr); int tipc_net_init(struct net *net, u8 *node_id, u32 addr) @@ -142,25 +136,21 @@ static void tipc_net_finalize(struct net *net, u32 addr) TIPC_CLUSTER_SCOPE, 0, addr); } -static void tipc_net_finalize_work(struct work_struct *work) +void tipc_net_finalize_work(struct work_struct *work) { struct tipc_net_work *fwork; fwork = container_of(work, struct tipc_net_work, work); tipc_net_finalize(fwork->net, fwork->addr); - kfree(fwork); } void tipc_sched_net_finalize(struct net *net, u32 addr) { - struct tipc_net_work *fwork = kzalloc(sizeof(*fwork), GFP_ATOMIC); + struct tipc_net *tn = tipc_net(net); - if (!fwork) - return; - INIT_WORK(&fwork->work, tipc_net_finalize_work); - fwork->net = net; - fwork->addr = addr; - schedule_work(&fwork->work); + tn->final_work.net = net; + tn->final_work.addr = addr; + schedule_work(&tn->final_work.work); } void tipc_net_stop(struct net *net) diff --git a/net/tipc/net.h b/net/tipc/net.h index 6740d97c706e..d0c91d2df20a 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -42,6 +42,7 @@ extern const struct nla_policy tipc_nl_net_policy[]; int tipc_net_init(struct net *net, u8 *node_id, u32 addr); +void tipc_net_finalize_work(struct work_struct *work); void tipc_sched_net_finalize(struct net *net, u32 addr); void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index c4aee6247d55..c447cb5f879e 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -108,6 +108,8 @@ const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { .len = TIPC_NODEID_LEN}, [TIPC_NLA_NODE_KEY] = { .type = NLA_BINARY, .len = TIPC_AEAD_KEY_SIZE_MAX}, + [TIPC_NLA_NODE_KEY_MASTER] = { .type = NLA_FLAG }, + [TIPC_NLA_NODE_REKEYING] = { .type = NLA_U32 }, }; /* Properties valid for media, bearer and link */ diff --git a/net/tipc/node.c b/net/tipc/node.c index 4edcee3088da..cf4b239fc569 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -278,6 +278,14 @@ struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) { return container_of(pos, struct tipc_node, list)->crypto_rx; } + +struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr) +{ + struct tipc_node *n; + + n = tipc_node_find(net, addr); + return (n) ? n->crypto_rx : NULL; +} #endif static void tipc_node_free(struct rcu_head *rp) @@ -303,7 +311,7 @@ void tipc_node_put(struct tipc_node *node) kref_put(&node->kref, tipc_node_kref_release); } -static void tipc_node_get(struct tipc_node *node) +void tipc_node_get(struct tipc_node *node) { kref_get(&node->kref); } @@ -584,6 +592,9 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) static void tipc_node_delete_from_list(struct tipc_node *node) { +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_key_flush(node->crypto_rx); +#endif list_del_rcu(&node->list); hlist_del_rcu(&node->hash); tipc_node_put(node); @@ -2868,15 +2879,27 @@ static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) return 0; } +static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv) +{ + struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING]; + + if (!attr) + return -ENODATA; + + *intv = nla_get_u32(attr); + return 0; +} + static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; struct net *net = sock_net(skb->sk); - struct tipc_net *tn = tipc_net(net); + struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx; struct tipc_node *n = NULL; struct tipc_aead_key *ukey; - struct tipc_crypto *c; - u8 *id, *own_id; + bool rekeying = true, master_key = false; + u8 *id, *own_id, mode; + u32 intv = 0; int rc = 0; if (!info->attrs[TIPC_NLA_NODE]) @@ -2886,52 +2909,66 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) info->attrs[TIPC_NLA_NODE], tipc_nl_node_policy, info->extack); if (rc) - goto exit; + return rc; own_id = tipc_own_id(net); if (!own_id) { - rc = -EPERM; - goto exit; + GENL_SET_ERR_MSG(info, "not found own node identity (set id?)"); + return -EPERM; } + rc = tipc_nl_retrieve_rekeying(attrs, &intv); + if (rc == -ENODATA) + rekeying = false; + rc = tipc_nl_retrieve_key(attrs, &ukey); - if (rc) - goto exit; + if (rc == -ENODATA && rekeying) + goto rekeying; + else if (rc) + return rc; - rc = tipc_aead_key_validate(ukey); + rc = tipc_aead_key_validate(ukey, info); if (rc) - goto exit; + return rc; rc = tipc_nl_retrieve_nodeid(attrs, &id); switch (rc) { case -ENODATA: - /* Cluster key mode */ - rc = tipc_crypto_key_init(tn->crypto_tx, ukey, CLUSTER_KEY); + mode = CLUSTER_KEY; + master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]); break; case 0: - /* Per-node key mode */ - if (!memcmp(id, own_id, NODE_ID_LEN)) { - c = tn->crypto_tx; - } else { + mode = PER_NODE_KEY; + if (memcmp(id, own_id, NODE_ID_LEN)) { n = tipc_node_find_by_id(net, id) ?: tipc_node_create(net, 0, id, 0xffffu, 0, true); - if (unlikely(!n)) { - rc = -ENOMEM; - break; - } + if (unlikely(!n)) + return -ENOMEM; c = n->crypto_rx; } - - rc = tipc_crypto_key_init(c, ukey, PER_NODE_KEY); - if (n) - tipc_node_put(n); break; default: - break; + return rc; } -exit: - return (rc < 0) ? rc : 0; + /* Initiate the TX/RX key */ + rc = tipc_crypto_key_init(c, ukey, mode, master_key); + if (n) + tipc_node_put(n); + + if (unlikely(rc < 0)) { + GENL_SET_ERR_MSG(info, "unable to initiate or attach new key"); + return rc; + } else if (c == tx) { + /* Distribute TX key but not master one */ + if (!master_key && tipc_crypto_key_distr(tx, rc, NULL)) + GENL_SET_ERR_MSG(info, "failed to replicate new key"); +rekeying: + /* Schedule TX rekeying if needed */ + tipc_crypto_rekeying_sched(tx, rekeying, intv); + } + + return 0; } int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) @@ -2958,7 +2995,6 @@ static int __tipc_nl_node_flush_key(struct sk_buff *skb, tipc_crypto_key_flush(n->crypto_rx); rcu_read_unlock(); - pr_info("All keys are flushed!\n"); return 0; } diff --git a/net/tipc/node.h b/net/tipc/node.h index 9f6f13f1604f..154a5bbb0d29 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -79,12 +79,14 @@ bool tipc_node_get_id(struct net *net, u32 addr, u8 *id); u32 tipc_node_get_addr(struct tipc_node *node); char *tipc_node_get_id_str(struct tipc_node *node); void tipc_node_put(struct tipc_node *node); +void tipc_node_get(struct tipc_node *node); struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, u16 capabilities, u32 hash_mixes, bool preliminary); #ifdef CONFIG_TIPC_CRYPTO struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n); struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos); +struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr); #endif u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr); void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 11b27ddc75ba..e795a8a2955b 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -52,10 +52,9 @@ #define NAGLE_START_MAX 1024 #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ #define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */ -#define TIPC_FWD_MSG 1 #define TIPC_MAX_PORT 0xffffffff #define TIPC_MIN_PORT 1 -#define TIPC_ACK_RATE 4 /* ACK at 1/4 of of rcv window size */ +#define TIPC_ACK_RATE 4 /* ACK at 1/4 of rcv window size */ enum { TIPC_LISTEN = TCP_LISTEN, diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 97a6264a2993..9fb65c988f7f 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -74,6 +74,15 @@ static struct ctl_table tipc_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "key_exchange_enabled", + .data = &sysctl_tipc_key_exchange_enabled, + .maxlen = sizeof(sysctl_tipc_key_exchange_enabled), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, #endif { .procname = "bc_retruni", diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 1489cfb941d8..5f6f86051c83 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -48,7 +48,6 @@ #define MAX_SEND_MSG_COUNT 25 #define MAX_RECV_MSG_COUNT 25 #define CF_CONNECTED 1 -#define CF_SERVER 2 #define TIPC_SERVER_NAME_LEN 32 diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 911d13cd2e67..1d17f4470ee2 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -52,6 +52,7 @@ #include "bearer.h" #include "netlink.h" #include "msg.h" +#include "udp_media.h" /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index bbc52b088d29..002b0859fed5 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -330,12 +330,13 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) tls_ctx_free(sk, ctx); } -static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, - int __user *optlen) +static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, + int __user *optlen, int tx) { int rc = 0; struct tls_context *ctx = tls_get_ctx(sk); struct tls_crypto_info *crypto_info; + struct cipher_context *cctx; int len; if (get_user(len, optlen)) @@ -352,7 +353,13 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, } /* get user crypto info */ - crypto_info = &ctx->crypto_send.info; + if (tx) { + crypto_info = &ctx->crypto_send.info; + cctx = &ctx->tx; + } else { + crypto_info = &ctx->crypto_recv.info; + cctx = &ctx->rx; + } if (!TLS_CRYPTO_INFO_READY(crypto_info)) { rc = -EBUSY; @@ -379,9 +386,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, } lock_sock(sk); memcpy(crypto_info_aes_gcm_128->iv, - ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); - memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->tx.rec_seq, + memcpy(crypto_info_aes_gcm_128->rec_seq, cctx->rec_seq, TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE); release_sock(sk); if (copy_to_user(optval, @@ -403,9 +410,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, } lock_sock(sk); memcpy(crypto_info_aes_gcm_256->iv, - ctx->tx.iv + TLS_CIPHER_AES_GCM_256_SALT_SIZE, + cctx->iv + TLS_CIPHER_AES_GCM_256_SALT_SIZE, TLS_CIPHER_AES_GCM_256_IV_SIZE); - memcpy(crypto_info_aes_gcm_256->rec_seq, ctx->tx.rec_seq, + memcpy(crypto_info_aes_gcm_256->rec_seq, cctx->rec_seq, TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE); release_sock(sk); if (copy_to_user(optval, @@ -429,7 +436,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname, switch (optname) { case TLS_TX: - rc = do_tls_getsockopt_tx(sk, optval, optlen); + case TLS_RX: + rc = do_tls_getsockopt_conf(sk, optval, optlen, + optname == TLS_TX); break; default: rc = -ENOPROTOOPT; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 92784e51ee7d..eb82bdc6cf7c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -878,7 +878,6 @@ static int unix_autobind(struct socket *sock) if (err) return err; - err = 0; if (u->addr) goto out; diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 6a6f2f214c10..96e24ee4c7e8 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -141,9 +141,62 @@ static bool cfg80211_edmg_chandef_valid(const struct cfg80211_chan_def *chandef) return true; } +static int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width) +{ + int mhz; + + switch (chan_width) { + case NL80211_CHAN_WIDTH_1: + mhz = 1; + break; + case NL80211_CHAN_WIDTH_2: + mhz = 2; + break; + case NL80211_CHAN_WIDTH_4: + mhz = 4; + break; + case NL80211_CHAN_WIDTH_8: + mhz = 8; + break; + case NL80211_CHAN_WIDTH_16: + mhz = 16; + break; + case NL80211_CHAN_WIDTH_5: + mhz = 5; + break; + case NL80211_CHAN_WIDTH_10: + mhz = 10; + break; + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_20_NOHT: + mhz = 20; + break; + case NL80211_CHAN_WIDTH_40: + mhz = 40; + break; + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_80: + mhz = 80; + break; + case NL80211_CHAN_WIDTH_160: + mhz = 160; + break; + default: + WARN_ON_ONCE(1); + return -1; + } + return mhz; +} + +static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) +{ + return nl80211_chan_width_to_mhz(c->width); +} + bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) { - u32 control_freq; + u32 control_freq, oper_freq; + int oper_width, control_width; if (!chandef->chan) return false; @@ -155,10 +208,6 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) switch (chandef->width) { case NL80211_CHAN_WIDTH_1: - case NL80211_CHAN_WIDTH_2: - case NL80211_CHAN_WIDTH_4: - case NL80211_CHAN_WIDTH_8: - case NL80211_CHAN_WIDTH_16: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20: @@ -169,6 +218,30 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) if (chandef->center_freq2) return false; break; + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: + control_freq = ieee80211_channel_to_khz(chandef->chan); + oper_freq = ieee80211_chandef_to_khz(chandef); + control_width = nl80211_chan_width_to_mhz( + ieee80211_s1g_channel_width( + chandef->chan)); + oper_width = cfg80211_chandef_get_width(chandef); + + if (oper_width < 0 || control_width < 0) + return false; + if (chandef->center_freq2) + return false; + + if (control_freq + MHZ_TO_KHZ(control_width) / 2 > + oper_freq + MHZ_TO_KHZ(oper_width) / 2) + return false; + + if (control_freq - MHZ_TO_KHZ(control_width) / 2 < + oper_freq - MHZ_TO_KHZ(oper_width) / 2) + return false; + break; case NL80211_CHAN_WIDTH_40: if (chandef->center_freq1 != control_freq + 10 && chandef->center_freq1 != control_freq - 10) @@ -264,53 +337,6 @@ static void chandef_primary_freqs(const struct cfg80211_chan_def *c, } } -static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) -{ - int width; - - switch (c->width) { - case NL80211_CHAN_WIDTH_1: - width = 1; - break; - case NL80211_CHAN_WIDTH_2: - width = 2; - break; - case NL80211_CHAN_WIDTH_4: - width = 4; - break; - case NL80211_CHAN_WIDTH_8: - width = 8; - break; - case NL80211_CHAN_WIDTH_16: - width = 16; - break; - case NL80211_CHAN_WIDTH_5: - width = 5; - break; - case NL80211_CHAN_WIDTH_10: - width = 10; - break; - case NL80211_CHAN_WIDTH_20: - case NL80211_CHAN_WIDTH_20_NOHT: - width = 20; - break; - case NL80211_CHAN_WIDTH_40: - width = 40; - break; - case NL80211_CHAN_WIDTH_80P80: - case NL80211_CHAN_WIDTH_80: - width = 80; - break; - case NL80211_CHAN_WIDTH_160: - width = 160; - break; - default: - WARN_ON_ONCE(1); - return -1; - } - return width; -} - const struct cfg80211_chan_def * cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1, const struct cfg80211_chan_def *c2) diff --git a/net/wireless/core.h b/net/wireless/core.h index 67b0389fca4d..2ebc2a66680d 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -466,8 +466,8 @@ extern struct work_struct cfg80211_disconnect_work; * * Checks if chandef is usable and we can/need start CAC on such channel. * - * Return: Return true if all channels available and at least - * one channel require CAC (NL80211_DFS_USABLE) + * Return: true if all channels available and at least + * one channel requires CAC (NL80211_DFS_USABLE) */ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef); diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c index cc7b9fd5c166..d66a913027e0 100644 --- a/net/wireless/lib80211.c +++ b/net/wireless/lib80211.c @@ -26,8 +26,6 @@ #include <net/lib80211.h> -#define DRV_NAME "lib80211" - #define DRV_DESCRIPTION "common routines for IEEE802.11 drivers" MODULE_DESCRIPTION(DRV_DESCRIPTION); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 2c9e9a2d1688..1a212db7a300 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -209,14 +209,23 @@ static int validate_beacon_head(const struct nlattr *attr, unsigned int len = nla_len(attr); const struct element *elem; const struct ieee80211_mgmt *mgmt = (void *)data; - unsigned int fixedlen = offsetof(struct ieee80211_mgmt, - u.beacon.variable); + bool s1g_bcn = ieee80211_is_s1g_beacon(mgmt->frame_control); + unsigned int fixedlen, hdrlen; + + if (s1g_bcn) { + fixedlen = offsetof(struct ieee80211_ext, + u.s1g_beacon.variable); + hdrlen = offsetof(struct ieee80211_ext, u.s1g_beacon); + } else { + fixedlen = offsetof(struct ieee80211_mgmt, + u.beacon.variable); + hdrlen = offsetof(struct ieee80211_mgmt, u.beacon); + } if (len < fixedlen) goto err; - if (ieee80211_hdrlen(mgmt->frame_control) != - offsetof(struct ieee80211_mgmt, u.beacon)) + if (ieee80211_hdrlen(mgmt->frame_control) != hdrlen) goto err; data += fixedlen; @@ -336,6 +345,13 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { .len = NL80211_MAX_SUPP_HT_RATES }, [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)), [NL80211_TXRATE_GI] = { .type = NLA_U8 }, + [NL80211_TXRATE_HE] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_he)), + [NL80211_TXRATE_HE_GI] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_HE_GI_0_8, + NL80211_RATE_INFO_HE_GI_3_2), + [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_HE_1XLTF, + NL80211_RATE_INFO_HE_4XLTF), }; static const struct nla_policy @@ -360,6 +376,22 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_NESTED(nl80211_txattr_policy), }; +static const struct nla_policy +nl80211_fils_discovery_policy[NL80211_FILS_DISCOVERY_ATTR_MAX + 1] = { + [NL80211_FILS_DISCOVERY_ATTR_INT_MIN] = NLA_POLICY_MAX(NLA_U32, 10000), + [NL80211_FILS_DISCOVERY_ATTR_INT_MAX] = NLA_POLICY_MAX(NLA_U32, 10000), + NLA_POLICY_RANGE(NLA_BINARY, + NL80211_FILS_DISCOVERY_TMPL_MIN_LEN, + IEEE80211_MAX_DATA_LEN), +}; + +static const struct nla_policy +nl80211_unsol_bcast_probe_resp_policy[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1] = { + [NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] = NLA_POLICY_MAX(NLA_U32, 20), + [NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN } +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -539,7 +571,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_BG_SCAN_PERIOD] = { .type = NLA_U16 }, [NL80211_ATTR_WDEV] = { .type = NLA_U64 }, [NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 }, - [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, }, + + /* need to include at least Auth Transaction and Status Code */ + [NL80211_ATTR_AUTH_DATA] = NLA_POLICY_MIN_LEN(4), + [NL80211_ATTR_VHT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_VHT_CAPABILITY_LEN), [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 }, [NL80211_ATTR_P2P_CTWINDOW] = NLA_POLICY_MAX(NLA_U8, 127), @@ -561,23 +596,30 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, [NL80211_ATTR_CRIT_PROT_ID] = { .type = NLA_U16 }, - [NL80211_ATTR_MAX_CRIT_PROT_DURATION] = { .type = NLA_U16 }, + [NL80211_ATTR_MAX_CRIT_PROT_DURATION] = + NLA_POLICY_MAX(NLA_U16, NL80211_CRIT_PROTO_MAX_DURATION), [NL80211_ATTR_PEER_AID] = NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID), [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 }, [NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED }, - [NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_BINARY }, - [NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_BINARY }, - [NL80211_ATTR_STA_SUPPORTED_CHANNELS] = { .type = NLA_BINARY }, - [NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES] = { .type = NLA_BINARY }, + [NL80211_ATTR_CNTDWN_OFFS_BEACON] = { .type = NLA_BINARY }, + [NL80211_ATTR_CNTDWN_OFFS_PRESP] = { .type = NLA_BINARY }, + [NL80211_ATTR_STA_SUPPORTED_CHANNELS] = NLA_POLICY_MIN_LEN(2), + /* + * The value of the Length field of the Supported Operating + * Classes element is between 2 and 253. + */ + [NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES] = + NLA_POLICY_RANGE(NLA_BINARY, 2, 253), [NL80211_ATTR_HANDLE_DFS] = { .type = NLA_FLAG }, [NL80211_ATTR_OPMODE_NOTIF] = { .type = NLA_U8 }, [NL80211_ATTR_VENDOR_ID] = { .type = NLA_U32 }, [NL80211_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 }, [NL80211_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, - [NL80211_ATTR_QOS_MAP] = { .type = NLA_BINARY, - .len = IEEE80211_QOS_MAP_LEN_MAX }, + [NL80211_ATTR_QOS_MAP] = NLA_POLICY_RANGE(NLA_BINARY, + IEEE80211_QOS_MAP_LEN_MIN, + IEEE80211_QOS_MAP_LEN_MAX), [NL80211_ATTR_MAC_HINT] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_WIPHY_FREQ_HINT] = { .type = NLA_U32 }, [NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 }, @@ -625,15 +667,17 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = FILS_ERP_MAX_RRK_LEN }, [NL80211_ATTR_FILS_CACHE_ID] = NLA_POLICY_EXACT_LEN_WARN(2), [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN }, + [NL80211_ATTR_PMKR0_NAME] = NLA_POLICY_EXACT_LEN(WLAN_PMK_NAME_LEN), [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG }, [NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG }, [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, - [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY, - .len = NL80211_HE_MAX_CAPABILITY_LEN }, - + [NL80211_ATTR_HE_CAPABILITY] = + NLA_POLICY_RANGE(NLA_BINARY, + NL80211_HE_MIN_CAPABILITY_LEN, + NL80211_HE_MAX_CAPABILITY_LEN), [NL80211_ATTR_FTM_RESPONDER] = NLA_POLICY_NESTED(nl80211_ftm_responder_policy), [NL80211_ATTR_TIMEOUT] = NLA_POLICY_MIN(NLA_U32, 1), @@ -654,10 +698,12 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, [NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), [NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED }, - [NL80211_ATTR_HE_6GHZ_CAPABILITY] = { - .type = NLA_EXACT_LEN, - .len = sizeof(struct ieee80211_he_6ghz_capa), - }, + [NL80211_ATTR_HE_6GHZ_CAPABILITY] = + NLA_POLICY_EXACT_LEN(sizeof(struct ieee80211_he_6ghz_capa)), + [NL80211_ATTR_FILS_DISCOVERY] = + NLA_POLICY_NESTED(nl80211_fils_discovery_policy), + [NL80211_ATTR_UNSOL_BCAST_PROBE_RESP] = + NLA_POLICY_NESTED(nl80211_unsol_bcast_probe_resp_policy), }; /* policy for the key attributes */ @@ -703,7 +749,7 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = { [NL80211_WOWLAN_TCP_DST_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_WOWLAN_TCP_SRC_PORT] = { .type = NLA_U16 }, [NL80211_WOWLAN_TCP_DST_PORT] = { .type = NLA_U16 }, - [NL80211_WOWLAN_TCP_DATA_PAYLOAD] = { .type = NLA_MIN_LEN, .len = 1 }, + [NL80211_WOWLAN_TCP_DATA_PAYLOAD] = NLA_POLICY_MIN_LEN(1), [NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ] = { .len = sizeof(struct nl80211_wowlan_tcp_data_seq) }, @@ -711,8 +757,8 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = { .len = sizeof(struct nl80211_wowlan_tcp_data_token) }, [NL80211_WOWLAN_TCP_DATA_INTERVAL] = { .type = NLA_U32 }, - [NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = { .type = NLA_MIN_LEN, .len = 1 }, - [NL80211_WOWLAN_TCP_WAKE_MASK] = { .type = NLA_MIN_LEN, .len = 1 }, + [NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = NLA_POLICY_MIN_LEN(1), + [NL80211_WOWLAN_TCP_WAKE_MASK] = NLA_POLICY_MIN_LEN(1), }; #endif /* CONFIG_PM */ @@ -738,7 +784,7 @@ nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { .type = NLA_BINARY, .len = NL80211_KCK_EXT_LEN }, - [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN_WARN(NL80211_REPLAY_CTR_LEN), + [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN(NL80211_REPLAY_CTR_LEN), [NL80211_REKEY_DATA_AKM] = { .type = NLA_U32 }, }; @@ -778,7 +824,8 @@ nl80211_bss_select_policy[NL80211_BSS_SELECT_ATTR_MAX + 1] = { /* policy for NAN function attributes */ static const struct nla_policy nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = { - [NL80211_NAN_FUNC_TYPE] = { .type = NLA_U8 }, + [NL80211_NAN_FUNC_TYPE] = + NLA_POLICY_MAX(NLA_U8, NL80211_NAN_FUNC_MAX_TYPE), [NL80211_NAN_FUNC_SERVICE_ID] = { .len = NL80211_NAN_FUNC_SERVICE_ID_LEN }, [NL80211_NAN_FUNC_PUBLISH_TYPE] = { .type = NLA_U8 }, @@ -992,6 +1039,21 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_HE) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_1MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_1MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_2MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_2MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_4MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_4MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_8MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_8MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_16MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -4419,21 +4481,106 @@ static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband, return true; } +static u16 he_mcs_map_to_mcs_mask(u8 he_mcs_map) +{ + switch (he_mcs_map) { + case IEEE80211_HE_MCS_NOT_SUPPORTED: + return 0; + case IEEE80211_HE_MCS_SUPPORT_0_7: + return 0x00FF; + case IEEE80211_HE_MCS_SUPPORT_0_9: + return 0x03FF; + case IEEE80211_HE_MCS_SUPPORT_0_11: + return 0xFFF; + default: + break; + } + return 0; +} + +static void he_build_mcs_mask(u16 he_mcs_map, + u16 he_mcs_mask[NL80211_HE_NSS_MAX]) +{ + u8 nss; + + for (nss = 0; nss < NL80211_HE_NSS_MAX; nss++) { + he_mcs_mask[nss] = he_mcs_map_to_mcs_mask(he_mcs_map & 0x03); + he_mcs_map >>= 2; + } +} + +static u16 he_get_txmcsmap(struct genl_info *info, + const struct ieee80211_sta_he_cap *he_cap) +{ + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + __le16 tx_mcs; + + switch (wdev->chandef.width) { + case NL80211_CHAN_WIDTH_80P80: + tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80p80; + break; + case NL80211_CHAN_WIDTH_160: + tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_160; + break; + default: + tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80; + break; + } + return le16_to_cpu(tx_mcs); +} + +static bool he_set_mcs_mask(struct genl_info *info, + struct wireless_dev *wdev, + struct ieee80211_supported_band *sband, + struct nl80211_txrate_he *txrate, + u16 mcs[NL80211_HE_NSS_MAX]) +{ + const struct ieee80211_sta_he_cap *he_cap; + u16 tx_mcs_mask[NL80211_HE_NSS_MAX] = {}; + u16 tx_mcs_map = 0; + u8 i; + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + return false; + + memset(mcs, 0, sizeof(u16) * NL80211_HE_NSS_MAX); + + tx_mcs_map = he_get_txmcsmap(info, he_cap); + + /* Build he_mcs_mask from HE capabilities */ + he_build_mcs_mask(tx_mcs_map, tx_mcs_mask); + + for (i = 0; i < NL80211_HE_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, struct nlattr *attrs[], enum nl80211_attrs attr, - struct cfg80211_bitrate_mask *mask) + struct cfg80211_bitrate_mask *mask, + struct net_device *dev) { struct nlattr *tb[NL80211_TXRATE_MAX + 1]; struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = dev->ieee80211_ptr; int rem, i; struct nlattr *tx_rates; struct ieee80211_supported_band *sband; - u16 vht_tx_mcs_map; + u16 vht_tx_mcs_map, he_tx_mcs_map; memset(mask, 0, sizeof(*mask)); /* Default to all rates enabled */ for (i = 0; i < NUM_NL80211_BANDS; i++) { + const struct ieee80211_sta_he_cap *he_cap; + sband = rdev->wiphy.bands[i]; if (!sband) @@ -4449,6 +4596,16 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs); + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + continue; + + he_tx_mcs_map = he_get_txmcsmap(info, he_cap); + he_build_mcs_mask(he_tx_mcs_map, mask->control[i].he_mcs); + + mask->control[i].he_gi = 0xFF; + mask->control[i].he_ltf = 0xFF; } /* if no rates are given set it back to the defaults */ @@ -4504,13 +4661,25 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].gi > NL80211_TXRATE_FORCE_LGI) return -EINVAL; } + if (tb[NL80211_TXRATE_HE] && + !he_set_mcs_mask(info, wdev, sband, + nla_data(tb[NL80211_TXRATE_HE]), + mask->control[band].he_mcs)) + return -EINVAL; + if (tb[NL80211_TXRATE_HE_GI]) + mask->control[band].he_gi = + nla_get_u8(tb[NL80211_TXRATE_HE_GI]); + if (tb[NL80211_TXRATE_HE_LTF]) + mask->control[band].he_ltf = + nla_get_u8(tb[NL80211_TXRATE_HE_LTF]); if (mask->control[band].legacy == 0) { - /* don't allow empty legacy rates if HT or VHT + /* don't allow empty legacy rates if HT, VHT or HE * are not even supported. */ if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || - rdev->wiphy.bands[band]->vht_cap.vht_supported)) + rdev->wiphy.bands[band]->vht_cap.vht_supported || + ieee80211_get_he_iftype_cap(sband, wdev->iftype))) return -EINVAL; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) @@ -4521,6 +4690,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].vht_mcs[i]) goto out; + for (i = 0; i < NL80211_HE_NSS_MAX; i++) + if (mask->control[band].he_mcs[i]) + goto out; + /* legacy and mcs rates may not be both empty */ return -EINVAL; } @@ -4721,6 +4894,65 @@ static int nl80211_parse_he_bss_color(struct nlattr *attrs, return 0; } +static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev, + struct nlattr *attrs, + struct cfg80211_ap_settings *params) +{ + struct nlattr *tb[NL80211_FILS_DISCOVERY_ATTR_MAX + 1]; + int ret; + struct cfg80211_fils_discovery *fd = ¶ms->fils_discovery; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_FILS_DISCOVERY)) + return -EINVAL; + + ret = nla_parse_nested(tb, NL80211_FILS_DISCOVERY_ATTR_MAX, attrs, + NULL, NULL); + if (ret) + return ret; + + if (!tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN] || + !tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX] || + !tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]) + return -EINVAL; + + fd->tmpl_len = nla_len(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]); + fd->tmpl = nla_data(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]); + fd->min_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN]); + fd->max_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX]); + + return 0; +} + +static int +nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev, + struct nlattr *attrs, + struct cfg80211_ap_settings *params) +{ + struct nlattr *tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1]; + int ret; + struct cfg80211_unsol_bcast_probe_resp *presp = + ¶ms->unsol_bcast_probe_resp; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP)) + return -EINVAL; + + ret = nla_parse_nested(tb, NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX, + attrs, NULL, NULL); + if (ret) + return ret; + + if (!tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] || + !tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]) + return -EINVAL; + + presp->tmpl = nla_data(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]); + presp->tmpl_len = nla_len(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]); + presp->interval = nla_get_u32(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT]); + return 0; +} + static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, const u8 *rates) { @@ -4831,8 +5063,9 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, return false; return true; case NL80211_CMD_START_AP: - /* SAE not supported yet */ - if (auth_type == NL80211_AUTHTYPE_SAE) + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD_AP) && + auth_type == NL80211_AUTHTYPE_SAE) return false; /* FILS not supported yet */ if (auth_type == NL80211_AUTHTYPE_FILS_SK || @@ -4896,8 +5129,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) params.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); params.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); - if (params.ssid_len == 0 || - params.ssid_len > IEEE80211_MAX_SSID_LEN) + if (params.ssid_len == 0) return -EINVAL; } @@ -4966,7 +5198,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TX_RATES]) { err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, - ¶ms.beacon_rate); + ¶ms.beacon_rate, + dev); if (err) return err; @@ -5028,6 +5261,22 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } + if (info->attrs[NL80211_ATTR_FILS_DISCOVERY]) { + err = nl80211_parse_fils_discovery(rdev, + info->attrs[NL80211_ATTR_FILS_DISCOVERY], + ¶ms); + if (err) + goto out; + } + + if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) { + err = nl80211_parse_unsol_bcast_probe_resp( + rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP], + ¶ms); + if (err) + return err; + } + nl80211_calculate_ap_params(¶ms); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) @@ -5837,11 +6086,9 @@ static int nl80211_parse_sta_channel_info(struct genl_info *info, nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_CHANNELS]); /* * Need to include at least one (first channel, number of - * channels) tuple for each subband, and must have proper - * tuples for the rest of the data as well. + * channels) tuple for each subband (checked in policy), + * and must have proper tuples for the rest of the data as well. */ - if (params->supported_channels_len < 2) - return -EINVAL; if (params->supported_channels_len % 2) return -EINVAL; } @@ -5851,13 +6098,6 @@ static int nl80211_parse_sta_channel_info(struct genl_info *info, nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]); params->supported_oper_classes_len = nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]); - /* - * The value of the Length field of the Supported Operating - * Classes element is between 2 and 253. - */ - if (params->supported_oper_classes_len < 2 || - params->supported_oper_classes_len > 253) - return -EINVAL; } return 0; } @@ -5880,9 +6120,6 @@ static int nl80211_set_station_tdls(struct genl_info *info, nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); params->he_capa_len = nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); - - if (params->he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN) - return -EINVAL; } err = nl80211_parse_sta_channel_info(info, params); @@ -6141,10 +6378,6 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); params.he_capa_len = nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); - - /* max len is validated in nla policy */ - if (params.he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN) - return -EINVAL; } if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) @@ -8416,23 +8649,14 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, } if (ssid) { - if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) { - err = -EINVAL; - goto out_free; - } memcpy(request->match_sets[i].ssid.ssid, nla_data(ssid), nla_len(ssid)); request->match_sets[i].ssid.ssid_len = nla_len(ssid); } - if (bssid) { - if (nla_len(bssid) != ETH_ALEN) { - err = -EINVAL; - goto out_free; - } + if (bssid) memcpy(request->match_sets[i].bssid, nla_data(bssid), ETH_ALEN); - } /* special attribute - old implementation w/a */ request->match_sets[i].rssi_thold = default_match_rssi; @@ -8787,10 +9011,10 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) if (err) return err; - if (!csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]) + if (!csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]) return -EINVAL; - len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]); + len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]); if (!len || (len % sizeof(u16))) return -EINVAL; @@ -8801,7 +9025,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) return -EINVAL; params.counter_offsets_beacon = - nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]); + nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]); /* sanity checks - counters should fit and be the same */ for (i = 0; i < params.n_counter_offsets_beacon; i++) { @@ -8814,8 +9038,8 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - if (csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]) { - len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]); + if (csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]) { + len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]); if (!len || (len % sizeof(u16))) return -EINVAL; @@ -8826,7 +9050,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) return -EINVAL; params.counter_offsets_presp = - nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]); + nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]); /* sanity checks - counters should fit and be the same */ for (i = 0; i < params.n_counter_offsets_presp; i++) { @@ -9309,9 +9533,6 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) return -EINVAL; auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]); auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]); - /* need to include at least Auth Transaction and Status Code */ - if (auth_data_len < 4) - return -EINVAL; } local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE]; @@ -9451,7 +9672,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_SAE_OFFLOAD)) + NL80211_EXT_FEATURE_SAE_OFFLOAD) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD_AP)) return -EINVAL; settings->sae_pwd = nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]); @@ -10798,7 +11021,8 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, return -EOPNOTSUPP; err = nl80211_parse_tx_bitrate_mask(info, info->attrs, - NL80211_ATTR_TX_RATES, &mask); + NL80211_ATTR_TX_RATES, &mask, + dev); if (err) return err; @@ -11406,7 +11630,8 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TX_RATES]) { err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, - &setup.beacon_rate); + &setup.beacon_rate, + dev); if (err) return err; @@ -12358,8 +12583,6 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (!tb[NL80211_REKEY_DATA_REPLAY_CTR] || !tb[NL80211_REKEY_DATA_KEK] || !tb[NL80211_REKEY_DATA_KCK]) return -EINVAL; - if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) - return -ERANGE; if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN && !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK && nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KEK_EXT_LEN)) @@ -12684,8 +12907,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb, func->cookie = cfg80211_assign_cookie(rdev); - if (!tb[NL80211_NAN_FUNC_TYPE] || - nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) { + if (!tb[NL80211_NAN_FUNC_TYPE]) { err = -EINVAL; goto out; } @@ -13175,9 +13397,6 @@ static int nl80211_crit_protocol_start(struct sk_buff *skb, duration = nla_get_u16(info->attrs[NL80211_ATTR_MAX_CRIT_PROT_DURATION]); - if (duration > NL80211_CRIT_PROTO_MAX_DURATION) - return -ERANGE; - ret = rdev_crit_proto_start(rdev, wdev, proto, duration); if (!ret) rdev->crit_proto_nlportid = info->snd_portid; @@ -13562,8 +13781,7 @@ static int nl80211_set_qos_map(struct sk_buff *skb, pos = nla_data(info->attrs[NL80211_ATTR_QOS_MAP]); len = nla_len(info->attrs[NL80211_ATTR_QOS_MAP]); - if (len % 2 || len < IEEE80211_QOS_MAP_LEN_MIN || - len > IEEE80211_QOS_MAP_LEN_MAX) + if (len % 2) return -EINVAL; qos_map = kzalloc(sizeof(struct cfg80211_qos_map), GFP_KERNEL); @@ -13831,17 +14049,9 @@ static int nl80211_set_pmk(struct sk_buff *skb, struct genl_info *info) goto out; } - if (info->attrs[NL80211_ATTR_PMKR0_NAME]) { - int r0_name_len = nla_len(info->attrs[NL80211_ATTR_PMKR0_NAME]); - - if (r0_name_len != WLAN_PMK_NAME_LEN) { - ret = -EINVAL; - goto out; - } - + if (info->attrs[NL80211_ATTR_PMKR0_NAME]) pmk_conf.pmk_r0_name = nla_data(info->attrs[NL80211_ATTR_PMKR0_NAME]); - } ret = rdev_set_pmk(rdev, dev, &pmk_conf); out: @@ -13900,8 +14110,7 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_SSID]) { params.ssid.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); - if (params.ssid.ssid_len == 0 || - params.ssid.ssid_len > IEEE80211_MAX_SSID_LEN) + if (params.ssid.ssid_len == 0) return -EINVAL; memcpy(params.ssid.ssid, nla_data(info->attrs[NL80211_ATTR_SSID]), @@ -14202,7 +14411,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) { attr = NL80211_TID_CONFIG_ATTR_TX_RATE; err = nl80211_parse_tx_bitrate_mask(info, attrs, attr, - &tid_conf->txrate_mask); + &tid_conf->txrate_mask, dev); if (err) return err; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index d8a90d397423..6043a9d33d61 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1594,7 +1594,7 @@ freq_reg_info_regd(u32 center_freq, /* * We only need to know if one frequency rule was - * was in center_freq's band, that's enough, so lets + * in center_freq's band, that's enough, so let's * not overwrite it once found */ if (!band_rule_found) @@ -1617,9 +1617,11 @@ __freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw) { const struct ieee80211_regdomain *regd = reg_get_regdomain(wiphy); const struct ieee80211_reg_rule *reg_rule = NULL; + const u32 bws[] = {0, 1, 2, 4, 5, 8, 10, 16, 20}; + int i = ARRAY_SIZE(bws) - 1; u32 bw; - for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) { + for (bw = MHZ_TO_KHZ(bws[i]); bw >= min_bw; bw = MHZ_TO_KHZ(bws[i--])) { reg_rule = freq_reg_info_regd(center_freq, regd, bw); if (!IS_ERR(reg_rule)) return reg_rule; @@ -1631,7 +1633,9 @@ __freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw) const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, u32 center_freq) { - return __freq_reg_info(wiphy, center_freq, MHZ_TO_KHZ(20)); + u32 min_bw = center_freq < MHZ_TO_KHZ(1000) ? 1 : 20; + + return __freq_reg_info(wiphy, center_freq, MHZ_TO_KHZ(min_bw)); } EXPORT_SYMBOL(freq_reg_info); @@ -1659,6 +1663,7 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd { const struct ieee80211_freq_range *freq_range = NULL; u32 max_bandwidth_khz, center_freq_khz, bw_flags = 0; + bool is_s1g = chan->band == NL80211_BAND_S1GHZ; freq_range = ®_rule->freq_range; @@ -1678,70 +1683,72 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(10)) - bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(20)) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags |= IEEE80211_CHAN_NO_HT40; - if (max_bandwidth_khz < MHZ_TO_KHZ(80)) - bw_flags |= IEEE80211_CHAN_NO_80MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(160)) - bw_flags |= IEEE80211_CHAN_NO_160MHZ; + if (is_s1g) { + /* S1G is strict about non overlapping channels. We can + * calculate which bandwidth is allowed per channel by finding + * the largest bandwidth which cleanly divides the freq_range. + */ + int edge_offset; + int ch_bw = max_bandwidth_khz; + + while (ch_bw) { + edge_offset = (center_freq_khz - ch_bw / 2) - + freq_range->start_freq_khz; + if (edge_offset % ch_bw == 0) { + switch (KHZ_TO_MHZ(ch_bw)) { + case 1: + bw_flags |= IEEE80211_CHAN_1MHZ; + break; + case 2: + bw_flags |= IEEE80211_CHAN_2MHZ; + break; + case 4: + bw_flags |= IEEE80211_CHAN_4MHZ; + break; + case 8: + bw_flags |= IEEE80211_CHAN_8MHZ; + break; + case 16: + bw_flags |= IEEE80211_CHAN_16MHZ; + break; + default: + /* If we got here, no bandwidths fit on + * this frequency, ie. band edge. + */ + bw_flags |= IEEE80211_CHAN_DISABLED; + break; + } + break; + } + ch_bw /= 2; + } + } else { + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(40)) + bw_flags |= IEEE80211_CHAN_NO_HT40; + if (max_bandwidth_khz < MHZ_TO_KHZ(80)) + bw_flags |= IEEE80211_CHAN_NO_80MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(160)) + bw_flags |= IEEE80211_CHAN_NO_160MHZ; + } return bw_flags; } -/* - * Note that right now we assume the desired channel bandwidth - * is always 20 MHz for each individual channel (HT40 uses 20 MHz - * per channel, the primary and the extension channel). - */ -static void handle_channel(struct wiphy *wiphy, - enum nl80211_reg_initiator initiator, - struct ieee80211_channel *chan) +static void handle_channel_single_rule(struct wiphy *wiphy, + enum nl80211_reg_initiator initiator, + struct ieee80211_channel *chan, + u32 flags, + struct regulatory_request *lr, + struct wiphy *request_wiphy, + const struct ieee80211_reg_rule *reg_rule) { - u32 flags, bw_flags = 0; - const struct ieee80211_reg_rule *reg_rule = NULL; + u32 bw_flags = 0; const struct ieee80211_power_rule *power_rule = NULL; - struct wiphy *request_wiphy = NULL; - struct regulatory_request *lr = get_last_request(); const struct ieee80211_regdomain *regd; - request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); - - flags = chan->orig_flags; - - reg_rule = freq_reg_info(wiphy, ieee80211_channel_to_khz(chan)); - if (IS_ERR(reg_rule)) { - /* - * We will disable all channels that do not match our - * received regulatory rule unless the hint is coming - * from a Country IE and the Country IE had no information - * about a band. The IEEE 802.11 spec allows for an AP - * to send only a subset of the regulatory rules allowed, - * so an AP in the US that only supports 2.4 GHz may only send - * a country IE with information for the 2.4 GHz band - * while 5 GHz is still supported. - */ - if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && - PTR_ERR(reg_rule) == -ERANGE) - return; - - if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && - request_wiphy && request_wiphy == wiphy && - request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { - pr_debug("Disabling freq %d.%03d MHz for good\n", - chan->center_freq, chan->freq_offset); - chan->orig_flags |= IEEE80211_CHAN_DISABLED; - chan->flags = chan->orig_flags; - } else { - pr_debug("Disabling freq %d.%03d MHz\n", - chan->center_freq, chan->freq_offset); - chan->flags |= IEEE80211_CHAN_DISABLED; - } - return; - } - regd = reg_get_regdomain(wiphy); power_rule = ®_rule->power_rule; @@ -1803,6 +1810,204 @@ static void handle_channel(struct wiphy *wiphy, chan->max_power = chan->max_reg_power; } +static void handle_channel_adjacent_rules(struct wiphy *wiphy, + enum nl80211_reg_initiator initiator, + struct ieee80211_channel *chan, + u32 flags, + struct regulatory_request *lr, + struct wiphy *request_wiphy, + const struct ieee80211_reg_rule *rrule1, + const struct ieee80211_reg_rule *rrule2, + struct ieee80211_freq_range *comb_range) +{ + u32 bw_flags1 = 0; + u32 bw_flags2 = 0; + const struct ieee80211_power_rule *power_rule1 = NULL; + const struct ieee80211_power_rule *power_rule2 = NULL; + const struct ieee80211_regdomain *regd; + + regd = reg_get_regdomain(wiphy); + + power_rule1 = &rrule1->power_rule; + power_rule2 = &rrule2->power_rule; + bw_flags1 = reg_rule_to_chan_bw_flags(regd, rrule1, chan); + bw_flags2 = reg_rule_to_chan_bw_flags(regd, rrule2, chan); + + if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && + request_wiphy && request_wiphy == wiphy && + request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { + /* This guarantees the driver's requested regulatory domain + * will always be used as a base for further regulatory + * settings + */ + chan->flags = + map_regdom_flags(rrule1->flags) | + map_regdom_flags(rrule2->flags) | + bw_flags1 | + bw_flags2; + chan->orig_flags = chan->flags; + chan->max_antenna_gain = + min_t(int, MBI_TO_DBI(power_rule1->max_antenna_gain), + MBI_TO_DBI(power_rule2->max_antenna_gain)); + chan->orig_mag = chan->max_antenna_gain; + chan->max_reg_power = + min_t(int, MBM_TO_DBM(power_rule1->max_eirp), + MBM_TO_DBM(power_rule2->max_eirp)); + chan->max_power = chan->max_reg_power; + chan->orig_mpwr = chan->max_reg_power; + + if (chan->flags & IEEE80211_CHAN_RADAR) { + chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; + if (rrule1->dfs_cac_ms || rrule2->dfs_cac_ms) + chan->dfs_cac_ms = max_t(unsigned int, + rrule1->dfs_cac_ms, + rrule2->dfs_cac_ms); + } + + return; + } + + chan->dfs_state = NL80211_DFS_USABLE; + chan->dfs_state_entered = jiffies; + + chan->beacon_found = false; + chan->flags = flags | bw_flags1 | bw_flags2 | + map_regdom_flags(rrule1->flags) | + map_regdom_flags(rrule2->flags); + + /* reg_rule_to_chan_bw_flags may forbids 10 and forbids 20 MHz + * (otherwise no adj. rule case), recheck therefore + */ + if (cfg80211_does_bw_fit_range(comb_range, + ieee80211_channel_to_khz(chan), + MHZ_TO_KHZ(10))) + chan->flags &= ~IEEE80211_CHAN_NO_10MHZ; + if (cfg80211_does_bw_fit_range(comb_range, + ieee80211_channel_to_khz(chan), + MHZ_TO_KHZ(20))) + chan->flags &= ~IEEE80211_CHAN_NO_20MHZ; + + chan->max_antenna_gain = + min_t(int, chan->orig_mag, + min_t(int, + MBI_TO_DBI(power_rule1->max_antenna_gain), + MBI_TO_DBI(power_rule2->max_antenna_gain))); + chan->max_reg_power = min_t(int, + MBM_TO_DBM(power_rule1->max_eirp), + MBM_TO_DBM(power_rule2->max_eirp)); + + if (chan->flags & IEEE80211_CHAN_RADAR) { + if (rrule1->dfs_cac_ms || rrule2->dfs_cac_ms) + chan->dfs_cac_ms = max_t(unsigned int, + rrule1->dfs_cac_ms, + rrule2->dfs_cac_ms); + else + chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; + } + + if (chan->orig_mpwr) { + /* Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER + * will always follow the passed country IE power settings. + */ + if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && + wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_FOLLOW_POWER) + chan->max_power = chan->max_reg_power; + else + chan->max_power = min(chan->orig_mpwr, + chan->max_reg_power); + } else { + chan->max_power = chan->max_reg_power; + } +} + +/* Note that right now we assume the desired channel bandwidth + * is always 20 MHz for each individual channel (HT40 uses 20 MHz + * per channel, the primary and the extension channel). + */ +static void handle_channel(struct wiphy *wiphy, + enum nl80211_reg_initiator initiator, + struct ieee80211_channel *chan) +{ + const u32 orig_chan_freq = ieee80211_channel_to_khz(chan); + struct regulatory_request *lr = get_last_request(); + struct wiphy *request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); + const struct ieee80211_reg_rule *rrule = NULL; + const struct ieee80211_reg_rule *rrule1 = NULL; + const struct ieee80211_reg_rule *rrule2 = NULL; + + u32 flags = chan->orig_flags; + + rrule = freq_reg_info(wiphy, orig_chan_freq); + if (IS_ERR(rrule)) { + /* check for adjacent match, therefore get rules for + * chan - 20 MHz and chan + 20 MHz and test + * if reg rules are adjacent + */ + rrule1 = freq_reg_info(wiphy, + orig_chan_freq - MHZ_TO_KHZ(20)); + rrule2 = freq_reg_info(wiphy, + orig_chan_freq + MHZ_TO_KHZ(20)); + if (!IS_ERR(rrule1) && !IS_ERR(rrule2)) { + struct ieee80211_freq_range comb_range; + + if (rrule1->freq_range.end_freq_khz != + rrule2->freq_range.start_freq_khz) + goto disable_chan; + + comb_range.start_freq_khz = + rrule1->freq_range.start_freq_khz; + comb_range.end_freq_khz = + rrule2->freq_range.end_freq_khz; + comb_range.max_bandwidth_khz = + min_t(u32, + rrule1->freq_range.max_bandwidth_khz, + rrule2->freq_range.max_bandwidth_khz); + + if (!cfg80211_does_bw_fit_range(&comb_range, + orig_chan_freq, + MHZ_TO_KHZ(20))) + goto disable_chan; + + handle_channel_adjacent_rules(wiphy, initiator, chan, + flags, lr, request_wiphy, + rrule1, rrule2, + &comb_range); + return; + } + +disable_chan: + /* We will disable all channels that do not match our + * received regulatory rule unless the hint is coming + * from a Country IE and the Country IE had no information + * about a band. The IEEE 802.11 spec allows for an AP + * to send only a subset of the regulatory rules allowed, + * so an AP in the US that only supports 2.4 GHz may only send + * a country IE with information for the 2.4 GHz band + * while 5 GHz is still supported. + */ + if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && + PTR_ERR(rrule) == -ERANGE) + return; + + if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && + request_wiphy && request_wiphy == wiphy && + request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { + pr_debug("Disabling freq %d.%03d MHz for good\n", + chan->center_freq, chan->freq_offset); + chan->orig_flags |= IEEE80211_CHAN_DISABLED; + chan->flags = chan->orig_flags; + } else { + pr_debug("Disabling freq %d.%03d MHz\n", + chan->center_freq, chan->freq_offset); + chan->flags |= IEEE80211_CHAN_DISABLED; + } + return; + } + + handle_channel_single_rule(wiphy, initiator, chan, flags, lr, + request_wiphy, rrule); +} + static void handle_band(struct wiphy *wiphy, enum nl80211_reg_initiator initiator, struct ieee80211_supported_band *sband) @@ -3170,7 +3375,7 @@ static void restore_custom_reg_settings(struct wiphy *wiphy) * - send a user regulatory hint if applicable * * Device drivers that send a regulatory hint for a specific country - * keep their own regulatory domain on wiphy->regd so that does does + * keep their own regulatory domain on wiphy->regd so that does * not need to be remembered. */ static void restore_regulatory_settings(bool reset_user, bool cached) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 04f2d198c215..84fc8ab16dd2 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -55,7 +55,7 @@ * * Also note that the hidden_beacon_bss pointer is only relevant * if the driver uses something other than the IEs, e.g. private - * data stored stored in the BSS struct, since the beacon IEs are + * data stored in the BSS struct, since the beacon IEs are * also linked into the probe response struct. */ @@ -1488,7 +1488,7 @@ static const struct element ielen - (mbssid_end - ie)); /* - * If is is not the last subelement in current MBSSID IE or there isn't + * If it is not the last subelement in current MBSSID IE or there isn't * a next MBSSID IE - profile is complete. */ if ((sub_elem->data + sub_elem->datalen < mbssid_end - 1) || diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 079ce320dc1e..38df713f2e2e 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -24,7 +24,7 @@ /* * Software SME in cfg80211, using auth/assoc/deauth calls to the - * driver. This is is for implementing nl80211's connect/disconnect + * driver. This is for implementing nl80211's connect/disconnect * and wireless extensions (if configured.) */ diff --git a/net/wireless/util.c b/net/wireless/util.c index 6fa99df52f86..f01746894a4e 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -111,6 +111,33 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) } EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); +enum nl80211_chan_width +ieee80211_s1g_channel_width(const struct ieee80211_channel *chan) +{ + if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ)) + return NL80211_CHAN_WIDTH_20_NOHT; + + /*S1G defines a single allowed channel width per channel. + * Extract that width here. + */ + if (chan->flags & IEEE80211_CHAN_1MHZ) + return NL80211_CHAN_WIDTH_1; + else if (chan->flags & IEEE80211_CHAN_2MHZ) + return NL80211_CHAN_WIDTH_2; + else if (chan->flags & IEEE80211_CHAN_4MHZ) + return NL80211_CHAN_WIDTH_4; + else if (chan->flags & IEEE80211_CHAN_8MHZ) + return NL80211_CHAN_WIDTH_8; + else if (chan->flags & IEEE80211_CHAN_16MHZ) + return NL80211_CHAN_WIDTH_16; + + pr_err("unknown channel width for channel at %dKHz?\n", + ieee80211_channel_to_khz(chan)); + + return NL80211_CHAN_WIDTH_1; +} +EXPORT_SYMBOL(ieee80211_s1g_channel_width); + int ieee80211_freq_khz_to_channel(u32 freq) { /* TODO: just handle MHz for now */ @@ -399,6 +426,11 @@ unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc) { unsigned int hdrlen = 24; + if (ieee80211_is_ext(fc)) { + hdrlen = 4; + goto out; + } + if (ieee80211_is_data(fc)) { if (ieee80211_has_a4(fc)) hdrlen = 30; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 4d2160c989a3..78f2927ead7f 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -497,7 +497,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, /* * We only need to store WEP keys, since they're the only keys that - * can be be set before a connection is established and persist after + * can be set before a connection is established and persist after * disconnecting. */ if (!addr && (params->cipher == WLAN_CIPHER_SUITE_WEP40 || diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index b010bfde0149..56d052bc65cb 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -23,162 +23,6 @@ static DEFINE_IDA(umem_ida); -void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) -{ - unsigned long flags; - - if (!xs->tx) - return; - - spin_lock_irqsave(&umem->xsk_tx_list_lock, flags); - list_add_rcu(&xs->list, &umem->xsk_tx_list); - spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags); -} - -void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) -{ - unsigned long flags; - - if (!xs->tx) - return; - - spin_lock_irqsave(&umem->xsk_tx_list_lock, flags); - list_del_rcu(&xs->list); - spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags); -} - -/* The umem is stored both in the _rx struct and the _tx struct as we do - * not know if the device has more tx queues than rx, or the opposite. - * This might also change during run time. - */ -static int xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem, - u16 queue_id) -{ - if (queue_id >= max_t(unsigned int, - dev->real_num_rx_queues, - dev->real_num_tx_queues)) - return -EINVAL; - - if (queue_id < dev->real_num_rx_queues) - dev->_rx[queue_id].umem = umem; - if (queue_id < dev->real_num_tx_queues) - dev->_tx[queue_id].umem = umem; - - return 0; -} - -struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, - u16 queue_id) -{ - if (queue_id < dev->real_num_rx_queues) - return dev->_rx[queue_id].umem; - if (queue_id < dev->real_num_tx_queues) - return dev->_tx[queue_id].umem; - - return NULL; -} -EXPORT_SYMBOL(xdp_get_umem_from_qid); - -static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id) -{ - if (queue_id < dev->real_num_rx_queues) - dev->_rx[queue_id].umem = NULL; - if (queue_id < dev->real_num_tx_queues) - dev->_tx[queue_id].umem = NULL; -} - -int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, - u16 queue_id, u16 flags) -{ - bool force_zc, force_copy; - struct netdev_bpf bpf; - int err = 0; - - ASSERT_RTNL(); - - force_zc = flags & XDP_ZEROCOPY; - force_copy = flags & XDP_COPY; - - if (force_zc && force_copy) - return -EINVAL; - - if (xdp_get_umem_from_qid(dev, queue_id)) - return -EBUSY; - - err = xdp_reg_umem_at_qid(dev, umem, queue_id); - if (err) - return err; - - umem->dev = dev; - umem->queue_id = queue_id; - - if (flags & XDP_USE_NEED_WAKEUP) { - umem->flags |= XDP_UMEM_USES_NEED_WAKEUP; - /* Tx needs to be explicitly woken up the first time. - * Also for supporting drivers that do not implement this - * feature. They will always have to call sendto(). - */ - xsk_set_tx_need_wakeup(umem); - } - - dev_hold(dev); - - if (force_copy) - /* For copy-mode, we are done. */ - return 0; - - if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) { - err = -EOPNOTSUPP; - goto err_unreg_umem; - } - - bpf.command = XDP_SETUP_XSK_UMEM; - bpf.xsk.umem = umem; - bpf.xsk.queue_id = queue_id; - - err = dev->netdev_ops->ndo_bpf(dev, &bpf); - if (err) - goto err_unreg_umem; - - umem->zc = true; - return 0; - -err_unreg_umem: - if (!force_zc) - err = 0; /* fallback to copy mode */ - if (err) - xdp_clear_umem_at_qid(dev, queue_id); - return err; -} - -void xdp_umem_clear_dev(struct xdp_umem *umem) -{ - struct netdev_bpf bpf; - int err; - - ASSERT_RTNL(); - - if (!umem->dev) - return; - - if (umem->zc) { - bpf.command = XDP_SETUP_XSK_UMEM; - bpf.xsk.umem = NULL; - bpf.xsk.queue_id = umem->queue_id; - - err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf); - - if (err) - WARN(1, "failed to disable umem!\n"); - } - - xdp_clear_umem_at_qid(umem->dev, umem->queue_id); - - dev_put(umem->dev); - umem->dev = NULL; - umem->zc = false; -} - static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); @@ -195,38 +39,33 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem) } } -static void xdp_umem_release(struct xdp_umem *umem) +static void xdp_umem_addr_unmap(struct xdp_umem *umem) { - rtnl_lock(); - xdp_umem_clear_dev(umem); - rtnl_unlock(); - - ida_simple_remove(&umem_ida, umem->id); + vunmap(umem->addrs); + umem->addrs = NULL; +} - if (umem->fq) { - xskq_destroy(umem->fq); - umem->fq = NULL; - } +static int xdp_umem_addr_map(struct xdp_umem *umem, struct page **pages, + u32 nr_pages) +{ + umem->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!umem->addrs) + return -ENOMEM; + return 0; +} - if (umem->cq) { - xskq_destroy(umem->cq); - umem->cq = NULL; - } +static void xdp_umem_release(struct xdp_umem *umem) +{ + umem->zc = false; + ida_simple_remove(&umem_ida, umem->id); - xp_destroy(umem->pool); + xdp_umem_addr_unmap(umem); xdp_umem_unpin_pages(umem); xdp_umem_unaccount_pages(umem); kfree(umem); } -static void xdp_umem_release_deferred(struct work_struct *work) -{ - struct xdp_umem *umem = container_of(work, struct xdp_umem, work); - - xdp_umem_release(umem); -} - void xdp_get_umem(struct xdp_umem *umem) { refcount_inc(&umem->users); @@ -237,10 +76,8 @@ void xdp_put_umem(struct xdp_umem *umem) if (!umem) return; - if (refcount_dec_and_test(&umem->users)) { - INIT_WORK(&umem->work, xdp_umem_release_deferred); - schedule_work(&umem->work); - } + if (refcount_dec_and_test(&umem->users)) + xdp_umem_release(umem); } static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) @@ -319,8 +156,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; } - if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG | - XDP_UMEM_USES_NEED_WAKEUP)) + if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG) return -EINVAL; if (!unaligned_chunks && !is_power_of_2(chunk_size)) @@ -355,13 +191,13 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->size = size; umem->headroom = headroom; umem->chunk_size = chunk_size; + umem->chunks = chunks; umem->npgs = (u32)npgs; umem->pgs = NULL; umem->user = NULL; umem->flags = mr->flags; - INIT_LIST_HEAD(&umem->xsk_tx_list); - spin_lock_init(&umem->xsk_tx_list_lock); + INIT_LIST_HEAD(&umem->xsk_dma_list); refcount_set(&umem->users, 1); err = xdp_umem_account_pages(umem); @@ -372,15 +208,13 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) goto out_account; - umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size, - headroom, size, unaligned_chunks); - if (!umem->pool) { - err = -ENOMEM; - goto out_pin; - } + err = xdp_umem_addr_map(umem, umem->pgs, umem->npgs); + if (err) + goto out_unpin; + return 0; -out_pin: +out_unpin: xdp_umem_unpin_pages(umem); out_account: xdp_umem_unaccount_pages(umem); @@ -412,8 +246,3 @@ struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) return umem; } - -bool xdp_umem_validate_queues(struct xdp_umem *umem) -{ - return umem->fq && umem->cq; -} diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 32067fe98f65..181fdda2f2a8 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -8,14 +8,8 @@ #include <net/xdp_sock_drv.h> -int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, - u16 queue_id, u16 flags); -void xdp_umem_clear_dev(struct xdp_umem *umem); -bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); -void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs); -void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs); struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); #endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c3231620d210..5eb6662f562a 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,68 +36,108 @@ static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && - READ_ONCE(xs->umem->fq); + (xs->pool->fq || READ_ONCE(xs->fq_tmp)); } -void xsk_set_rx_need_wakeup(struct xdp_umem *umem) +void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { - if (umem->need_wakeup & XDP_WAKEUP_RX) + if (pool->cached_need_wakeup & XDP_WAKEUP_RX) return; - umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP; - umem->need_wakeup |= XDP_WAKEUP_RX; + pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; + pool->cached_need_wakeup |= XDP_WAKEUP_RX; } EXPORT_SYMBOL(xsk_set_rx_need_wakeup); -void xsk_set_tx_need_wakeup(struct xdp_umem *umem) +void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { struct xdp_sock *xs; - if (umem->need_wakeup & XDP_WAKEUP_TX) + if (pool->cached_need_wakeup & XDP_WAKEUP_TX) return; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); - umem->need_wakeup |= XDP_WAKEUP_TX; + pool->cached_need_wakeup |= XDP_WAKEUP_TX; } EXPORT_SYMBOL(xsk_set_tx_need_wakeup); -void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) +void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { - if (!(umem->need_wakeup & XDP_WAKEUP_RX)) + if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) return; - umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; - umem->need_wakeup &= ~XDP_WAKEUP_RX; + pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; + pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; } EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); -void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) +void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { struct xdp_sock *xs; - if (!(umem->need_wakeup & XDP_WAKEUP_TX)) + if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) return; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); - umem->need_wakeup &= ~XDP_WAKEUP_TX; + pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; } EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); -bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) +bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { - return umem->flags & XDP_UMEM_USES_NEED_WAKEUP; + return pool->uses_need_wakeup; +} +EXPORT_SYMBOL(xsk_uses_need_wakeup); + +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, + u16 queue_id) +{ + if (queue_id < dev->real_num_rx_queues) + return dev->_rx[queue_id].pool; + if (queue_id < dev->real_num_tx_queues) + return dev->_tx[queue_id].pool; + + return NULL; +} +EXPORT_SYMBOL(xsk_get_pool_from_qid); + +void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) +{ + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].pool = NULL; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].pool = NULL; +} + +/* The buffer pool is stored both in the _rx struct and the _tx struct as we do + * not know if the device has more tx queues than rx, or the opposite. + * This might also change during run time. + */ +int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, + u16 queue_id) +{ + if (queue_id >= max_t(unsigned int, + dev->real_num_rx_queues, + dev->real_num_tx_queues)) + return -EINVAL; + + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].pool = pool; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].pool = pool; + + return 0; } -EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); void xp_release(struct xdp_buff_xsk *xskb) { @@ -155,12 +195,12 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, struct xdp_buff *xsk_xdp; int err; - if (len > xsk_umem_get_rx_frame_size(xs->umem)) { + if (len > xsk_pool_get_rx_frame_size(xs->pool)) { xs->rx_dropped++; return -ENOSPC; } - xsk_xdp = xsk_buff_alloc(xs->umem); + xsk_xdp = xsk_buff_alloc(xs->pool); if (!xsk_xdp) { xs->rx_dropped++; return -ENOSPC; @@ -208,7 +248,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, static void xsk_flush(struct xdp_sock *xs) { xskq_prod_submit(xs->rx); - __xskq_cons_release(xs->umem->fq); + __xskq_cons_release(xs->pool->fq); sock_def_readable(&xs->sk); } @@ -249,32 +289,32 @@ void __xsk_map_flush(void) } } -void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { - xskq_prod_submit_n(umem->cq, nb_entries); + xskq_prod_submit_n(pool->cq, nb_entries); } -EXPORT_SYMBOL(xsk_umem_complete_tx); +EXPORT_SYMBOL(xsk_tx_completed); -void xsk_umem_consume_tx_done(struct xdp_umem *umem) +void xsk_tx_release(struct xsk_buff_pool *pool) { struct xdp_sock *xs; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { __xskq_cons_release(xs->tx); xs->sk.sk_write_space(&xs->sk); } rcu_read_unlock(); } -EXPORT_SYMBOL(xsk_umem_consume_tx_done); +EXPORT_SYMBOL(xsk_tx_release); -bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) +bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { struct xdp_sock *xs; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { - if (!xskq_cons_peek_desc(xs->tx, desc, umem)) { + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { + if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { xs->tx->queue_empty_descs++; continue; } @@ -284,7 +324,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - if (xskq_prod_reserve_addr(umem->cq, desc->addr)) + if (xskq_prod_reserve_addr(pool->cq, desc->addr)) goto out; xskq_cons_release(xs->tx); @@ -296,7 +336,7 @@ out: rcu_read_unlock(); return false; } -EXPORT_SYMBOL(xsk_umem_consume_tx); +EXPORT_SYMBOL(xsk_tx_peek_desc); static int xsk_wakeup(struct xdp_sock *xs, u8 flags) { @@ -322,7 +362,7 @@ static void xsk_destruct_skb(struct sk_buff *skb) unsigned long flags; spin_lock_irqsave(&xs->tx_completion_lock, flags); - xskq_prod_submit_addr(xs->umem->cq, addr); + xskq_prod_submit_addr(xs->pool->cq, addr); spin_unlock_irqrestore(&xs->tx_completion_lock, flags); sock_wfree(skb); @@ -342,7 +382,7 @@ static int xsk_generic_xmit(struct sock *sk) if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; - while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) { + while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { char *buffer; u64 addr; u32 len; @@ -359,14 +399,14 @@ static int xsk_generic_xmit(struct sock *sk) skb_put(skb, len); addr = desc.addr; - buffer = xsk_buff_raw_get_data(xs->umem, addr); + buffer = xsk_buff_raw_get_data(xs->pool, addr); err = skb_store_bits(skb, 0, buffer, len); /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) { + if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) { kfree_skb(skb); goto out; } @@ -431,16 +471,16 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock, __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); - struct xdp_umem *umem; + struct xsk_buff_pool *pool; if (unlikely(!xsk_is_bound(xs))) return mask; - umem = xs->umem; + pool = xs->pool; - if (umem->need_wakeup) { + if (pool->cached_need_wakeup) { if (xs->zc) - xsk_wakeup(xs, umem->need_wakeup); + xsk_wakeup(xs, pool->cached_need_wakeup); else /* Poll needs to drive Tx also in copy mode */ __xsk_sendmsg(sk); @@ -481,7 +521,7 @@ static void xsk_unbind_dev(struct xdp_sock *xs) WRITE_ONCE(xs->state, XSK_UNBOUND); /* Wait for driver to stop using the xdp socket. */ - xdp_del_sk_umem(xs->umem, xs); + xp_del_xsk(xs->pool, xs); xs->dev = NULL; synchronize_net(); dev_put(dev); @@ -559,6 +599,8 @@ static int xsk_release(struct socket *sock) xskq_destroy(xs->rx); xskq_destroy(xs->tx); + xskq_destroy(xs->fq_tmp); + xskq_destroy(xs->cq_tmp); sock_orphan(sk); sock->sk = NULL; @@ -586,6 +628,11 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd) return sock; } +static bool xsk_validate_queues(struct xdp_sock *xs) +{ + return xs->fq_tmp && xs->cq_tmp; +} + static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; @@ -654,29 +701,64 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) sockfd_put(sock); goto out_unlock; } - if (umem_xs->dev != dev || umem_xs->queue_id != qid) { - err = -EINVAL; - sockfd_put(sock); - goto out_unlock; + + if (umem_xs->queue_id != qid || umem_xs->dev != dev) { + /* Share the umem with another socket on another qid + * and/or device. + */ + xs->pool = xp_create_and_assign_umem(xs, + umem_xs->umem); + if (!xs->pool) { + sockfd_put(sock); + goto out_unlock; + } + + err = xp_assign_dev_shared(xs->pool, umem_xs->umem, + dev, qid); + if (err) { + xp_destroy(xs->pool); + sockfd_put(sock); + goto out_unlock; + } + } else { + /* Share the buffer pool with the other socket. */ + if (xs->fq_tmp || xs->cq_tmp) { + /* Do not allow setting your own fq or cq. */ + err = -EINVAL; + sockfd_put(sock); + goto out_unlock; + } + + xp_get_pool(umem_xs->pool); + xs->pool = umem_xs->pool; } xdp_get_umem(umem_xs->umem); WRITE_ONCE(xs->umem, umem_xs->umem); sockfd_put(sock); - } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { + } else if (!xs->umem || !xsk_validate_queues(xs)) { err = -EINVAL; goto out_unlock; } else { /* This xsk has its own umem. */ - err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); - if (err) + xs->pool = xp_create_and_assign_umem(xs, xs->umem); + if (!xs->pool) { + err = -ENOMEM; + goto out_unlock; + } + + err = xp_assign_dev(xs->pool, dev, qid, flags); + if (err) { + xp_destroy(xs->pool); + xs->pool = NULL; goto out_unlock; + } } xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; - xdp_add_sk_umem(xs->umem, xs); + xp_add_xsk(xs->pool, xs); out_unlock: if (err) { @@ -782,16 +864,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_unlock(&xs->mutex); return -EBUSY; } - if (!xs->umem) { - mutex_unlock(&xs->mutex); - return -EINVAL; - } - q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : - &xs->umem->cq; + q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : + &xs->cq_tmp; err = xsk_init_queue(entries, q, true); - if (optname == XDP_UMEM_FILL_RING) - xp_set_fq(xs->umem->pool, *q); mutex_unlock(&xs->mutex); return err; } @@ -858,7 +934,7 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, if (extra_stats) { stats.rx_ring_full = xs->rx_queue_full; stats.rx_fill_ring_empty_descs = - xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0; + xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); } else { stats.rx_dropped += xs->rx_queue_full; @@ -960,7 +1036,6 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); struct xsk_queue *q = NULL; - struct xdp_umem *umem; unsigned long pfn; struct page *qpg; @@ -972,16 +1047,12 @@ static int xsk_mmap(struct file *file, struct socket *sock, } else if (offset == XDP_PGOFF_TX_RING) { q = READ_ONCE(xs->tx); } else { - umem = READ_ONCE(xs->umem); - if (!umem) - return -EINVAL; - /* Matches the smp_wmb() in XDP_UMEM_REG */ smp_rmb(); if (offset == XDP_UMEM_PGOFF_FILL_RING) - q = READ_ONCE(umem->fq); + q = READ_ONCE(xs->fq_tmp); else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) - q = READ_ONCE(umem->cq); + q = READ_ONCE(xs->cq_tmp); } if (!q) @@ -1019,8 +1090,8 @@ static int xsk_notifier(struct notifier_block *this, xsk_unbind_dev(xs); - /* Clear device references in umem. */ - xdp_umem_clear_dev(xs->umem); + /* Clear device references. */ + xp_clear_dev(xs->pool); } mutex_unlock(&xs->mutex); } @@ -1064,7 +1135,7 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; - xdp_put_umem(xs->umem); + xp_put_pool(xs->pool); sk_refcnt_debug_dec(sk); } @@ -1072,8 +1143,8 @@ static void xsk_destruct(struct sock *sk) static int xsk_create(struct net *net, struct socket *sock, int protocol, int kern) { - struct sock *sk; struct xdp_sock *xs; + struct sock *sk; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index 455ddd480f3d..da1f73e43924 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -11,13 +11,6 @@ #define XSK_NEXT_PG_CONTIG_SHIFT 0 #define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) -/* Flags for the umem flags field. - * - * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public - * flags. See inlude/uapi/include/linux/if_xdp.h. - */ -#define XDP_UMEM_USES_NEED_WAKEUP BIT(1) - struct xdp_ring_offset_v1 { __u64 producer; __u64 consumer; @@ -51,5 +44,8 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); int xsk_map_inc(struct xsk_map *map); void xsk_map_put(struct xsk_map *map); +void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id); +int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, + u16 queue_id); #endif /* XSK_H_ */ diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index a2044c245215..795d7c81c0ca 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -2,21 +2,37 @@ #include <net/xsk_buff_pool.h> #include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> +#include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> +#include <linux/swiotlb.h> #include "xsk_queue.h" +#include "xdp_umem.h" +#include "xsk.h" -static void xp_addr_unmap(struct xsk_buff_pool *pool) +void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) { - vunmap(pool->addrs); + unsigned long flags; + + if (!xs->tx) + return; + + spin_lock_irqsave(&pool->xsk_tx_list_lock, flags); + list_add_rcu(&xs->tx_list, &pool->xsk_tx_list); + spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags); } -static int xp_addr_map(struct xsk_buff_pool *pool, - struct page **pages, u32 nr_pages) +void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) { - pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (!pool->addrs) - return -ENOMEM; - return 0; + unsigned long flags; + + if (!xs->tx) + return; + + spin_lock_irqsave(&pool->xsk_tx_list_lock, flags); + list_del_rcu(&xs->tx_list); + spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags); } void xp_destroy(struct xsk_buff_pool *pool) @@ -24,59 +40,61 @@ void xp_destroy(struct xsk_buff_pool *pool) if (!pool) return; - xp_addr_unmap(pool); kvfree(pool->heads); kvfree(pool); } -struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, - u32 chunk_size, u32 headroom, u64 size, - bool unaligned) +struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, + struct xdp_umem *umem) { struct xsk_buff_pool *pool; struct xdp_buff_xsk *xskb; - int err; u32 i; - pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL); + pool = kvzalloc(struct_size(pool, free_heads, umem->chunks), + GFP_KERNEL); if (!pool) goto out; - pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL); + pool->heads = kvcalloc(umem->chunks, sizeof(*pool->heads), GFP_KERNEL); if (!pool->heads) goto out; - pool->chunk_mask = ~((u64)chunk_size - 1); - pool->addrs_cnt = size; - pool->heads_cnt = chunks; - pool->free_heads_cnt = chunks; - pool->headroom = headroom; - pool->chunk_size = chunk_size; - pool->unaligned = unaligned; - pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM; + pool->chunk_mask = ~((u64)umem->chunk_size - 1); + pool->addrs_cnt = umem->size; + pool->heads_cnt = umem->chunks; + pool->free_heads_cnt = umem->chunks; + pool->headroom = umem->headroom; + pool->chunk_size = umem->chunk_size; + pool->unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; + pool->frame_len = umem->chunk_size - umem->headroom - + XDP_PACKET_HEADROOM; + pool->umem = umem; + pool->addrs = umem->addrs; INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->xsk_tx_list); + spin_lock_init(&pool->xsk_tx_list_lock); + refcount_set(&pool->users, 1); + + pool->fq = xs->fq_tmp; + pool->cq = xs->cq_tmp; + xs->fq_tmp = NULL; + xs->cq_tmp = NULL; for (i = 0; i < pool->free_heads_cnt; i++) { xskb = &pool->heads[i]; xskb->pool = pool; - xskb->xdp.frame_sz = chunk_size - headroom; + xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; pool->free_heads[i] = xskb; } - err = xp_addr_map(pool, pages, nr_pages); - if (!err) - return pool; + return pool; out: xp_destroy(pool); return NULL; } -void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq) -{ - pool->fq = fq; -} - void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { u32 i; @@ -86,70 +104,320 @@ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) } EXPORT_SYMBOL(xp_set_rxq_info); -void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) +static void xp_disable_drv_zc(struct xsk_buff_pool *pool) { - dma_addr_t *dma; - u32 i; + struct netdev_bpf bpf; + int err; - if (pool->dma_pages_cnt == 0) + ASSERT_RTNL(); + + if (pool->umem->zc) { + bpf.command = XDP_SETUP_XSK_POOL; + bpf.xsk.pool = NULL; + bpf.xsk.queue_id = pool->queue_id; + + err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf); + + if (err) + WARN(1, "Failed to disable zero-copy!\n"); + } +} + +static int __xp_assign_dev(struct xsk_buff_pool *pool, + struct net_device *netdev, u16 queue_id, u16 flags) +{ + bool force_zc, force_copy; + struct netdev_bpf bpf; + int err = 0; + + ASSERT_RTNL(); + + force_zc = flags & XDP_ZEROCOPY; + force_copy = flags & XDP_COPY; + + if (force_zc && force_copy) + return -EINVAL; + + if (xsk_get_pool_from_qid(netdev, queue_id)) + return -EBUSY; + + pool->netdev = netdev; + pool->queue_id = queue_id; + err = xsk_reg_pool_at_qid(netdev, pool, queue_id); + if (err) + return err; + + if (flags & XDP_USE_NEED_WAKEUP) { + pool->uses_need_wakeup = true; + /* Tx needs to be explicitly woken up the first time. + * Also for supporting drivers that do not implement this + * feature. They will always have to call sendto(). + */ + pool->cached_need_wakeup = XDP_WAKEUP_TX; + } + + dev_hold(netdev); + + if (force_copy) + /* For copy-mode, we are done. */ + return 0; + + if (!netdev->netdev_ops->ndo_bpf || + !netdev->netdev_ops->ndo_xsk_wakeup) { + err = -EOPNOTSUPP; + goto err_unreg_pool; + } + + bpf.command = XDP_SETUP_XSK_POOL; + bpf.xsk.pool = pool; + bpf.xsk.queue_id = queue_id; + + err = netdev->netdev_ops->ndo_bpf(netdev, &bpf); + if (err) + goto err_unreg_pool; + + if (!pool->dma_pages) { + WARN(1, "Driver did not DMA map zero-copy buffers"); + goto err_unreg_xsk; + } + pool->umem->zc = true; + return 0; + +err_unreg_xsk: + xp_disable_drv_zc(pool); +err_unreg_pool: + if (!force_zc) + err = 0; /* fallback to copy mode */ + if (err) + xsk_clear_pool_at_qid(netdev, queue_id); + return err; +} + +int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, + u16 queue_id, u16 flags) +{ + return __xp_assign_dev(pool, dev, queue_id, flags); +} + +int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, + struct net_device *dev, u16 queue_id) +{ + u16 flags; + + /* One fill and completion ring required for each queue id. */ + if (!pool->fq || !pool->cq) + return -EINVAL; + + flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY; + if (pool->uses_need_wakeup) + flags |= XDP_USE_NEED_WAKEUP; + + return __xp_assign_dev(pool, dev, queue_id, flags); +} + +void xp_clear_dev(struct xsk_buff_pool *pool) +{ + if (!pool->netdev) return; - for (i = 0; i < pool->dma_pages_cnt; i++) { - dma = &pool->dma_pages[i]; + xp_disable_drv_zc(pool); + xsk_clear_pool_at_qid(pool->netdev, pool->queue_id); + dev_put(pool->netdev); + pool->netdev = NULL; +} + +static void xp_release_deferred(struct work_struct *work) +{ + struct xsk_buff_pool *pool = container_of(work, struct xsk_buff_pool, + work); + + rtnl_lock(); + xp_clear_dev(pool); + rtnl_unlock(); + + if (pool->fq) { + xskq_destroy(pool->fq); + pool->fq = NULL; + } + + if (pool->cq) { + xskq_destroy(pool->cq); + pool->cq = NULL; + } + + xdp_put_umem(pool->umem); + xp_destroy(pool); +} + +void xp_get_pool(struct xsk_buff_pool *pool) +{ + refcount_inc(&pool->users); +} + +void xp_put_pool(struct xsk_buff_pool *pool) +{ + if (!pool) + return; + + if (refcount_dec_and_test(&pool->users)) { + INIT_WORK(&pool->work, xp_release_deferred); + schedule_work(&pool->work); + } +} + +static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool) +{ + struct xsk_dma_map *dma_map; + + list_for_each_entry(dma_map, &pool->umem->xsk_dma_list, list) { + if (dma_map->netdev == pool->netdev) + return dma_map; + } + + return NULL; +} + +static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_device *netdev, + u32 nr_pages, struct xdp_umem *umem) +{ + struct xsk_dma_map *dma_map; + + dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL); + if (!dma_map) + return NULL; + + dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL); + if (!dma_map) { + kfree(dma_map); + return NULL; + } + + dma_map->netdev = netdev; + dma_map->dev = dev; + dma_map->dma_need_sync = false; + dma_map->dma_pages_cnt = nr_pages; + refcount_set(&dma_map->users, 0); + list_add(&dma_map->list, &umem->xsk_dma_list); + return dma_map; +} + +static void xp_destroy_dma_map(struct xsk_dma_map *dma_map) +{ + list_del(&dma_map->list); + kvfree(dma_map->dma_pages); + kfree(dma_map); +} + +static void __xp_dma_unmap(struct xsk_dma_map *dma_map, unsigned long attrs) +{ + dma_addr_t *dma; + u32 i; + + for (i = 0; i < dma_map->dma_pages_cnt; i++) { + dma = &dma_map->dma_pages[i]; if (*dma) { - dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE, + dma_unmap_page_attrs(dma_map->dev, *dma, PAGE_SIZE, DMA_BIDIRECTIONAL, attrs); *dma = 0; } } + xp_destroy_dma_map(dma_map); +} + +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) +{ + struct xsk_dma_map *dma_map; + + if (pool->dma_pages_cnt == 0) + return; + + dma_map = xp_find_dma_map(pool); + if (!dma_map) { + WARN(1, "Could not find dma_map for device"); + return; + } + + if (!refcount_dec_and_test(&dma_map->users)) + return; + + __xp_dma_unmap(dma_map, attrs); kvfree(pool->dma_pages); pool->dma_pages_cnt = 0; pool->dev = NULL; } EXPORT_SYMBOL(xp_dma_unmap); -static void xp_check_dma_contiguity(struct xsk_buff_pool *pool) +static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map) { u32 i; - for (i = 0; i < pool->dma_pages_cnt - 1; i++) { - if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1]) - pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; + for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) { + if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1]) + dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; else - pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; + dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; } } +static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map) +{ + pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL); + if (!pool->dma_pages) + return -ENOMEM; + + pool->dev = dma_map->dev; + pool->dma_pages_cnt = dma_map->dma_pages_cnt; + pool->dma_need_sync = dma_map->dma_need_sync; + refcount_inc(&dma_map->users); + memcpy(pool->dma_pages, dma_map->dma_pages, + pool->dma_pages_cnt * sizeof(*pool->dma_pages)); + + return 0; +} + int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs, struct page **pages, u32 nr_pages) { + struct xsk_dma_map *dma_map; dma_addr_t dma; + int err; u32 i; - pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages), - GFP_KERNEL); - if (!pool->dma_pages) - return -ENOMEM; + dma_map = xp_find_dma_map(pool); + if (dma_map) { + err = xp_init_dma_info(pool, dma_map); + if (err) + return err; + + return 0; + } - pool->dev = dev; - pool->dma_pages_cnt = nr_pages; - pool->dma_need_sync = false; + dma_map = xp_create_dma_map(dev, pool->netdev, nr_pages, pool->umem); + if (!dma_map) + return -ENOMEM; - for (i = 0; i < pool->dma_pages_cnt; i++) { + for (i = 0; i < dma_map->dma_pages_cnt; i++) { dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL, attrs); if (dma_mapping_error(dev, dma)) { - xp_dma_unmap(pool, attrs); + __xp_dma_unmap(dma_map, attrs); return -ENOMEM; } if (dma_need_sync(dev, dma)) - pool->dma_need_sync = true; - pool->dma_pages[i] = dma; + dma_map->dma_need_sync = true; + dma_map->dma_pages[i] = dma; } if (pool->unaligned) - xp_check_dma_contiguity(pool); + xp_check_dma_contiguity(dma_map); + + err = xp_init_dma_info(pool, dma_map); + if (err) { + __xp_dma_unmap(dma_map, attrs); + return err; + } + return 0; } EXPORT_SYMBOL(xp_dma_map); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 21e9c2d123ee..5bd8ea9d206a 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -46,6 +46,7 @@ static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs, static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) { + struct xsk_buff_pool *pool = xs->pool; struct xdp_umem *umem = xs->umem; struct xdp_diag_umem du = {}; int err; @@ -58,8 +59,8 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) du.num_pages = umem->npgs; du.chunk_size = umem->chunk_size; du.headroom = umem->headroom; - du.ifindex = umem->dev ? umem->dev->ifindex : 0; - du.queue_id = umem->queue_id; + du.ifindex = pool->netdev ? pool->netdev->ifindex : 0; + du.queue_id = pool->queue_id; du.flags = 0; if (umem->zc) du.flags |= XDP_DU_F_ZEROCOPY; @@ -67,10 +68,11 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du); - if (!err && umem->fq) - err = xsk_diag_put_ring(umem->fq, XDP_DIAG_UMEM_FILL_RING, nlskb); - if (!err && umem->cq) { - err = xsk_diag_put_ring(umem->cq, XDP_DIAG_UMEM_COMPLETION_RING, + if (!err && pool->fq) + err = xsk_diag_put_ring(pool->fq, + XDP_DIAG_UMEM_FILL_RING, nlskb); + if (!err && pool->cq) { + err = xsk_diag_put_ring(pool->cq, XDP_DIAG_UMEM_COMPLETION_RING, nlskb); } return err; @@ -83,7 +85,7 @@ static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb) du.n_rx_dropped = xs->rx_dropped; du.n_rx_invalid = xskq_nb_invalid_descs(xs->rx); du.n_rx_full = xs->rx_queue_full; - du.n_fill_ring_empty = xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0; + du.n_fill_ring_empty = xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; du.n_tx_invalid = xskq_nb_invalid_descs(xs->tx); du.n_tx_ring_empty = xskq_nb_queue_empty_descs(xs->tx); return nla_put(nlskb, XDP_DIAG_STATS, sizeof(du), &du); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index bf42cfd74b89..2d883f631c85 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -166,9 +166,9 @@ static inline bool xp_validate_desc(struct xsk_buff_pool *pool, static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, - struct xdp_umem *umem) + struct xsk_buff_pool *pool) { - if (!xp_validate_desc(umem->pool, d)) { + if (!xp_validate_desc(pool, d)) { q->invalid_descs++; return false; } @@ -177,14 +177,14 @@ static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_desc *desc, - struct xdp_umem *umem) + struct xsk_buff_pool *pool) { while (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; *desc = ring->desc[idx]; - if (xskq_cons_is_valid_desc(q, desc, umem)) + if (xskq_cons_is_valid_desc(q, desc, pool)) return true; q->cached_cons++; @@ -236,11 +236,11 @@ static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) static inline bool xskq_cons_peek_desc(struct xsk_queue *q, struct xdp_desc *desc, - struct xdp_umem *umem) + struct xsk_buff_pool *pool) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); - return xskq_cons_read_desc(q, desc, umem); + return xskq_cons_read_desc(q, desc, pool); } static inline void xskq_cons_release(struct xsk_queue *q) diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 8367adbbe9df..2a4fd6677155 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -254,8 +254,16 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, spin_unlock_bh(&map->lock); } +static bool xsk_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1) +{ + return meta0->max_entries == meta1->max_entries && + bpf_map_meta_equal(meta0, meta1); +} + static int xsk_map_btf_id; const struct bpf_map_ops xsk_map_ops = { + .map_meta_equal = xsk_map_meta_equal, .map_alloc = xsk_map_alloc, .map_free = xsk_map_free, .map_get_next_key = xsk_map_get_next_key, |