diff options
Diffstat (limited to 'net')
545 files changed, 15608 insertions, 9063 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index bad01b14a4ad..5505ee6ebdbe 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -360,6 +360,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, struct vlan_dev_priv *vlan; bool last = false; LIST_HEAD(list); + int err; if (is_vlan_dev(dev)) { int err = __vlan_device_event(dev, event); @@ -489,6 +490,26 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); break; + + case NETDEV_CVLAN_FILTER_PUSH_INFO: + err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q)); + if (err) + return notifier_from_errno(err); + break; + + case NETDEV_CVLAN_FILTER_DROP_INFO: + vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q)); + break; + + case NETDEV_SVLAN_FILTER_PUSH_INFO: + err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD)); + if (err) + return notifier_from_errno(err); + break; + + case NETDEV_SVLAN_FILTER_DROP_INFO: + vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD)); + break; } out: diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index a8ba51030b75..e23aac3e4d37 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -97,6 +97,9 @@ static inline struct net_device *vlan_find_dev(struct net_device *real_dev, if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \ (i) % VLAN_N_VID))) +int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto); +void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto); + /* found in vlan_dev.c */ void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 64aa9f755e1d..4f60e86f4b8d 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -48,8 +48,8 @@ bool vlan_do_receive(struct sk_buff **skbp) * original position later */ skb_push(skb, offset); - skb = *skbp = vlan_insert_tag(skb, skb->vlan_proto, - skb->vlan_tci); + skb = *skbp = vlan_insert_inner_tag(skb, skb->vlan_proto, + skb->vlan_tci, skb->mac_len); if (!skb) return false; skb_pull(skb, offset + VLAN_HLEN); @@ -165,13 +165,12 @@ struct vlan_vid_info { int refcount; }; -static bool vlan_hw_filter_capable(const struct net_device *dev, - const struct vlan_vid_info *vid_info) +static bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto) { - if (vid_info->proto == htons(ETH_P_8021Q) && + if (proto == htons(ETH_P_8021Q) && dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) return true; - if (vid_info->proto == htons(ETH_P_8021AD) && + if (proto == htons(ETH_P_8021AD) && dev->features & NETIF_F_HW_VLAN_STAG_FILTER) return true; return false; @@ -202,11 +201,73 @@ static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid) return vid_info; } +static int vlan_add_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid) +{ + if (!vlan_hw_filter_capable(dev, proto)) + return 0; + + if (netif_device_present(dev)) + return dev->netdev_ops->ndo_vlan_rx_add_vid(dev, proto, vid); + else + return -ENODEV; +} + +static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid) +{ + if (!vlan_hw_filter_capable(dev, proto)) + return 0; + + if (netif_device_present(dev)) + return dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, proto, vid); + else + return -ENODEV; +} + +int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto) +{ + struct net_device *real_dev = vlan_info->real_dev; + struct vlan_vid_info *vlan_vid_info; + int err; + + list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list) { + if (vlan_vid_info->proto == proto) { + err = vlan_add_rx_filter_info(real_dev, proto, + vlan_vid_info->vid); + if (err) + goto unwind; + } + } + + return 0; + +unwind: + list_for_each_entry_continue_reverse(vlan_vid_info, + &vlan_info->vid_list, list) { + if (vlan_vid_info->proto == proto) + vlan_kill_rx_filter_info(real_dev, proto, + vlan_vid_info->vid); + } + + return err; +} +EXPORT_SYMBOL(vlan_filter_push_vids); + +void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto) +{ + struct vlan_vid_info *vlan_vid_info; + + list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list) + if (vlan_vid_info->proto == proto) + vlan_kill_rx_filter_info(vlan_info->real_dev, + vlan_vid_info->proto, + vlan_vid_info->vid); +} +EXPORT_SYMBOL(vlan_filter_drop_vids); + static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid, struct vlan_vid_info **pvid_info) { struct net_device *dev = vlan_info->real_dev; - const struct net_device_ops *ops = dev->netdev_ops; struct vlan_vid_info *vid_info; int err; @@ -214,16 +275,12 @@ static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid, if (!vid_info) return -ENOMEM; - if (vlan_hw_filter_capable(dev, vid_info)) { - if (netif_device_present(dev)) - err = ops->ndo_vlan_rx_add_vid(dev, proto, vid); - else - err = -ENODEV; - if (err) { - kfree(vid_info); - return err; - } + err = vlan_add_rx_filter_info(dev, proto, vid); + if (err) { + kfree(vid_info); + return err; } + list_add(&vid_info->list, &vlan_info->vid_list); vlan_info->nr_vids++; *pvid_info = vid_info; @@ -270,21 +327,15 @@ static void __vlan_vid_del(struct vlan_info *vlan_info, struct vlan_vid_info *vid_info) { struct net_device *dev = vlan_info->real_dev; - const struct net_device_ops *ops = dev->netdev_ops; __be16 proto = vid_info->proto; u16 vid = vid_info->vid; int err; - if (vlan_hw_filter_capable(dev, vid_info)) { - if (netif_device_present(dev)) - err = ops->ndo_vlan_rx_kill_vid(dev, proto, vid); - else - err = -ENODEV; - if (err) { - pr_warn("failed to kill vid %04x/%d for device %s\n", - proto, vid, dev->name); - } - } + err = vlan_kill_rx_filter_info(dev, proto, vid); + if (err) + pr_warn("failed to kill vid %04x/%d for device %s\n", + proto, vid, dev->name); + list_del(&vid_info->list); kfree(vid_info); vlan_info->nr_vids--; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index f7e83f6d2e64..236452ebbd9e 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -29,6 +29,7 @@ #include <linux/net_tstamp.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> +#include <linux/phy.h> #include <net/arp.h> #include <net/switchdev.h> @@ -665,8 +666,11 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev, { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops; + struct phy_device *phydev = vlan->real_dev->phydev; - if (ops->get_ts_info) { + if (phydev && phydev->drv && phydev->drv->ts_info) { + return phydev->drv->ts_info(phydev, info); + } else if (ops->get_ts_info) { return ops->get_ts_info(vlan->real_dev, info); } else { info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index a662ccc166df..a627a5db2125 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -148,8 +148,8 @@ int __net_init vlan_proc_init(struct net *net) if (!vn->proc_vlan_dir) goto err; - vn->proc_vlan_conf = proc_create(name_conf, S_IFREG|S_IRUSR|S_IWUSR, - vn->proc_vlan_dir, &vlan_fops); + vn->proc_vlan_conf = proc_create(name_conf, S_IFREG | 0600, + vn->proc_vlan_dir, &vlan_fops); if (!vn->proc_vlan_conf) goto err; return 0; @@ -172,7 +172,7 @@ int vlan_proc_add_dev(struct net_device *vlandev) if (!strcmp(vlandev->name, name_conf)) return -EINVAL; vlan->dent = - proc_create_data(vlandev->name, S_IFREG|S_IRUSR|S_IWUSR, + proc_create_data(vlandev->name, S_IFREG | 0600, vn->proc_vlan_dir, &vlandev_fops, vlandev); if (!vlan->dent) return -ENOBUFS; diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c index a3bf9d519193..7214aea14cb3 100644 --- a/net/appletalk/atalk_proc.c +++ b/net/appletalk/atalk_proc.c @@ -257,22 +257,22 @@ int __init atalk_proc_init(void) if (!atalk_proc_dir) goto out; - p = proc_create("interface", S_IRUGO, atalk_proc_dir, + p = proc_create("interface", 0444, atalk_proc_dir, &atalk_seq_interface_fops); if (!p) goto out_interface; - p = proc_create("route", S_IRUGO, atalk_proc_dir, + p = proc_create("route", 0444, atalk_proc_dir, &atalk_seq_route_fops); if (!p) goto out_route; - p = proc_create("socket", S_IRUGO, atalk_proc_dir, + p = proc_create("socket", 0444, atalk_proc_dir, &atalk_seq_socket_fops); if (!p) goto out_socket; - p = proc_create("arp", S_IRUGO, atalk_proc_dir, &atalk_seq_arp_fops); + p = proc_create("arp", 0444, atalk_proc_dir, &atalk_seq_arp_fops); if (!p) goto out_arp; diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 03a9fc0771c0..9b6bc5abe946 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1238,7 +1238,7 @@ out: * fields into the sockaddr. */ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_at sat; struct sock *sk = sock->sk; @@ -1251,7 +1251,6 @@ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr, if (atalk_autobind(sk) < 0) goto out; - *uaddr_len = sizeof(struct sockaddr_at); memset(&sat, 0, sizeof(sat)); if (peer) { @@ -1268,9 +1267,9 @@ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr, sat.sat_port = at->src_port; } - err = 0; sat.sat_family = AF_APPLETALK; memcpy(uaddr, &sat, sizeof(sat)); + err = sizeof(struct sockaddr_at); out: release_sock(sk); diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c index 5d2fed9f5710..39b94ca5f65d 100644 --- a/net/atm/atm_sysfs.c +++ b/net/atm/atm_sysfs.c @@ -96,12 +96,12 @@ static ssize_t show_link_rate(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%d\n", link_rate); } -static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); -static DEVICE_ATTR(atmaddress, S_IRUGO, show_atmaddress, NULL); -static DEVICE_ATTR(atmindex, S_IRUGO, show_atmindex, NULL); -static DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL); -static DEVICE_ATTR(type, S_IRUGO, show_type, NULL); -static DEVICE_ATTR(link_rate, S_IRUGO, show_link_rate, NULL); +static DEVICE_ATTR(address, 0444, show_address, NULL); +static DEVICE_ATTR(atmaddress, 0444, show_atmaddress, NULL); +static DEVICE_ATTR(atmindex, 0444, show_atmindex, NULL); +static DEVICE_ATTR(carrier, 0444, show_carrier, NULL); +static DEVICE_ATTR(type, 0444, show_type, NULL); +static DEVICE_ATTR(link_rate, 0444, show_link_rate, NULL); static struct device_attribute *atm_attrs[] = { &dev_attr_atmaddress, diff --git a/net/atm/clip.c b/net/atm/clip.c index d4f6029d5109..f07dbc632222 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -893,7 +893,7 @@ static int __init atm_clip_init(void) { struct proc_dir_entry *p; - p = proc_create("arp", S_IRUGO, atm_proc_root, &arp_seq_fops); + p = proc_create("arp", 0444, atm_proc_root, &arp_seq_fops); if (!p) { pr_err("Unable to initialize /proc/net/atm/arp\n"); atm_clip_exit_noproc(); diff --git a/net/atm/lec.c b/net/atm/lec.c index 09a1f056712a..01d5d20a6eb1 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1042,7 +1042,7 @@ static int __init lane_module_init(void) #ifdef CONFIG_PROC_FS struct proc_dir_entry *p; - p = proc_create("lec", S_IRUGO, atm_proc_root, &lec_seq_fops); + p = proc_create("lec", 0444, atm_proc_root, &lec_seq_fops); if (!p) { pr_err("Unable to initialize /proc/net/atm/lec\n"); return -ENOMEM; diff --git a/net/atm/proc.c b/net/atm/proc.c index edc48edc95c1..55410c00c7e2 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -474,7 +474,7 @@ int __init atm_proc_init(void) for (e = atm_proc_ents; e->name; e++) { struct proc_dir_entry *dirent; - dirent = proc_create(e->name, S_IRUGO, + dirent = proc_create(e->name, 0444, atm_proc_root, e->proc_fops); if (!dirent) goto err_out_remove; diff --git a/net/atm/pvc.c b/net/atm/pvc.c index e1140b3bdcaa..2cb10af16afc 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -87,21 +87,20 @@ static int pvc_getsockopt(struct socket *sock, int level, int optname, } static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr, - int *sockaddr_len, int peer) + int peer) { struct sockaddr_atmpvc *addr; struct atm_vcc *vcc = ATM_SD(sock); if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags)) return -ENOTCONN; - *sockaddr_len = sizeof(struct sockaddr_atmpvc); addr = (struct sockaddr_atmpvc *)sockaddr; memset(addr, 0, sizeof(*addr)); addr->sap_family = AF_ATMPVC; addr->sap_addr.itf = vcc->dev->number; addr->sap_addr.vpi = vcc->vpi; addr->sap_addr.vci = vcc->vci; - return 0; + return sizeof(struct sockaddr_atmpvc); } static const struct proto_ops pvc_proto_ops = { diff --git a/net/atm/svc.c b/net/atm/svc.c index c458adcbc177..2f91b766ac42 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -419,15 +419,14 @@ out: } static int svc_getname(struct socket *sock, struct sockaddr *sockaddr, - int *sockaddr_len, int peer) + int peer) { struct sockaddr_atmsvc *addr; - *sockaddr_len = sizeof(struct sockaddr_atmsvc); addr = (struct sockaddr_atmsvc *) sockaddr; memcpy(addr, peer ? &ATM_SD(sock)->remote : &ATM_SD(sock)->local, sizeof(struct sockaddr_atmsvc)); - return 0; + return sizeof(struct sockaddr_atmsvc); } int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 47fdd399626b..2b41366fcad2 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1388,7 +1388,7 @@ out: } static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr; struct sock *sk = sock->sk; @@ -1427,7 +1427,7 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, fsa->fsa_digipeater[0] = null_ax25_address; } } - *uaddr_len = sizeof (struct full_sockaddr_ax25); + err = sizeof (struct full_sockaddr_ax25); out: release_sock(sk); @@ -1989,10 +1989,10 @@ static int __init ax25_init(void) dev_add_pack(&ax25_packet_type); register_netdevice_notifier(&ax25_dev_notifier); - proc_create("ax25_route", S_IRUGO, init_net.proc_net, + proc_create("ax25_route", 0444, init_net.proc_net, &ax25_route_fops); - proc_create("ax25", S_IRUGO, init_net.proc_net, &ax25_info_fops); - proc_create("ax25_calls", S_IRUGO, init_net.proc_net, &ax25_uid_fops); + proc_create("ax25", 0444, init_net.proc_net, &ax25_info_fops); + proc_create("ax25_calls", 0444, init_net.proc_net, &ax25_uid_fops); out: return rc; } diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index c44f6515be5e..e4e2e02b7380 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 022f6e77307b..b97ba6fb8353 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 80c72c7d3cad..ea309ad06175 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 029221615ba3..534b790c3753 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing * diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 79e326383726..be09a9883825 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -157,7 +157,7 @@ static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node) * Return: 0 on success, a negative error code otherwise. */ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, - int max_if_num) + unsigned int max_if_num) { void *data_ptr; size_t old_size; @@ -201,7 +201,8 @@ unlock: */ static void batadv_iv_ogm_drop_bcast_own_entry(struct batadv_orig_node *orig_node, - int max_if_num, int del_if_num) + unsigned int max_if_num, + unsigned int del_if_num) { size_t chunk_size; size_t if_offset; @@ -239,7 +240,8 @@ batadv_iv_ogm_drop_bcast_own_entry(struct batadv_orig_node *orig_node, */ static void batadv_iv_ogm_drop_bcast_own_sum_entry(struct batadv_orig_node *orig_node, - int max_if_num, int del_if_num) + unsigned int max_if_num, + unsigned int del_if_num) { size_t if_offset; void *data_ptr; @@ -276,7 +278,8 @@ batadv_iv_ogm_drop_bcast_own_sum_entry(struct batadv_orig_node *orig_node, * Return: 0 on success, a negative error code otherwise. */ static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, - int max_if_num, int del_if_num) + unsigned int max_if_num, + unsigned int del_if_num) { spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); @@ -311,7 +314,8 @@ static struct batadv_orig_node * batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; - int size, hash_added; + int hash_added; + size_t size; orig_node = batadv_orig_hash_find(bat_priv, addr); if (orig_node) @@ -893,7 +897,7 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) u32 i; size_t word_index; u8 *w; - int if_num; + unsigned int if_num; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -1023,7 +1027,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, struct batadv_neigh_node *tmp_neigh_node = NULL; struct batadv_neigh_node *router = NULL; struct batadv_orig_node *orig_node_tmp; - int if_num; + unsigned int if_num; u8 sum_orig, sum_neigh; u8 *neigh_addr; u8 tq_avg; @@ -1182,7 +1186,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; - int if_num; + unsigned int if_num; unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; unsigned int tq_iface_penalty; @@ -1702,9 +1706,9 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, if (is_my_orig) { unsigned long *word; - int offset; + size_t offset; s32 bit_pos; - s16 if_num; + unsigned int if_num; u8 *weight; orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, @@ -2729,7 +2733,7 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; struct batadv_gw_node *curr_gw; - int ret = -EINVAL; + int ret = 0; void *hdr; router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index 9dc0dd5c83df..317cafd302cf 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 27e165ac9302..ec93337ee259 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * @@ -928,7 +928,7 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; struct batadv_gw_node *curr_gw; - int ret = -EINVAL; + int ret = 0; void *hdr; router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index a17ab68bbce8..ec4a2a569750 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing * diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index a83478c46597..28687493599f 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 5e39d0588a48..e8c7b7fd290d 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index ba59b77c605d..2948b41b06d4 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index 6a4c14ccc3c6..ed36c5e79fde 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index bdc1ef06e05b..a296a4d851f5 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index ca9d0753dd6b..48f683289531 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index fad47853ad3c..a2de5a44bd41 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -2161,22 +2161,25 @@ batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, { struct batadv_bla_claim *claim; int idx = 0; + int ret = 0; rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { if (idx++ < *idx_skip) continue; - if (batadv_bla_claim_dump_entry(msg, portid, seq, - primary_if, claim)) { + + ret = batadv_bla_claim_dump_entry(msg, portid, seq, + primary_if, claim); + if (ret) { *idx_skip = idx - 1; goto unlock; } } - *idx_skip = idx; + *idx_skip = 0; unlock: rcu_read_unlock(); - return 0; + return ret; } /** @@ -2391,22 +2394,25 @@ batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, { struct batadv_bla_backbone_gw *backbone_gw; int idx = 0; + int ret = 0; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (idx++ < *idx_skip) continue; - if (batadv_bla_backbone_dump_entry(msg, portid, seq, - primary_if, backbone_gw)) { + + ret = batadv_bla_backbone_dump_entry(msg, portid, seq, + primary_if, backbone_gw); + if (ret) { *idx_skip = idx - 1; goto unlock; } } - *idx_skip = idx; + *idx_skip = 0; unlock: rcu_read_unlock(); - return 0; + return ret; } /** diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index b27571abcd2f..71f95a3e4d3f 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 21d1189957a7..4229b01ac7b5 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 90a08d35c501..37b069698b04 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 9703c791ffc5..a60bacf7120b 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -33,6 +33,7 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> +#include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> @@ -43,13 +44,19 @@ #include <linux/string.h> #include <linux/workqueue.h> #include <net/arp.h> +#include <net/genetlink.h> +#include <net/netlink.h> +#include <net/sock.h> +#include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" #include "originator.h" #include "send.h" +#include "soft-interface.h" #include "translation-table.h" #include "tvlv.h" @@ -393,7 +400,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, batadv_arp_hw_src(skb, hdr_size), &ip_src, batadv_arp_hw_dst(skb, hdr_size), &ip_dst); - if (hdr_size == 0) + if (hdr_size < sizeof(struct batadv_unicast_packet)) return; unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data; @@ -495,7 +502,7 @@ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res, * the one with the lowest address */ if (tmp_max == max && max_orig_node && - batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0) + batadv_compare_eth(candidate->orig, max_orig_node->orig)) goto out; ret = true; @@ -852,6 +859,151 @@ out: #endif /** + * batadv_dat_cache_dump_entry() - dump one entry of the DAT cache table to a + * netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @dat_entry: entry to dump + * + * Return: 0 or error code. + */ +static int +batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_dat_entry *dat_entry) +{ + int msecs; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_DAT_CACHE); + if (!hdr) + return -ENOBUFS; + + msecs = jiffies_to_msecs(jiffies - dat_entry->last_update); + + if (nla_put_in_addr(msg, BATADV_ATTR_DAT_CACHE_IP4ADDRESS, + dat_entry->ip) || + nla_put(msg, BATADV_ATTR_DAT_CACHE_HWADDRESS, ETH_ALEN, + dat_entry->mac_addr) || + nla_put_u16(msg, BATADV_ATTR_DAT_CACHE_VID, dat_entry->vid) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) { + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; + } + + genlmsg_end(msg, hdr); + return 0; +} + +/** + * batadv_dat_cache_dump_bucket() - dump one bucket of the DAT cache table to + * a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @head: bucket to dump + * @idx_skip: How many entries to skip + * + * Return: 0 or error code. + */ +static int +batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct hlist_head *head, int *idx_skip) +{ + struct batadv_dat_entry *dat_entry; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(dat_entry, head, hash_entry) { + if (idx < *idx_skip) + goto skip; + + if (batadv_dat_cache_dump_entry(msg, portid, seq, + dat_entry)) { + rcu_read_unlock(); + *idx_skip = idx; + + return -EMSGSIZE; + } + +skip: + idx++; + } + rcu_read_unlock(); + + return 0; +} + +/** + * batadv_dat_cache_dump() - dump DAT cache table to a netlink socket + * @msg: buffer for the message + * @cb: callback structure containing arguments + * + * Return: message length. + */ +int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + int portid = NETLINK_CB(cb->skb).portid; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_hashtable *hash; + struct batadv_priv *bat_priv; + int bucket = cb->args[0]; + struct hlist_head *head; + int idx = cb->args[1]; + int ifindex; + int ret = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + hash = bat_priv->dat.hash; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_dat_cache_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, head, + &idx)) + break; + + bucket++; + idx = 0; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + + ret = msg->len; + +out: + if (primary_if) + batadv_hardif_put(primary_if); + + if (soft_iface) + dev_put(soft_iface); + + return ret; +} + +/** * batadv_arp_get_type() - parse an ARP packet and gets the type * @bat_priv: the bat priv with all the soft interface information * @skb: packet to analyse diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 12897eb46268..a04596028337 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -28,6 +28,7 @@ #include "originator.h" +struct netlink_callback; struct seq_file; struct sk_buff; @@ -81,6 +82,7 @@ batadv_dat_init_own_addr(struct batadv_priv *bat_priv, int batadv_dat_init(struct batadv_priv *bat_priv); void batadv_dat_free(struct batadv_priv *bat_priv); int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset); +int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb); /** * batadv_dat_inc_counter() - increment the correct DAT packet counter @@ -169,6 +171,12 @@ static inline void batadv_dat_free(struct batadv_priv *bat_priv) { } +static inline int +batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv, u8 subtype) { diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 22dde42fd80e..0fddc17106bd 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * @@ -288,7 +288,8 @@ batadv_frag_merge_packets(struct hlist_head *chain) /* Move the existing MAC header to just before the payload. (Override * the fragment header.) */ - skb_pull_rcsum(skb_out, hdr_size); + skb_pull(skb_out, hdr_size); + skb_out->ip_summed = CHECKSUM_NONE; memmove(skb_out->data - ETH_HLEN, skb_mac_header(skb_out), ETH_HLEN); skb_set_mac_header(skb_out, -ETH_HLEN); skb_reset_network_header(skb_out); diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 138b22a1836a..944512e07782 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 37fe9a644f22..8b198ee798c9 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -746,7 +746,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, { struct batadv_neigh_node *neigh_curr = NULL; struct batadv_neigh_node *neigh_old = NULL; - struct batadv_orig_node *orig_dst_node; + struct batadv_orig_node *orig_dst_node = NULL; struct batadv_gw_node *gw_node = NULL; struct batadv_gw_node *curr_gw = NULL; struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo; @@ -757,6 +757,9 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, vid = batadv_get_vid(skb, 0); + if (is_multicast_ether_addr(ethhdr->h_dest)) + goto out; + orig_dst_node = batadv_transtable_search(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid); if (!orig_dst_node) diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 981f58421a32..f0b86fcb2493 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index b3e156af2256..936c107f3199 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index afebd9c7edf4..80afb2793687 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 5f186bff284a..c405d15befd6 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -763,6 +763,11 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); + if (bat_priv->num_ifaces >= UINT_MAX) { + ret = -ENOSPC; + goto err_dev; + } + ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface, NULL, NULL, NULL); if (ret) @@ -876,7 +881,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); /* nobody uses this interface anymore */ - if (!bat_priv->num_ifaces) { + if (bat_priv->num_ifaces == 0) { batadv_gw_check_client_stop(bat_priv); if (autodel == BATADV_IF_CLEANUP_AUTO) @@ -912,7 +917,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (ret) goto free_if; - hard_iface->if_num = -1; + hard_iface->if_num = 0; hard_iface->net_dev = net_dev; hard_iface->soft_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index de5e9a374ece..d1c0f6189301 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 04d964358c98..7b49e4001778 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 4ce1b6d3ad5c..9490a7ca2ba6 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index e91f29c7c638..55c358ad3331 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -24,6 +24,7 @@ #include <linux/debugfs.h> #include <linux/errno.h> #include <linux/etherdevice.h> +#include <linux/eventpoll.h> #include <linux/export.h> #include <linux/fcntl.h> #include <linux/fs.h> diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 84cddd01eeab..958be22beda9 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index dc9fa37ddd14..853773e45f79 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -22,6 +22,7 @@ #include <linux/compiler.h> #include <linux/debugfs.h> #include <linux/errno.h> +#include <linux/eventpoll.h> #include <linux/export.h> #include <linux/fcntl.h> #include <linux/fs.h> diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 35e02b2b9e72..35f4f397ed57 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d31c8266e244..69c0d85bceb3 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index f7ba3f96d8f3..057a28a9fe88 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -25,7 +25,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2018.0" +#define BATADV_SOURCE_VERSION "2018.1" #endif /* B.A.T.M.A.N. parameters */ @@ -331,11 +331,13 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, * * Return: true when x is a predecessor of y, false otherwise */ -#define batadv_seq_before(x, y) ({typeof(x)_d1 = (x); \ - typeof(y)_d2 = (y); \ - typeof(x)_dummy = (_d1 - _d2); \ - (void)(&_d1 == &_d2); \ - _dummy > batadv_smallest_signed_int(_dummy); }) +#define batadv_seq_before(x, y) ({ \ + typeof(x)_d1 = (x); \ + typeof(y)_d2 = (y); \ + typeof(x)_dummy = (_d1 - _d2); \ + (void)(&_d1 == &_d2); \ + _dummy > batadv_smallest_signed_int(_dummy); \ +}) /** * batadv_seq_after() - Checks if a sequence number x is a successor of y diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index cbdeb47ec3f6..a11d3d89f012 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing * @@ -40,6 +40,7 @@ #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> @@ -52,14 +53,20 @@ #include <linux/types.h> #include <linux/workqueue.h> #include <net/addrconf.h> +#include <net/genetlink.h> #include <net/if_inet6.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/netlink.h> +#include <net/sock.h> #include <uapi/linux/batadv_packet.h> +#include <uapi/linux/batman_adv.h> #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" +#include "soft-interface.h" #include "translation-table.h" #include "tvlv.h" @@ -102,7 +109,36 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) } /** + * batadv_mcast_addr_is_ipv4() - check if multicast MAC is IPv4 + * @addr: the MAC address to check + * + * Return: True, if MAC address is one reserved for IPv4 multicast, false + * otherwise. + */ +static bool batadv_mcast_addr_is_ipv4(const u8 *addr) +{ + static const u8 prefix[] = {0x01, 0x00, 0x5E}; + + return memcmp(prefix, addr, sizeof(prefix)) == 0; +} + +/** + * batadv_mcast_addr_is_ipv6() - check if multicast MAC is IPv6 + * @addr: the MAC address to check + * + * Return: True, if MAC address is one reserved for IPv6 multicast, false + * otherwise. + */ +static bool batadv_mcast_addr_is_ipv6(const u8 *addr) +{ + static const u8 prefix[] = {0x33, 0x33}; + + return memcmp(prefix, addr, sizeof(prefix)) == 0; +} + +/** * batadv_mcast_mla_softif_get() - get softif multicast listeners + * @bat_priv: the bat priv with all the soft interface information * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @@ -119,9 +155,12 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ -static int batadv_mcast_mla_softif_get(struct net_device *dev, +static int batadv_mcast_mla_softif_get(struct batadv_priv *bat_priv, + struct net_device *dev, struct hlist_head *mcast_list) { + bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4; + bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6; struct net_device *bridge = batadv_mcast_get_bridge(dev); struct netdev_hw_addr *mc_list_entry; struct batadv_hw_addr *new; @@ -129,6 +168,12 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev, netif_addr_lock_bh(bridge ? bridge : dev); netdev_for_each_mc_addr(mc_list_entry, bridge ? bridge : dev) { + if (all_ipv4 && batadv_mcast_addr_is_ipv4(mc_list_entry->addr)) + continue; + + if (all_ipv6 && batadv_mcast_addr_is_ipv6(mc_list_entry->addr)) + continue; + new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; @@ -193,6 +238,7 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) /** * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners + * @bat_priv: the bat priv with all the soft interface information * @dev: a bridge slave whose bridge to collect multicast addresses from * @mcast_list: a list to put found addresses into * @@ -204,10 +250,13 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ -static int batadv_mcast_mla_bridge_get(struct net_device *dev, +static int batadv_mcast_mla_bridge_get(struct batadv_priv *bat_priv, + struct net_device *dev, struct hlist_head *mcast_list) { struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); + bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4; + bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6; struct br_ip_list *br_ip_entry, *tmp; struct batadv_hw_addr *new; u8 mcast_addr[ETH_ALEN]; @@ -221,6 +270,12 @@ static int batadv_mcast_mla_bridge_get(struct net_device *dev, goto out; list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) { + if (all_ipv4 && br_ip_entry->addr.proto == htons(ETH_P_IP)) + continue; + + if (all_ipv6 && br_ip_entry->addr.proto == htons(ETH_P_IPV6)) + continue; + batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; @@ -543,8 +598,8 @@ update: bat_priv->mcast.enabled = true; } - return !(mcast_data.flags & - (BATADV_MCAST_WANT_ALL_IPV4 | BATADV_MCAST_WANT_ALL_IPV6)); + return !(mcast_data.flags & BATADV_MCAST_WANT_ALL_IPV4 && + mcast_data.flags & BATADV_MCAST_WANT_ALL_IPV6); } /** @@ -568,11 +623,11 @@ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv) if (!batadv_mcast_mla_tvlv_update(bat_priv)) goto update; - ret = batadv_mcast_mla_softif_get(soft_iface, &mcast_list); + ret = batadv_mcast_mla_softif_get(bat_priv, soft_iface, &mcast_list); if (ret < 0) goto out; - ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list); + ret = batadv_mcast_mla_bridge_get(bat_priv, soft_iface, &mcast_list); if (ret < 0) goto out; @@ -814,8 +869,8 @@ static struct batadv_orig_node * batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv, struct ethhdr *ethhdr) { - return batadv_transtable_search(bat_priv, ethhdr->h_source, - ethhdr->h_dest, BATADV_NO_FLAGS); + return batadv_transtable_search(bat_priv, NULL, ethhdr->h_dest, + BATADV_NO_FLAGS); } /** @@ -1286,6 +1341,236 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) #endif /** + * batadv_mcast_mesh_info_put() - put multicast info into a netlink message + * @msg: buffer for the message + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 or error code. + */ +int batadv_mcast_mesh_info_put(struct sk_buff *msg, + struct batadv_priv *bat_priv) +{ + u32 flags = bat_priv->mcast.flags; + u32 flags_priv = BATADV_NO_FLAGS; + + if (bat_priv->mcast.bridged) { + flags_priv |= BATADV_MCAST_FLAGS_BRIDGED; + + if (bat_priv->mcast.querier_ipv4.exists) + flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS; + if (bat_priv->mcast.querier_ipv6.exists) + flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS; + if (bat_priv->mcast.querier_ipv4.shadowing) + flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING; + if (bat_priv->mcast.querier_ipv6.shadowing) + flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING; + } + + if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, flags) || + nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS_PRIV, flags_priv)) + return -EMSGSIZE; + + return 0; +} + +/** + * batadv_mcast_flags_dump_entry() - dump one entry of the multicast flags table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @orig_node: originator to dump the multicast flags of + * + * Return: 0 or error code. + */ +static int +batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_orig_node *orig_node) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + orig_node->orig)) { + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; + } + + if (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, + &orig_node->capabilities)) { + if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, + orig_node->mcast_flags)) { + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; + } + } + + genlmsg_end(msg, hdr); + return 0; +} + +/** + * batadv_mcast_flags_dump_bucket() - dump one bucket of the multicast flags + * table to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @head: bucket to dump + * @idx_skip: How many entries to skip + * + * Return: 0 or error code. + */ +static int +batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct hlist_head *head, long *idx_skip) +{ + struct batadv_orig_node *orig_node; + long idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, + &orig_node->capa_initialized)) + continue; + + if (idx < *idx_skip) + goto skip; + + if (batadv_mcast_flags_dump_entry(msg, portid, seq, + orig_node)) { + rcu_read_unlock(); + *idx_skip = idx; + + return -EMSGSIZE; + } + +skip: + idx++; + } + rcu_read_unlock(); + + return 0; +} + +/** + * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @bat_priv: the bat priv with all the soft interface information + * @bucket: current bucket to dump + * @idx: index in current bucket to the next entry to dump + * + * Return: 0 or error code. + */ +static int +__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, long *bucket, long *idx) +{ + struct batadv_hashtable *hash = bat_priv->orig_hash; + long bucket_tmp = *bucket; + struct hlist_head *head; + long idx_tmp = *idx; + + while (bucket_tmp < hash->size) { + head = &hash->table[bucket_tmp]; + + if (batadv_mcast_flags_dump_bucket(msg, portid, seq, head, + &idx_tmp)) + break; + + bucket_tmp++; + idx_tmp = 0; + } + + *bucket = bucket_tmp; + *idx = idx_tmp; + + return msg->len; +} + +/** + * batadv_mcast_netlink_get_primary() - get primary interface from netlink + * callback + * @cb: netlink callback structure + * @primary_if: the primary interface pointer to return the result in + * + * Return: 0 or error code. + */ +static int +batadv_mcast_netlink_get_primary(struct netlink_callback *cb, + struct batadv_hard_iface **primary_if) +{ + struct batadv_hard_iface *hard_iface = NULL; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + int ifindex; + int ret = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + hard_iface = batadv_primary_if_get_selected(bat_priv); + if (!hard_iface || hard_iface->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + +out: + if (soft_iface) + dev_put(soft_iface); + + if (!ret && primary_if) + *primary_if = hard_iface; + else + batadv_hardif_put(hard_iface); + + return ret; +} + +/** + * batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket + * @msg: buffer for the message + * @cb: callback structure containing arguments + * + * Return: message length. + */ +int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + int portid = NETLINK_CB(cb->skb).portid; + struct batadv_priv *bat_priv; + long *bucket = &cb->args[0]; + long *idx = &cb->args[1]; + int ret; + + ret = batadv_mcast_netlink_get_primary(cb, &primary_if); + if (ret) + return ret; + + bat_priv = netdev_priv(primary_if->soft_iface); + ret = __batadv_mcast_flags_dump(msg, portid, cb->nlh->nlmsg_seq, + bat_priv, bucket, idx); + + batadv_hardif_put(primary_if); + return ret; +} + +/** * batadv_mcast_free() - free the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information */ diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 3ac06337ab71..3b04ab13f0eb 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing * @@ -21,6 +21,7 @@ #include "main.h" +struct netlink_callback; struct seq_file; struct sk_buff; @@ -54,6 +55,11 @@ void batadv_mcast_init(struct batadv_priv *bat_priv); int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset); +int batadv_mcast_mesh_info_put(struct sk_buff *msg, + struct batadv_priv *bat_priv); + +int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb); + void batadv_mcast_free(struct batadv_priv *bat_priv); void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node); @@ -72,6 +78,18 @@ static inline int batadv_mcast_init(struct batadv_priv *bat_priv) return 0; } +static inline int +batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv) +{ + return 0; +} + +static inline int batadv_mcast_flags_dump(struct sk_buff *msg, + struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + static inline void batadv_mcast_free(struct batadv_priv *bat_priv) { } diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index a823d3899bad..0d9459b69bdb 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors: * * Matthias Schiffer * @@ -45,8 +45,10 @@ #include "bat_algo.h" #include "bridge_loop_avoidance.h" +#include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" +#include "multicast.h" #include "originator.h" #include "soft-interface.h" #include "tp_meter.h" @@ -64,39 +66,44 @@ static const struct genl_multicast_group batadv_netlink_mcgrps[] = { }; static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { - [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, - [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, - [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, - [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING }, - [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 }, - [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING }, - [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 }, - [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, - [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, - [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, - [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, - [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 }, - [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 }, - [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 }, - [BATADV_ATTR_TT_VID] = { .type = NLA_U16 }, - [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, - [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, - [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, - [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_TQ] = { .type = NLA_U8 }, - [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, - [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, - [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, - [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, - [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG }, - [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 }, - [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN }, - [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 }, + [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, + [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, + [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, + [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING }, + [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 }, + [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING }, + [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 }, + [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, + [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, + [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, + [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, + [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 }, + [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 }, + [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 }, + [BATADV_ATTR_TT_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, + [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, + [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, + [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TQ] = { .type = NLA_U8 }, + [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, + [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, + [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, + [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG }, + [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 }, + [BATADV_ATTR_DAT_CACHE_IP4ADDRESS] = { .type = NLA_U32 }, + [BATADV_ATTR_DAT_CACHE_HWADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_DAT_CACHE_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_MCAST_FLAGS] = { .type = NLA_U32 }, + [BATADV_ATTR_MCAST_FLAGS_PRIV] = { .type = NLA_U32 }, }; /** @@ -147,6 +154,9 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface) goto out; #endif + if (batadv_mcast_mesh_info_put(msg, bat_priv)) + goto out; + primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) { hard_iface = primary_if->net_dev; @@ -604,6 +614,18 @@ static const struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .dumpit = batadv_bla_backbone_dump, }, + { + .cmd = BATADV_CMD_GET_DAT_CACHE, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_dat_cache_dump, + }, + { + .cmd = BATADV_CMD_GET_MCAST_FLAGS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_mcast_flags_dump, + }, }; diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 0e7e57b69b54..571d9a5ae7aa 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors: * * Matthias Schiffer * diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index b48116bb24ef..c3578444f3cb 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index adaeafa4f71e..65c346812bc1 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 58a7d9274435..716e5b43acfa 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -1569,7 +1569,7 @@ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) * Return: 0 on success or negative error number in case of failure */ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, - int max_if_num) + unsigned int max_if_num) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_algo_ops *bao = bat_priv->algo_ops; @@ -1611,7 +1611,7 @@ err: * Return: 0 on success or negative error number in case of failure */ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, - int max_if_num) + unsigned int max_if_num) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hashtable *hash = bat_priv->orig_hash; diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 8e543a3cdc6c..3b3f59b881e1 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -73,9 +73,9 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, - int max_if_num); + unsigned int max_if_num); int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, - int max_if_num); + unsigned int max_if_num); struct batadv_orig_node_vlan * batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, unsigned short vid); diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index b6891e8b741c..cc3ed93a6d51 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -759,6 +759,7 @@ free_skb: /** * batadv_reroute_unicast_packet() - update the unicast header for re-routing * @bat_priv: the bat priv with all the soft interface information + * @skb: unicast packet to process * @unicast_packet: the unicast header to be updated * @dst_addr: the payload destination * @vid: VLAN identifier @@ -770,7 +771,7 @@ free_skb: * Return: true if the packet header has been updated, false otherwise */ static bool -batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, +batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_unicast_packet *unicast_packet, u8 *dst_addr, unsigned short vid) { @@ -799,8 +800,10 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, } /* update the packet header */ + skb_postpull_rcsum(skb, unicast_packet, sizeof(*unicast_packet)); ether_addr_copy(unicast_packet->dest, orig_addr); unicast_packet->ttvn = orig_ttvn; + skb_postpush_rcsum(skb, unicast_packet, sizeof(*unicast_packet)); ret = true; out: @@ -841,7 +844,7 @@ static bool batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * the packet to */ if (batadv_tt_local_client_is_roaming(bat_priv, ethhdr->h_dest, vid)) { - if (batadv_reroute_unicast_packet(bat_priv, unicast_packet, + if (batadv_reroute_unicast_packet(bat_priv, skb, unicast_packet, ethhdr->h_dest, vid)) batadv_dbg_ratelimited(BATADV_DBG_TT, bat_priv, @@ -887,7 +890,7 @@ static bool batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * destination can possibly be updated and forwarded towards the new * target host */ - if (batadv_reroute_unicast_packet(bat_priv, unicast_packet, + if (batadv_reroute_unicast_packet(bat_priv, skb, unicast_packet, ethhdr->h_dest, vid)) { batadv_dbg_ratelimited(BATADV_DBG_TT, bat_priv, "Rerouting unicast packet to %pM (dst=%pM): TTVN mismatch old_ttvn=%u new_ttvn=%u\n", @@ -910,12 +913,14 @@ static bool batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, if (!primary_if) return false; + /* update the packet header */ + skb_postpull_rcsum(skb, unicast_packet, sizeof(*unicast_packet)); ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr); + unicast_packet->ttvn = curr_ttvn; + skb_postpush_rcsum(skb, unicast_packet, sizeof(*unicast_packet)); batadv_hardif_put(primary_if); - unicast_packet->ttvn = curr_ttvn; - return true; } @@ -968,14 +973,10 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, struct batadv_orig_node *orig_node = NULL, *orig_node_gw = NULL; int check, hdr_size = sizeof(*unicast_packet); enum batadv_subtype subtype; - struct ethhdr *ethhdr; int ret = NET_RX_DROP; bool is4addr, is_gw; unicast_packet = (struct batadv_unicast_packet *)skb->data; - unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data; - ethhdr = eth_hdr(skb); - is4addr = unicast_packet->packet_type == BATADV_UNICAST_4ADDR; /* the caller function should have already pulled 2 bytes */ if (is4addr) @@ -995,12 +996,14 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size)) goto free_skb; + unicast_packet = (struct batadv_unicast_packet *)skb->data; + /* packet for me */ if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) { /* If this is a unicast packet from another backgone gw, * drop it. */ - orig_addr_gw = ethhdr->h_source; + orig_addr_gw = eth_hdr(skb)->h_source; orig_node_gw = batadv_orig_hash_find(bat_priv, orig_addr_gw); if (orig_node_gw) { is_gw = batadv_bla_is_backbone_gw(skb, orig_node_gw, @@ -1015,6 +1018,8 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, } if (is4addr) { + unicast_4addr_packet = + (struct batadv_unicast_4addr_packet *)skb->data; subtype = unicast_4addr_packet->subtype; batadv_dat_inc_counter(bat_priv, subtype); diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index a1289bc5f115..db54c2d9b8bf 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 2a5ab6f1076d..4a35f5c2f52b 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 1e8c79093623..64cce07b8fe6 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 900c5ce21cd4..edeffcb9f3a2 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -459,13 +459,7 @@ void batadv_interface_rx(struct net_device *soft_iface, /* skb->dev & skb->pkt_type are set here */ skb->protocol = eth_type_trans(skb, soft_iface); - - /* should not be necessary anymore as we use skb_pull_rcsum() - * TODO: please verify this and remove this TODO - * -- Dec 21st 2009, Simon Wunderlich - */ - - /* skb->ip_summed = CHECKSUM_UNNECESSARY; */ + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 075c5b5b2ce1..daf87f07fadd 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index c1578fa0b952..f2eef43bd2ec 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index bbeee61221fa..c1e3fb69952d 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 8b576712d0c1..11520de96ccb 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli * diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index c8b8f2cb2c2b..68e600974759 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli * diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 7550a9ccd695..0225616d5771 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 8d9e3abec2c8..01b6c8eafaf9 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 5ffcb45ac6ff..a637458205d1 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index a74df33f446d..ef5867f49824 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index bb1578410e0c..476b052ad982 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -167,7 +167,7 @@ struct batadv_hard_iface { struct list_head list; /** @if_num: identificator of the interface */ - s16 if_num; + unsigned int if_num; /** @if_status: status of the interface for batman-adv */ char if_status; @@ -1596,7 +1596,7 @@ struct batadv_priv { atomic_t batman_queue_left; /** @num_ifaces: number of interfaces assigned to this mesh interface */ - char num_ifaces; + unsigned int num_ifaces; /** @mesh_obj: kobject for sysfs mesh subdirectory */ struct kobject *mesh_obj; @@ -2186,15 +2186,16 @@ struct batadv_algo_orig_ops { * orig_node due to a new hard-interface being added into the mesh * (optional) */ - int (*add_if)(struct batadv_orig_node *orig_node, int max_if_num); + int (*add_if)(struct batadv_orig_node *orig_node, + unsigned int max_if_num); /** * @del_if: ask the routing algorithm to apply the needed changes to the * orig_node due to an hard-interface being removed from the mesh * (optional) */ - int (*del_if)(struct batadv_orig_node *orig_node, int max_if_num, - int del_if_num); + int (*del_if)(struct batadv_orig_node *orig_node, + unsigned int max_if_num, unsigned int del_if_num); #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** @print: print the originator table (optional) */ diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 3394e6791673..66c0781773df 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -934,8 +934,8 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) /* Slave connection state and connectable mode bit 38 * and scannable bit 21. */ - if (connectable && (!(hdev->le_states[4] & 0x01) || - !(hdev->le_states[2] & 0x40))) + if (connectable && (!(hdev->le_states[4] & 0x40) || + !(hdev->le_states[2] & 0x20))) return false; } @@ -948,7 +948,7 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) /* Master connection state and connectable mode bit 35 and * scannable 19. */ - if (connectable && (!(hdev->le_states[4] & 0x10) || + if (connectable && (!(hdev->le_states[4] & 0x08) || !(hdev->le_states[2] & 0x08))) return false; } diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 923e9a271872..1506e1632394 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1340,7 +1340,7 @@ done: } static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, - int *addr_len, int peer) + int peer) { struct sockaddr_hci *haddr = (struct sockaddr_hci *)addr; struct sock *sk = sock->sk; @@ -1360,10 +1360,10 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, goto done; } - *addr_len = sizeof(*haddr); haddr->hci_family = AF_BLUETOOTH; haddr->hci_dev = hdev->id; haddr->hci_channel= hci_pi(sk)->channel; + err = sizeof(*haddr); done: release_sock(sk); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 67a8642f57ea..686bdc6b35b0 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -358,7 +358,7 @@ done: } static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, - int *len, int peer) + int peer) { struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr; struct sock *sk = sock->sk; @@ -373,7 +373,6 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, memset(la, 0, sizeof(struct sockaddr_l2)); addr->sa_family = AF_BLUETOOTH; - *len = sizeof(struct sockaddr_l2); la->l2_psm = chan->psm; @@ -387,7 +386,7 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, la->l2_bdaddr_type = chan->src_type; } - return 0; + return sizeof(struct sockaddr_l2); } static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 6e9fc86d8daf..8a80d48d89c4 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4801,6 +4801,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, case MGMT_LTK_P256_DEBUG: authenticated = 0x00; type = SMP_LTK_P256_DEBUG; + /* fall through */ default: continue; } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 1aaccf637479..d606e9212291 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -221,6 +221,7 @@ static void __rfcomm_sock_close(struct sock *sk) case BT_CONFIG: case BT_CONNECTED: rfcomm_dlc_close(d, 0); + /* fall through */ default: sock_set_flag(sk, SOCK_ZAPPED); @@ -533,7 +534,7 @@ done: return err; } -static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer) +static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; struct sock *sk = sock->sk; @@ -552,8 +553,7 @@ static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int * else bacpy(&sa->rc_bdaddr, &rfcomm_pi(sk)->src); - *len = sizeof(struct sockaddr_rc); - return 0; + return sizeof(struct sockaddr_rc); } static int rfcomm_sock_sendmsg(struct socket *sock, struct msghdr *msg, diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 5f3074cb6b4d..5e44d842cc5d 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -210,8 +210,8 @@ static ssize_t show_channel(struct device *tty_dev, struct device_attribute *att return sprintf(buf, "%d\n", dev->channel); } -static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); -static DEVICE_ATTR(channel, S_IRUGO, show_channel, NULL); +static DEVICE_ATTR(address, 0444, show_address, NULL); +static DEVICE_ATTR(channel, 0444, show_channel, NULL); static struct rfcomm_dev *__rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 08df57665e1f..413b8ee49fec 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -680,7 +680,7 @@ done: } static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, - int *len, int peer) + int peer) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; @@ -688,14 +688,13 @@ static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, BT_DBG("sock %p, sk %p", sock, sk); addr->sa_family = AF_BLUETOOTH; - *len = sizeof(struct sockaddr_sco); if (peer) bacpy(&sa->sco_bdaddr, &sco_pi(sk)->dst); else bacpy(&sa->sco_bdaddr, &sco_pi(sk)->src); - return 0; + return sizeof(struct sockaddr_sco); } static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 01117ae84f1d..a2ddae2f37d7 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2296,8 +2296,14 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) else sec_level = authreq_to_seclevel(auth); - if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK)) + if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK)) { + /* If link is already encrypted with sufficient security we + * still need refresh encryption as per Core Spec 5.0 Vol 3, + * Part H 2.4.6 + */ + smp_ltk_encrypt(conn, hcon->sec_level); return 0; + } if (sec_level > hcon->pending_sec_level) hcon->pending_sec_level = sec_level; diff --git a/net/bridge/br.c b/net/bridge/br.c index 6bf06e756df2..671d13c10f6f 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -52,7 +52,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v switch (event) { case NETDEV_CHANGEMTU: - dev_set_mtu(br->dev, br_min_mtu(br)); + br_mtu_auto_adjust(br); break; case NETDEV_CHANGEADDR: diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 1285ca30ab0a..e682a668ce57 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -224,11 +224,11 @@ static void br_get_stats64(struct net_device *dev, static int br_change_mtu(struct net_device *dev, int new_mtu) { struct net_bridge *br = netdev_priv(dev); - if (new_mtu > br_min_mtu(br)) - return -EINVAL; dev->mtu = new_mtu; + /* this flag will be cleared if the MTU was automatically adjusted */ + br->mtu_set_by_user = true; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* remember the MTU in the rtable for PMTU */ dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 9ba4ed65c52b..82c1a6f430b3 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -425,22 +425,31 @@ int br_del_bridge(struct net *net, const char *name) } /* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */ -int br_min_mtu(const struct net_bridge *br) +static int br_mtu_min(const struct net_bridge *br) { const struct net_bridge_port *p; - int mtu = 0; + int ret_mtu = 0; + list_for_each_entry(p, &br->port_list, list) + if (!ret_mtu || ret_mtu > p->dev->mtu) + ret_mtu = p->dev->mtu; + + return ret_mtu ? ret_mtu : ETH_DATA_LEN; +} + +void br_mtu_auto_adjust(struct net_bridge *br) +{ ASSERT_RTNL(); - if (list_empty(&br->port_list)) - mtu = ETH_DATA_LEN; - else { - list_for_each_entry(p, &br->port_list, list) { - if (!mtu || p->dev->mtu < mtu) - mtu = p->dev->mtu; - } - } - return mtu; + /* if the bridge MTU was manually configured don't mess with it */ + if (br->mtu_set_by_user) + return; + + /* change to the minimum MTU and clear the flag which was set by + * the bridge ndo_change_mtu callback + */ + dev_set_mtu(br->dev, br_mtu_min(br)); + br->mtu_set_by_user = false; } static void br_set_gso_limits(struct net_bridge *br) @@ -594,7 +603,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); - dev_set_mtu(br->dev, br_min_mtu(br)); + br_mtu_auto_adjust(br); br_set_gso_limits(br); kobject_uevent(&p->kobj, KOBJ_ADD); @@ -641,7 +650,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) */ del_nbp(p); - dev_set_mtu(br->dev, br_min_mtu(br)); + br_mtu_auto_adjust(br); br_set_gso_limits(br); spin_lock_bh(&br->lock); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 27f1d4f2114a..9b16eaf33819 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -214,7 +214,7 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) - goto inhdr_error; + goto csum_error; len = ntohs(iph->tot_len); if (skb->len < len) { @@ -236,6 +236,8 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) */ return 0; +csum_error: + __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 8e13a64d8c99..a7cb3ece5031 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -410,6 +410,7 @@ struct net_bridge { int offload_fwd_mark; #endif bool neigh_suppress_enabled; + bool mtu_set_by_user; struct hlist_head fdb_list; }; @@ -578,7 +579,7 @@ int br_del_bridge(struct net *net, const char *name); int br_add_if(struct net_bridge *br, struct net_device *dev, struct netlink_ext_ack *extack); int br_del_if(struct net_bridge *br, struct net_device *dev); -int br_min_mtu(const struct net_bridge *br); +void br_mtu_auto_adjust(struct net_bridge *br); netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features); void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index b1be0dcfba6b..0318a69888d4 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -893,7 +893,7 @@ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, static struct bin_attribute bridge_forward = { .attr = { .name = SYSFS_BRIDGE_FDB, - .mode = S_IRUGO, }, + .mode = 0444, }, .read = brforward_read, }; diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 126a8ea73c96..fd31ad83ec7b 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -44,7 +44,7 @@ static int store_##_name(struct net_bridge_port *p, unsigned long v) \ { \ return store_flag(p, v, _mask); \ } \ -static BRPORT_ATTR(_name, S_IRUGO | S_IWUSR, \ +static BRPORT_ATTR(_name, 0644, \ show_##_name, store_##_name) static int store_flag(struct net_bridge_port *p, unsigned long v, @@ -71,7 +71,7 @@ static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) return sprintf(buf, "%d\n", p->path_cost); } -static BRPORT_ATTR(path_cost, S_IRUGO | S_IWUSR, +static BRPORT_ATTR(path_cost, 0644, show_path_cost, br_stp_set_path_cost); static ssize_t show_priority(struct net_bridge_port *p, char *buf) @@ -79,91 +79,91 @@ static ssize_t show_priority(struct net_bridge_port *p, char *buf) return sprintf(buf, "%d\n", p->priority); } -static BRPORT_ATTR(priority, S_IRUGO | S_IWUSR, +static BRPORT_ATTR(priority, 0644, show_priority, br_stp_set_port_priority); static ssize_t show_designated_root(struct net_bridge_port *p, char *buf) { return br_show_bridge_id(buf, &p->designated_root); } -static BRPORT_ATTR(designated_root, S_IRUGO, show_designated_root, NULL); +static BRPORT_ATTR(designated_root, 0444, show_designated_root, NULL); static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf) { return br_show_bridge_id(buf, &p->designated_bridge); } -static BRPORT_ATTR(designated_bridge, S_IRUGO, show_designated_bridge, NULL); +static BRPORT_ATTR(designated_bridge, 0444, show_designated_bridge, NULL); static ssize_t show_designated_port(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->designated_port); } -static BRPORT_ATTR(designated_port, S_IRUGO, show_designated_port, NULL); +static BRPORT_ATTR(designated_port, 0444, show_designated_port, NULL); static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->designated_cost); } -static BRPORT_ATTR(designated_cost, S_IRUGO, show_designated_cost, NULL); +static BRPORT_ATTR(designated_cost, 0444, show_designated_cost, NULL); static ssize_t show_port_id(struct net_bridge_port *p, char *buf) { return sprintf(buf, "0x%x\n", p->port_id); } -static BRPORT_ATTR(port_id, S_IRUGO, show_port_id, NULL); +static BRPORT_ATTR(port_id, 0444, show_port_id, NULL); static ssize_t show_port_no(struct net_bridge_port *p, char *buf) { return sprintf(buf, "0x%x\n", p->port_no); } -static BRPORT_ATTR(port_no, S_IRUGO, show_port_no, NULL); +static BRPORT_ATTR(port_no, 0444, show_port_no, NULL); static ssize_t show_change_ack(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->topology_change_ack); } -static BRPORT_ATTR(change_ack, S_IRUGO, show_change_ack, NULL); +static BRPORT_ATTR(change_ack, 0444, show_change_ack, NULL); static ssize_t show_config_pending(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->config_pending); } -static BRPORT_ATTR(config_pending, S_IRUGO, show_config_pending, NULL); +static BRPORT_ATTR(config_pending, 0444, show_config_pending, NULL); static ssize_t show_port_state(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->state); } -static BRPORT_ATTR(state, S_IRUGO, show_port_state, NULL); +static BRPORT_ATTR(state, 0444, show_port_state, NULL); static ssize_t show_message_age_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer)); } -static BRPORT_ATTR(message_age_timer, S_IRUGO, show_message_age_timer, NULL); +static BRPORT_ATTR(message_age_timer, 0444, show_message_age_timer, NULL); static ssize_t show_forward_delay_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer)); } -static BRPORT_ATTR(forward_delay_timer, S_IRUGO, show_forward_delay_timer, NULL); +static BRPORT_ATTR(forward_delay_timer, 0444, show_forward_delay_timer, NULL); static ssize_t show_hold_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer)); } -static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL); +static BRPORT_ATTR(hold_timer, 0444, show_hold_timer, NULL); static int store_flush(struct net_bridge_port *p, unsigned long v) { br_fdb_delete_by_port(p->br, p, 0, 0); // Don't delete local entry return 0; } -static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); +static BRPORT_ATTR(flush, 0200, NULL, store_flush); static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf) { @@ -179,7 +179,7 @@ static int store_group_fwd_mask(struct net_bridge_port *p, return 0; } -static BRPORT_ATTR(group_fwd_mask, S_IRUGO | S_IWUSR, show_group_fwd_mask, +static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask, store_group_fwd_mask); BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); @@ -204,7 +204,7 @@ static int store_multicast_router(struct net_bridge_port *p, { return br_multicast_set_port_router(p, v); } -static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router, +static BRPORT_ATTR(multicast_router, 0644, show_multicast_router, store_multicast_router); BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 51935270c651..9896f4975353 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -168,6 +168,8 @@ static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid masterv = br_vlan_find(vg, vid); if (WARN_ON(!masterv)) return NULL; + refcount_set(&masterv->refcnt, 1); + return masterv; } refcount_inc(&masterv->refcnt); diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 225d1668dfdd..f212447794bd 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -5,7 +5,7 @@ menuconfig NF_TABLES_BRIDGE depends on BRIDGE && NETFILTER && NF_TABLES select NETFILTER_FAMILY_BRIDGE - tristate "Ethernet Bridge nf_tables support" + bool "Ethernet Bridge nf_tables support" if NF_TABLES_BRIDGE diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 2f28e16de6c7..4bc758dd4a8c 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -3,7 +3,6 @@ # Makefile for the netfilter modules for Link Layer filtering on a bridge. # -obj-$(CONFIG_NF_TABLES_BRIDGE) += nf_tables_bridge.o obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c index 279527f8b1fe..620e54f08296 100644 --- a/net/bridge/netfilter/ebt_among.c +++ b/net/bridge/netfilter/ebt_among.c @@ -172,32 +172,83 @@ ebt_among_mt(const struct sk_buff *skb, struct xt_action_param *par) return true; } +static bool poolsize_invalid(const struct ebt_mac_wormhash *w) +{ + return w && w->poolsize >= (INT_MAX / sizeof(struct ebt_mac_wormhash_tuple)); +} + +static bool wormhash_offset_invalid(int off, unsigned int len) +{ + if (off == 0) /* not present */ + return false; + + if (off < (int)sizeof(struct ebt_among_info) || + off % __alignof__(struct ebt_mac_wormhash)) + return true; + + off += sizeof(struct ebt_mac_wormhash); + + return off > len; +} + +static bool wormhash_sizes_valid(const struct ebt_mac_wormhash *wh, int a, int b) +{ + if (a == 0) + a = sizeof(struct ebt_among_info); + + return ebt_mac_wormhash_size(wh) + a == b; +} + static int ebt_among_mt_check(const struct xt_mtchk_param *par) { const struct ebt_among_info *info = par->matchinfo; const struct ebt_entry_match *em = container_of(par->matchinfo, const struct ebt_entry_match, data); - int expected_length = sizeof(struct ebt_among_info); + unsigned int expected_length = sizeof(struct ebt_among_info); const struct ebt_mac_wormhash *wh_dst, *wh_src; int err; + if (expected_length > em->match_size) + return -EINVAL; + + if (wormhash_offset_invalid(info->wh_dst_ofs, em->match_size) || + wormhash_offset_invalid(info->wh_src_ofs, em->match_size)) + return -EINVAL; + wh_dst = ebt_among_wh_dst(info); - wh_src = ebt_among_wh_src(info); + if (poolsize_invalid(wh_dst)) + return -EINVAL; + expected_length += ebt_mac_wormhash_size(wh_dst); + if (expected_length > em->match_size) + return -EINVAL; + + wh_src = ebt_among_wh_src(info); + if (poolsize_invalid(wh_src)) + return -EINVAL; + + if (info->wh_src_ofs < info->wh_dst_ofs) { + if (!wormhash_sizes_valid(wh_src, info->wh_src_ofs, info->wh_dst_ofs)) + return -EINVAL; + } else { + if (!wormhash_sizes_valid(wh_dst, info->wh_dst_ofs, info->wh_src_ofs)) + return -EINVAL; + } + expected_length += ebt_mac_wormhash_size(wh_src); if (em->match_size != EBT_ALIGN(expected_length)) { - pr_info("wrong size: %d against expected %d, rounded to %zd\n", - em->match_size, expected_length, - EBT_ALIGN(expected_length)); + pr_err_ratelimited("wrong size: %d against expected %d, rounded to %zd\n", + em->match_size, expected_length, + EBT_ALIGN(expected_length)); return -EINVAL; } if (wh_dst && (err = ebt_mac_wormhash_check_integrity(wh_dst))) { - pr_info("dst integrity fail: %x\n", -err); + pr_err_ratelimited("dst integrity fail: %x\n", -err); return -EINVAL; } if (wh_src && (err = ebt_mac_wormhash_check_integrity(wh_src))) { - pr_info("src integrity fail: %x\n", -err); + pr_err_ratelimited("src integrity fail: %x\n", -err); return -EINVAL; } return 0; diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c index 2b46c50abce0..ffaa8ce2e724 100644 --- a/net/bridge/netfilter/ebt_ip.c +++ b/net/bridge/netfilter/ebt_ip.c @@ -19,9 +19,18 @@ #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_ip.h> -struct tcpudphdr { - __be16 src; - __be16 dst; +union pkthdr { + struct { + __be16 src; + __be16 dst; + } tcpudphdr; + struct { + u8 type; + u8 code; + } icmphdr; + struct { + u8 type; + } igmphdr; }; static bool @@ -30,8 +39,8 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct ebt_ip_info *info = par->matchinfo; const struct iphdr *ih; struct iphdr _iph; - const struct tcpudphdr *pptr; - struct tcpudphdr _ports; + const union pkthdr *pptr; + union pkthdr _pkthdr; ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (ih == NULL) @@ -50,29 +59,43 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) if (info->bitmask & EBT_IP_PROTO) { if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol)) return false; - if (!(info->bitmask & EBT_IP_DPORT) && - !(info->bitmask & EBT_IP_SPORT)) + if (!(info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT | + EBT_IP_ICMP | EBT_IP_IGMP))) return true; if (ntohs(ih->frag_off) & IP_OFFSET) return false; + + /* min icmp/igmp headersize is 4, so sizeof(_pkthdr) is ok. */ pptr = skb_header_pointer(skb, ih->ihl*4, - sizeof(_ports), &_ports); + sizeof(_pkthdr), &_pkthdr); if (pptr == NULL) return false; if (info->bitmask & EBT_IP_DPORT) { - u32 dst = ntohs(pptr->dst); + u32 dst = ntohs(pptr->tcpudphdr.dst); if (NF_INVF(info, EBT_IP_DPORT, dst < info->dport[0] || dst > info->dport[1])) return false; } if (info->bitmask & EBT_IP_SPORT) { - u32 src = ntohs(pptr->src); + u32 src = ntohs(pptr->tcpudphdr.src); if (NF_INVF(info, EBT_IP_SPORT, src < info->sport[0] || src > info->sport[1])) return false; } + if ((info->bitmask & EBT_IP_ICMP) && + NF_INVF(info, EBT_IP_ICMP, + pptr->icmphdr.type < info->icmp_type[0] || + pptr->icmphdr.type > info->icmp_type[1] || + pptr->icmphdr.code < info->icmp_code[0] || + pptr->icmphdr.code > info->icmp_code[1])) + return false; + if ((info->bitmask & EBT_IP_IGMP) && + NF_INVF(info, EBT_IP_IGMP, + pptr->igmphdr.type < info->igmp_type[0] || + pptr->igmphdr.type > info->igmp_type[1])) + return false; } return true; } @@ -101,6 +124,21 @@ static int ebt_ip_mt_check(const struct xt_mtchk_param *par) return -EINVAL; if (info->bitmask & EBT_IP_SPORT && info->sport[0] > info->sport[1]) return -EINVAL; + if (info->bitmask & EBT_IP_ICMP) { + if ((info->invflags & EBT_IP_PROTO) || + info->protocol != IPPROTO_ICMP) + return -EINVAL; + if (info->icmp_type[0] > info->icmp_type[1] || + info->icmp_code[0] > info->icmp_code[1]) + return -EINVAL; + } + if (info->bitmask & EBT_IP_IGMP) { + if ((info->invflags & EBT_IP_PROTO) || + info->protocol != IPPROTO_IGMP) + return -EINVAL; + if (info->igmp_type[0] > info->igmp_type[1]) + return -EINVAL; + } return 0; } diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c index 61a9f1be1263..165b9d678cf1 100644 --- a/net/bridge/netfilter/ebt_limit.c +++ b/net/bridge/netfilter/ebt_limit.c @@ -72,8 +72,8 @@ static int ebt_limit_mt_check(const struct xt_mtchk_param *par) /* Check for overflow. */ if (info->burst == 0 || user2credits(info->avg * info->burst) < user2credits(info->avg)) { - pr_info("overflow, try lower: %u/%u\n", - info->avg, info->burst); + pr_info_ratelimited("overflow, try lower: %u/%u\n", + info->avg, info->burst); return -EINVAL; } diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 3140eb912d7e..47ba98db145d 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -153,8 +153,6 @@ ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par) static int ebt_stp_mt_check(const struct xt_mtchk_param *par) { const struct ebt_stp_info *info = par->matchinfo; - const u8 bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00}; - const u8 msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; const struct ebt_entry *e = par->entryinfo; if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK || @@ -162,8 +160,8 @@ static int ebt_stp_mt_check(const struct xt_mtchk_param *par) return -EINVAL; /* Make sure the match only receives stp frames */ if (!par->nft_compat && - (!ether_addr_equal(e->destmac, bridge_ula) || - !ether_addr_equal(e->destmsk, msk) || + (!ether_addr_equal(e->destmac, eth_stp_addr) || + !is_broadcast_ether_addr(e->destmsk) || !(e->bitmask & EBT_DESTMAC))) return -EINVAL; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 02c4b409d317..032e0fe45940 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -223,9 +223,7 @@ unsigned int ebt_do_table(struct sk_buff *skb, return NF_DROP; } - /* increase counter */ - (*(counter_base + i)).pcnt++; - (*(counter_base + i)).bcnt += skb->len; + ADD_COUNTER(*(counter_base + i), 1, skb->len); /* these should only watch: not modify, nor tell us * what to do with the packet @@ -358,12 +356,12 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, left - sizeof(struct ebt_entry_match) < m->match_size) return -EINVAL; - match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0); + match = xt_find_match(NFPROTO_BRIDGE, m->u.name, m->u.revision); if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) { if (!IS_ERR(match)) module_put(match->me); request_module("ebt_%s", m->u.name); - match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0); + match = xt_find_match(NFPROTO_BRIDGE, m->u.name, m->u.revision); } if (IS_ERR(match)) return PTR_ERR(match); @@ -968,10 +966,9 @@ static void get_counters(const struct ebt_counter *oldcounters, if (cpu == 0) continue; counter_base = COUNTER_BASE(oldcounters, nentries, cpu); - for (i = 0; i < nentries; i++) { - counters[i].pcnt += counter_base[i].pcnt; - counters[i].bcnt += counter_base[i].bcnt; - } + for (i = 0; i < nentries; i++) + ADD_COUNTER(counters[i], counter_base[i].pcnt, + counter_base[i].bcnt); } } @@ -1324,10 +1321,8 @@ static int do_update_counters(struct net *net, const char *name, write_lock_bh(&t->lock); /* we add to the counters of the first cpu */ - for (i = 0; i < num_counters; i++) { - t->private->counters[i].pcnt += tmp[i].pcnt; - t->private->counters[i].bcnt += tmp[i].bcnt; - } + for (i = 0; i < num_counters; i++) + ADD_COUNTER(t->private->counters[i], tmp[i].pcnt, tmp[i].bcnt); write_unlock_bh(&t->lock); ret = 0; @@ -1355,16 +1350,17 @@ static int update_counters(struct net *net, const void __user *user, static inline int ebt_obj_to_user(char __user *um, const char *_name, const char *data, int entrysize, - int usersize, int datasize) + int usersize, int datasize, u8 revision) { - char name[EBT_FUNCTION_MAXNAMELEN] = {0}; + char name[EBT_EXTENSION_MAXNAMELEN] = {0}; - /* ebtables expects 32 bytes long names but xt_match names are 29 bytes + /* ebtables expects 31 bytes long names but xt_match names are 29 bytes * long. Copy 29 bytes and fill remaining bytes with zeroes. */ strlcpy(name, _name, sizeof(name)); - if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) || - put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) || + if (copy_to_user(um, name, EBT_EXTENSION_MAXNAMELEN) || + put_user(revision, (u8 __user *)(um + EBT_EXTENSION_MAXNAMELEN)) || + put_user(datasize, (int __user *)(um + EBT_EXTENSION_MAXNAMELEN + 1)) || xt_data_to_user(um + entrysize, data, usersize, datasize, XT_ALIGN(datasize))) return -EFAULT; @@ -1377,7 +1373,8 @@ static inline int ebt_match_to_user(const struct ebt_entry_match *m, { return ebt_obj_to_user(ubase + ((char *)m - base), m->u.match->name, m->data, sizeof(*m), - m->u.match->usersize, m->match_size); + m->u.match->usersize, m->match_size, + m->u.match->revision); } static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w, @@ -1385,7 +1382,8 @@ static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w, { return ebt_obj_to_user(ubase + ((char *)w - base), w->u.watcher->name, w->data, sizeof(*w), - w->u.watcher->usersize, w->watcher_size); + w->u.watcher->usersize, w->watcher_size, + w->u.watcher->revision); } static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base, @@ -1416,7 +1414,8 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base, if (ret != 0) return ret; ret = ebt_obj_to_user(hlp, t->u.target->name, t->data, sizeof(*t), - t->u.target->usersize, t->target_size); + t->u.target->usersize, t->target_size, + t->u.target->revision); if (ret != 0) return ret; @@ -1604,7 +1603,10 @@ struct compat_ebt_replace { /* struct ebt_entry_match, _target and _watcher have same layout */ struct compat_ebt_entry_mwt { union { - char name[EBT_FUNCTION_MAXNAMELEN]; + struct { + char name[EBT_EXTENSION_MAXNAMELEN]; + u8 revision; + }; compat_uptr_t ptr; } u; compat_uint_t match_size; @@ -1641,10 +1643,12 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr, int off = ebt_compat_match_offset(match, m->match_size); compat_uint_t msize = m->match_size - off; - BUG_ON(off >= m->match_size); + if (WARN_ON(off >= m->match_size)) + return -EINVAL; - if (copy_to_user(cm->u.name, match->name, - strlen(match->name) + 1) || put_user(msize, &cm->match_size)) + if (copy_to_user(cm->u.name, match->name, strlen(match->name) + 1) || + put_user(match->revision, &cm->u.revision) || + put_user(msize, &cm->match_size)) return -EFAULT; if (match->compat_to_user) { @@ -1671,10 +1675,12 @@ static int compat_target_to_user(struct ebt_entry_target *t, int off = xt_compat_target_offset(target); compat_uint_t tsize = t->target_size - off; - BUG_ON(off >= t->target_size); + if (WARN_ON(off >= t->target_size)) + return -EINVAL; - if (copy_to_user(cm->u.name, target->name, - strlen(target->name) + 1) || put_user(tsize, &cm->match_size)) + if (copy_to_user(cm->u.name, target->name, strlen(target->name) + 1) || + put_user(target->revision, &cm->u.revision) || + put_user(tsize, &cm->match_size)) return -EFAULT; if (target->compat_to_user) { @@ -1819,10 +1825,14 @@ static int compat_table_info(const struct ebt_table_info *info, { unsigned int size = info->entries_size; const void *entries = info->entries; + int ret; newinfo->entries_size = size; - xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries); + ret = xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries); + if (ret) + return ret; + return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info, entries, newinfo); } @@ -1902,7 +1912,8 @@ static int ebt_buf_add(struct ebt_entries_buf_state *state, if (state->buf_kern_start == NULL) goto count_only; - BUG_ON(state->buf_kern_offset + sz > state->buf_kern_len); + if (WARN_ON(state->buf_kern_offset + sz > state->buf_kern_len)) + return -EINVAL; memcpy(state->buf_kern_start + state->buf_kern_offset, data, sz); @@ -1915,7 +1926,8 @@ static int ebt_buf_add_pad(struct ebt_entries_buf_state *state, unsigned int sz) { char *b = state->buf_kern_start; - BUG_ON(b && state->buf_kern_offset > state->buf_kern_len); + if (WARN_ON(b && state->buf_kern_offset > state->buf_kern_len)) + return -EINVAL; if (b != NULL && sz > 0) memset(b + state->buf_kern_offset, 0, sz); @@ -1934,7 +1946,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, struct ebt_entries_buf_state *state, const unsigned char *base) { - char name[EBT_FUNCTION_MAXNAMELEN]; + char name[EBT_EXTENSION_MAXNAMELEN]; struct xt_match *match; struct xt_target *wt; void *dst = NULL; @@ -1948,7 +1960,8 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, switch (compat_mwt) { case EBT_COMPAT_MATCH: - match = xt_request_find_match(NFPROTO_BRIDGE, name, 0); + match = xt_request_find_match(NFPROTO_BRIDGE, name, + mwt->u.revision); if (IS_ERR(match)) return PTR_ERR(match); @@ -1967,7 +1980,8 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, break; case EBT_COMPAT_WATCHER: /* fallthrough */ case EBT_COMPAT_TARGET: - wt = xt_request_find_target(NFPROTO_BRIDGE, name, 0); + wt = xt_request_find_target(NFPROTO_BRIDGE, name, + mwt->u.revision); if (IS_ERR(wt)) return PTR_ERR(wt); off = xt_compat_target_offset(wt); @@ -1992,8 +2006,10 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, pad = XT_ALIGN(size_kern) - size_kern; if (pad > 0 && dst) { - BUG_ON(state->buf_kern_len <= pad); - BUG_ON(state->buf_kern_offset - (match_size + off) + size_kern > state->buf_kern_len - pad); + if (WARN_ON(state->buf_kern_len <= pad)) + return -EINVAL; + if (WARN_ON(state->buf_kern_offset - (match_size + off) + size_kern > state->buf_kern_len - pad)) + return -EINVAL; memset(dst + size_kern, 0, pad); } return off + match_size; @@ -2043,7 +2059,8 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, if (ret < 0) return ret; - BUG_ON(ret < match32->match_size); + if (WARN_ON(ret < match32->match_size)) + return -EINVAL; growth += ret - match32->match_size; growth += ebt_compat_entry_padsize(); @@ -2053,7 +2070,9 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, if (match_kern) match_kern->match_size = ret; - WARN_ON(type == EBT_COMPAT_TARGET && size_left); + if (WARN_ON(type == EBT_COMPAT_TARGET && size_left)) + return -EINVAL; + match32 = (struct compat_ebt_entry_mwt *) buf; } @@ -2109,6 +2128,19 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, * * offsets are relative to beginning of struct ebt_entry (i.e., 0). */ + for (i = 0; i < 4 ; ++i) { + if (offsets[i] > *total) + return -EINVAL; + + if (i < 3 && offsets[i] == *total) + return -EINVAL; + + if (i == 0) + continue; + if (offsets[i-1] > offsets[i]) + return -EINVAL; + } + for (i = 0, j = 1 ; j < 4 ; j++, i++) { struct compat_ebt_entry_mwt *match32; unsigned int size; @@ -2140,7 +2172,8 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, startoff = state->buf_user_offset - startoff; - BUG_ON(*total < startoff); + if (WARN_ON(*total < startoff)) + return -EINVAL; *total -= startoff; return 0; } @@ -2245,7 +2278,9 @@ static int compat_do_replace(struct net *net, void __user *user, xt_compat_lock(NFPROTO_BRIDGE); - xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries); + ret = xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries); + if (ret < 0) + goto out_unlock; ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state); if (ret < 0) goto out_unlock; @@ -2267,7 +2302,8 @@ static int compat_do_replace(struct net *net, void __user *user, state.buf_kern_len = size64; ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state); - BUG_ON(ret < 0); /* parses same data again */ + if (WARN_ON(ret < 0)) + goto out_unlock; vfree(entries_tmp); tmp.entries_size = size64; diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c deleted file mode 100644 index 5160cf614176..000000000000 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/netfilter_bridge.h> -#include <net/netfilter/nf_tables.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <net/netfilter/nf_tables_ipv4.h> -#include <net/netfilter/nf_tables_ipv6.h> - -static unsigned int -nft_do_chain_bridge(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - - switch (eth_hdr(skb)->h_proto) { - case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb); - break; - case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb); - break; - default: - nft_set_pktinfo_unspec(&pkt, skb); - break; - } - - return nft_do_chain(&pkt, priv); -} - -static const struct nf_chain_type filter_bridge = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_BRIDGE, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_BR_PRE_ROUTING) | - (1 << NF_BR_LOCAL_IN) | - (1 << NF_BR_FORWARD) | - (1 << NF_BR_LOCAL_OUT) | - (1 << NF_BR_POST_ROUTING), - .hooks = { - [NF_BR_PRE_ROUTING] = nft_do_chain_bridge, - [NF_BR_LOCAL_IN] = nft_do_chain_bridge, - [NF_BR_FORWARD] = nft_do_chain_bridge, - [NF_BR_LOCAL_OUT] = nft_do_chain_bridge, - [NF_BR_POST_ROUTING] = nft_do_chain_bridge, - }, -}; - -static int __init nf_tables_bridge_init(void) -{ - return nft_register_chain_type(&filter_bridge); -} - -static void __exit nf_tables_bridge_exit(void) -{ - nft_unregister_chain_type(&filter_bridge); -} - -module_init(nf_tables_bridge_init); -module_exit(nf_tables_bridge_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(AF_BRIDGE, "filter"); diff --git a/net/can/af_can.c b/net/can/af_can.c index 6da324550eec..1684ba5b51eb 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -72,7 +72,7 @@ MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>, " MODULE_ALIAS_NETPROTO(PF_CAN); static int stats_timer __read_mostly = 1; -module_param(stats_timer, int, S_IRUGO); +module_param(stats_timer, int, 0444); MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)"); static struct kmem_cache *rcv_cache __read_mostly; diff --git a/net/can/gw.c b/net/can/gw.c index 398dd0395ad9..faa3da88a127 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -72,7 +72,7 @@ MODULE_ALIAS(CAN_GW_NAME); #define CGW_DEFAULT_HOPS 1 static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS; -module_param(max_hops, uint, S_IRUGO); +module_param(max_hops, uint, 0444); MODULE_PARM_DESC(max_hops, "maximum " CAN_GW_NAME " routing hops for CAN frames " "(valid values: " __stringify(CGW_MIN_HOPS) "-" diff --git a/net/can/raw.c b/net/can/raw.c index f2ecc43376a1..1051eee82581 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -470,7 +470,7 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) } static int raw_getname(struct socket *sock, struct sockaddr *uaddr, - int *len, int peer) + int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; @@ -483,9 +483,7 @@ static int raw_getname(struct socket *sock, struct sockaddr *uaddr, addr->can_family = AF_CAN; addr->can_ifindex = ro->ifindex; - *len = sizeof(*addr); - - return 0; + return sizeof(*addr); } static int raw_setsockopt(struct socket *sock, int level, int optname, diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 1e492ef2a33d..4adf07826f4a 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -54,7 +54,7 @@ static const struct kernel_param_ops param_ops_supported_features = { .get = param_get_supported_features, }; module_param_cb(supported_features, ¶m_ops_supported_features, NULL, - S_IRUGO); + 0444); const char *ceph_msg_type_name(int type) { @@ -418,6 +418,7 @@ ceph_parse_options(char *options, const char *dev_name, opt->flags |= CEPH_OPT_FSID; break; case Opt_name: + kfree(opt->name); opt->name = kstrndup(argstr[0].from, argstr[0].to-argstr[0].from, GFP_KERNEL); @@ -427,6 +428,9 @@ ceph_parse_options(char *options, const char *dev_name, } break; case Opt_secret: + ceph_crypto_key_destroy(opt->key); + kfree(opt->key); + opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); if (!opt->key) { err = -ENOMEM; @@ -437,6 +441,9 @@ ceph_parse_options(char *options, const char *dev_name, goto out; break; case Opt_key: + ceph_crypto_key_destroy(opt->key); + kfree(opt->key); + opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); if (!opt->key) { err = -ENOMEM; diff --git a/net/compat.c b/net/compat.c index 22381719718c..5ae7437d3853 100644 --- a/net/compat.c +++ b/net/compat.c @@ -383,8 +383,8 @@ static int compat_sock_setsockopt(struct socket *sock, int level, int optname, return sock_setsockopt(sock, level, optname, optval, optlen); } -COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, - char __user *, optval, unsigned int, optlen) +static int __compat_sys_setsockopt(int fd, int level, int optname, + char __user *optval, unsigned int optlen) { int err; struct socket *sock = sockfd_lookup(fd, &err); @@ -410,6 +410,12 @@ COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, return err; } +COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, + char __user *, optval, unsigned int, optlen) +{ + return __compat_sys_setsockopt(fd, level, optname, optval, optlen); +} + static int do_get_sock_timeout(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { @@ -503,8 +509,9 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta } EXPORT_SYMBOL(compat_sock_get_timestampns); -COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, - char __user *, optval, int __user *, optlen) +static int __compat_sys_getsockopt(int fd, int level, int optname, + char __user *optval, + int __user *optlen) { int err; struct socket *sock = sockfd_lookup(fd, &err); @@ -530,6 +537,12 @@ COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, return err; } +COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, + char __user *, optval, int __user *, optlen) +{ + return __compat_sys_getsockopt(fd, level, optname, optval, optlen); +} + struct compat_group_req { __u32 gr_interface; struct __kernel_sockaddr_storage gr_group @@ -734,38 +747,72 @@ static unsigned char nas[21] = { }; #undef AL -COMPAT_SYSCALL_DEFINE3(sendmsg, int, fd, struct compat_msghdr __user *, msg, unsigned int, flags) +static inline long __compat_sys_sendmsg(int fd, + struct compat_msghdr __user *msg, + unsigned int flags) { - return __sys_sendmsg(fd, (struct user_msghdr __user *)msg, flags | MSG_CMSG_COMPAT); + return __sys_sendmsg(fd, (struct user_msghdr __user *)msg, + flags | MSG_CMSG_COMPAT, false); +} + +COMPAT_SYSCALL_DEFINE3(sendmsg, int, fd, struct compat_msghdr __user *, msg, + unsigned int, flags) +{ + return __compat_sys_sendmsg(fd, msg, flags); +} + +static inline long __compat_sys_sendmmsg(int fd, + struct compat_mmsghdr __user *mmsg, + unsigned int vlen, unsigned int flags) +{ + return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, + flags | MSG_CMSG_COMPAT, false); } COMPAT_SYSCALL_DEFINE4(sendmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags) { - return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, - flags | MSG_CMSG_COMPAT); + return __compat_sys_sendmmsg(fd, mmsg, vlen, flags); +} + +static inline long __compat_sys_recvmsg(int fd, + struct compat_msghdr __user *msg, + unsigned int flags) +{ + return __sys_recvmsg(fd, (struct user_msghdr __user *)msg, + flags | MSG_CMSG_COMPAT, false); +} + +COMPAT_SYSCALL_DEFINE3(recvmsg, int, fd, struct compat_msghdr __user *, msg, + unsigned int, flags) +{ + return __compat_sys_recvmsg(fd, msg, flags); } -COMPAT_SYSCALL_DEFINE3(recvmsg, int, fd, struct compat_msghdr __user *, msg, unsigned int, flags) +static inline long __compat_sys_recvfrom(int fd, void __user *buf, + compat_size_t len, unsigned int flags, + struct sockaddr __user *addr, + int __user *addrlen) { - return __sys_recvmsg(fd, (struct user_msghdr __user *)msg, flags | MSG_CMSG_COMPAT); + return __sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, + addrlen); } COMPAT_SYSCALL_DEFINE4(recv, int, fd, void __user *, buf, compat_size_t, len, unsigned int, flags) { - return sys_recv(fd, buf, len, flags | MSG_CMSG_COMPAT); + return __compat_sys_recvfrom(fd, buf, len, flags, NULL, NULL); } COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len, unsigned int, flags, struct sockaddr __user *, addr, int __user *, addrlen) { - return sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, addrlen); + return __compat_sys_recvfrom(fd, buf, len, flags, addr, addrlen); } -COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, - unsigned int, vlen, unsigned int, flags, - struct compat_timespec __user *, timeout) +static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, + unsigned int vlen, unsigned int flags, + struct compat_timespec __user *timeout) { int datagrams; struct timespec ktspec; @@ -785,6 +832,13 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, return datagrams; } +COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags, + struct compat_timespec __user *, timeout) +{ + return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); +} + COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) { u32 a[AUDITSC_ARGS]; @@ -810,68 +864,72 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) switch (call) { case SYS_SOCKET: - ret = sys_socket(a0, a1, a[2]); + ret = __sys_socket(a0, a1, a[2]); break; case SYS_BIND: - ret = sys_bind(a0, compat_ptr(a1), a[2]); + ret = __sys_bind(a0, compat_ptr(a1), a[2]); break; case SYS_CONNECT: - ret = sys_connect(a0, compat_ptr(a1), a[2]); + ret = __sys_connect(a0, compat_ptr(a1), a[2]); break; case SYS_LISTEN: - ret = sys_listen(a0, a1); + ret = __sys_listen(a0, a1); break; case SYS_ACCEPT: - ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0); + ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0); break; case SYS_GETSOCKNAME: - ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2])); + ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2])); break; case SYS_GETPEERNAME: - ret = sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2])); + ret = __sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2])); break; case SYS_SOCKETPAIR: - ret = sys_socketpair(a0, a1, a[2], compat_ptr(a[3])); + ret = __sys_socketpair(a0, a1, a[2], compat_ptr(a[3])); break; case SYS_SEND: - ret = sys_send(a0, compat_ptr(a1), a[2], a[3]); + ret = __sys_sendto(a0, compat_ptr(a1), a[2], a[3], NULL, 0); break; case SYS_SENDTO: - ret = sys_sendto(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), a[5]); + ret = __sys_sendto(a0, compat_ptr(a1), a[2], a[3], + compat_ptr(a[4]), a[5]); break; case SYS_RECV: - ret = compat_sys_recv(a0, compat_ptr(a1), a[2], a[3]); + ret = __compat_sys_recvfrom(a0, compat_ptr(a1), a[2], a[3], + NULL, NULL); break; case SYS_RECVFROM: - ret = compat_sys_recvfrom(a0, compat_ptr(a1), a[2], a[3], - compat_ptr(a[4]), compat_ptr(a[5])); + ret = __compat_sys_recvfrom(a0, compat_ptr(a1), a[2], a[3], + compat_ptr(a[4]), + compat_ptr(a[5])); break; case SYS_SHUTDOWN: - ret = sys_shutdown(a0, a1); + ret = __sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: - ret = compat_sys_setsockopt(a0, a1, a[2], - compat_ptr(a[3]), a[4]); + ret = __compat_sys_setsockopt(a0, a1, a[2], + compat_ptr(a[3]), a[4]); break; case SYS_GETSOCKOPT: - ret = compat_sys_getsockopt(a0, a1, a[2], - compat_ptr(a[3]), compat_ptr(a[4])); + ret = __compat_sys_getsockopt(a0, a1, a[2], + compat_ptr(a[3]), + compat_ptr(a[4])); break; case SYS_SENDMSG: - ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]); + ret = __compat_sys_sendmsg(a0, compat_ptr(a1), a[2]); break; case SYS_SENDMMSG: - ret = compat_sys_sendmmsg(a0, compat_ptr(a1), a[2], a[3]); + ret = __compat_sys_sendmmsg(a0, compat_ptr(a1), a[2], a[3]); break; case SYS_RECVMSG: - ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]); + ret = __compat_sys_recvmsg(a0, compat_ptr(a1), a[2]); break; case SYS_RECVMMSG: - ret = compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3], - compat_ptr(a[4])); + ret = __compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3], + compat_ptr(a[4])); break; case SYS_ACCEPT4: - ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]); + ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]); break; default: ret = -EINVAL; diff --git a/net/core/dev.c b/net/core/dev.c index d4362befe7e2..9b04a9fd1dfd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1571,6 +1571,27 @@ static void dev_disable_gro_hw(struct net_device *dev) netdev_WARN(dev, "failed to disable GRO_HW!\n"); } +const char *netdev_cmd_to_name(enum netdev_cmd cmd) +{ +#define N(val) \ + case NETDEV_##val: \ + return "NETDEV_" __stringify(val); + switch (cmd) { + N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) + N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) + N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) + N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) + N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) + N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) + N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) + N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) + N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) + }; +#undef N + return "UNKNOWN_NETDEV_EVENT"; +} +EXPORT_SYMBOL_GPL(netdev_cmd_to_name); + static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { @@ -1604,6 +1625,8 @@ int register_netdevice_notifier(struct notifier_block *nb) struct net *net; int err; + /* Close race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_register(&netdev_chain, nb); if (err) @@ -1626,6 +1649,7 @@ int register_netdevice_notifier(struct notifier_block *nb) unlock: rtnl_unlock(); + up_write(&pernet_ops_rwsem); return err; rollback: @@ -1670,6 +1694,8 @@ int unregister_netdevice_notifier(struct notifier_block *nb) struct net *net; int err; + /* Close race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_unregister(&netdev_chain, nb); if (err) @@ -1687,6 +1713,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) } unlock: rtnl_unlock(); + up_write(&pernet_ops_rwsem); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier); @@ -2378,7 +2405,7 @@ EXPORT_SYMBOL(netdev_set_num_tc); /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues - * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. + * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { @@ -2735,7 +2762,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) return 0; - eth = (struct ethhdr *)skb_mac_header(skb); + eth = (struct ethhdr *)skb->data; type = eth->h_proto; } @@ -3278,15 +3305,23 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { - struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); + const struct netprio_map *map; + const struct sock *sk; + unsigned int prioidx; + + if (skb->priority) + return; + map = rcu_dereference_bh(skb->dev->priomap); + if (!map) + return; + sk = skb_to_full_sk(skb); + if (!sk) + return; - if (!skb->priority && skb->sk && map) { - unsigned int prioidx = - sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); + prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); - if (prioidx < map->priomap_len) - skb->priority = map->priomap[prioidx]; - } + if (prioidx < map->priomap_len) + skb->priority = map->priomap[prioidx]; } #else #define skb_update_prio(skb) @@ -4351,6 +4386,9 @@ int netdev_rx_handler_register(struct net_device *dev, if (netdev_is_rx_handler_busy(dev)) return -EBUSY; + if (dev->priv_flags & IFF_NO_RX_HANDLER) + return -EINVAL; + /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); @@ -6396,6 +6434,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, .linking = true, .upper_info = upper_info, }; + struct net_device *master_dev; int ret = 0; ASSERT_RTNL(); @@ -6407,11 +6446,14 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; - if (netdev_has_upper_dev(dev, upper_dev)) - return -EEXIST; - - if (master && netdev_master_upper_dev_get(dev)) - return -EBUSY; + if (!master) { + if (netdev_has_upper_dev(dev, upper_dev)) + return -EEXIST; + } else { + master_dev = netdev_master_upper_dev_get(dev); + if (master_dev) + return master_dev == upper_dev ? -EEXIST : -EBUSY; + } ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); @@ -7542,6 +7584,19 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } + /* LRO/HW-GRO features cannot be combined with RX-FCS */ + if (features & NETIF_F_RXFCS) { + if (features & NETIF_F_LRO) { + netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); + features &= ~NETIF_F_LRO; + } + + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n"); + features &= ~NETIF_F_GRO_HW; + } + } + return features; } @@ -7613,6 +7668,24 @@ sync_lower: } } + if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { + if (features & NETIF_F_HW_VLAN_CTAG_FILTER) { + dev->features = features; + err |= vlan_get_rx_ctag_filter_info(dev); + } else { + vlan_drop_rx_ctag_filter_info(dev); + } + } + + if (diff & NETIF_F_HW_VLAN_STAG_FILTER) { + if (features & NETIF_F_HW_VLAN_STAG_FILTER) { + dev->features = features; + err |= vlan_get_rx_stag_filter_info(dev); + } else { + vlan_drop_rx_stag_filter_info(dev); + } + } + dev->features = features; } @@ -7998,7 +8071,8 @@ int register_netdev(struct net_device *dev) { int err; - rtnl_lock(); + if (rtnl_lock_killable()) + return -EINTR; err = register_netdevice(dev); rtnl_unlock(); return err; @@ -8048,7 +8122,6 @@ static void netdev_wait_allrefs(struct net_device *dev) rcu_barrier(); rtnl_lock(); - call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* We must not have linkwatch events @@ -8120,10 +8193,6 @@ void netdev_run_todo(void) = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); - rtnl_lock(); - call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - __rtnl_unlock(); - if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { pr_err("network todo '%s' but state %d\n", dev->name, dev->reg_state); @@ -8141,8 +8210,9 @@ void netdev_run_todo(void) BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); +#if IS_ENABLED(CONFIG_DECNET) WARN_ON(dev->dn_ptr); - +#endif if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) @@ -8564,7 +8634,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); - call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); new_nsid = peernet2id_alloc(dev_net(dev), net); /* If there is an ifindex conflict assign a new one */ diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 0ab1af04296c..a04e1e88bf3a 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -402,8 +402,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c if (colon) *colon = 0; - dev_load(net, ifr->ifr_name); - /* * See which interface the caller is talking about. */ @@ -423,6 +421,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCGIFMAP: case SIOCGIFINDEX: case SIOCGIFTXQLEN: + dev_load(net, ifr->ifr_name); rcu_read_lock(); ret = dev_ifsioc_locked(net, ifr, cmd); rcu_read_unlock(); @@ -431,6 +430,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c return ret; case SIOCETHTOOL: + dev_load(net, ifr->ifr_name); rtnl_lock(); ret = dev_ethtool(net, ifr); rtnl_unlock(); @@ -447,6 +447,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSIFNAME: + dev_load(net, ifr->ifr_name); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_lock(); @@ -494,6 +495,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c /* fall through */ case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: + dev_load(net, ifr->ifr_name); rtnl_lock(); ret = dev_ifsioc(net, ifr, cmd); rtnl_unlock(); @@ -518,6 +520,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c cmd == SIOCGHWTSTAMP || (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15)) { + dev_load(net, ifr->ifr_name); rtnl_lock(); ret = dev_ifsioc(net, ifr, cmd); rtnl_unlock(); diff --git a/net/core/devlink.c b/net/core/devlink.c index 18d385ed8237..9236e421bd62 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1695,10 +1695,11 @@ static int devlink_dpipe_table_put(struct sk_buff *skb, goto nla_put_failure; if (table->resource_valid) { - nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, - table->resource_id, DEVLINK_ATTR_PAD); - nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS, - table->resource_units, DEVLINK_ATTR_PAD); + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, + table->resource_id, DEVLINK_ATTR_PAD) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS, + table->resource_units, DEVLINK_ATTR_PAD)) + goto nla_put_failure; } if (devlink_dpipe_matches_put(table, skb)) goto nla_put_failure; @@ -1797,7 +1798,7 @@ send_done: if (!nlh) { err = devlink_dpipe_send_and_alloc_skb(&skb, info); if (err) - goto err_skb_send_alloc; + return err; goto send_done; } @@ -1806,7 +1807,6 @@ send_done: nla_put_failure: err = -EMSGSIZE; err_table_put: -err_skb_send_alloc: genlmsg_cancel(skb, hdr); nlmsg_free(skb); return err; @@ -2072,7 +2072,7 @@ static int devlink_dpipe_entries_fill(struct genl_info *info, table->counters_enabled, &dump_ctx); if (err) - goto err_entries_dump; + return err; send_done: nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq, @@ -2080,16 +2080,10 @@ send_done: if (!nlh) { err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info); if (err) - goto err_skb_send_alloc; + return err; goto send_done; } return genlmsg_reply(dump_ctx.skb, info); - -err_entries_dump: -err_skb_send_alloc: - genlmsg_cancel(dump_ctx.skb, dump_ctx.hdr); - nlmsg_free(dump_ctx.skb); - return err; } static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, @@ -2228,7 +2222,7 @@ send_done: if (!nlh) { err = devlink_dpipe_send_and_alloc_skb(&skb, info); if (err) - goto err_skb_send_alloc; + return err; goto send_done; } return genlmsg_reply(skb, info); @@ -2236,7 +2230,6 @@ send_done: nla_put_failure: err = -EMSGSIZE; err_table_put: -err_skb_send_alloc: genlmsg_cancel(skb, hdr); nlmsg_free(skb); return err; @@ -2332,12 +2325,38 @@ devlink_resource_validate_children(struct devlink_resource *resource) list_for_each_entry(child_resource, &resource->resource_list, list) parts_size += child_resource->size_new; - if (parts_size > resource->size) + if (parts_size > resource->size_new) size_valid = false; out: resource->size_valid = size_valid; } +static int +devlink_resource_validate_size(struct devlink_resource *resource, u64 size, + struct netlink_ext_ack *extack) +{ + u64 reminder; + int err = 0; + + if (size > resource->size_params.size_max) { + NL_SET_ERR_MSG_MOD(extack, "Size larger than maximum"); + err = -EINVAL; + } + + if (size < resource->size_params.size_min) { + NL_SET_ERR_MSG_MOD(extack, "Size smaller than minimum"); + err = -EINVAL; + } + + div64_u64_rem(size, resource->size_params.size_granularity, &reminder); + if (reminder) { + NL_SET_ERR_MSG_MOD(extack, "Wrong granularity"); + err = -EINVAL; + } + + return err; +} + static int devlink_nl_cmd_resource_set(struct sk_buff *skb, struct genl_info *info) { @@ -2356,12 +2375,8 @@ static int devlink_nl_cmd_resource_set(struct sk_buff *skb, if (!resource) return -EINVAL; - if (!resource->resource_ops->size_validate) - return -EINVAL; - size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); - err = resource->resource_ops->size_validate(devlink, size, - info->extack); + err = devlink_resource_validate_size(resource, size, info->extack); if (err) return err; @@ -2372,20 +2387,22 @@ static int devlink_nl_cmd_resource_set(struct sk_buff *skb, return 0; } -static void +static int devlink_resource_size_params_put(struct devlink_resource *resource, struct sk_buff *skb) { struct devlink_resource_size_params *size_params; - size_params = resource->size_params; - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, - size_params->size_granularity, DEVLINK_ATTR_PAD); - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, - size_params->size_max, DEVLINK_ATTR_PAD); - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, - size_params->size_min, DEVLINK_ATTR_PAD); - nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit); + size_params = &resource->size_params; + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, + size_params->size_granularity, DEVLINK_ATTR_PAD) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, + size_params->size_max, DEVLINK_ATTR_PAD) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, + size_params->size_min, DEVLINK_ATTR_PAD) || + nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit)) + return -EMSGSIZE; + return 0; } static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, @@ -2409,10 +2426,12 @@ static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, resource->size_new, DEVLINK_ATTR_PAD); if (resource->resource_ops && resource->resource_ops->occ_get) - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, - resource->resource_ops->occ_get(devlink), - DEVLINK_ATTR_PAD); - devlink_resource_size_params_put(resource, skb); + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, + resource->resource_ops->occ_get(devlink), + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (devlink_resource_size_params_put(resource, skb)) + goto nla_put_failure; if (list_empty(&resource->resource_list)) goto out; @@ -2717,22 +2736,22 @@ static const struct genl_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, .doit = devlink_nl_cmd_dpipe_table_get, .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, .doit = devlink_nl_cmd_dpipe_entries_get, .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, .doit = devlink_nl_cmd_dpipe_headers_get, .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, @@ -2752,8 +2771,8 @@ static const struct genl_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_RESOURCE_DUMP, .doit = devlink_nl_cmd_resource_dump, .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_RELOAD, @@ -3147,17 +3166,19 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); */ int devlink_resource_register(struct devlink *devlink, const char *resource_name, - bool top_hierarchy, u64 resource_size, u64 resource_id, u64 parent_resource_id, - struct devlink_resource_size_params *size_params, + const struct devlink_resource_size_params *size_params, const struct devlink_resource_ops *resource_ops) { struct devlink_resource *resource; struct list_head *resource_list; + bool top_hierarchy; int err = 0; + top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP; + mutex_lock(&devlink->lock); resource = devlink_resource_find(devlink, NULL, resource_id); if (resource) { @@ -3194,7 +3215,8 @@ int devlink_resource_register(struct devlink *devlink, resource->id = resource_id; resource->resource_ops = resource_ops; resource->size_valid = true; - resource->size_params = size_params; + memcpy(&resource->size_params, size_params, + sizeof(resource->size_params)); INIT_LIST_HEAD(&resource->resource_list); list_add_tail(&resource->list, resource_list); out: diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 554d36449231..64cef977484a 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -107,7 +107,7 @@ EXPORT_SYMBOL_GPL(dst_cache_set_ip4); #if IS_ENABLED(CONFIG_IPV6) void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, - const struct in6_addr *addr) + const struct in6_addr *saddr) { struct dst_cache_pcpu *idst; @@ -117,7 +117,7 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, rt6_get_cookie((struct rt6_info *)dst)); - idst->in6_saddr = *addr; + idst->in6_saddr = *saddr; } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 494e6a5d7306..03416e6dd5d7 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -22,6 +22,7 @@ #include <linux/bitops.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> +#include <linux/sfp.h> #include <linux/slab.h> #include <linux/rtnetlink.h> #include <linux/sched/signal.h> @@ -107,6 +108,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", + [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", }; static const char @@ -121,6 +123,7 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { [ETHTOOL_ID_UNSPEC] = "Unspec", [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", + [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", }; static const char @@ -1022,6 +1025,15 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, if (copy_from_user(&info, useraddr, info_size)) return -EFAULT; + /* If FLOW_RSS was requested then user-space must be using the + * new definition, as FLOW_RSS is newer. + */ + if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { + info_size = sizeof(info); + if (copy_from_user(&info, useraddr, info_size)) + return -EFAULT; + } + if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) @@ -1251,9 +1263,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || - rxfh.rsvd8[2] || rxfh.rsvd32) + if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; + /* Most drivers don't handle rss_context, check it's 0 as well */ + if (rxfh.rss_context && !ops->get_rxfh_context) + return -EOPNOTSUPP; rxfh.indir_size = dev_indir_size; rxfh.key_size = dev_key_size; @@ -1276,7 +1290,12 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (user_key_size) hkey = rss_config + indir_bytes; - ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); + if (rxfh.rss_context) + ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey, + &dev_hfunc, + rxfh.rss_context); + else + ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); if (ret) goto out; @@ -1306,6 +1325,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, u8 *hkey = NULL; u8 *rss_config; u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); + bool delete = false; if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; @@ -1319,9 +1339,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, return -EFAULT; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || - rxfh.rsvd8[2] || rxfh.rsvd32) + if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; + /* Most drivers don't handle rss_context, check it's 0 as well */ + if (rxfh.rss_context && !ops->set_rxfh_context) + return -EOPNOTSUPP; /* If either indir, hash key or function is valid, proceed further. * Must request at least one change: indir size, hash key or function. @@ -1346,7 +1368,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (ret) goto out; - /* rxfh.indir_size == 0 means reset the indir table to default. + /* rxfh.indir_size == 0 means reset the indir table to default (master + * context) or delete the context (other RSS contexts). * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. */ if (rxfh.indir_size && @@ -1359,9 +1382,13 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (ret) goto out; } else if (rxfh.indir_size == 0) { - indir = (u32 *)rss_config; - for (i = 0; i < dev_indir_size; i++) - indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + if (rxfh.rss_context == 0) { + indir = (u32 *)rss_config; + for (i = 0; i < dev_indir_size; i++) + indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + } else { + delete = true; + } } if (rxfh.key_size) { @@ -1374,15 +1401,25 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } } - ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); + if (rxfh.rss_context) + ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc, + &rxfh.rss_context, delete); + else + ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); if (ret) goto out; - /* indicate whether rxfh was set to default */ - if (rxfh.indir_size == 0) - dev->priv_flags &= ~IFF_RXFH_CONFIGURED; - else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) - dev->priv_flags |= IFF_RXFH_CONFIGURED; + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context), + &rxfh.rss_context, sizeof(rxfh.rss_context))) + ret = -EFAULT; + + if (!rxfh.rss_context) { + /* indicate whether rxfh was set to default */ + if (rxfh.indir_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + dev->priv_flags |= IFF_RXFH_CONFIGURED; + } out: kfree(rss_config); @@ -2210,6 +2247,9 @@ static int __ethtool_get_module_info(struct net_device *dev, const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; + if (dev->sfp_bus) + return sfp_get_module_info(dev->sfp_bus, modinfo); + if (phydev && phydev->drv && phydev->drv->module_info) return phydev->drv->module_info(phydev, modinfo); @@ -2244,6 +2284,9 @@ static int __ethtool_get_module_eeprom(struct net_device *dev, const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; + if (dev->sfp_bus) + return sfp_get_module_eeprom(dev->sfp_bus, ee, data); + if (phydev && phydev->drv && phydev->drv->module_eeprom) return phydev->drv->module_eeprom(phydev, ee, data); @@ -2277,6 +2320,11 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) tuna->type_id != ETHTOOL_TUNABLE_U32) return -EINVAL; break; + case ETHTOOL_PFC_PREVENTION_TOUT: + if (tuna->len != sizeof(u16) || + tuna->type_id != ETHTOOL_TUNABLE_U16) + return -EINVAL; + break; default: return -EINVAL; } @@ -2520,11 +2568,14 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr) static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr) { struct ethtool_fecparam fecparam = { ETHTOOL_GFECPARAM }; + int rc; if (!dev->ethtool_ops->get_fecparam) return -EOPNOTSUPP; - dev->ethtool_ops->get_fecparam(dev, &fecparam); + rc = dev->ethtool_ops->get_fecparam(dev, &fecparam); + if (rc) + return rc; if (copy_to_user(useraddr, &fecparam, sizeof(fecparam))) return -EFAULT; diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 0c048bdeb016..13a40b831d6d 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -13,16 +13,22 @@ int call_fib_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + int err; + info->net = net; - return nb->notifier_call(nb, event_type, info); + err = nb->notifier_call(nb, event_type, info); + return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + int err; + info->net = net; - return atomic_notifier_call_chain(&fib_chain, event_type, info); + err = atomic_notifier_call_chain(&fib_chain, event_type, info); + return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifiers); @@ -33,6 +39,7 @@ static unsigned int fib_seq_sum(void) struct net *net; rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) { rcu_read_lock(); list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) { @@ -43,6 +50,7 @@ static unsigned int fib_seq_sum(void) } rcu_read_unlock(); } + up_read(&net_rwsem); rtnl_unlock(); return fib_seq; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 98e1066c3d55..33958f84c173 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -33,6 +33,10 @@ bool fib_rule_matchall(const struct fib_rule *rule) if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) return false; + if (fib_rule_port_range_set(&rule->sport_range)) + return false; + if (fib_rule_port_range_set(&rule->dport_range)) + return false; return true; } EXPORT_SYMBOL_GPL(fib_rule_matchall); @@ -51,6 +55,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->pref = pref; r->table = table; r->flags = flags; + r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; @@ -220,6 +225,26 @@ static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); } +static int nla_get_port_range(struct nlattr *pattr, + struct fib_rule_port_range *port_range) +{ + const struct fib_rule_port_range *pr = nla_data(pattr); + + if (!fib_rule_port_range_valid(pr)) + return -EINVAL; + + port_range->start = pr->start; + port_range->end = pr->end; + + return 0; +} + +static int nla_put_port_range(struct sk_buff *skb, int attrtype, + struct fib_rule_port_range *range) +{ + return nla_put(skb, attrtype, sizeof(*range), range); +} + static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) @@ -424,6 +449,17 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, !uid_eq(r->uid_range.end, rule->uid_range.end)) continue; + if (r->ip_proto != rule->ip_proto) + continue; + + if (!fib_rule_port_range_compare(&r->sport_range, + &rule->sport_range)) + continue; + + if (!fib_rule_port_range_compare(&r->dport_range, + &rule->dport_range)) + continue; + if (!ops->compare(r, frh, tb)) continue; return 1; @@ -469,6 +505,9 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) : fib_default_rule_pref(ops); + rule->proto = tb[FRA_PROTOCOL] ? + nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; + if (tb[FRA_IIFNAME]) { struct net_device *dev; @@ -565,6 +604,23 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, rule->uid_range = fib_kuid_range_unset; } + if (tb[FRA_IP_PROTO]) + rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); + + if (tb[FRA_SPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_SPORT_RANGE], + &rule->sport_range); + if (err) + goto errout_free; + } + + if (tb[FRA_DPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_DPORT_RANGE], + &rule->dport_range); + if (err) + goto errout_free; + } + if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; @@ -575,6 +631,11 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout_free; + err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, + extack); + if (err < 0) + goto errout_free; + list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; @@ -611,7 +672,6 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (rule->tun_id) ip_tunnel_need_metadata(); - call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); @@ -630,6 +690,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rule_port_range sprange = {0, 0}; + struct fib_rule_port_range dprange = {0, 0}; struct fib_rules_ops *ops = NULL; struct fib_rule *rule, *r; struct nlattr *tb[FRA_MAX+1]; @@ -663,7 +725,25 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, range = fib_kuid_range_unset; } + if (tb[FRA_SPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_SPORT_RANGE], + &sprange); + if (err) + goto errout; + } + + if (tb[FRA_DPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_DPORT_RANGE], + &dprange); + if (err) + goto errout; + } + list_for_each_entry(rule, &ops->rules_list, list) { + if (tb[FRA_PROTOCOL] && + (rule->proto != nla_get_u8(tb[FRA_PROTOCOL]))) + continue; + if (frh->action && (frh->action != rule->action)) continue; @@ -704,6 +784,18 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, !uid_eq(rule->uid_range.end, range.end))) continue; + if (tb[FRA_IP_PROTO] && + (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO]))) + continue; + + if (fib_rule_port_range_set(&sprange) && + !fib_rule_port_range_compare(&rule->sport_range, &sprange)) + continue; + + if (fib_rule_port_range_set(&dprange) && + !fib_rule_port_range_compare(&rule->dport_range, &dprange)) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -781,7 +873,11 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ - + nla_total_size(sizeof(struct fib_kuid_range)); + + nla_total_size(sizeof(struct fib_kuid_range)) + + nla_total_size(1) /* FRA_PROTOCOL */ + + nla_total_size(1) /* FRA_IP_PROTO */ + + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -812,6 +908,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->action = rule->action; frh->flags = rule->flags; + if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) + goto nla_put_failure; + if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; @@ -843,7 +942,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->l3mdev && nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || (uid_range_set(&rule->uid_range) && - nla_put_uid_range(skb, &rule->uid_range))) + nla_put_uid_range(skb, &rule->uid_range)) || + (fib_rule_port_range_set(&rule->sport_range) && + nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || + (fib_rule_port_range_set(&rule->dport_range) && + nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || + (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/core/filter.c b/net/core/filter.c index 08ab4c65a998..d31aff93270d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -33,6 +33,7 @@ #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/gfp.h> +#include <net/inet_common.h> #include <net/ip.h> #include <net/protocol.h> #include <net/netlink.h> @@ -1855,7 +1856,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); /* If user passes invalid input drop the packet. */ - if (unlikely(flags)) + if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; tcb->bpf.key = key; @@ -1890,6 +1891,202 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + msg->key = key; + msg->flags = flags; + msg->map = map; + + return SK_PASS; +} + +struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +{ + struct sock *sk = NULL; + + if (msg->map) { + sk = __sock_map_lookup_elem(msg->map, msg->key); + + msg->key = 0; + msg->map = NULL; + } + + return sk; +} + +static const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes) +{ + msg->apply_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { + .func = bpf_msg_apply_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) +{ + msg->cork_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { + .func = bpf_msg_cork_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_pull_data, + struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) +{ + unsigned int len = 0, offset = 0, copy = 0; + struct scatterlist *sg = msg->sg_data; + int first_sg, last_sg, i, shift; + unsigned char *p, *to, *from; + int bytes = end - start; + struct page *page; + + if (unlikely(flags || end <= start)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg_start; + do { + len = sg[i].length; + offset += len; + if (start < offset + len) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != msg->sg_end); + + if (unlikely(start >= offset + len)) + return -EINVAL; + + if (!msg->sg_copy[i] && bytes <= len) + goto out; + + first_sg = i; + + /* At this point we need to linearize multiple scatterlist + * elements or a single shared page. Either way we need to + * copy into a linear buffer exclusively owned by BPF. Then + * place the buffer in the scatterlist and fixup the original + * entries by removing the entries now in the linear buffer + * and shifting the remaining entries. For now we do not try + * to copy partial entries to avoid complexity of running out + * of sg_entry slots. The downside is reading a single byte + * will copy the entire sg entry. + */ + do { + copy += sg[i].length; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + if (bytes < copy) + break; + } while (i != msg->sg_end); + last_sg = i; + + if (unlikely(copy < end - start)) + return -EINVAL; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); + if (unlikely(!page)) + return -ENOMEM; + p = page_address(page); + offset = 0; + + i = first_sg; + do { + from = sg_virt(&sg[i]); + len = sg[i].length; + to = p + offset; + + memcpy(to, from, len); + offset += len; + sg[i].length = 0; + put_page(sg_page(&sg[i])); + + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != last_sg); + + sg[first_sg].length = copy; + sg_set_page(&sg[first_sg], page, copy, 0); + + /* To repair sg ring we need to shift entries. If we only + * had a single entry though we can just replace it and + * be done. Otherwise walk the ring and shift the entries. + */ + shift = last_sg - first_sg - 1; + if (!shift) + goto out; + + i = first_sg + 1; + do { + int move_from; + + if (i + shift >= MAX_SKB_FRAGS) + move_from = i + shift - MAX_SKB_FRAGS; + else + move_from = i + shift; + + if (move_from == msg->sg_end) + break; + + sg[i] = sg[move_from]; + sg[move_from].length = 0; + sg[move_from].page_link = 0; + sg[move_from].offset = 0; + + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (1); + msg->sg_end -= shift; + if (msg->sg_end < 0) + msg->sg_end += MAX_SKB_FRAGS; +out: + msg->data = sg_virt(&sg[i]) + start - offset; + msg->data_end = msg->data + bytes; + + return 0; +} + +static const struct bpf_func_proto bpf_msg_pull_data_proto = { + .func = bpf_msg_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -2087,6 +2284,10 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; + /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ + if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + return -ENOTSUPP; + ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; @@ -2096,19 +2297,21 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) return ret; if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* SKB_GSO_TCPV4 needs to be changed into * SKB_GSO_TCPV6. */ - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { - skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4; - skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; + if (shinfo->gso_type & SKB_GSO_TCPV4) { + shinfo->gso_type &= ~SKB_GSO_TCPV4; + shinfo->gso_type |= SKB_GSO_TCPV6; } /* Due to IPv6 header, MSS needs to be downgraded. */ - skb_shinfo(skb)->gso_size -= len_diff; + skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IPV6); @@ -2123,6 +2326,10 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; + /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ + if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + return -ENOTSUPP; + ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; @@ -2132,19 +2339,21 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) return ret; if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* SKB_GSO_TCPV6 needs to be changed into * SKB_GSO_TCPV4. */ - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { - skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6; - skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; + if (shinfo->gso_type & SKB_GSO_TCPV6) { + shinfo->gso_type &= ~SKB_GSO_TCPV6; + shinfo->gso_type |= SKB_GSO_TCPV4; } /* Due to IPv4 header, MSS can be upgraded. */ - skb_shinfo(skb)->gso_size += len_diff; + skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IP); @@ -2243,6 +2452,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; + /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ + if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + return -ENOTSUPP; + ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; @@ -2252,11 +2465,13 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) return ret; if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* Due to header grow, MSS needs to be downgraded. */ - skb_shinfo(skb)->gso_size -= len_diff; + skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; } return 0; @@ -2267,6 +2482,10 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; + /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ + if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + return -ENOTSUPP; + ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; @@ -2276,11 +2495,13 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) return ret; if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* Due to header shrink, MSS can be upgraded. */ - skb_shinfo(skb)->gso_size += len_diff; + skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; } return 0; @@ -2831,7 +3052,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta) + func == bpf_xdp_adjust_meta || + func == bpf_msg_pull_data) return true; return false; @@ -2991,7 +3213,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | - BPF_F_DONT_FRAGMENT))) + BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { @@ -3025,6 +3247,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; if (flags & BPF_F_ZERO_CSUM_TX) info->key.tun_flags &= ~TUNNEL_CSUM; + if (flags & BPF_F_SEQ_NUMBER) + info->key.tun_flags |= TUNNEL_SEQ; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -3239,6 +3463,27 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } #ifdef CONFIG_INET + } else if (level == SOL_IP) { + if (optlen != sizeof(int) || sk->sk_family != AF_INET) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case IP_TOS: + if (val < -1 || val > 0xff) { + ret = -EINVAL; + } else { + struct inet_sock *inet = inet_sk(sk); + + if (val == -1) + val = 0; + inet->tos = val; + } + break; + default: + ret = -EINVAL; + } #if IS_ENABLED(CONFIG_IPV6) } else if (level == SOL_IPV6) { if (optlen != sizeof(int) || sk->sk_family != AF_INET6) @@ -3338,6 +3583,20 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } + } else if (level == SOL_IP) { + struct inet_sock *inet = inet_sk(sk); + + if (optlen != sizeof(int) || sk->sk_family != AF_INET) + goto err_clear; + + /* Only some options are supported */ + switch (optname) { + case IP_TOS: + *((int *)optval) = (int)inet->tos; + break; + default: + goto err_clear; + } #if IS_ENABLED(CONFIG_IPV6) } else if (level == SOL_IPV6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -3381,17 +3640,13 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; - if (!sk_fullsock(sk)) + if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; -#ifdef CONFIG_INET if (val) tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); -#else - return -EINVAL; -#endif } static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { @@ -3402,6 +3657,52 @@ static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { .arg2_type = ARG_ANYTHING, }; +const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; +EXPORT_SYMBOL_GPL(ipv6_bpf_stub); + +BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, + int, addr_len) +{ +#ifdef CONFIG_INET + struct sock *sk = ctx->sk; + int err; + + /* Binding to port can be expensive so it's prohibited in the helper. + * Only binding to IP is supported. + */ + err = -EINVAL; + if (addr->sa_family == AF_INET) { + if (addr_len < sizeof(struct sockaddr_in)) + return err; + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) + return err; + return __inet_bind(sk, addr, addr_len, true, false); +#if IS_ENABLED(CONFIG_IPV6) + } else if (addr->sa_family == AF_INET6) { + if (addr_len < SIN6_LEN_RFC2133) + return err; + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) + return err; + /* ipv6_bpf_stub cannot be NULL, since it's called from + * bpf_cgroup_inet6_connect hook and ipv6 is already loaded + */ + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); +#endif /* CONFIG_IPV6 */ + } +#endif /* CONFIG_INET */ + + return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_bind_proto = { + .func = bpf_bind, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3431,7 +3732,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -sock_filter_func_proto(enum bpf_func_id func_id) +sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { /* inet and inet6 sockets are created in a process @@ -3445,7 +3746,29 @@ sock_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -sk_filter_func_proto(enum bpf_func_id func_id) +sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + /* inet and inet6 sockets are created in a process + * context so there is always a valid uid/gid + */ + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_bind: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_bind_proto; + default: + return NULL; + } + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3460,7 +3783,7 @@ sk_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -tc_cls_act_func_proto(enum bpf_func_id func_id) +tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3527,7 +3850,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -xdp_func_proto(enum bpf_func_id func_id) +xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -3550,7 +3873,7 @@ xdp_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id) +lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3577,7 +3900,7 @@ lwt_inout_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * - sock_ops_func_proto(enum bpf_func_id func_id) +sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: @@ -3593,7 +3916,25 @@ static const struct bpf_func_proto * } } -static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_msg_redirect_map: + return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_apply_bytes: + return &bpf_msg_apply_bytes_proto; + case BPF_FUNC_msg_cork_bytes: + return &bpf_msg_cork_bytes_proto; + case BPF_FUNC_msg_pull_data: + return &bpf_msg_pull_data_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3618,7 +3959,7 @@ static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -lwt_xmit_func_proto(enum bpf_func_id func_id) +lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: @@ -3648,11 +3989,12 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; default: - return lwt_inout_func_proto(func_id); + return lwt_inout_func_proto(func_id, prog); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -3696,6 +4038,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -3716,11 +4059,12 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -3750,32 +4094,83 @@ static bool lwt_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } -static bool sock_filter_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) + +/* Attach type specific accesses */ +static bool __sock_filter_check_attach_type(int off, + enum bpf_access_type access_type, + enum bpf_attach_type attach_type) { - if (type == BPF_WRITE) { - switch (off) { - case offsetof(struct bpf_sock, bound_dev_if): - case offsetof(struct bpf_sock, mark): - case offsetof(struct bpf_sock, priority): - break; + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + case offsetof(struct bpf_sock, mark): + case offsetof(struct bpf_sock, priority): + switch (attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + goto full_access; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_ip4): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + switch (attach_type) { + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_port): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; default: return false; } } +read_only: + return access_type == BPF_READ; +full_access: + return true; +} - if (off < 0 || off + size > sizeof(struct bpf_sock)) +static bool __sock_filter_check_size(int off, int size, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + switch (off) { + case bpf_ctx_range(struct bpf_sock, src_ip4): + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } + + return size == size_default; +} + +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sock)) return false; - /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) + if (!__sock_filter_check_attach_type(off, type, + prog->expected_attach_type)) + return false; + if (!__sock_filter_check_size(off, size, info)) return false; - return true; } @@ -3826,6 +4221,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { @@ -3855,7 +4251,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, return false; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool __is_valid_xdp_access(int off, int size) @@ -3872,6 +4268,7 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) @@ -3902,8 +4299,74 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static bool sock_addr_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sock_addr)) + return false; + if (off % size != 0) + return false; + + /* Disallow access to IPv6 fields from IPv4 contex and vise + * versa. + */ + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET4_CONNECT: + break; + default: + return false; + } + break; + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET6_CONNECT: + break; + default: + return false; + } + break; + } + + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + /* Only narrow read access allowed for now. */ + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + } else { + if (size != size_default) + return false; + } + break; + case bpf_ctx_range(struct bpf_sock_addr, user_port): + if (size != size_default) + return false; + break; + default: + if (type == BPF_READ) { + if (size != size_default) + return false; + } else { + return false; + } + } + + return true; +} + static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -3950,6 +4413,7 @@ static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -3979,7 +4443,34 @@ static bool sk_skb_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + +static bool sk_msg_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct sk_msg_md, data): + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct sk_msg_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + if (off < 0 || off >= sizeof(struct sk_msg_md)) + return false; + if (off % size != 0) + return false; + if (size != sizeof(__u64)) + return false; + + return true; } static u32 bpf_convert_ctx_access(enum bpf_access_type type, @@ -4287,6 +4778,7 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): @@ -4342,6 +4834,43 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; + + case offsetof(struct bpf_sock, src_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_rcv_saddr, + FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr), + target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, src_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off( + struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0], + FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]), + target_size) + off); +#else + (void)off; + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock, src_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_num), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_num, + FIELD_SIZEOF(struct sock_common, + skc_num), + target_size)); + break; } return insn - insn_buf; @@ -4417,6 +4946,152 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of + * context Structure, F is Field in context structure that contains a pointer + * to Nested Structure of type NS that has the field NF. + * + * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make + * sure that SIZE is not greater than actual size of S.F.NF. + * + * If offset OFF is provided, the load happens from that offset relative to + * offset of NF. + */ +#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ + do { \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ + si->src_reg, offsetof(S, F)); \ + *insn++ = BPF_LDX_MEM( \ + SIZE, si->dst_reg, si->dst_reg, \ + bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + target_size) \ + + OFF); \ + } while (0) + +#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ + BPF_FIELD_SIZEOF(NS, NF), 0) + +/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to + * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. + * + * It doesn't support SIZE argument though since narrow stores are not + * supported for now. + * + * In addition it uses Temporary Field TF (member of struct S) as the 3rd + * "register" since two registers available in convert_ctx_access are not + * enough: we can't override neither SRC, since it contains value to store, nor + * DST since it contains pointer to context that may be used by later + * instructions. But we need a temporary place to save pointer to nested + * structure whose field we want to store to. + */ +#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ + do { \ + int tmp_reg = BPF_REG_9; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ + offsetof(S, TF)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ + si->dst_reg, offsetof(S, F)); \ + *insn++ = BPF_STX_MEM( \ + BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ + bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + target_size) \ + + OFF); \ + *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ + offsetof(S, TF)); \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ + TF) \ + do { \ + if (type == BPF_WRITE) { \ + SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ + TF); \ + } else { \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, SIZE, OFF); \ + } \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) + +static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock_addr, user_family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr, uaddr, sa_family); + break; + + case offsetof(struct bpf_sock_addr, user_ip4): + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, + sin_addr, BPF_SIZE(si->code), 0, tmp_reg); + break; + + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock_addr, user_ip6[0]); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, + tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, user_port): + /* To get port we need to know sa_family first and then treat + * sockaddr as either sockaddr_in or sockaddr_in6. + * Though we can simplify since port field has same offset and + * size in both structures. + * Here we check this invariant and use just one of the + * structures if it's true. + */ + BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != + offsetof(struct sockaddr_in6, sin6_port)); + BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) != + FIELD_SIZEOF(struct sockaddr_in6, sin6_port)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr_in6, uaddr, + sin6_port, tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_family); + break; + + case offsetof(struct bpf_sock_addr, type): + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sock, sk, + __sk_flags_offset, BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + break; + + case offsetof(struct bpf_sock_addr, protocol): + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sock, sk, + __sk_flags_offset, BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + break; + } + + return insn - insn_buf; +} + static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -4780,6 +5455,29 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_msg_md, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data)); + break; + case offsetof(struct sk_msg_md, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data_end)); + break; + } + + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -4851,6 +5549,15 @@ const struct bpf_verifier_ops cg_sock_verifier_ops = { const struct bpf_prog_ops cg_sock_prog_ops = { }; +const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { + .get_func_proto = sock_addr_func_proto, + .is_valid_access = sock_addr_is_valid_access, + .convert_ctx_access = sock_addr_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sock_addr_prog_ops = { +}; + const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, @@ -4870,6 +5577,15 @@ const struct bpf_verifier_ops sk_skb_verifier_ops = { const struct bpf_prog_ops sk_skb_prog_ops = { }; +const struct bpf_verifier_ops sk_msg_verifier_ops = { + .get_func_proto = sk_msg_func_proto, + .is_valid_access = sk_msg_is_valid_access, + .convert_ctx_access = sk_msg_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_msg_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 559db9ea8d86..d29f09bc5ff9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1341,22 +1341,6 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) } EXPORT_SYMBOL(__get_hash_from_flowi6); -__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys) -{ - memset(keys, 0, sizeof(*keys)); - - keys->addrs.v4addrs.src = fl4->saddr; - keys->addrs.v4addrs.dst = fl4->daddr; - keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - keys->ports.src = fl4->fl4_sport; - keys->ports.dst = fl4->fl4_dport; - keys->keyid.keyid = fl4->fl4_gre_key; - keys->basic.ip_proto = fl4->flowi4_proto; - - return flow_hash_from_keys(keys); -} -EXPORT_SYMBOL(__get_hash_from_flowi4); - static const struct flow_dissector_key flow_keys_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 0a3f88f08727..98fd12721221 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -66,6 +66,7 @@ struct net_rate_estimator { static void est_fetch_counters(struct net_rate_estimator *e, struct gnet_stats_basic_packed *b) { + memset(b, 0, sizeof(*b)); if (e->stats_lock) spin_lock(e->stats_lock); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index e010bb800d7b..9737302907b1 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -315,12 +315,12 @@ static int __net_init dev_proc_net_init(struct net *net) { int rc = -ENOMEM; - if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops)) + if (!proc_create("dev", 0444, net->proc_net, &dev_seq_fops)) goto out; - if (!proc_create("softnet_stat", S_IRUGO, net->proc_net, + if (!proc_create("softnet_stat", 0444, net->proc_net, &softnet_seq_fops)) goto out_dev; - if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops)) + if (!proc_create("ptype", 0444, net->proc_net, &ptype_seq_fops)) goto out_softnet; if (wext_proc_init(net)) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 60a5ad2c33ee..c476f0794132 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -431,7 +431,7 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr, return netdev_store(dev, attr, buf, len, change_group); } NETDEVICE_SHOW(group, fmt_dec); -static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); +static DEVICE_ATTR(netdev_group, 0644, group_show, group_store); static int change_proto_down(struct net_device *dev, unsigned long proto_down) { @@ -854,10 +854,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, } static struct rx_queue_attribute rps_cpus_attribute __ro_after_init - = __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); + = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map); static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init - = __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, + = __ATTR(rps_flow_cnt, 0644, show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); #endif /* CONFIG_RPS */ @@ -1154,7 +1154,7 @@ static ssize_t bql_set_hold_time(struct netdev_queue *queue, } static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init - = __ATTR(hold_time, S_IRUGO | S_IWUSR, + = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); static ssize_t bql_show_inflight(struct netdev_queue *queue, @@ -1166,7 +1166,7 @@ static ssize_t bql_show_inflight(struct netdev_queue *queue, } static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = - __ATTR(inflight, S_IRUGO, bql_show_inflight, NULL); + __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ @@ -1182,7 +1182,7 @@ static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ } \ \ static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \ - = __ATTR(NAME, S_IRUGO | S_IWUSR, \ + = __ATTR(NAME, 0644, \ bql_show_ ## NAME, bql_set_ ## NAME) BQL_ATTR(limit, limit); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 3cad5f51afd3..a11e03f920d3 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -29,11 +29,14 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; -DEFINE_MUTEX(net_mutex); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); +/* Protects net_namespace_list. Nests iside rtnl_lock() */ +DECLARE_RWSEM(net_rwsem); +EXPORT_SYMBOL_GPL(net_rwsem); + struct net init_net = { .count = REFCOUNT_INIT(1), .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), @@ -41,6 +44,14 @@ struct net init_net = { EXPORT_SYMBOL(init_net); static bool init_net_initialized; +/* + * pernet_ops_rwsem: protects: pernet_list, net_generic_ids, + * init_net_initialized and first_device pointer. + * This is internal net namespace object. Please, don't use it + * outside. + */ +DECLARE_RWSEM(pernet_ops_rwsem); +EXPORT_SYMBOL_GPL(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) @@ -65,11 +76,10 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) { struct net_generic *ng, *old_ng; - BUG_ON(!mutex_is_locked(&net_mutex)); BUG_ON(id < MIN_PERNET_OPS_ID); old_ng = rcu_dereference_protected(net->gen, - lockdep_is_held(&net_mutex)); + lockdep_is_held(&pernet_ops_rwsem)); if (old_ng->s.len > id) { old_ng->ptr[id] = data; return 0; @@ -286,7 +296,7 @@ struct net *get_net_ns_by_id(struct net *net, int id) */ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) { - /* Must be called with net_mutex held */ + /* Must be called with pernet_ops_rwsem held */ const struct pernet_operations *ops, *saved_ops; int error = 0; LIST_HEAD(net_exit_list); @@ -297,12 +307,16 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) net->user_ns = user_ns; idr_init(&net->netns_ids); spin_lock_init(&net->nsid_lock); + mutex_init(&net->ipv4.ra_mutex); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); if (error < 0) goto out_undo; } + down_write(&net_rwsem); + list_add_tail_rcu(&net->list, &net_namespace_list); + up_write(&net_rwsem); out: return error; @@ -354,7 +368,7 @@ static void dec_net_namespaces(struct ucounts *ucounts) dec_ucount(ucounts, UCOUNT_NET_NAMESPACES); } -static struct kmem_cache *net_cachep; +static struct kmem_cache *net_cachep __ro_after_init; static struct workqueue_struct *netns_wq; static struct net *net_alloc(void) @@ -408,32 +422,27 @@ struct net *copy_net_ns(unsigned long flags, net = net_alloc(); if (!net) { - dec_net_namespaces(ucounts); - return ERR_PTR(-ENOMEM); + rv = -ENOMEM; + goto dec_ucounts; } - + refcount_set(&net->passive, 1); + net->ucounts = ucounts; get_user_ns(user_ns); - rv = mutex_lock_killable(&net_mutex); - if (rv < 0) { - net_free(net); - dec_net_namespaces(ucounts); - put_user_ns(user_ns); - return ERR_PTR(rv); - } + rv = down_read_killable(&pernet_ops_rwsem); + if (rv < 0) + goto put_userns; - net->ucounts = ucounts; rv = setup_net(net, user_ns); - if (rv == 0) { - rtnl_lock(); - list_add_tail_rcu(&net->list, &net_namespace_list); - rtnl_unlock(); - } - mutex_unlock(&net_mutex); + + up_read(&pernet_ops_rwsem); + if (rv < 0) { - dec_net_namespaces(ucounts); +put_userns: put_user_ns(user_ns); net_drop_ns(net); +dec_ucounts: + dec_net_namespaces(ucounts); return ERR_PTR(rv); } return net; @@ -446,7 +455,7 @@ static void unhash_nsid(struct net *net, struct net *last) * and this work is the only process, that may delete * a net from net_namespace_list. So, when the below * is executing, the list may only grow. Thus, we do not - * use for_each_net_rcu() or rtnl_lock(). + * use for_each_net_rcu() or net_rwsem. */ for_each_net(tmp) { int id; @@ -466,26 +475,23 @@ static void unhash_nsid(struct net *net, struct net *last) spin_unlock_bh(&net->nsid_lock); } -static DEFINE_SPINLOCK(cleanup_list_lock); -static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ +static LLIST_HEAD(cleanup_list); static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; struct net *net, *tmp, *last; - struct list_head net_kill_list; + struct llist_node *net_kill_list; LIST_HEAD(net_exit_list); /* Atomically snapshot the list of namespaces to cleanup */ - spin_lock_irq(&cleanup_list_lock); - list_replace_init(&cleanup_list, &net_kill_list); - spin_unlock_irq(&cleanup_list_lock); + net_kill_list = llist_del_all(&cleanup_list); - mutex_lock(&net_mutex); + down_read(&pernet_ops_rwsem); /* Don't let anyone else find us. */ - rtnl_lock(); - list_for_each_entry(net, &net_kill_list, cleanup_list) + down_write(&net_rwsem); + llist_for_each_entry(net, net_kill_list, cleanup_list) list_del_rcu(&net->list); /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer @@ -498,9 +504,9 @@ static void cleanup_net(struct work_struct *work) * useless anyway, as netns_ids are destroyed there. */ last = list_last_entry(&net_namespace_list, struct net, list); - rtnl_unlock(); + up_write(&net_rwsem); - list_for_each_entry(net, &net_kill_list, cleanup_list) { + llist_for_each_entry(net, net_kill_list, cleanup_list) { unhash_nsid(net, last); list_add_tail(&net->exit_list, &net_exit_list); } @@ -520,7 +526,7 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list); - mutex_unlock(&net_mutex); + up_read(&pernet_ops_rwsem); /* Ensure there are no outstanding rcu callbacks using this * network namespace. @@ -547,8 +553,8 @@ static void cleanup_net(struct work_struct *work) */ void net_ns_barrier(void) { - mutex_lock(&net_mutex); - mutex_unlock(&net_mutex); + down_write(&pernet_ops_rwsem); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL(net_ns_barrier); @@ -557,13 +563,8 @@ static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { /* Cleanup the network namespace in process context */ - unsigned long flags; - - spin_lock_irqsave(&cleanup_list_lock, flags); - list_add(&net->cleanup_list, &cleanup_list); - spin_unlock_irqrestore(&cleanup_list_lock, flags); - - queue_work(netns_wq, &net_cleanup_work); + if (llist_add(&net->cleanup_list, &cleanup_list)) + queue_work(netns_wq, &net_cleanup_work); } EXPORT_SYMBOL_GPL(__put_net); @@ -861,7 +862,7 @@ static int __init net_ns_init(void) #ifdef CONFIG_NET_NS net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, - SLAB_PANIC, NULL); + SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Create workqueue for cleanup */ netns_wq = create_singlethread_workqueue("netns"); @@ -875,17 +876,12 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); - mutex_lock(&net_mutex); + down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); init_net_initialized = true; - - rtnl_lock(); - list_add_tail_rcu(&init_net.list, &net_namespace_list); - rtnl_unlock(); - - mutex_unlock(&net_mutex); + up_write(&pernet_ops_rwsem); register_pernet_subsys(&net_ns_ops); @@ -909,6 +905,9 @@ static int __register_pernet_operations(struct list_head *list, list_add_tail(&ops->list, list); if (ops->init || (ops->id && ops->size)) { + /* We held write locked pernet_ops_rwsem, and parallel + * setup_net() and cleanup_net() are not possible. + */ for_each_net(net) { error = ops_init(ops, net); if (error) @@ -932,6 +931,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) LIST_HEAD(net_exit_list); list_del(&ops->list); + /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); ops_exit_list(ops, &net_exit_list); @@ -996,7 +996,6 @@ again: static void unregister_pernet_operations(struct pernet_operations *ops) { - __unregister_pernet_operations(ops); rcu_barrier(); if (ops->id) @@ -1025,9 +1024,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops) int register_pernet_subsys(struct pernet_operations *ops) { int error; - mutex_lock(&net_mutex); + down_write(&pernet_ops_rwsem); error = register_pernet_operations(first_device, ops); - mutex_unlock(&net_mutex); + up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_subsys); @@ -1043,9 +1042,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys); */ void unregister_pernet_subsys(struct pernet_operations *ops) { - mutex_lock(&net_mutex); + down_write(&pernet_ops_rwsem); unregister_pernet_operations(ops); - mutex_unlock(&net_mutex); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_subsys); @@ -1071,11 +1070,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys); int register_pernet_device(struct pernet_operations *ops) { int error; - mutex_lock(&net_mutex); + down_write(&pernet_ops_rwsem); error = register_pernet_operations(&pernet_list, ops); if (!error && (first_device == &pernet_list)) first_device = &ops->list; - mutex_unlock(&net_mutex); + up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_device); @@ -1091,11 +1090,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device); */ void unregister_pernet_device(struct pernet_operations *ops) { - mutex_lock(&net_mutex); + down_write(&pernet_ops_rwsem); if (&ops->list == first_device) first_device = first_device->next; unregister_pernet_operations(ops); - mutex_unlock(&net_mutex); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_device); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b8ab5c829511..7e4ede34cc52 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -906,13 +906,14 @@ static ssize_t pktgen_if_write(struct file *file, i += len; if (debug) { - size_t copy = min_t(size_t, count, 1023); - char tb[copy + 1]; - if (copy_from_user(tb, user_buffer, copy)) - return -EFAULT; - tb[copy] = 0; - pr_debug("%s,%lu buffer -:%s:-\n", - name, (unsigned long)count, tb); + size_t copy = min_t(size_t, count + 1, 1024); + char *tp = strndup_user(user_buffer, copy); + + if (IS_ERR(tp)) + return PTR_ERR(tp); + + pr_debug("%s,%zu buffer -:%s:-\n", name, count, tp); + kfree(tp); } if (!strcmp(name, "min_pkt_size")) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bc290413a49d..45936922d7e2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -75,6 +75,12 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +int rtnl_lock_killable(void) +{ + return mutex_lock_killable(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_lock_killable); + static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) { @@ -406,7 +412,9 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister * - * The caller must hold the rtnl_mutex. + * The caller must hold the rtnl_mutex and guarantee net_namespace_list + * integrity (hold pernet_ops_rwsem for writing to close the race + * with setup_net() and cleanup_net()). */ void __rtnl_link_unregister(struct rtnl_link_ops *ops) { @@ -432,6 +440,9 @@ static void rtnl_lock_unregistering_all(void) for (;;) { unregistering = false; rtnl_lock(); + /* We held write locked pernet_ops_rwsem, and parallel + * setup_net() and cleanup_net() are not possible. + */ for_each_net(net) { if (net->dev_unreg_count > 0) { unregistering = true; @@ -453,12 +464,12 @@ static void rtnl_lock_unregistering_all(void) */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { - /* Close the race with cleanup_net() */ - mutex_lock(&net_mutex); + /* Close the race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); - mutex_unlock(&net_mutex); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 09bd89c90a71..1bca1e0fc8f7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -77,8 +77,8 @@ #include <linux/capability.h> #include <linux/user_namespace.h> -struct kmem_cache *skbuff_head_cache __read_mostly; -static struct kmem_cache *skbuff_fclone_cache __read_mostly; +struct kmem_cache *skbuff_head_cache __ro_after_init; +static struct kmem_cache *skbuff_fclone_cache __ro_after_init; int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -890,7 +890,7 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); -static int mm_account_pinned_pages(struct mmpin *mmp, size_t size) +int mm_account_pinned_pages(struct mmpin *mmp, size_t size) { unsigned long max_pg, num_pg, new_pg, old_pg; struct user_struct *user; @@ -919,14 +919,16 @@ static int mm_account_pinned_pages(struct mmpin *mmp, size_t size) return 0; } +EXPORT_SYMBOL_GPL(mm_account_pinned_pages); -static void mm_unaccount_pinned_pages(struct mmpin *mmp) +void mm_unaccount_pinned_pages(struct mmpin *mmp) { if (mmp->user) { atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); free_uid(mmp->user); } } +EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) { @@ -3458,6 +3460,19 @@ void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) } EXPORT_SYMBOL_GPL(skb_pull_rcsum); +static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) +{ + skb_frag_t head_frag; + struct page *page; + + page = virt_to_head_page(frag_skb->head); + head_frag.page.p = page; + head_frag.page_offset = frag_skb->data - + (unsigned char *)page_address(page); + head_frag.size = skb_headlen(frag_skb); + return head_frag; +} + /** * skb_segment - Perform protocol segmentation on skb. * @head_skb: buffer to segment @@ -3662,15 +3677,19 @@ normal: while (pos < offset + len) { if (i >= nfrags) { - BUG_ON(skb_headlen(list_skb)); - i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; + if (!skb_headlen(list_skb)) { + BUG_ON(!nfrags); + } else { + BUG_ON(!list_skb->head_frag); - BUG_ON(!nfrags); - + /* to make room for head_frag. */ + i--; + frag--; + } if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) @@ -3687,7 +3706,7 @@ normal: goto err; } - *nskb_frag = *frag; + *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; __skb_frag_ref(nskb_frag); size = skb_frag_size(nskb_frag); @@ -4179,7 +4198,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) skb_queue_tail(&sk->sk_error_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + sk->sk_error_report(sk); return 0; } EXPORT_SYMBOL(sock_queue_err_skb); @@ -4891,7 +4910,7 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet); * * The MAC/L2 or network (IP, IPv6) headers are not accounted for. */ -unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int thlen = 0; @@ -4904,7 +4923,7 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) thlen += inner_tcp_hdrlen(skb); } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { thlen = tcp_hdrlen(skb); - } else if (unlikely(shinfo->gso_type & SKB_GSO_SCTP)) { + } else if (unlikely(skb_is_gso_sctp(skb))) { thlen = sizeof(struct sctphdr); } /* UFO sets gso_size to the size of the fragmentation @@ -4913,7 +4932,40 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) */ return thlen + shinfo->gso_size; } -EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); + +/** + * skb_gso_network_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_network_seglen is used to determine the real size of the + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). + * + * The MAC/L2 header is not accounted for. + */ +static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - + skb_network_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_mac_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_mac_seglen is used to determine the real size of the + * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 + * headers (TCP/UDP). + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} /** * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS @@ -4955,19 +5007,20 @@ static inline bool skb_gso_size_check(const struct sk_buff *skb, } /** - * skb_gso_validate_mtu - Return in case such skb fits a given MTU + * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? * * @skb: GSO skb * @mtu: MTU to validate against * - * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU - * once split. + * skb_gso_validate_network_len validates if a given skb will fit a + * wanted MTU once split. It considers L3 headers, L4 headers, and the + * payload. */ -bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu) +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) { return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); } -EXPORT_SYMBOL_GPL(skb_gso_validate_mtu); +EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); /** * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? @@ -4986,13 +5039,18 @@ EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { + int mac_len; + if (skb_cow(skb, skb_headroom(skb)) < 0) { kfree_skb(skb); return NULL; } - memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN, - 2 * ETH_ALEN); + mac_len = skb->data - skb_mac_header(skb); + if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { + memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), + mac_len - VLAN_HLEN - ETH_TLEN); + } skb->mac_header += VLAN_HLEN; return skb; } diff --git a/net/core/sock.c b/net/core/sock.c index c501499a04fe..6444525f610c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1049,16 +1049,18 @@ set_rcvbuf: break; case SO_ZEROCOPY: - if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { + if (sk->sk_protocol != IPPROTO_TCP) + ret = -ENOTSUPP; + } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; - else if (sk->sk_protocol != IPPROTO_TCP) - ret = -ENOTSUPP; - else if (sk->sk_state != TCP_CLOSE) - ret = -EBUSY; - else if (val < 0 || val > 1) - ret = -EINVAL; - else - sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); + } + if (!ret) { + if (val < 0 || val > 1) + ret = -EINVAL; + else + sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); + } break; default: @@ -1274,7 +1276,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname, { char address[128]; - if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) + lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); + if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; @@ -1773,7 +1776,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) u32 max_segs = 1; sk_dst_set(sk, dst); - sk->sk_route_caps = dst->dev->features; + sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; sk->sk_route_caps &= ~sk->sk_route_nocaps; @@ -2234,6 +2237,67 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); +int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, + int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, + int first_coalesce) +{ + int sg_curr = *sg_curr_index, use = 0, rc = 0; + unsigned int size = *sg_curr_size; + struct page_frag *pfrag; + struct scatterlist *sge; + + len -= size; + pfrag = sk_page_frag(sk); + + while (len > 0) { + unsigned int orig_offset; + + if (!sk_page_frag_refill(sk, pfrag)) { + rc = -ENOMEM; + goto out; + } + + use = min_t(int, len, pfrag->size - pfrag->offset); + + if (!sk_wmem_schedule(sk, use)) { + rc = -ENOMEM; + goto out; + } + + sk_mem_charge(sk, use); + size += use; + orig_offset = pfrag->offset; + pfrag->offset += use; + + sge = sg + sg_curr - 1; + if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && + sg->offset + sg->length == orig_offset) { + sg->length += use; + } else { + sge = sg + sg_curr; + sg_unmark_end(sge); + sg_set_page(sge, pfrag->page, use, orig_offset); + get_page(pfrag->page); + sg_curr++; + + if (sg_curr == MAX_SKB_FRAGS) + sg_curr = 0; + + if (sg_curr == sg_start) { + rc = -ENOSPC; + break; + } + } + + len -= use; + } +out: + *sg_curr_size = size; + *sg_curr_index = sg_curr; + return rc; +} +EXPORT_SYMBOL(sk_alloc_sg); + static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) @@ -2497,7 +2561,7 @@ int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, EXPORT_SYMBOL(sock_no_accept); int sock_no_getname(struct socket *sock, struct sockaddr *saddr, - int *len, int peer) + int peer) { return -EOPNOTSUPP; } @@ -3261,6 +3325,27 @@ void proto_unregister(struct proto *prot) } EXPORT_SYMBOL(proto_unregister); +int sock_load_diag_module(int family, int protocol) +{ + if (!protocol) { + if (!sock_is_registered(family)) + return -ENOENT; + + return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, family); + } + +#ifdef CONFIG_INET + if (family == AF_INET && + !rcu_access_pointer(inet_protos[protocol])) + return -ENOENT; +#endif + + return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, family, protocol); +} +EXPORT_SYMBOL(sock_load_diag_module); + #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) __acquires(proto_list_mutex) @@ -3369,7 +3454,7 @@ static const struct file_operations proto_seq_fops = { static __net_init int proto_init_net(struct net *net) { - if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops)) + if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops)) return -ENOMEM; return 0; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 146b50e30659..c37b5be7c5e4 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -220,8 +220,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; if (sock_diag_handlers[req->sdiag_family] == NULL) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, req->sdiag_family); + sock_load_diag_module(req->sdiag_family, 0); mutex_lock(&sock_diag_table_mutex); hndl = sock_diag_handlers[req->sdiag_family]; @@ -247,8 +246,7 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, case TCPDIAG_GETSOCK: case DCCPDIAG_GETSOCK: if (inet_rcv_compat == NULL) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET); + sock_load_diag_module(AF_INET, 0); mutex_lock(&sock_diag_table_mutex); if (inet_rcv_compat != NULL) @@ -281,14 +279,12 @@ static int sock_diag_bind(struct net *net, int group) case SKNLGRP_INET_TCP_DESTROY: case SKNLGRP_INET_UDP_DESTROY: if (!sock_diag_handlers[AF_INET]) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET); + sock_load_diag_module(AF_INET, 0); break; case SKNLGRP_INET6_TCP_DESTROY: case SKNLGRP_INET6_UDP_DESTROY: if (!sock_diag_handlers[AF_INET6]) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET6); + sock_load_diag_module(AF_INET6, 0); break; } return 0; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index f2d0462611c3..b3b609f0eeb5 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -32,6 +32,9 @@ static int max_skb_frags = MAX_SKB_FRAGS; static int net_msg_warn; /* Unused, but still a sysctl */ +int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; +EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -513,6 +516,15 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, + { + .procname = "fb_tunnels_only_for_init_net", + .data = &sysctl_fb_tunnels_only_for_init_net, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { } }; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 15bdc002d90c..84cd4e3fd01b 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -794,6 +794,11 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (skb == NULL) goto out_release; + if (sk->sk_state == DCCP_CLOSED) { + rc = -ENOTCONN; + goto out_discard; + } + skb_reserve(skb, sk->sk_prot->max_header); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc != 0) diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 791aff68af88..32751602767f 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1180,14 +1180,12 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags, } -static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len,int peer) +static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer) { struct sockaddr_dn *sa = (struct sockaddr_dn *)uaddr; struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); - *uaddr_len = sizeof(struct sockaddr_dn); - lock_sock(sk); if (peer) { @@ -1205,7 +1203,7 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len release_sock(sk); - return 0; + return sizeof(struct sockaddr_dn); } @@ -2385,7 +2383,7 @@ static int __init decnet_init(void) dev_add_pack(&dn_dix_packet_type); register_netdevice_notifier(&dn_dev_notifier); - proc_create("decnet", S_IRUGO, init_net.proc_net, &dn_socket_seq_fops); + proc_create("decnet", 0444, init_net.proc_net, &dn_socket_seq_fops); dn_register_sysctl(); out: return rc; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index c9f5e1ebb9c8..c03b046478c3 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -1424,7 +1424,7 @@ void __init dn_dev_init(void) rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr, 0); - proc_create("decnet_dev", S_IRUGO, init_net.proc_net, &dn_dev_seq_fops); + proc_create("decnet_dev", 0444, init_net.proc_net, &dn_dev_seq_fops); #ifdef CONFIG_SYSCTL { diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 6e37d9e6345e..13156165afa3 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -608,7 +608,7 @@ static const struct file_operations dn_neigh_seq_fops = { void __init dn_neigh_init(void) { neigh_table_init(NEIGH_DN_TABLE, &dn_neigh_table); - proc_create("decnet_neigh", S_IRUGO, init_net.proc_net, + proc_create("decnet_neigh", 0444, init_net.proc_net, &dn_neigh_seq_fops); } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index ef20b8e31669..eca0cc6b761f 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1918,7 +1918,7 @@ void __init dn_route_init(void) dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1); - proc_create("decnet_cache", S_IRUGO, init_net.proc_net, + proc_create("decnet_cache", 0444, init_net.proc_net, &dn_rt_cache_seq_fops); #ifdef CONFIG_DECNET_ROUTER diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index e1d4d898a007..8396705deffc 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -38,7 +38,7 @@ MODULE_AUTHOR("Wang Lei"); MODULE_LICENSE("GPL"); unsigned int dns_resolver_debug; -module_param_named(debug, dns_resolver_debug, uint, S_IWUSR | S_IRUGO); +module_param_named(debug, dns_resolver_debug, uint, 0644); MODULE_PARM_DESC(debug, "DNS Resolver debugging mask"); const struct cred *dns_resolver_cache; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6a9d0f50fbee..e63c554e0623 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -23,6 +23,7 @@ #include <linux/netdevice.h> #include <linux/sysfs.h> #include <linux/phy_fixed.h> +#include <linux/ptp_classify.h> #include <linux/gpio/consumer.h> #include <linux/etherdevice.h> @@ -122,6 +123,38 @@ struct net_device *dsa_dev_to_net_device(struct device *dev) } EXPORT_SYMBOL_GPL(dsa_dev_to_net_device); +/* Determine if we should defer delivery of skb until we have a rx timestamp. + * + * Called from dsa_switch_rcv. For now, this will only work if tagging is + * enabled on the switch. Normally the MAC driver would retrieve the hardware + * timestamp when it reads the packet out of the hardware. However in a DSA + * switch, the DSA driver owning the interface to which the packet is + * delivered is never notified unless we do so here. + */ +static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p, + struct sk_buff *skb) +{ + struct dsa_switch *ds = p->dp->ds; + unsigned int type; + + if (skb_headroom(skb) < ETH_HLEN) + return false; + + __skb_push(skb, ETH_HLEN); + + type = ptp_classify_raw(skb); + + __skb_pull(skb, ETH_HLEN); + + if (type == PTP_CLASS_NONE) + return false; + + if (likely(ds->ops->port_rxtstamp)) + return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type); + + return false; +} + static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *unused) { @@ -157,6 +190,9 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, s->rx_bytes += skb->len; u64_stats_update_end(&s->syncp); + if (dsa_skb_defer_rx_timestamp(p, skb)) + return 0; + netif_receive_skb(skb); return 0; diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index cb54b81d0bd9..42a7b85b84e1 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -194,7 +194,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, ds->ports[i].dn = cd->port_dn[i]; ds->ports[i].cpu_dp = dst->cpu_dp; - if (dsa_is_user_port(ds, i)) + if (!dsa_is_user_port(ds, i)) continue; ret = dsa_slave_create(&ds->ports[i]); diff --git a/net/dsa/master.c b/net/dsa/master.c index 00589147f042..90e6df0351eb 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -42,7 +42,7 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset) count += ops->get_sset_count(dev, sset); if (sset == ETH_SS_STATS && ds->ops->get_sset_count) - count += ds->ops->get_sset_count(ds); + count += ds->ops->get_sset_count(ds, cpu_dp->index); return count; } @@ -76,7 +76,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, * constructed earlier */ ds->ops->get_strings(ds, port, ndata); - count = ds->ops->get_sset_count(ds); + count = ds->ops->get_sset_count(ds, port); for (i = 0; i < count; i++) { memmove(ndata + (i * len + sizeof(pfx)), ndata + i * len, len - sizeof(pfx)); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index f52307296de4..18561af7a8f1 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -21,6 +21,7 @@ #include <net/tc_act/tc_mirred.h> #include <linux/if_bridge.h> #include <linux/netpoll.h> +#include <linux/ptp_classify.h> #include "dsa_priv.h" @@ -255,6 +256,22 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->dp->ds; + int port = p->dp->index; + + /* Pass through to switch driver if it supports timestamping */ + switch (cmd) { + case SIOCGHWTSTAMP: + if (ds->ops->port_hwtstamp_get) + return ds->ops->port_hwtstamp_get(ds, port, ifr); + break; + case SIOCSHWTSTAMP: + if (ds->ops->port_hwtstamp_set) + return ds->ops->port_hwtstamp_set(ds, port, ifr); + break; + } + if (!dev->phydev) return -ENODEV; @@ -385,6 +402,30 @@ static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev, return NETDEV_TX_OK; } +static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, + struct sk_buff *skb) +{ + struct dsa_switch *ds = p->dp->ds; + struct sk_buff *clone; + unsigned int type; + + type = ptp_classify_raw(skb); + if (type == PTP_CLASS_NONE) + return; + + if (!ds->ops->port_txtstamp) + return; + + clone = skb_clone_sk(skb); + if (!clone) + return; + + if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type)) + return; + + kfree_skb(clone); +} + static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -397,6 +438,11 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) s->tx_bytes += skb->len; u64_stats_update_end(&s->syncp); + /* Identify PTP protocol packets, clone them, and pass them to the + * switch driver + */ + dsa_skb_tx_timestamp(p, skb); + /* Transmit function may have to reallocate the original SKB, * in which case it must have freed it. Only free it here on error. */ @@ -559,7 +605,7 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) count = 4; if (ds->ops->get_sset_count) - count += ds->ops->get_sset_count(ds); + count += ds->ops->get_sset_count(ds, dp->index); return count; } @@ -918,6 +964,18 @@ static int dsa_slave_set_rxnfc(struct net_device *dev, return ds->ops->set_rxnfc(ds, dp->index, nfc); } +static int dsa_slave_get_ts_info(struct net_device *dev, + struct ethtool_ts_info *ts) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->dp->ds; + + if (!ds->ops->get_ts_info) + return -EOPNOTSUPP; + + return ds->ops->get_ts_info(ds, p->dp->index, ts); +} + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_drvinfo = dsa_slave_get_drvinfo, .get_regs_len = dsa_slave_get_regs_len, @@ -938,6 +996,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .set_link_ksettings = phy_ethtool_set_link_ksettings, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, + .get_ts_info = dsa_slave_get_ts_info, }; /* legacy way, bypassing the bridge *****************************************/ diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h index d8de3bcfb103..b8d95cb71c25 100644 --- a/net/ieee802154/6lowpan/6lowpan_i.h +++ b/net/ieee802154/6lowpan/6lowpan_i.h @@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result; #define LOWPAN_DISPATCH_FRAG1 0xc0 #define LOWPAN_DISPATCH_FRAGN 0xe0 -struct lowpan_create_arg { +struct frag_lowpan_compare_key { u16 tag; u16 d_size; - const struct ieee802154_addr *src; - const struct ieee802154_addr *dst; + const struct ieee802154_addr src; + const struct ieee802154_addr dst; }; -/* Equivalent of ipv4 struct ip +/* Equivalent of ipv4 struct ipq */ struct lowpan_frag_queue { struct inet_frag_queue q; - - u16 tag; - u16 d_size; - struct ieee802154_addr saddr; - struct ieee802154_addr daddr; }; -static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) -{ - switch (a->mode) { - case IEEE802154_ADDR_LONG: - return (((__force u64)a->extended_addr) >> 32) ^ - (((__force u64)a->extended_addr) & 0xffffffff); - case IEEE802154_ADDR_SHORT: - return (__force u32)(a->short_addr + (a->pan_id << 16)); - default: - return 0; - } -} - int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); void lowpan_net_frag_exit(void); int lowpan_net_frag_init(void); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 974765b7d92a..275449b0d633 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -104,6 +104,7 @@ static void lowpan_setup(struct net_device *ldev) /* We need an ipv6hdr as minimum len when calling xmit */ ldev->hard_header_len = sizeof(struct ipv6hdr); ldev->flags = IFF_BROADCAST | IFF_MULTICAST; + ldev->priv_flags |= IFF_NO_QUEUE; ldev->netdev_ops = &lowpan_netdev_ops; ldev->header_ops = &lowpan_header_ops; @@ -206,9 +207,13 @@ static inline void lowpan_netlink_fini(void) static int lowpan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *wdev = netdev_notifier_info_to_dev(ptr); + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct wpan_dev *wpan_dev; - if (wdev->type != ARPHRD_IEEE802154) + if (ndev->type != ARPHRD_IEEE802154) + return NOTIFY_DONE; + wpan_dev = ndev->ieee802154_ptr; + if (!wpan_dev) return NOTIFY_DONE; switch (event) { @@ -217,8 +222,8 @@ static int lowpan_device_event(struct notifier_block *unused, * also delete possible lowpan interfaces which belongs * to the wpan interface. */ - if (wdev->ieee802154_ptr->lowpan_dev) - lowpan_dellink(wdev->ieee802154_ptr->lowpan_dev, NULL); + if (wpan_dev->lowpan_dev) + lowpan_dellink(wpan_dev->lowpan_dev, NULL); break; default: return NOTIFY_DONE; diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 85bf86ad6b18..1790b65944b3 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, struct net_device *ldev); -static unsigned int lowpan_hash_frag(u16 tag, u16 d_size, - const struct ieee802154_addr *saddr, - const struct ieee802154_addr *daddr) -{ - net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd)); - return jhash_3words(ieee802154_addr_hash(saddr), - ieee802154_addr_hash(daddr), - (__force u32)(tag + (d_size << 16)), - lowpan_frags.rnd); -} - -static unsigned int lowpan_hashfn(const struct inet_frag_queue *q) -{ - const struct lowpan_frag_queue *fq; - - fq = container_of(q, struct lowpan_frag_queue, q); - return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr); -} - -static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct lowpan_frag_queue *fq; - const struct lowpan_create_arg *arg = a; - - fq = container_of(q, struct lowpan_frag_queue, q); - return fq->tag == arg->tag && fq->d_size == arg->d_size && - ieee802154_addr_equal(&fq->saddr, arg->src) && - ieee802154_addr_equal(&fq->daddr, arg->dst); -} - static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { - const struct lowpan_create_arg *arg = a; + const struct frag_lowpan_compare_key *key = a; struct lowpan_frag_queue *fq; fq = container_of(q, struct lowpan_frag_queue, q); - fq->tag = arg->tag; - fq->d_size = arg->d_size; - fq->saddr = *arg->src; - fq->daddr = *arg->dst; + BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); + memcpy(&q->key, key, sizeof(*key)); } static void lowpan_frag_expire(struct timer_list *t) @@ -94,10 +62,10 @@ static void lowpan_frag_expire(struct timer_list *t) if (fq->q.flags & INET_FRAG_COMPLETE) goto out; - inet_frag_kill(&fq->q, &lowpan_frags); + inet_frag_kill(&fq->q); out: spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, &lowpan_frags); + inet_frag_put(&fq->q); } static inline struct lowpan_frag_queue * @@ -105,25 +73,20 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, const struct ieee802154_addr *src, const struct ieee802154_addr *dst) { - struct inet_frag_queue *q; - struct lowpan_create_arg arg; - unsigned int hash; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); + struct frag_lowpan_compare_key key = { + .tag = cb->d_tag, + .d_size = cb->d_size, + .src = *src, + .dst = *dst, + }; + struct inet_frag_queue *q; - arg.tag = cb->d_tag; - arg.d_size = cb->d_size; - arg.src = src; - arg.dst = dst; - - hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst); - - q = inet_frag_find(&ieee802154_lowpan->frags, - &lowpan_frags, &arg, hash); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + q = inet_frag_find(&ieee802154_lowpan->frags, &key); + if (!q) return NULL; - } + return container_of(q, struct lowpan_frag_queue, q); } @@ -230,7 +193,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, struct sk_buff *fp, *head = fq->q.fragments; int sum_truesize; - inet_frag_kill(&fq->q, &lowpan_frags); + inet_frag_kill(&fq->q); /* Make the one we just received the head. */ if (prev) { @@ -438,7 +401,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) ret = lowpan_frag_queue(fq, skb, frag_type); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, &lowpan_frags); + inet_frag_put(&fq->q); return ret; } @@ -448,24 +411,22 @@ err: } #ifdef CONFIG_SYSCTL -static int zero; static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", .data = &init_net.ieee802154_lowpan.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh }, { .procname = "6lowpanfrag_low_thresh", .data = &init_net.ieee802154_lowpan.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .proc_handler = proc_doulongvec_minmax, .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh }, { @@ -581,14 +542,20 @@ static int __net_init lowpan_frags_init_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); + int res; ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; + ieee802154_lowpan->frags.f = &lowpan_frags; - inet_frags_init_net(&ieee802154_lowpan->frags); - - return lowpan_frags_ns_sysctl_register(net); + res = inet_frags_init_net(&ieee802154_lowpan->frags); + if (res < 0) + return res; + res = lowpan_frags_ns_sysctl_register(net); + if (res < 0) + inet_frags_exit_net(&ieee802154_lowpan->frags); + return res; } static void __net_exit lowpan_frags_exit_net(struct net *net) @@ -597,7 +564,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net) net_ieee802154_lowpan(net); lowpan_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags); + inet_frags_exit_net(&ieee802154_lowpan->frags); } static struct pernet_operations lowpan_frags_ops = { @@ -605,32 +572,63 @@ static struct pernet_operations lowpan_frags_ops = { .exit = lowpan_frags_exit_net, }; -int __init lowpan_net_frag_init(void) +static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) { - int ret; + return jhash2(data, + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); +} - ret = lowpan_frags_sysctl_register(); - if (ret) - return ret; +static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; - ret = register_pernet_subsys(&lowpan_frags_ops); - if (ret) - goto err_pernet; + return jhash2((const u32 *)&fq->key, + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); +} + +static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_lowpan_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static const struct rhashtable_params lowpan_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = lowpan_key_hashfn, + .obj_hashfn = lowpan_obj_hashfn, + .obj_cmpfn = lowpan_obj_cmpfn, + .automatic_shrinking = true, +}; + +int __init lowpan_net_frag_init(void) +{ + int ret; - lowpan_frags.hashfn = lowpan_hashfn; lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.destructor = NULL; lowpan_frags.qsize = sizeof(struct frag_queue); - lowpan_frags.match = lowpan_frag_match; lowpan_frags.frag_expire = lowpan_frag_expire; lowpan_frags.frags_cache_name = lowpan_frags_cache_name; + lowpan_frags.rhash_params = lowpan_rhash_params; ret = inet_frags_init(&lowpan_frags); if (ret) - goto err_pernet; + goto out; + ret = lowpan_frags_sysctl_register(); + if (ret) + goto err_sysctl; + + ret = register_pernet_subsys(&lowpan_frags_ops); + if (ret) + goto err_pernet; +out: return ret; err_pernet: lowpan_frags_sysctl_unregister(); +err_sysctl: + inet_frags_fini(&lowpan_frags); return ret; } diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index f48fe6fc7e8c..80dad301361d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -212,9 +212,14 @@ config NET_IPGRE_BROADCAST Network), but can be distributed all over the Internet. If you want to do that, say Y here and to "IP multicast routing" below. +config IP_MROUTE_COMMON + bool + depends on IP_MROUTE || IPV6_MROUTE + config IP_MROUTE bool "IP: multicast routing" depends on IP_MULTICAST + select IP_MROUTE_COMMON help This is used if you want your machine to act as a router for IP packets that have several destination addresses. It is needed on the diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 47a0a6649a9d..a07b7dd06def 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o +obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o obj-$(CONFIG_NET_IPIP) += ipip.o gre-y := gre_demux.o obj-$(CONFIG_NET_FOU) += fou.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e4329e161943..eaed0367e669 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -432,23 +432,37 @@ EXPORT_SYMBOL(inet_release); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; - struct inet_sock *inet = inet_sk(sk); - struct net *net = sock_net(sk); - unsigned short snum; - int chk_addr_ret; - u32 tb_id = RT_TABLE_LOCAL; int err; /* If the socket has its own bind function then use it. (RAW) */ if (sk->sk_prot->bind) { - err = sk->sk_prot->bind(sk, uaddr, addr_len); - goto out; + return sk->sk_prot->bind(sk, uaddr, addr_len); } - err = -EINVAL; if (addr_len < sizeof(struct sockaddr_in)) - goto out; + return -EINVAL; + + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); + if (err) + return err; + + return __inet_bind(sk, uaddr, addr_len, false, true); +} +EXPORT_SYMBOL(inet_bind); + +int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + unsigned short snum; + int chk_addr_ret; + u32 tb_id = RT_TABLE_LOCAL; + int err; if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) @@ -492,7 +506,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ - lock_sock(sk); + if (with_lock) + lock_sock(sk); /* Check these errors (active socket, double bind). */ err = -EINVAL; @@ -504,11 +519,18 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if ((snum || !inet->bind_address_no_port) && - sk->sk_prot->get_port(sk, snum)) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - err = -EADDRINUSE; - goto out_release_sock; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + err = -EADDRINUSE; + goto out_release_sock; + } + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } if (inet->inet_rcv_saddr) @@ -521,22 +543,29 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sk_dst_reset(sk); err = 0; out_release_sock: - release_sock(sk); + if (with_lock) + release_sock(sk); out: return err; } -EXPORT_SYMBOL(inet_bind); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; + int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + return err; + } + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->connect(sk, uaddr, addr_len); @@ -617,6 +646,12 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state != TCP_CLOSE) goto out; + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + goto out; + } + err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; @@ -723,7 +758,7 @@ EXPORT_SYMBOL(inet_accept); * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -745,8 +780,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_addr.s_addr = addr; } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - *uaddr_len = sizeof(*sin); - return 0; + return sizeof(*sin); } EXPORT_SYMBOL(inet_getname); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f28f06c91ead..be4c595edccb 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1434,7 +1434,7 @@ static const struct file_operations arp_seq_fops = { static int __net_init arp_net_init(struct net *net) { - if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops)) + if (!proc_create("arp", 0444, net->proc_net, &arp_seq_fops)) return -ENOMEM; return 0; } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 296d0b956bfe..97689012b357 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -654,7 +654,7 @@ static void esp_input_restore_header(struct sk_buff *skb) static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) { struct xfrm_state *x = xfrm_input_state(skb); - struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data; + struct ip_esp_hdr *esph; /* For ESN we move the header forward by 4 bytes to * accomodate the high bits. We will move it back after diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index da5635fc52c2..7cf755ef9efb 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -138,6 +138,8 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || (x->xso.dev != skb->dev)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); + else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) + esp_features = features & ~NETIF_F_CSUM_MASK; xo->flags |= XFRM_GSO_SEGMENT; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 35d646a62ad4..737d11bc8838 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -182,6 +182,17 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) if (r->tos && (r->tos != fl4->flowi4_tos)) return 0; + if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto)) + return 0; + + if (fib_rule_port_range_set(&rule->sport_range) && + !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport)) + return 0; + + if (fib_rule_port_range_set(&rule->dport_range) && + !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport)) + return 0; + return 1; } @@ -244,6 +255,9 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, } #endif + if (fib_rule_requires_fldissect(rule)) + net->ipv4.fib_rules_require_fldissect++; + rule4->src_len = frh->src_len; rule4->srcmask = inet_make_mask(rule4->src_len); rule4->dst_len = frh->dst_len; @@ -272,6 +286,10 @@ static int fib4_rule_delete(struct fib_rule *rule) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; + + if (net->ipv4.fib_rules_require_fldissect && + fib_rule_requires_fldissect(rule)) + net->ipv4.fib_rules_require_fldissect--; errout: return err; } @@ -389,6 +407,7 @@ int __net_init fib4_rules_init(struct net *net) goto fail; net->ipv4.rules_ops = ops; net->ipv4.fib_has_custom_rules = false; + net->ipv4.fib_rules_require_fldissect = 0; return 0; fail: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 7d36a950d961..c27122f01b87 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -171,7 +171,7 @@ static void free_nh_exceptions(struct fib_nh *nh) fnhe = rcu_dereference_protected(hash[i].chain, 1); while (fnhe) { struct fib_nh_exception *next; - + next = rcu_dereference_protected(fnhe->fnhe_next, 1); rt_fibinfo_free(&fnhe->fnhe_rth_input); @@ -1746,18 +1746,20 @@ void fib_select_multipath(struct fib_result *res, int hash) bool first = false; for_nexthops(fi) { + if (net->ipv4.sysctl_fib_multipath_use_neigh) { + if (!fib_good_nh(nh)) + continue; + if (!first) { + res->nh_sel = nhsel; + first = true; + } + } + if (hash > atomic_read(&nh->nh_upper_bound)) continue; - if (!net->ipv4.sysctl_fib_multipath_use_neigh || - fib_good_nh(nh)) { - res->nh_sel = nhsel; - return; - } - if (!first) { - res->nh_sel = nhsel; - first = true; - } + res->nh_sel = nhsel; + return; } endfor_nexthops(fi); } #endif @@ -1765,14 +1767,12 @@ void fib_select_multipath(struct fib_result *res, int hash) void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb) { - bool oif_check; - - oif_check = (fl4->flowi4_oif == 0 || - fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF); + if (fl4->flowi4_oif && !(fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) + goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi->fib_nhs > 1 && oif_check) { - int h = fib_multipath_hash(res->fi, fl4, skb); + if (res->fi->fib_nhs > 1) { + int h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h); } @@ -1780,10 +1780,10 @@ void fib_select_path(struct net *net, struct fib_result *res, #endif if (!res->prefixlen && res->table->tb_num_default > 1 && - res->type == RTN_UNICAST && oif_check) + res->type == RTN_UNICAST) fib_select_default(fl4, res); +check_saddr: if (!fl4->saddr) fl4->saddr = FIB_RES_PREFSRC(net, *res); } -EXPORT_SYMBOL_GPL(fib_select_path); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5530cd6fdbc7..3dcffd3ce98c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -50,6 +50,7 @@ #define VERSION "0.409" +#include <linux/cache.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> @@ -191,8 +192,8 @@ static size_t tnode_free_size; */ static const int sync_pages = 128; -static struct kmem_cache *fn_alias_kmem __read_mostly; -static struct kmem_cache *trie_leaf_kmem __read_mostly; +static struct kmem_cache *fn_alias_kmem __ro_after_init; +static struct kmem_cache *trie_leaf_kmem __ro_after_init; static inline struct tnode *tn_info(struct key_vector *kv) { @@ -1064,6 +1065,9 @@ noleaf: return -ENOMEM; } +/* fib notifier for ADD is sent before calling fib_insert_alias with + * the expectation that the only possible failure ENOMEM + */ static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) @@ -1215,8 +1219,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - key, plen, new_fa, extack); + err = call_fib_entry_notifiers(net, + FIB_EVENT_ENTRY_REPLACE, + key, plen, new_fa, + extack); + if (err) + goto out_free_new_fa; + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1262,21 +1271,32 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); + if (err) + goto out_free_new_fa; + /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) - goto out_free_new_fa; + goto out_fib_notif; if (!plen) tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: return 0; +out_fib_notif: + /* notifier was sent that entry would be added to trie, but + * the add failed and need to recover. Only failure for + * fib_insert_alias is ENOMEM. + */ + NL_SET_ERR_MSG(extack, "Failed to insert route into trie"); + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, + plen, new_fa, NULL); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -2721,14 +2741,14 @@ static const struct file_operations fib_route_fops = { int __net_init fib_proc_init(struct net *net) { - if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops)) + if (!proc_create("fib_trie", 0444, net->proc_net, &fib_trie_fops)) goto out1; - if (!proc_create("fib_triestat", S_IRUGO, net->proc_net, + if (!proc_create("fib_triestat", 0444, net->proc_net, &fib_triestat_fops)) goto out2; - if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops)) + if (!proc_create("route", 0444, net->proc_net, &fib_route_fops)) goto out3; return 0; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f2402581fef1..b26a81a7de42 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2993,10 +2993,10 @@ static int __net_init igmp_net_init(struct net *net) struct proc_dir_entry *pde; int err; - pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops); + pde = proc_create("igmp", 0444, net->proc_net, &igmp_mc_seq_fops); if (!pde) goto out_igmp; - pde = proc_create("mcfilter", S_IRUGO, net->proc_net, + pde = proc_create("mcfilter", 0444, net->proc_net, &igmp_mcf_seq_fops); if (!pde) goto out_mcfilter; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index a383f299ce24..4e5bc4b2f14e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -53,8 +53,7 @@ static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { if (!inet_diag_table[proto]) - request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET, proto); + sock_load_diag_module(AF_INET, proto); mutex_lock(&inet_diag_table_mutex); if (!inet_diag_table[proto]) diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 26a3d0315728..c9e35b81d093 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,12 +25,6 @@ #include <net/inet_frag.h> #include <net/inet_ecn.h> -#define INETFRAGS_EVICT_BUCKETS 128 -#define INETFRAGS_EVICT_MAX 512 - -/* don't rebuild inetfrag table with new secret more often than this */ -#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) - /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field @@ -52,154 +46,8 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static unsigned int -inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) -{ - return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); -} - -static bool inet_frag_may_rebuild(struct inet_frags *f) -{ - return time_after(jiffies, - f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); -} - -static void inet_frag_secret_rebuild(struct inet_frags *f) -{ - int i; - - write_seqlock_bh(&f->rnd_seqlock); - - if (!inet_frag_may_rebuild(f)) - goto out; - - get_random_bytes(&f->rnd, sizeof(u32)); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - struct hlist_node *n; - - hb = &f->hash[i]; - spin_lock(&hb->chain_lock); - - hlist_for_each_entry_safe(q, n, &hb->chain, list) { - unsigned int hval = inet_frag_hashfn(f, q); - - if (hval != i) { - struct inet_frag_bucket *hb_dest; - - hlist_del(&q->list); - - /* Relink to new hash chain. */ - hb_dest = &f->hash[hval]; - - /* This is the only place where we take - * another chain_lock while already holding - * one. As this will not run concurrently, - * we cannot deadlock on hb_dest lock below, if its - * already locked it will be released soon since - * other caller cannot be waiting for hb lock - * that we've taken above. - */ - spin_lock_nested(&hb_dest->chain_lock, - SINGLE_DEPTH_NESTING); - hlist_add_head(&q->list, &hb_dest->chain); - spin_unlock(&hb_dest->chain_lock); - } - } - spin_unlock(&hb->chain_lock); - } - - f->rebuild = false; - f->last_rebuild_jiffies = jiffies; -out: - write_sequnlock_bh(&f->rnd_seqlock); -} - -static bool inet_fragq_should_evict(const struct inet_frag_queue *q) -{ - return q->net->low_thresh == 0 || - frag_mem_limit(q->net) >= q->net->low_thresh; -} - -static unsigned int -inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) -{ - struct inet_frag_queue *fq; - struct hlist_node *n; - unsigned int evicted = 0; - HLIST_HEAD(expired); - - spin_lock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &hb->chain, list) { - if (!inet_fragq_should_evict(fq)) - continue; - - if (!del_timer(&fq->timer)) - continue; - - hlist_add_head(&fq->list_evictor, &expired); - ++evicted; - } - - spin_unlock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire(&fq->timer); - - return evicted; -} - -static void inet_frag_worker(struct work_struct *work) -{ - unsigned int budget = INETFRAGS_EVICT_BUCKETS; - unsigned int i, evicted = 0; - struct inet_frags *f; - - f = container_of(work, struct inet_frags, frags_work); - - BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); - - local_bh_disable(); - - for (i = READ_ONCE(f->next_bucket); budget; --budget) { - evicted += inet_evict_bucket(f, &f->hash[i]); - i = (i + 1) & (INETFRAGS_HASHSZ - 1); - if (evicted > INETFRAGS_EVICT_MAX) - break; - } - - f->next_bucket = i; - - local_bh_enable(); - - if (f->rebuild && inet_frag_may_rebuild(f)) - inet_frag_secret_rebuild(f); -} - -static void inet_frag_schedule_worker(struct inet_frags *f) -{ - if (unlikely(!work_pending(&f->frags_work))) - schedule_work(&f->frags_work); -} - int inet_frags_init(struct inet_frags *f) { - int i; - - INIT_WORK(&f->frags_work, inet_frag_worker); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb = &f->hash[i]; - - spin_lock_init(&hb->chain_lock); - INIT_HLIST_HEAD(&hb->chain); - } - - seqlock_init(&f->rnd_seqlock); - f->last_rebuild_jiffies = 0; f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, NULL); if (!f->frags_cachep) @@ -211,83 +59,75 @@ EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { - cancel_work_sync(&f->frags_work); + /* We must wait that all inet_frag_destroy_rcu() have completed. */ + rcu_barrier(); + kmem_cache_destroy(f->frags_cachep); + f->frags_cachep = NULL; } EXPORT_SYMBOL(inet_frags_fini); -void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) -{ - unsigned int seq; - int i; - - nf->low_thresh = 0; - -evict_again: - local_bh_disable(); - seq = read_seqbegin(&f->rnd_seqlock); - - for (i = 0; i < INETFRAGS_HASHSZ ; i++) - inet_evict_bucket(f, &f->hash[i]); - - local_bh_enable(); - cond_resched(); - - if (read_seqretry(&f->rnd_seqlock, seq) || - sum_frag_mem_limit(nf)) - goto evict_again; -} -EXPORT_SYMBOL(inet_frags_exit_net); - -static struct inet_frag_bucket * -get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) -__acquires(hb->chain_lock) +static void inet_frags_free_cb(void *ptr, void *arg) { - struct inet_frag_bucket *hb; - unsigned int seq, hash; + struct inet_frag_queue *fq = ptr; - restart: - seq = read_seqbegin(&f->rnd_seqlock); - - hash = inet_frag_hashfn(f, fq); - hb = &f->hash[hash]; + /* If we can not cancel the timer, it means this frag_queue + * is already disappearing, we have nothing to do. + * Otherwise, we own a refcount until the end of this function. + */ + if (!del_timer(&fq->timer)) + return; - spin_lock(&hb->chain_lock); - if (read_seqretry(&f->rnd_seqlock, seq)) { - spin_unlock(&hb->chain_lock); - goto restart; + spin_lock_bh(&fq->lock); + if (!(fq->flags & INET_FRAG_COMPLETE)) { + fq->flags |= INET_FRAG_COMPLETE; + refcount_dec(&fq->refcnt); } + spin_unlock_bh(&fq->lock); - return hb; + inet_frag_put(fq); } -static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +void inet_frags_exit_net(struct netns_frags *nf) { - struct inet_frag_bucket *hb; + nf->low_thresh = 0; /* prevent creation of new frags */ - hb = get_frag_bucket_locked(fq, f); - hlist_del(&fq->list); - fq->flags |= INET_FRAG_COMPLETE; - spin_unlock(&hb->chain_lock); + rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); } +EXPORT_SYMBOL(inet_frags_exit_net); -void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) +void inet_frag_kill(struct inet_frag_queue *fq) { if (del_timer(&fq->timer)) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { - fq_unlink(fq, f); + struct netns_frags *nf = fq->net; + + fq->flags |= INET_FRAG_COMPLETE; + rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); refcount_dec(&fq->refcnt); } } EXPORT_SYMBOL(inet_frag_kill); -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) +static void inet_frag_destroy_rcu(struct rcu_head *head) +{ + struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, + rcu); + struct inet_frags *f = q->net->f; + + if (f->destructor) + f->destructor(q); + kmem_cache_free(f->frags_cachep, q); +} + +void inet_frag_destroy(struct inet_frag_queue *q) { struct sk_buff *fp; struct netns_frags *nf; unsigned int sum, sum_truesize = 0; + struct inet_frags *f; WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); WARN_ON(del_timer(&q->timer) != 0); @@ -295,6 +135,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) /* Release all fragment data. */ fp = q->fragments; nf = q->net; + f = nf->f; while (fp) { struct sk_buff *xp = fp->next; @@ -304,59 +145,20 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) } sum = sum_truesize + f->qsize; - if (f->destructor) - f->destructor(q); - kmem_cache_free(f->frags_cachep, q); + call_rcu(&q->rcu, inet_frag_destroy_rcu); sub_frag_mem_limit(nf, sum); } EXPORT_SYMBOL(inet_frag_destroy); -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, - struct inet_frag_queue *qp_in, - struct inet_frags *f, - void *arg) -{ - struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); - struct inet_frag_queue *qp; - -#ifdef CONFIG_SMP - /* With SMP race we have to recheck hash table, because - * such entry could have been created on other cpu before - * we acquired hash bucket lock. - */ - hlist_for_each_entry(qp, &hb->chain, list) { - if (qp->net == nf && f->match(qp, arg)) { - refcount_inc(&qp->refcnt); - spin_unlock(&hb->chain_lock); - qp_in->flags |= INET_FRAG_COMPLETE; - inet_frag_put(qp_in, f); - return qp; - } - } -#endif - qp = qp_in; - if (!mod_timer(&qp->timer, jiffies + nf->timeout)) - refcount_inc(&qp->refcnt); - - refcount_inc(&qp->refcnt); - hlist_add_head(&qp->list, &hb->chain); - - spin_unlock(&hb->chain_lock); - - return qp; -} - static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { - inet_frag_schedule_worker(f); + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) return NULL; - } q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) @@ -368,70 +170,51 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); - refcount_set(&q->refcnt, 1); + refcount_set(&q->refcnt, 3); return q; } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - struct inet_frags *f, void *arg) { + struct inet_frags *f = nf->f; struct inet_frag_queue *q; + int err; q = inet_frag_alloc(nf, f, arg); if (!q) return NULL; - return inet_frag_intern(nf, q, f, arg); + mod_timer(&q->timer, jiffies + nf->timeout); + + err = rhashtable_insert_fast(&nf->rhashtable, &q->node, + f->rhash_params); + if (err < 0) { + q->flags |= INET_FRAG_COMPLETE; + inet_frag_kill(q); + inet_frag_destroy(q); + return NULL; + } + return q; } -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, - unsigned int hash) +/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - int depth = 0; - - if (frag_mem_limit(nf) > nf->low_thresh) - inet_frag_schedule_worker(f); - - hash &= (INETFRAGS_HASHSZ - 1); - hb = &f->hash[hash]; - - spin_lock(&hb->chain_lock); - hlist_for_each_entry(q, &hb->chain, list) { - if (q->net == nf && f->match(q, key)) { - refcount_inc(&q->refcnt); - spin_unlock(&hb->chain_lock); - return q; - } - depth++; - } - spin_unlock(&hb->chain_lock); + struct inet_frag_queue *fq; - if (depth <= INETFRAGS_MAXDEPTH) - return inet_frag_create(nf, f, key); + rcu_read_lock(); - if (inet_frag_may_rebuild(f)) { - if (!f->rebuild) - f->rebuild = true; - inet_frag_schedule_worker(f); + fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + if (fq) { + if (!refcount_inc_not_zero(&fq->refcnt)) + fq = NULL; + rcu_read_unlock(); + return fq; } + rcu_read_unlock(); - return ERR_PTR(-ENOBUFS); + return inet_frag_create(nf, key); } EXPORT_SYMBOL(inet_frag_find); - -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, - const char *prefix) -{ - static const char msg[] = "inet_frag_find: Fragment hash bucket" - " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) - ". Dropping fragment.\n"; - - if (PTR_ERR(q) == -ENOBUFS) - net_dbg_ratelimited("%s%s", prefix, msg); -} -EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 914d56928578..1f04bd91fc2e 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -6,6 +6,7 @@ * Authors: Andrey V. Savochkin <saw@msu.ru> */ +#include <linux/cache.h> #include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> @@ -51,7 +52,7 @@ * daddr: unchangeable */ -static struct kmem_cache *peer_cachep __read_mostly; +static struct kmem_cache *peer_cachep __ro_after_init; void inet_peer_base_init(struct inet_peer_base *bp) { diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 2dd21c3281a1..b54b948b0596 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -55,7 +55,7 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) if (skb->ignore_df) return false; - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index bbf1b94942c0..8e9528ebaa8e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,27 +57,13 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; -struct ipfrag_skb_cb -{ - struct inet_skb_parm h; - int offset; -}; - -#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) - /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; - u32 user; - __be32 saddr; - __be32 daddr; - __be16 id; - u8 protocol; u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; - int vif; /* L3 master device index */ unsigned int rid; struct inet_peer *peer; }; @@ -89,49 +75,9 @@ static u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; -int ip_frag_mem(struct net *net) -{ - return sum_frag_mem_limit(&net->ipv4.frags); -} - static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev); -struct ip4_create_arg { - struct iphdr *iph; - u32 user; - int vif; -}; - -static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) -{ - net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); - return jhash_3words((__force u32)id << 16 | prot, - (__force u32)saddr, (__force u32)daddr, - ip4_frags.rnd); -} - -static unsigned int ip4_hashfn(const struct inet_frag_queue *q) -{ - const struct ipq *ipq; - - ipq = container_of(q, struct ipq, q); - return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); -} - -static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct ipq *qp; - const struct ip4_create_arg *arg = a; - - qp = container_of(q, struct ipq, q); - return qp->id == arg->iph->id && - qp->saddr == arg->iph->saddr && - qp->daddr == arg->iph->daddr && - qp->protocol == arg->iph->protocol && - qp->user == arg->user && - qp->vif == arg->vif; -} static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { @@ -140,17 +86,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) frags); struct net *net = container_of(ipv4, struct net, ipv4); - const struct ip4_create_arg *arg = a; + const struct frag_v4_compare_key *key = a; - qp->protocol = arg->iph->protocol; - qp->id = arg->iph->id; - qp->ecn = ip4_frag_ecn(arg->iph->tos); - qp->saddr = arg->iph->saddr; - qp->daddr = arg->iph->daddr; - qp->vif = arg->vif; - qp->user = arg->user; + q->key.v4 = *key; + qp->ecn = 0; qp->peer = q->net->max_dist ? - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : + inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : NULL; } @@ -168,7 +109,7 @@ static void ip4_frag_free(struct inet_frag_queue *q) static void ipq_put(struct ipq *ipq) { - inet_frag_put(&ipq->q, &ip4_frags); + inet_frag_put(&ipq->q); } /* Kill ipq entry. It is not destroyed immediately, @@ -176,7 +117,7 @@ static void ipq_put(struct ipq *ipq) */ static void ipq_kill(struct ipq *ipq) { - inet_frag_kill(&ipq->q, &ip4_frags); + inet_frag_kill(&ipq->q); } static bool frag_expire_skip_icmp(u32 user) @@ -194,8 +135,11 @@ static bool frag_expire_skip_icmp(u32 user) static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); - struct ipq *qp; + const struct iphdr *iph; + struct sk_buff *head; struct net *net; + struct ipq *qp; + int err; qp = container_of(frag, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); @@ -209,46 +153,38 @@ static void ip_expire(struct timer_list *t) ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - if (!inet_frag_evicting(&qp->q)) { - struct sk_buff *clone, *head = qp->q.fragments; - const struct iphdr *iph; - int err; + head = qp->q.fragments; - __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); + __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) - goto out; + if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) + goto out; - head->dev = dev_get_by_index_rcu(net, qp->iif); - if (!head->dev) - goto out; + head->dev = dev_get_by_index_rcu(net, qp->iif); + if (!head->dev) + goto out; - /* skb has no dst, perform route lookup again */ - iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, + /* skb has no dst, perform route lookup again */ + iph = ip_hdr(head); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); - if (err) - goto out; + if (err) + goto out; + + /* Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. + */ + if (frag_expire_skip_icmp(qp->q.key.v4.user) && + (skb_rtable(head)->rt_type != RTN_LOCAL)) + goto out; + + skb_get(head); + spin_unlock(&qp->q.lock); + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + kfree_skb(head); + goto out_rcu_unlock; - /* Only an end host needs to send an ICMP - * "Fragment Reassembly Timeout" message, per RFC792. - */ - if (frag_expire_skip_icmp(qp->user) && - (skb_rtable(head)->rt_type != RTN_LOCAL)) - goto out; - - clone = skb_clone(head, GFP_ATOMIC); - - /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if (clone) { - spin_unlock(&qp->q.lock); - icmp_send(clone, ICMP_TIME_EXCEEDED, - ICMP_EXC_FRAGTIME, 0); - consume_skb(clone); - goto out_rcu_unlock; - } - } out: spin_unlock(&qp->q.lock); out_rcu_unlock: @@ -262,21 +198,20 @@ out_rcu_unlock: static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user, int vif) { + struct frag_v4_compare_key key = { + .saddr = iph->saddr, + .daddr = iph->daddr, + .user = user, + .vif = vif, + .id = iph->id, + .protocol = iph->protocol, + }; struct inet_frag_queue *q; - struct ip4_create_arg arg; - unsigned int hash; - - arg.iph = iph; - arg.user = user; - arg.vif = vif; - hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); - - q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + q = inet_frag_find(&net->ipv4.frags, &key); + if (!q) return NULL; - } + return container_of(q, struct ipq, q); } @@ -410,13 +345,13 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * this fragment, right? */ prev = qp->q.fragments_tail; - if (!prev || FRAG_CB(prev)->offset < offset) { + if (!prev || prev->ip_defrag_offset < offset) { next = NULL; goto found; } prev = NULL; for (next = qp->q.fragments; next != NULL; next = next->next) { - if (FRAG_CB(next)->offset >= offset) + if (next->ip_defrag_offset >= offset) break; /* bingo! */ prev = next; } @@ -427,7 +362,7 @@ found: * any overlaps are eliminated. */ if (prev) { - int i = (FRAG_CB(prev)->offset + prev->len) - offset; + int i = (prev->ip_defrag_offset + prev->len) - offset; if (i > 0) { offset += i; @@ -444,8 +379,8 @@ found: err = -ENOMEM; - while (next && FRAG_CB(next)->offset < end) { - int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ + while (next && next->ip_defrag_offset < end) { + int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */ if (i < next->len) { /* Eat head of the next overlapped fragment @@ -453,7 +388,7 @@ found: */ if (!pskb_pull(next, i)) goto err; - FRAG_CB(next)->offset += i; + next->ip_defrag_offset += i; qp->q.meat -= i; if (next->ip_summed != CHECKSUM_UNNECESSARY) next->ip_summed = CHECKSUM_NONE; @@ -477,7 +412,13 @@ found: } } - FRAG_CB(skb)->offset = offset; + /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + dev = skb->dev; + if (dev) + qp->iif = dev->ifindex; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + skb->ip_defrag_offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; @@ -488,11 +429,6 @@ found: else qp->q.fragments = skb; - dev = skb->dev; - if (dev) { - qp->iif = dev->ifindex; - skb->dev = NULL; - } qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; @@ -568,7 +504,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } WARN_ON(!head); - WARN_ON(FRAG_CB(head)->offset != 0); + WARN_ON(head->ip_defrag_offset != 0); /* Allocate a new buffer for the datagram. */ ihlen = ip_hdrlen(head); @@ -656,7 +592,7 @@ out_nomem: err = -ENOMEM; goto out_fail; out_oversize: - net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); + net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr); out_fail: __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; @@ -731,24 +667,23 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) EXPORT_SYMBOL(ip_check_defrag); #ifdef CONFIG_SYSCTL -static int zero; +static int dist_min; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", .data = &init_net.ipv4.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .proc_handler = proc_doulongvec_minmax, .extra2 = &init_net.ipv4.frags.high_thresh }, { @@ -764,7 +699,7 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .extra1 = &dist_min, }, { } }; @@ -846,6 +781,8 @@ static void __init ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { + int res; + /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -870,16 +807,21 @@ static int __net_init ipv4_frags_init_net(struct net *net) net->ipv4.frags.timeout = IP_FRAG_TIME; net->ipv4.frags.max_dist = 64; - - inet_frags_init_net(&net->ipv4.frags); - - return ip4_frags_ns_ctl_register(net); + net->ipv4.frags.f = &ip4_frags; + + res = inet_frags_init_net(&net->ipv4.frags); + if (res < 0) + return res; + res = ip4_frags_ns_ctl_register(net); + if (res < 0) + inet_frags_exit_net(&net->ipv4.frags); + return res; } static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); - inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); + inet_frags_exit_net(&net->ipv4.frags); } static struct pernet_operations ip4_frags_ops = { @@ -887,17 +829,49 @@ static struct pernet_operations ip4_frags_ops = { .exit = ipv4_frags_exit_net, }; + +static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); +} + +static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v4, + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); +} + +static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v4_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static const struct rhashtable_params ip4_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .key_offset = offsetof(struct inet_frag_queue, key), + .key_len = sizeof(struct frag_v4_compare_key), + .hashfn = ip4_key_hashfn, + .obj_hashfn = ip4_obj_hashfn, + .obj_cmpfn = ip4_obj_cmpfn, + .automatic_shrinking = true, +}; + void __init ipfrag_init(void) { - ip4_frags_ctl_register(); - register_pernet_subsys(&ip4_frags_ops); - ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; ip4_frags.qsize = sizeof(struct ipq); - ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; ip4_frags.frags_cache_name = ip_frag_cache_name; + ip4_frags.rhash_params = ip4_rhash_params; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); + ip4_frags_ctl_register(); + register_pernet_subsys(&ip4_frags_ops); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 45d97e9b2759..a8772a978224 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -522,6 +522,7 @@ err_free_skb: static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { + struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct rtable *rt = NULL; @@ -545,9 +546,11 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) goto err_free_rt; - flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + flags = tun_info->key.tun_flags & + (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); gre_build_header(skb, tunnel_hlen, flags, proto, - tunnel_id_to_key32(tun_info->key.tun_id), 0); + tunnel_id_to_key32(tun_info->key.tun_id), + (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; @@ -970,9 +973,6 @@ static void __gre_tunnel_init(struct net_device *dev) t_hlen = tunnel->hlen + sizeof(struct iphdr); - dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; - dev->mtu = ETH_DATA_LEN - t_hlen - 4; - dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -1290,8 +1290,6 @@ static int erspan_tunnel_init(struct net_device *dev) erspan_hdr_len(tunnel->erspan_ver); t_hlen = tunnel->hlen + sizeof(struct iphdr); - dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; - dev->mtu = ETH_DATA_LEN - t_hlen - 4; dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; @@ -1322,6 +1320,12 @@ static void ipgre_tap_setup(struct net_device *dev) ip_tunnel_setup(dev, gre_tap_net_id); } +bool is_gretap_dev(const struct net_device *dev) +{ + return dev->netdev_ops == &gre_tap_netdev_ops; +} +EXPORT_SYMBOL_GPL(is_gretap_dev); + static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 57fc13c6ab2b..7582713dd18f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) struct net_device *dev = skb->dev; struct net *net = dev_net(dev); - for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { + for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report @@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) */ if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || - sk->sk_bound_dev_if == dev->ifindex) && - net_eq(sock_net(sk), net)) { + sk->sk_bound_dev_if == dev->ifindex)) { if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e8e675be60ec..4c11b810a447 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -248,7 +248,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, /* common case: seglen is <= mtu */ - if (skb_gso_validate_mtu(skb, mtu)) + if (skb_gso_validate_network_len(skb, mtu)) return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length exceeds the egress MTU. @@ -876,6 +876,7 @@ static int __ip_append_data(struct sock *sk, unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; + unsigned int wmem_alloc_delta = 0; u32 tskey = 0; skb = skb_peek_tail(queue); @@ -971,11 +972,10 @@ alloc_new_skb: (flags & MSG_DONTWAIT), &err); } else { skb = NULL; - if (refcount_read(&sk->sk_wmem_alloc) <= + if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) - skb = sock_wmalloc(sk, - alloclen + hh_len + 15, 1, - sk->sk_allocation); + skb = alloc_skb(alloclen + hh_len + 15, + sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; } @@ -1033,6 +1033,11 @@ alloc_new_skb: /* * Put the packet on the pending queue. */ + if (!skb->destructor) { + skb->destructor = sock_wfree; + skb->sk = sk; + wmem_alloc_delta += skb->truesize; + } __skb_queue_tail(queue, skb); continue; } @@ -1079,12 +1084,14 @@ alloc_new_skb: skb->len += copy; skb->data_len += copy; skb->truesize += copy; - refcount_add(copy, &sk->sk_wmem_alloc); + wmem_alloc_delta += copy; } offset += copy; length -= copy; } + if (wmem_alloc_delta) + refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return 0; error_efault: @@ -1092,6 +1099,7 @@ error_efault: error: cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); + refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return err; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 008be04ac1cc..5ad2d8ed3a3f 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -258,7 +258,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); if (!ipv6_addr_v4mapped(&src_info->ipi6_addr)) return -EINVAL; - ipc->oif = src_info->ipi6_ifindex; + if (src_info->ipi6_ifindex) + ipc->oif = src_info->ipi6_ifindex; ipc->addr = src_info->ipi6_addr.s6_addr32[3]; continue; } @@ -288,7 +289,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) return -EINVAL; info = (struct in_pktinfo *)CMSG_DATA(cmsg); - ipc->oif = info->ipi_ifindex; + if (info->ipi_ifindex) + ipc->oif = info->ipi_ifindex; ipc->addr = info->ipi_spec_dst.s_addr; break; } @@ -320,20 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, return 0; } - -/* Special input handler for packets caught by router alert option. - They are selected only by protocol field, and then processed likely - local ones; but only if someone wants them! Otherwise, router - not running rsvpd will kill RSVP. - - It is user level problem, what it will make with them. - I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), - but receiver should be enough clever f.e. to forward mtrace requests, - sent to multicast group to reach destination designated router. - */ -struct ip_ra_chain __rcu *ip_ra_chain; - - static void ip_ra_destroy_rcu(struct rcu_head *head) { struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); @@ -347,23 +335,28 @@ int ip_ra_control(struct sock *sk, unsigned char on, { struct ip_ra_chain *ra, *new_ra; struct ip_ra_chain __rcu **rap; + struct net *net = sock_net(sk); if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; - for (rap = &ip_ra_chain; - (ra = rtnl_dereference(*rap)) != NULL; + mutex_lock(&net->ipv4.ra_mutex); + for (rap = &net->ipv4.ra_chain; + (ra = rcu_dereference_protected(*rap, + lockdep_is_held(&net->ipv4.ra_mutex))) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (on) { + mutex_unlock(&net->ipv4.ra_mutex); kfree(new_ra); return -EADDRINUSE; } /* dont let ip_call_ra_chain() use sk again */ ra->sk = NULL; RCU_INIT_POINTER(*rap, ra->next); + mutex_unlock(&net->ipv4.ra_mutex); if (ra->destructor) ra->destructor(sk); @@ -377,14 +370,17 @@ int ip_ra_control(struct sock *sk, unsigned char on, return 0; } } - if (!new_ra) + if (!new_ra) { + mutex_unlock(&net->ipv4.ra_mutex); return -ENOBUFS; + } new_ra->sk = sk; new_ra->destructor = destructor; RCU_INIT_POINTER(new_ra->next, ra); rcu_assign_pointer(*rap, new_ra); sock_hold(sk); + mutex_unlock(&net->ipv4.ra_mutex); return 0; } @@ -584,7 +580,6 @@ static bool setsockopt_needs_rtnl(int optname) case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_UNBLOCK_SOURCE: - case IP_ROUTER_ALERT: return true; } return false; @@ -637,6 +632,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, /* If optlen==0, it is equivalent to val == 0 */ + if (optname == IP_ROUTER_ALERT) + return ip_ra_control(sk, val ? 1 : 0, NULL); if (ip_mroute_opt(optname)) return ip_mroute_setsockopt(sk, optname, optval, optlen); @@ -1147,9 +1144,6 @@ mc_msf_out: goto e_inval; inet->mc_all = val; break; - case IP_ROUTER_ALERT: - err = ip_ra_control(sk, val ? 1 : 0, NULL); - break; case IP_FREEBIND: if (optlen < 1) @@ -1567,10 +1561,7 @@ int ip_getsockopt(struct sock *sk, int level, if (get_user(len, optlen)) return -EFAULT; - lock_sock(sk); - err = nf_getsockopt(sk, PF_INET, optname, optval, - &len); - release_sock(sk); + err = nf_getsockopt(sk, PF_INET, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); return err; @@ -1602,9 +1593,7 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, if (get_user(len, optlen)) return -EFAULT; - lock_sock(sk); err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len); - release_sock(sk); if (err >= 0) err = put_user(len, optlen); return err; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index d786a8441bce..de6d94482fe7 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -290,22 +290,6 @@ failed: return ERR_PTR(err); } -static inline void init_tunnel_flow(struct flowi4 *fl4, - int proto, - __be32 daddr, __be32 saddr, - __be32 key, __u8 tos, int oif, - __u32 mark) -{ - memset(fl4, 0, sizeof(*fl4)); - fl4->flowi4_oif = oif; - fl4->daddr = daddr; - fl4->saddr = saddr; - fl4->flowi4_tos = tos; - fl4->flowi4_proto = proto; - fl4->fl4_gre_key = key; - fl4->flowi4_mark = mark; -} - static int ip_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; @@ -322,10 +306,10 @@ static int ip_tunnel_bind_dev(struct net_device *dev) struct flowi4 fl4; struct rtable *rt; - init_tunnel_flow(&fl4, iph->protocol, iph->daddr, - iph->saddr, tunnel->parms.o_key, - RT_TOS(iph->tos), tunnel->parms.link, - tunnel->fwmark); + ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, + iph->saddr, tunnel->parms.o_key, + RT_TOS(iph->tos), tunnel->parms.link, + tunnel->fwmark); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { @@ -362,13 +346,17 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, struct ip_tunnel *nt; struct net_device *dev; int t_hlen; + int mtu; + int err; - BUG_ON(!itn->fb_tunnel_dev); - dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); + dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); if (IS_ERR(dev)) return ERR_CAST(dev); - dev->mtu = ip_tunnel_bind_dev(dev); + mtu = ip_tunnel_bind_dev(dev); + err = dev_set_mtu(dev, mtu); + if (err) + goto err_dev_set_mtu; nt = netdev_priv(dev); t_hlen = nt->hlen + sizeof(struct iphdr); @@ -376,6 +364,10 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; ip_tunnel_add(itn, nt); return nt; + +err_dev_set_mtu: + unregister_netdevice(dev); + return ERR_PTR(err); } int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, @@ -581,8 +573,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) else if (skb->protocol == htons(ETH_P_IPV6)) tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } - init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, - RT_TOS(tos), tunnel->parms.link, tunnel->fwmark); + ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, + RT_TOS(tos), tunnel->parms.link, tunnel->fwmark); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; rt = ip_route_output_key(tunnel->net, &fl4); @@ -710,16 +702,9 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } } - if (tunnel->fwmark) { - init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, - tunnel->fwmark); - } - else { - init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, - skb->mark); - } + ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, + tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, + tunnel->fwmark); if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; @@ -845,7 +830,6 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) struct net *net = t->net; struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); - BUG_ON(!itn->fb_tunnel_dev); switch (cmd) { case SIOCGETTUNNEL: if (dev == itn->fb_tunnel_dev) { @@ -870,7 +854,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) p->o_key = 0; } - t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); + t = ip_tunnel_find(itn, p, itn->type); if (cmd == SIOCADDTUNNEL) { if (!t) { @@ -1014,10 +998,15 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct ip_tunnel_parm parms; unsigned int i; + itn->rtnl_link_ops = ops; for (i = 0; i < IP_TNL_HASH_SIZE; i++) INIT_HLIST_HEAD(&itn->tunnels[i]); - if (!ops) { + if (!ops || !net_has_fallback_tunnels(net)) { + struct ip_tunnel_net *it_init_net; + + it_init_net = net_generic(&init_net, ip_tnl_net_id); + itn->type = it_init_net->type; itn->fb_tunnel_dev = NULL; return 0; } @@ -1035,6 +1024,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); + itn->type = itn->fb_tunnel_dev->type; } rtnl_unlock(); @@ -1042,10 +1032,10 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); -static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, +static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, + struct list_head *head, struct rtnl_link_ops *ops) { - struct net *net = dev_net(itn->fb_tunnel_dev); struct net_device *dev, *aux; int h; @@ -1077,7 +1067,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { itn = net_generic(net, id); - ip_tunnel_destroy(itn, &list, ops); + ip_tunnel_destroy(net, itn, &list, ops); } unregister_netdevice_many(&list); rtnl_unlock(); @@ -1109,17 +1099,29 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], nt->fwmark = fwmark; err = register_netdevice(dev); if (err) - goto out; + goto err_register_netdevice; if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); mtu = ip_tunnel_bind_dev(dev); - if (!tb[IFLA_MTU]) - dev->mtu = mtu; + if (tb[IFLA_MTU]) { + unsigned int max = 0xfff8 - dev->hard_header_len - nt->hlen; + + mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, + (unsigned int)(max - sizeof(struct iphdr))); + } + + err = dev_set_mtu(dev, mtu); + if (err) + goto err_dev_set_mtu; ip_tunnel_add(itn, nt); -out: + return 0; + +err_dev_set_mtu: + unregister_netdevice(dev); +err_register_netdevice: return err; } EXPORT_SYMBOL_GPL(ip_tunnel_newlink); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 51b1669334fe..3f091ccad9af 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -387,8 +387,6 @@ static int vti_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); - dev->mtu = ETH_DATA_LEN; dev->flags = IFF_NOARP; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index f75802ad960f..43f620feb1c4 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1369,7 +1369,7 @@ static int __init ip_auto_config(void) unsigned int i; #ifdef CONFIG_PROC_FS - proc_create("pnp", S_IRUGO, init_net.proc_net, &pnp_seq_fops); + proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops); #endif /* CONFIG_PROC_FS */ if (!ic_enable) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b05689bbba31..2fb4de3f7f66 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -28,9 +28,9 @@ #include <linux/uaccess.h> #include <linux/types.h> +#include <linux/cache.h> #include <linux/capability.h> #include <linux/errno.h> -#include <linux/timer.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> @@ -52,7 +52,6 @@ #include <net/protocol.h> #include <linux/skbuff.h> #include <net/route.h> -#include <net/sock.h> #include <net/icmp.h> #include <net/udp.h> #include <net/raw.h> @@ -96,7 +95,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock); * In this case data path is free of exclusive locks at all. */ -static struct kmem_cache *mrt_cachep __read_mostly; +static struct kmem_cache *mrt_cachep __ro_after_init; static struct mr_table *ipmr_new_table(struct net *net, u32 id); static void ipmr_free_table(struct mr_table *mrt); @@ -106,8 +105,6 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct mfc_cache *cache, int local); static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); -static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - struct mfc_cache *c, struct rtmsg *rtm); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); @@ -118,6 +115,23 @@ static void ipmr_expire_process(struct timer_list *t); #define ipmr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list) +static struct mr_table *ipmr_mr_table_iter(struct net *net, + struct mr_table *mrt) +{ + struct mr_table *ret; + + if (!mrt) + ret = list_entry_rcu(net->ipv4.mr_tables.next, + struct mr_table, list); + else + ret = list_entry_rcu(mrt->list.next, + struct mr_table, list); + + if (&ret->list == &net->ipv4.mr_tables) + return NULL; + return ret; +} + static struct mr_table *ipmr_get_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -285,6 +299,14 @@ EXPORT_SYMBOL(ipmr_rule_default); #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) +static struct mr_table *ipmr_mr_table_iter(struct net *net, + struct mr_table *mrt) +{ + if (!mrt) + return net->ipv4.mrt; + return NULL; +} + static struct mr_table *ipmr_get_table(struct net *net, u32 id) { return net->ipv4.mrt; @@ -344,7 +366,7 @@ static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params ipmr_rht_params = { - .head_offset = offsetof(struct mfc_cache, mnode), + .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc_cache, cmparg), .key_len = sizeof(struct mfc_cache_cmp_arg), .nelem_hint = 3, @@ -353,6 +375,24 @@ static const struct rhashtable_params ipmr_rht_params = { .automatic_shrinking = true, }; +static void ipmr_new_table_set(struct mr_table *mrt, + struct net *net) +{ +#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES + list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables); +#endif +} + +static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = { + .mfc_mcastgrp = htonl(INADDR_ANY), + .mfc_origin = htonl(INADDR_ANY), +}; + +static struct mr_table_ops ipmr_mr_table_ops = { + .rht_params = &ipmr_rht_params, + .cmparg_any = &ipmr_mr_table_ops_cmparg_any, +}; + static struct mr_table *ipmr_new_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -365,23 +405,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) if (mrt) return mrt; - mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); - if (!mrt) - return ERR_PTR(-ENOMEM); - write_pnet(&mrt->net, net); - mrt->id = id; - - rhltable_init(&mrt->mfc_hash, &ipmr_rht_params); - INIT_LIST_HEAD(&mrt->mfc_cache_list); - INIT_LIST_HEAD(&mrt->mfc_unres_queue); - - timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0); - - mrt->mroute_reg_vif_num = -1; -#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES - list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables); -#endif - return mrt; + return mr_table_alloc(net, id, &ipmr_mr_table_ops, + ipmr_expire_process, ipmr_new_table_set); } static void ipmr_free_table(struct mr_table *mrt) @@ -619,80 +644,22 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) } #endif -static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, - struct net *net, - enum fib_event_type event_type, - struct vif_device *vif, - vifi_t vif_index, u32 tb_id) -{ - struct vif_entry_notifier_info info = { - .info = { - .family = RTNL_FAMILY_IPMR, - .net = net, - }, - .dev = vif->dev, - .vif_index = vif_index, - .vif_flags = vif->flags, - .tb_id = tb_id, - }; - - return call_fib_notifier(nb, net, event_type, &info.info); -} - static int call_ipmr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, vifi_t vif_index, u32 tb_id) { - struct vif_entry_notifier_info info = { - .info = { - .family = RTNL_FAMILY_IPMR, - .net = net, - }, - .dev = vif->dev, - .vif_index = vif_index, - .vif_flags = vif->flags, - .tb_id = tb_id, - }; - - ASSERT_RTNL(); - net->ipv4.ipmr_seq++; - return call_fib_notifiers(net, event_type, &info.info); -} - -static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, - struct net *net, - enum fib_event_type event_type, - struct mfc_cache *mfc, u32 tb_id) -{ - struct mfc_entry_notifier_info info = { - .info = { - .family = RTNL_FAMILY_IPMR, - .net = net, - }, - .mfc = mfc, - .tb_id = tb_id - }; - - return call_fib_notifier(nb, net, event_type, &info.info); + return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type, + vif, vif_index, tb_id, + &net->ipv4.ipmr_seq); } static int call_ipmr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc_cache *mfc, u32 tb_id) { - struct mfc_entry_notifier_info info = { - .info = { - .family = RTNL_FAMILY_IPMR, - .net = net, - }, - .mfc = mfc, - .tb_id = tb_id - }; - - ASSERT_RTNL(); - net->ipv4.ipmr_seq++; - return call_fib_notifiers(net, event_type, &info.info); + return mr_call_mfc_notifiers(net, RTNL_FAMILY_IPMR, event_type, + &mfc->_c, tb_id, &net->ipv4.ipmr_seq); } /** @@ -760,16 +727,15 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, static void ipmr_cache_free_rcu(struct rcu_head *head) { - struct mfc_cache *c = container_of(head, struct mfc_cache, rcu); + struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); - kmem_cache_free(mrt_cachep, c); + kmem_cache_free(mrt_cachep, (struct mfc_cache *)c); } -void ipmr_cache_free(struct mfc_cache *c) +static void ipmr_cache_free(struct mfc_cache *c) { - call_rcu(&c->rcu, ipmr_cache_free_rcu); + call_rcu(&c->_c.rcu, ipmr_cache_free_rcu); } -EXPORT_SYMBOL(ipmr_cache_free); /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. @@ -782,7 +748,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) atomic_dec(&mrt->cache_resolve_queue_len); - while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) { + while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); @@ -806,9 +772,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); - unsigned long now; + struct mr_mfc *c, *next; unsigned long expires; - struct mfc_cache *c, *next; + unsigned long now; if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10); @@ -830,8 +796,8 @@ static void ipmr_expire_process(struct timer_list *t) } list_del(&c->list); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_destroy_unres(mrt, c); + mroute_netlink_event(mrt, (struct mfc_cache *)c, RTM_DELROUTE); + ipmr_destroy_unres(mrt, (struct mfc_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) @@ -842,7 +808,7 @@ out: } /* Fill oifs list. It is called under write locked mrt_lock. */ -static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache, +static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; @@ -944,6 +910,10 @@ static int vif_add(struct net *net, struct mr_table *mrt, ip_rt_multicast_event(in_dev); /* Fill in the VIF structures */ + vif_device_init(v, dev, vifc->vifc_rate_limit, + vifc->vifc_threshold, + vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0), + (VIFF_TUNNEL | VIFF_REGISTER)); attr.orig_dev = dev; if (!switchdev_port_attr_get(dev, &attr)) { @@ -952,20 +922,9 @@ static int vif_add(struct net *net, struct mr_table *mrt, } else { v->dev_parent_id.id_len = 0; } - v->rate_limit = vifc->vifc_rate_limit; + v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; - v->flags = vifc->vifc_flags; - if (!mrtsock) - v->flags |= VIFF_STATIC; - v->threshold = vifc->vifc_threshold; - v->bytes_in = 0; - v->bytes_out = 0; - v->pkt_in = 0; - v->pkt_out = 0; - v->link = dev->ifindex; - if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER)) - v->link = dev_get_iflink(dev); /* And finish update writing critical data */ write_lock_bh(&mrt_lock); @@ -988,33 +947,8 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, .mfc_mcastgrp = mcastgrp, .mfc_origin = origin }; - struct rhlist_head *tmp, *list; - struct mfc_cache *c; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) - return c; - - return NULL; -} - -/* Look for a (*,*,oif) entry */ -static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, - int vifi) -{ - struct mfc_cache_cmp_arg arg = { - .mfc_mcastgrp = htonl(INADDR_ANY), - .mfc_origin = htonl(INADDR_ANY) - }; - struct rhlist_head *tmp, *list; - struct mfc_cache *c; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) - if (c->mfc_un.res.ttls[vifi] < 255) - return c; - return NULL; + return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ @@ -1025,25 +959,10 @@ static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, .mfc_mcastgrp = mcastgrp, .mfc_origin = htonl(INADDR_ANY) }; - struct rhlist_head *tmp, *list; - struct mfc_cache *c, *proxy; if (mcastgrp == htonl(INADDR_ANY)) - goto skip; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) { - if (c->mfc_un.res.ttls[vifi] < 255) - return c; - - /* It's ok if the vifi is part of the static tree */ - proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent); - if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) - return c; - } - -skip: - return ipmr_cache_find_any_parent(mrt, vifi); + return mr_mfc_find_any_parent(mrt, vifi); + return mr_mfc_find_any(mrt, vifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ @@ -1055,15 +974,8 @@ static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt, .mfc_mcastgrp = mcastgrp, .mfc_origin = origin, }; - struct rhlist_head *tmp, *list; - struct mfc_cache *c; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) - if (parent == -1 || parent == c->mfc_parent) - return c; - return NULL; + return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ @@ -1072,9 +984,10 @@ static struct mfc_cache *ipmr_cache_alloc(void) struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (c) { - c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; - c->mfc_un.res.minvif = MAXVIFS; - refcount_set(&c->mfc_un.res.refcount, 1); + c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; + c->_c.mfc_un.res.minvif = MAXVIFS; + c->_c.free = ipmr_cache_free_rcu; + refcount_set(&c->_c.mfc_un.res.refcount, 1); } return c; } @@ -1084,8 +997,8 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void) struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (c) { - skb_queue_head_init(&c->mfc_un.unres.unresolved); - c->mfc_un.unres.expires = jiffies + 10*HZ; + skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); + c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; } return c; } @@ -1098,12 +1011,13 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct nlmsgerr *e; /* Play the pending entries through our router */ - while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { + while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); - if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) { + if (mr_fill_mroute(mrt, skb, &c->_c, + nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { @@ -1211,7 +1125,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, int err; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(c, &mrt->mfc_unres_queue, list) { + list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (c->mfc_mcastgrp == iph->daddr && c->mfc_origin == iph->saddr) { found = true; @@ -1230,12 +1144,13 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, } /* Fill in the new cache entry */ - c->mfc_parent = -1; + c->_c.mfc_parent = -1; c->mfc_origin = iph->saddr; c->mfc_mcastgrp = iph->daddr; /* Reflect first query at mrouted. */ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); + if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker @@ -1248,15 +1163,16 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, } atomic_inc(&mrt->cache_resolve_queue_len); - list_add(&c->list, &mrt->mfc_unres_queue); + list_add(&c->_c.list, &mrt->mfc_unres_queue); mroute_netlink_event(mrt, c, RTM_NEWROUTE); if (atomic_read(&mrt->cache_resolve_queue_len) == 1) - mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); + mod_timer(&mrt->ipmr_expire_timer, + c->_c.mfc_un.unres.expires); } /* See if we can append the packet */ - if (c->mfc_un.unres.unresolved.qlen > 3) { + if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { @@ -1264,7 +1180,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, skb->dev = dev; skb->skb_iif = dev->ifindex; } - skb_queue_tail(&c->mfc_un.unres.unresolved, skb); + skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } @@ -1286,11 +1202,11 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) rcu_read_unlock(); if (!c) return -ENOENT; - rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); - list_del_rcu(&c->list); + rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ipmr_rht_params); + list_del_rcu(&c->_c.list); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_put(c); + mr_cache_put(&c->_c); return 0; } @@ -1299,6 +1215,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, struct mfcctl *mfc, int mrtsock, int parent) { struct mfc_cache *uc, *c; + struct mr_mfc *_uc; bool found; int ret; @@ -1312,10 +1229,10 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, rcu_read_unlock(); if (c) { write_lock_bh(&mrt_lock); - c->mfc_parent = mfc->mfcc_parent; - ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); + c->_c.mfc_parent = mfc->mfcc_parent; + ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) - c->mfc_flags |= MFC_STATIC; + c->_c.mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); @@ -1333,28 +1250,29 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, c->mfc_origin = mfc->mfcc_origin.s_addr; c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; - c->mfc_parent = mfc->mfcc_parent; - ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); + c->_c.mfc_parent = mfc->mfcc_parent; + ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) - c->mfc_flags |= MFC_STATIC; + c->_c.mfc_flags |= MFC_STATIC; - ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode, + ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ipmr_rht_params); if (ret) { pr_err("ipmr: rhtable insert error %d\n", ret); ipmr_cache_free(c); return ret; } - list_add_tail_rcu(&c->list, &mrt->mfc_cache_list); + list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(uc, &mrt->mfc_unres_queue, list) { + list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { + uc = (struct mfc_cache *)_uc; if (uc->mfc_origin == c->mfc_origin && uc->mfc_mcastgrp == c->mfc_mcastgrp) { - list_del(&uc->list); + list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; @@ -1377,7 +1295,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, static void mroute_clean_tables(struct mr_table *mrt, bool all) { struct net *net = read_pnet(&mrt->net); - struct mfc_cache *c, *tmp; + struct mr_mfc *c, *tmp; + struct mfc_cache *cache; LIST_HEAD(list); int i; @@ -1395,18 +1314,20 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); - call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, + cache = (struct mfc_cache *)c; + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache, mrt->id); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_put(c); + mroute_netlink_event(mrt, cache, RTM_DELROUTE); + mr_cache_put(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_destroy_unres(mrt, c); + cache = (struct mfc_cache *)c; + mroute_netlink_event(mrt, cache, RTM_DELROUTE); + ipmr_destroy_unres(mrt, cache); } spin_unlock_bh(&mfc_unres_lock); } @@ -1420,7 +1341,7 @@ static void mrtsock_destruct(struct sock *sk) struct net *net = sock_net(sk); struct mr_table *mrt; - ASSERT_RTNL(); + rtnl_lock(); ipmr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; @@ -1432,6 +1353,7 @@ static void mrtsock_destruct(struct sock *sk) mroute_clean_tables(mrt, false); } } + rtnl_unlock(); } /* Socket options and virtual interface manipulation. The whole @@ -1496,8 +1418,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, if (sk != rcu_access_pointer(mrt->mroute_sk)) { ret = -EACCES; } else { + /* We need to unlock here because mrtsock_destruct takes + * care of rtnl itself and we can't change that due to + * the IP_ROUTER_ALERT setsockopt which runs without it. + */ + rtnl_unlock(); ret = ip_ra_control(sk, 0, NULL); - goto out_unlock; + goto out; } break; case MRT_ADD_VIF: @@ -1609,6 +1536,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, } out_unlock: rtnl_unlock(); +out: return ret; } @@ -1698,9 +1626,9 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { - sr.pktcnt = c->mfc_un.res.pkt; - sr.bytecnt = c->mfc_un.res.bytes; - sr.wrong_if = c->mfc_un.res.wrong_if; + sr.pktcnt = c->_c.mfc_un.res.pkt; + sr.bytecnt = c->_c.mfc_un.res.bytes; + sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -1772,9 +1700,9 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { - sr.pktcnt = c->mfc_un.res.pkt; - sr.bytecnt = c->mfc_un.res.bytes; - sr.wrong_if = c->mfc_un.res.wrong_if; + sr.pktcnt = c->_c.mfc_un.res.pkt; + sr.bytecnt = c->_c.mfc_un.res.bytes; + sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -1998,26 +1926,26 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) /* "local" means that we should preserve one skb (for local delivery) */ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, - struct mfc_cache *cache, int local) + struct mfc_cache *c, int local) { int true_vifi = ipmr_find_vif(mrt, dev); int psend = -1; int vif, ct; - vif = cache->mfc_parent; - cache->mfc_un.res.pkt++; - cache->mfc_un.res.bytes += skb->len; - cache->mfc_un.res.lastuse = jiffies; + vif = c->_c.mfc_parent; + c->_c.mfc_un.res.pkt++; + c->_c.mfc_un.res.bytes += skb->len; + c->_c.mfc_un.res.lastuse = jiffies; - if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { + if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; /* For an (*,G) entry, we only check that the incomming * interface is part of the static tree. */ - cache_proxy = ipmr_cache_find_any_parent(mrt, vif); + cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && - cache_proxy->mfc_un.res.ttls[true_vifi] < 255) + cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } @@ -2038,7 +1966,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, goto dont_forward; } - cache->mfc_un.res.wrong_if++; + c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -2047,10 +1975,11 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, * large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || - cache->mfc_un.res.ttls[true_vifi] < 255) && + c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, - cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { - cache->mfc_un.res.last_assert = jiffies; + c->_c.mfc_un.res.last_assert + + MFC_ASSERT_THRESH)) { + c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); } goto dont_forward; @@ -2061,33 +1990,33 @@ forward: mrt->vif_table[vif].bytes_in += skb->len; /* Forward the frame */ - if (cache->mfc_origin == htonl(INADDR_ANY) && - cache->mfc_mcastgrp == htonl(INADDR_ANY)) { + if (c->mfc_origin == htonl(INADDR_ANY) && + c->mfc_mcastgrp == htonl(INADDR_ANY)) { if (true_vifi >= 0 && - true_vifi != cache->mfc_parent && + true_vifi != c->_c.mfc_parent && ip_hdr(skb)->ttl > - cache->mfc_un.res.ttls[cache->mfc_parent]) { + c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ - psend = cache->mfc_parent; + psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } - for (ct = cache->mfc_un.res.maxvif - 1; - ct >= cache->mfc_un.res.minvif; ct--) { + for (ct = c->_c.mfc_un.res.maxvif - 1; + ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ - if ((cache->mfc_origin != htonl(INADDR_ANY) || + if ((c->mfc_origin != htonl(INADDR_ANY) || ct != true_vifi) && - ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { + ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, - skb2, cache, psend); + skb2, c, psend); } psend = ct; } @@ -2099,9 +2028,9 @@ last_forward: if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, skb2, - cache, psend); + c, psend); } else { - ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb, c, psend); return; } } @@ -2299,62 +2228,6 @@ drop: } #endif -static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - struct mfc_cache *c, struct rtmsg *rtm) -{ - struct rta_mfc_stats mfcs; - struct nlattr *mp_attr; - struct rtnexthop *nhp; - unsigned long lastuse; - int ct; - - /* If cache is unresolved, don't try to parse IIF and OIF */ - if (c->mfc_parent >= MAXVIFS) { - rtm->rtm_flags |= RTNH_F_UNRESOLVED; - return -ENOENT; - } - - if (VIF_EXISTS(mrt, c->mfc_parent) && - nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) - return -EMSGSIZE; - - if (c->mfc_flags & MFC_OFFLOAD) - rtm->rtm_flags |= RTNH_F_OFFLOAD; - - if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) - return -EMSGSIZE; - - for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { - if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) { - nla_nest_cancel(skb, mp_attr); - return -EMSGSIZE; - } - - nhp->rtnh_flags = 0; - nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; - nhp->rtnh_len = sizeof(*nhp); - } - } - - nla_nest_end(skb, mp_attr); - - lastuse = READ_ONCE(c->mfc_un.res.lastuse); - lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; - - mfcs.mfcs_packets = c->mfc_un.res.pkt; - mfcs.mfcs_bytes = c->mfc_un.res.bytes; - mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; - if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || - nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), - RTA_PAD)) - return -EMSGSIZE; - - rtm->rtm_type = RTN_MULTICAST; - return 1; -} - int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid) @@ -2412,7 +2285,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, } read_lock(&mrt_lock); - err = __ipmr_fill_mroute(mrt, skb, cache, rtm); + err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); read_unlock(&mrt_lock); rcu_read_unlock(); return err; @@ -2440,7 +2313,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; - if (c->mfc_flags & MFC_STATIC) + if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; @@ -2449,7 +2322,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) || nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp)) goto nla_put_failure; - err = __ipmr_fill_mroute(mrt, skb, c, rtm); + err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; @@ -2462,6 +2335,14 @@ nla_put_failure: return -EMSGSIZE; } +static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, int cmd, + int flags) +{ + return ipmr_fill_mroute(mrt, skb, portid, seq, (struct mfc_cache *)c, + cmd, flags); +} + static size_t mroute_msgsize(bool unresolved, int maxvif) { size_t len = @@ -2490,7 +2371,8 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif), + skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS, + mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; @@ -2634,62 +2516,8 @@ errout_free: static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); - struct mr_table *mrt; - struct mfc_cache *mfc; - unsigned int t = 0, s_t; - unsigned int e = 0, s_e; - - s_t = cb->args[0]; - s_e = cb->args[1]; - - rcu_read_lock(); - ipmr_for_each_table(mrt, net) { - if (t < s_t) - goto next_table; - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { - if (e < s_e) - goto next_entry; - if (ipmr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) - goto done; -next_entry: - e++; - } - e = 0; - s_e = 0; - - spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { - if (e < s_e) - goto next_entry2; - if (ipmr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) { - spin_unlock_bh(&mfc_unres_lock); - goto done; - } -next_entry2: - e++; - } - spin_unlock_bh(&mfc_unres_lock); - e = 0; - s_e = 0; -next_table: - t++; - } -done: - rcu_read_unlock(); - - cb->args[1] = e; - cb->args[0] = t; - - return skb->len; + return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, + _ipmr_fill_mroute, &mfc_unres_lock); } static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = { @@ -2946,31 +2774,11 @@ out: /* The /proc interfaces to multicast routing : * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif */ -struct ipmr_vif_iter { - struct seq_net_private p; - struct mr_table *mrt; - int ct; -}; - -static struct vif_device *ipmr_vif_seq_idx(struct net *net, - struct ipmr_vif_iter *iter, - loff_t pos) -{ - struct mr_table *mrt = iter->mrt; - - for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { - if (!VIF_EXISTS(mrt, iter->ct)) - continue; - if (pos-- == 0) - return &mrt->vif_table[iter->ct]; - } - return NULL; -} static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(mrt_lock) { - struct ipmr_vif_iter *iter = seq->private; + struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; @@ -2981,26 +2789,7 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) iter->mrt = mrt; read_lock(&mrt_lock); - return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1) - : SEQ_START_TOKEN; -} - -static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ipmr_vif_iter *iter = seq->private; - struct net *net = seq_file_net(seq); - struct mr_table *mrt = iter->mrt; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ipmr_vif_seq_idx(net, iter, 0); - - while (++iter->ct < mrt->maxvif) { - if (!VIF_EXISTS(mrt, iter->ct)) - continue; - return &mrt->vif_table[iter->ct]; - } - return NULL; + return mr_vif_seq_start(seq, pos); } static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) @@ -3011,7 +2800,7 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) static int ipmr_vif_seq_show(struct seq_file *seq, void *v) { - struct ipmr_vif_iter *iter = seq->private; + struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { @@ -3019,7 +2808,8 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); } else { const struct vif_device *vif = v; - const char *name = vif->dev ? vif->dev->name : "none"; + const char *name = vif->dev ? + vif->dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", @@ -3033,7 +2823,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) static const struct seq_operations ipmr_vif_seq_ops = { .start = ipmr_vif_seq_start, - .next = ipmr_vif_seq_next, + .next = mr_vif_seq_next, .stop = ipmr_vif_seq_stop, .show = ipmr_vif_seq_show, }; @@ -3041,7 +2831,7 @@ static const struct seq_operations ipmr_vif_seq_ops = { static int ipmr_vif_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &ipmr_vif_seq_ops, - sizeof(struct ipmr_vif_iter)); + sizeof(struct mr_vif_iter)); } static const struct file_operations ipmr_vif_fops = { @@ -3051,40 +2841,8 @@ static const struct file_operations ipmr_vif_fops = { .release = seq_release_net, }; -struct ipmr_mfc_iter { - struct seq_net_private p; - struct mr_table *mrt; - struct list_head *cache; -}; - -static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, - struct ipmr_mfc_iter *it, loff_t pos) -{ - struct mr_table *mrt = it->mrt; - struct mfc_cache *mfc; - - rcu_read_lock(); - it->cache = &mrt->mfc_cache_list; - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - if (pos-- == 0) - return mfc; - rcu_read_unlock(); - - spin_lock_bh(&mfc_unres_lock); - it->cache = &mrt->mfc_unres_queue; - list_for_each_entry(mfc, it->cache, list) - if (pos-- == 0) - return mfc; - spin_unlock_bh(&mfc_unres_lock); - - it->cache = NULL; - return NULL; -} - - static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { - struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; @@ -3092,54 +2850,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) if (!mrt) return ERR_PTR(-ENOENT); - it->mrt = mrt; - it->cache = NULL; - return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) - : SEQ_START_TOKEN; -} - -static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ipmr_mfc_iter *it = seq->private; - struct net *net = seq_file_net(seq); - struct mr_table *mrt = it->mrt; - struct mfc_cache *mfc = v; - - ++*pos; - - if (v == SEQ_START_TOKEN) - return ipmr_mfc_seq_idx(net, seq->private, 0); - - if (mfc->list.next != it->cache) - return list_entry(mfc->list.next, struct mfc_cache, list); - - if (it->cache == &mrt->mfc_unres_queue) - goto end_of_list; - - /* exhausted cache_array, show unresolved */ - rcu_read_unlock(); - it->cache = &mrt->mfc_unres_queue; - - spin_lock_bh(&mfc_unres_lock); - if (!list_empty(it->cache)) - return list_first_entry(it->cache, struct mfc_cache, list); - -end_of_list: - spin_unlock_bh(&mfc_unres_lock); - it->cache = NULL; - - return NULL; -} - -static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) -{ - struct ipmr_mfc_iter *it = seq->private; - struct mr_table *mrt = it->mrt; - - if (it->cache == &mrt->mfc_unres_queue) - spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == &mrt->mfc_cache_list) - rcu_read_unlock(); + return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) @@ -3151,26 +2862,26 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) "Group Origin Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc_cache *mfc = v; - const struct ipmr_mfc_iter *it = seq->private; + const struct mr_mfc_iter *it = seq->private; const struct mr_table *mrt = it->mrt; seq_printf(seq, "%08X %08X %-3hd", (__force u32) mfc->mfc_mcastgrp, (__force u32) mfc->mfc_origin, - mfc->mfc_parent); + mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", - mfc->mfc_un.res.pkt, - mfc->mfc_un.res.bytes, - mfc->mfc_un.res.wrong_if); - for (n = mfc->mfc_un.res.minvif; - n < mfc->mfc_un.res.maxvif; n++) { + mfc->_c.mfc_un.res.pkt, + mfc->_c.mfc_un.res.bytes, + mfc->_c.mfc_un.res.wrong_if); + for (n = mfc->_c.mfc_un.res.minvif; + n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && - mfc->mfc_un.res.ttls[n] < 255) + mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", - n, mfc->mfc_un.res.ttls[n]); + n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain @@ -3185,15 +2896,15 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, - .next = ipmr_mfc_seq_next, - .stop = ipmr_mfc_seq_stop, + .next = mr_mfc_seq_next, + .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; static int ipmr_mfc_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &ipmr_mfc_seq_ops, - sizeof(struct ipmr_mfc_iter)); + sizeof(struct mr_mfc_iter)); } static const struct file_operations ipmr_mfc_fops = { @@ -3220,37 +2931,8 @@ static unsigned int ipmr_seq_read(struct net *net) static int ipmr_dump(struct net *net, struct notifier_block *nb) { - struct mr_table *mrt; - int err; - - err = ipmr_rules_dump(net, nb); - if (err) - return err; - - ipmr_for_each_table(mrt, net) { - struct vif_device *v = &mrt->vif_table[0]; - struct mfc_cache *mfc; - int vifi; - - /* Notifiy on table VIF entries */ - read_lock(&mrt_lock); - for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { - if (!v->dev) - continue; - - call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD, - v, vifi, mrt->id); - } - read_unlock(&mrt_lock); - - /* Notify on table MFC entries */ - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - call_ipmr_mfc_entry_notifier(nb, net, - FIB_EVENT_ENTRY_ADD, mfc, - mrt->id); - } - - return 0; + return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, + ipmr_mr_table_iter, &mrt_lock); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c new file mode 100644 index 000000000000..4fe97723b53f --- /dev/null +++ b/net/ipv4/ipmr_base.c @@ -0,0 +1,365 @@ +/* Linux multicast routing support + * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation + */ + +#include <linux/mroute_base.h> + +/* Sets everything common except 'dev', since that is done under locking */ +void vif_device_init(struct vif_device *v, + struct net_device *dev, + unsigned long rate_limit, + unsigned char threshold, + unsigned short flags, + unsigned short get_iflink_mask) +{ + v->dev = NULL; + v->bytes_in = 0; + v->bytes_out = 0; + v->pkt_in = 0; + v->pkt_out = 0; + v->rate_limit = rate_limit; + v->flags = flags; + v->threshold = threshold; + if (v->flags & get_iflink_mask) + v->link = dev_get_iflink(dev); + else + v->link = dev->ifindex; +} +EXPORT_SYMBOL(vif_device_init); + +struct mr_table * +mr_table_alloc(struct net *net, u32 id, + struct mr_table_ops *ops, + void (*expire_func)(struct timer_list *t), + void (*table_set)(struct mr_table *mrt, + struct net *net)) +{ + struct mr_table *mrt; + + mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); + if (!mrt) + return NULL; + mrt->id = id; + write_pnet(&mrt->net, net); + + mrt->ops = *ops; + rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params); + INIT_LIST_HEAD(&mrt->mfc_cache_list); + INIT_LIST_HEAD(&mrt->mfc_unres_queue); + + timer_setup(&mrt->ipmr_expire_timer, expire_func, 0); + + mrt->mroute_reg_vif_num = -1; + table_set(mrt, net); + return mrt; +} +EXPORT_SYMBOL(mr_table_alloc); + +void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent) +{ + struct rhlist_head *tmp, *list; + struct mr_mfc *c; + + list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + if (parent == -1 || parent == c->mfc_parent) + return c; + + return NULL; +} +EXPORT_SYMBOL(mr_mfc_find_parent); + +void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi) +{ + struct rhlist_head *tmp, *list; + struct mr_mfc *c; + + list = rhltable_lookup(&mrt->mfc_hash, mrt->ops.cmparg_any, + *mrt->ops.rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + if (c->mfc_un.res.ttls[vifi] < 255) + return c; + + return NULL; +} +EXPORT_SYMBOL(mr_mfc_find_any_parent); + +void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) +{ + struct rhlist_head *tmp, *list; + struct mr_mfc *c, *proxy; + + list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) { + if (c->mfc_un.res.ttls[vifi] < 255) + return c; + + /* It's ok if the vifi is part of the static tree */ + proxy = mr_mfc_find_any_parent(mrt, c->mfc_parent); + if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) + return c; + } + + return mr_mfc_find_any_parent(mrt, vifi); +} +EXPORT_SYMBOL(mr_mfc_find_any); + +#ifdef CONFIG_PROC_FS +void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos) +{ + struct mr_table *mrt = iter->mrt; + + for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { + if (!VIF_EXISTS(mrt, iter->ct)) + continue; + if (pos-- == 0) + return &mrt->vif_table[iter->ct]; + } + return NULL; +} +EXPORT_SYMBOL(mr_vif_seq_idx); + +void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct mr_vif_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct mr_table *mrt = iter->mrt; + + ++*pos; + if (v == SEQ_START_TOKEN) + return mr_vif_seq_idx(net, iter, 0); + + while (++iter->ct < mrt->maxvif) { + if (!VIF_EXISTS(mrt, iter->ct)) + continue; + return &mrt->vif_table[iter->ct]; + } + return NULL; +} +EXPORT_SYMBOL(mr_vif_seq_next); + +void *mr_mfc_seq_idx(struct net *net, + struct mr_mfc_iter *it, loff_t pos) +{ + struct mr_table *mrt = it->mrt; + struct mr_mfc *mfc; + + rcu_read_lock(); + it->cache = &mrt->mfc_cache_list; + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + if (pos-- == 0) + return mfc; + rcu_read_unlock(); + + spin_lock_bh(it->lock); + it->cache = &mrt->mfc_unres_queue; + list_for_each_entry(mfc, it->cache, list) + if (pos-- == 0) + return mfc; + spin_unlock_bh(it->lock); + + it->cache = NULL; + return NULL; +} +EXPORT_SYMBOL(mr_mfc_seq_idx); + +void *mr_mfc_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + struct mr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); + struct mr_table *mrt = it->mrt; + struct mr_mfc *c = v; + + ++*pos; + + if (v == SEQ_START_TOKEN) + return mr_mfc_seq_idx(net, seq->private, 0); + + if (c->list.next != it->cache) + return list_entry(c->list.next, struct mr_mfc, list); + + if (it->cache == &mrt->mfc_unres_queue) + goto end_of_list; + + /* exhausted cache_array, show unresolved */ + rcu_read_unlock(); + it->cache = &mrt->mfc_unres_queue; + + spin_lock_bh(it->lock); + if (!list_empty(it->cache)) + return list_first_entry(it->cache, struct mr_mfc, list); + +end_of_list: + spin_unlock_bh(it->lock); + it->cache = NULL; + + return NULL; +} +EXPORT_SYMBOL(mr_mfc_seq_next); +#endif + +int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mr_mfc *c, struct rtmsg *rtm) +{ + struct rta_mfc_stats mfcs; + struct nlattr *mp_attr; + struct rtnexthop *nhp; + unsigned long lastuse; + int ct; + + /* If cache is unresolved, don't try to parse IIF and OIF */ + if (c->mfc_parent >= MAXVIFS) { + rtm->rtm_flags |= RTNH_F_UNRESOLVED; + return -ENOENT; + } + + if (VIF_EXISTS(mrt, c->mfc_parent) && + nla_put_u32(skb, RTA_IIF, + mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) + return -EMSGSIZE; + + if (c->mfc_flags & MFC_OFFLOAD) + rtm->rtm_flags |= RTNH_F_OFFLOAD; + + mp_attr = nla_nest_start(skb, RTA_MULTIPATH); + if (!mp_attr) + return -EMSGSIZE; + + for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { + if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { + struct vif_device *vif; + + nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); + if (!nhp) { + nla_nest_cancel(skb, mp_attr); + return -EMSGSIZE; + } + + nhp->rtnh_flags = 0; + nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; + vif = &mrt->vif_table[ct]; + nhp->rtnh_ifindex = vif->dev->ifindex; + nhp->rtnh_len = sizeof(*nhp); + } + } + + nla_nest_end(skb, mp_attr); + + lastuse = READ_ONCE(c->mfc_un.res.lastuse); + lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; + + mfcs.mfcs_packets = c->mfc_un.res.pkt; + mfcs.mfcs_bytes = c->mfc_un.res.bytes; + mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; + if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || + nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), + RTA_PAD)) + return -EMSGSIZE; + + rtm->rtm_type = RTN_MULTICAST; + return 1; +} +EXPORT_SYMBOL(mr_fill_mroute); + +int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, + struct mr_table *(*iter)(struct net *net, + struct mr_table *mrt), + int (*fill)(struct mr_table *mrt, + struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, + int cmd, int flags), + spinlock_t *lock) +{ + unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1]; + struct net *net = sock_net(skb->sk); + struct mr_table *mrt; + struct mr_mfc *mfc; + + rcu_read_lock(); + for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) { + if (t < s_t) + goto next_table; + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + if (e < s_e) + goto next_entry; + if (fill(mrt, skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, mfc, + RTM_NEWROUTE, NLM_F_MULTI) < 0) + goto done; +next_entry: + e++; + } + e = 0; + s_e = 0; + + spin_lock_bh(lock); + list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { + if (e < s_e) + goto next_entry2; + if (fill(mrt, skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, mfc, + RTM_NEWROUTE, NLM_F_MULTI) < 0) { + spin_unlock_bh(lock); + goto done; + } +next_entry2: + e++; + } + spin_unlock_bh(lock); + e = 0; + s_e = 0; +next_table: + t++; + } +done: + rcu_read_unlock(); + + cb->args[1] = e; + cb->args[0] = t; + + return skb->len; +} +EXPORT_SYMBOL(mr_rtm_dumproute); + +int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, + int (*rules_dump)(struct net *net, + struct notifier_block *nb), + struct mr_table *(*mr_iter)(struct net *net, + struct mr_table *mrt), + rwlock_t *mrt_lock) +{ + struct mr_table *mrt; + int err; + + err = rules_dump(net, nb); + if (err) + return err; + + for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) { + struct vif_device *v = &mrt->vif_table[0]; + struct mr_mfc *mfc; + int vifi; + + /* Notifiy on table VIF entries */ + read_lock(mrt_lock); + for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { + if (!v->dev) + continue; + + mr_call_vif_notifier(nb, net, family, + FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); + } + read_unlock(mrt_lock); + + /* Notify on table MFC entries */ + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + mr_call_mfc_notifier(nb, net, family, + FIB_EVENT_ENTRY_ADD, + mfc, mrt->id); + } + + return 0; +} +EXPORT_SYMBOL(mr_dump); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index dfe6fa4ea554..280048e1e395 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -34,7 +34,7 @@ config NF_SOCKET_IPV4 if NF_TABLES config NF_TABLES_IPV4 - tristate "IPv4 nf_tables support" + bool "IPv4 nf_tables support" help This option enables the IPv4 support for nf_tables. @@ -71,7 +71,7 @@ config NFT_FIB_IPV4 endif # NF_TABLES_IPV4 config NF_TABLES_ARP - tristate "ARP nf_tables support" + bool "ARP nf_tables support" select NETFILTER_FAMILY_ARP help This option enables the ARP support for nf_tables. diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 2dad20eefd26..7523ddb2566b 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -29,7 +29,7 @@ obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o nf_nat_snmp_basic-y := nf_nat_snmp_basic-asn1.o nf_nat_snmp_basic_main.o -nf_nat_snmp_basic-y : nf_nat_snmp_basic-asn1.h nf_nat_snmp_basic-asn1.c +$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic-asn1.h obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o clean-files := nf_nat_snmp_basic-asn1.c nf_nat_snmp_basic-asn1.h @@ -39,7 +39,6 @@ obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o -obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o @@ -47,7 +46,6 @@ obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o -obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # flow table support obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 4ffe302f9b82..2dc83de53f94 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -252,6 +252,10 @@ unsigned int arpt_do_table(struct sk_buff *skb, } if (table_base + v != arpt_next_entry(e)) { + if (unlikely(stackidx >= private->stacksize)) { + verdict = NF_DROP; + break; + } jumpstack[stackidx++] = e; } @@ -330,11 +334,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo, t->verdict < 0) || visited) { unsigned int oldpos, size; - if ((strcmp(t->target.u.user.name, - XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) - return 0; - /* Return: backtrack through the last * big jump. */ @@ -556,16 +555,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, if (i != repl->num_entries) goto out_free; - /* Check hooks all assigned */ - for (i = 0; i < NF_ARP_NUMHOOKS; i++) { - /* Only hooks which are valid */ - if (!(repl->valid_hooks & (1 << i))) - continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) - goto out_free; - if (newinfo->underflow[i] == 0xFFFFFFFF) - goto out_free; - } + ret = xt_check_table_hooks(newinfo, repl->valid_hooks); + if (ret) + goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; @@ -777,7 +769,9 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; - xt_compat_init_offsets(NFPROTO_ARP, info->number); + ret = xt_compat_init_offsets(NFPROTO_ARP, info->number); + if (ret) + return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -891,7 +885,7 @@ static int __do_replace(struct net *net, const char *name, struct arpt_entry *iter; ret = 0; - counters = vzalloc(num_counters * sizeof(struct xt_counters)); + counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; @@ -921,6 +915,8 @@ static int __do_replace(struct net *net, const char *name, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); + xt_table_unlock(t); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ @@ -935,7 +931,6 @@ static int __do_replace(struct net *net, const char *name, net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); } vfree(counters); - xt_table_unlock(t); return ret; put_module: @@ -1163,7 +1158,7 @@ static int translate_compat_table(struct xt_table_info **pinfo, struct compat_arpt_entry *iter0; struct arpt_replace repl; unsigned int size; - int ret = 0; + int ret; info = *pinfo; entry0 = *pentry0; @@ -1172,7 +1167,9 @@ static int translate_compat_table(struct xt_table_info **pinfo, j = 0; xt_compat_lock(NFPROTO_ARP); - xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries); + ret = xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries); + if (ret) + goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 9a71f3149507..44b308d93ec2 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -330,8 +330,13 @@ ipt_do_table(struct sk_buff *skb, continue; } if (table_base + v != ipt_next_entry(e) && - !(e->ip.flags & IPT_F_GOTO)) + !(e->ip.flags & IPT_F_GOTO)) { + if (unlikely(stackidx >= private->stacksize)) { + verdict = NF_DROP; + break; + } jumpstack[stackidx++] = e; + } e = get_entry(table_base, v); continue; @@ -397,11 +402,6 @@ mark_source_chains(const struct xt_table_info *newinfo, t->verdict < 0) || visited) { unsigned int oldpos, size; - if ((strcmp(t->target.u.user.name, - XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) - return 0; - /* Return: backtrack through the last big jump. */ do { @@ -702,16 +702,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (i != repl->num_entries) goto out_free; - /* Check hooks all assigned */ - for (i = 0; i < NF_INET_NUMHOOKS; i++) { - /* Only hooks which are valid */ - if (!(repl->valid_hooks & (1 << i))) - continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) - goto out_free; - if (newinfo->underflow[i] == 0xFFFFFFFF) - goto out_free; - } + ret = xt_check_table_hooks(newinfo, repl->valid_hooks); + if (ret) + goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; @@ -940,7 +933,9 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; - xt_compat_init_offsets(AF_INET, info->number); + ret = xt_compat_init_offsets(AF_INET, info->number); + if (ret) + return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1052,7 +1047,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct ipt_entry *iter; ret = 0; - counters = vzalloc(num_counters * sizeof(struct xt_counters)); + counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; @@ -1082,6 +1077,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); + xt_table_unlock(t); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ @@ -1095,7 +1092,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); } vfree(counters); - xt_table_unlock(t); return ret; put_module: @@ -1413,7 +1409,9 @@ translate_compat_table(struct net *net, j = 0; xt_compat_lock(AF_INET); - xt_compat_init_offsets(AF_INET, compatr->num_entries); + ret = xt_compat_init_offsets(AF_INET, compatr->num_entries); + if (ret) + goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 3a84a60f6b39..2c8d313ae216 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -107,12 +107,6 @@ clusterip_config_entry_put(struct net *net, struct clusterip_config *c) local_bh_disable(); if (refcount_dec_and_lock(&c->entries, &cn->lock)) { - list_del_rcu(&c->list); - spin_unlock(&cn->lock); - local_bh_enable(); - - unregister_netdevice_notifier(&c->notifier); - /* In case anyone still accesses the file, the open/close * functions are also incrementing the refcount on their own, * so it's safe to remove the entry even if it's in use. */ @@ -120,6 +114,12 @@ clusterip_config_entry_put(struct net *net, struct clusterip_config *c) if (cn->procdir) proc_remove(c->pde); #endif + list_del_rcu(&c->list); + spin_unlock(&cn->lock); + local_bh_enable(); + + unregister_netdevice_notifier(&c->notifier); + return; } local_bh_enable(); @@ -154,8 +154,12 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) #endif if (unlikely(!refcount_inc_not_zero(&c->refcount))) c = NULL; - else if (entry) - refcount_inc(&c->entries); + else if (entry) { + if (unlikely(!refcount_inc_not_zero(&c->entries))) { + clusterip_config_put(c); + c = NULL; + } + } } rcu_read_unlock_bh(); @@ -228,7 +232,6 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, c->hash_mode = i->hash_mode; c->hash_initval = i->hash_initval; refcount_set(&c->refcount, 1); - refcount_set(&c->entries, 1); spin_lock_bh(&cn->lock); if (__clusterip_config_find(net, ip)) { @@ -247,7 +250,7 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, /* create proc dir entry */ sprintf(buffer, "%pI4", &ip); - c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR, + c->pde = proc_create_data(buffer, 0600, cn->procdir, &clusterip_proc_fops, c); if (!c->pde) { @@ -259,8 +262,10 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, c->notifier.notifier_call = clusterip_netdev_event; err = register_netdevice_notifier(&c->notifier); - if (!err) + if (!err) { + refcount_set(&c->entries, 1); return c; + } #ifdef CONFIG_PROC_FS proc_remove(c->pde); @@ -269,7 +274,7 @@ err: spin_lock_bh(&cn->lock); list_del_rcu(&c->list); spin_unlock_bh(&cn->lock); - kfree(c); + clusterip_config_put(c); return ERR_PTR(err); } @@ -492,12 +497,15 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) return PTR_ERR(config); } } - cipinfo->config = config; ret = nf_ct_netns_get(par->net, par->family); - if (ret < 0) + if (ret < 0) { pr_info("cannot load conntrack support for proto=%u\n", par->family); + clusterip_config_entry_put(par->net, config); + clusterip_config_put(config); + return ret; + } if (!par->net->xt.clusterip_deprecated_warning) { pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, " @@ -505,6 +513,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) par->net->xt.clusterip_deprecated_warning = true; } + cipinfo->config = config; return ret; } diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 270765236f5e..aaaf9a81fbc9 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -98,17 +98,15 @@ static int ecn_tg_check(const struct xt_tgchk_param *par) const struct ipt_ECN_info *einfo = par->targinfo; const struct ipt_entry *e = par->entryinfo; - if (einfo->operation & IPT_ECN_OP_MASK) { - pr_info("unsupported ECN operation %x\n", einfo->operation); + if (einfo->operation & IPT_ECN_OP_MASK) return -EINVAL; - } - if (einfo->ip_ect & ~IPT_ECN_IP_MASK) { - pr_info("new ECT codepoint %x out of mask\n", einfo->ip_ect); + + if (einfo->ip_ect & ~IPT_ECN_IP_MASK) return -EINVAL; - } + if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) { - pr_info("cannot use TCP operations on a non-tcp rule\n"); + pr_info_ratelimited("cannot use operation on non-tcp rule\n"); return -EINVAL; } return 0; diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 8bd0d7b26632..e8bed3390e58 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -74,13 +74,13 @@ static int reject_tg_check(const struct xt_tgchk_param *par) const struct ipt_entry *e = par->entryinfo; if (rejinfo->with == IPT_ICMP_ECHOREPLY) { - pr_info("ECHOREPLY no longer supported.\n"); + pr_info_ratelimited("ECHOREPLY no longer supported.\n"); return -EINVAL; } else if (rejinfo->with == IPT_TCP_RESET) { /* Must specify that it's a TCP packet */ if (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO)) { - pr_info("TCP_RESET invalid for non-tcp\n"); + pr_info_ratelimited("TCP_RESET invalid for non-tcp\n"); return -EINVAL; } } diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index f75fc6b53115..690b17ef6a44 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -16,6 +16,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> +#include <net/netfilter/nf_conntrack_ecache.h> static struct iphdr * synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, @@ -384,6 +385,8 @@ static unsigned int ipv4_synproxy_hook(void *priv, synproxy->isn = ntohl(th->ack_seq); if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); break; case TCP_CONNTRACK_SYN_RECV: if (!th->syn || !th->ack) @@ -392,8 +395,10 @@ static unsigned int ipv4_synproxy_hook(void *priv, if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } opts.options &= ~(XT_SYNPROXY_OPT_MSS | XT_SYNPROXY_OPT_WSCALE | @@ -403,6 +408,7 @@ static unsigned int ipv4_synproxy_hook(void *priv, synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); swap(opts.tsval, opts.tsecr); synproxy_send_client_ack(net, skb, th, &opts); diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index a787d07f6cb7..7c6c20eaf4db 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -47,7 +47,7 @@ static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par) */ pr_debug("Dropping evil AH tinygram.\n"); par->hotdrop = true; - return 0; + return false; } return spi_match(ahinfo->spis[0], ahinfo->spis[1], diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 37fb9552e858..fd01f13c896a 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -105,14 +105,14 @@ static int rpfilter_check(const struct xt_mtchk_param *par) const struct xt_rpfilter_info *info = par->matchinfo; unsigned int options = ~XT_RPFILTER_OPTION_MASK; if (info->flags & options) { - pr_info("unknown options encountered"); + pr_info_ratelimited("unknown options\n"); return -EINVAL; } if (strcmp(par->table, "mangle") != 0 && strcmp(par->table, "raw") != 0) { - pr_info("match only valid in the \'raw\' " - "or \'mangle\' tables, not \'%s\'.\n", par->table); + pr_info_ratelimited("only valid in \'raw\' or \'mangle\' table, not \'%s\'\n", + par->table); return -EINVAL; } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index b50721d9d30e..9db988f9a4d7 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -154,8 +154,20 @@ static unsigned int ipv4_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */ + if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */ + enum ip_conntrack_info ctinfo; + struct nf_conn *tmpl; + + tmpl = nf_ct_get(skb, &ctinfo); + if (tmpl && nf_ct_is_template(tmpl)) { + /* when skipping ct, clear templates to avoid fooling + * later targets/matches + */ + skb->_nfct = 0; + nf_ct_put(tmpl); + } return NF_ACCEPT; + } return nf_conntrack_in(state->net, PF_INET, state->hook, skb); } diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index 25d2975da156..0cd46bffa469 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -111,6 +111,7 @@ static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, default: return -1; } + csum_replace4(&iph->check, addr, new_addr); return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); } @@ -185,7 +186,7 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) return false; - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index e9293bdebba0..4824b1e183a1 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -108,10 +108,12 @@ struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, int doff = 0; if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { - struct udphdr _hdr, *hp; + struct tcphdr _hdr; + struct udphdr *hp; hp = skb_header_pointer(skb, ip_hdrlen(skb), - sizeof(_hdr), &_hdr); + iph->protocol == IPPROTO_UDP ? + sizeof(*hp) : sizeof(_hdr), &_hdr); if (hp == NULL) return NULL; diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c deleted file mode 100644 index 036c074736b0..000000000000 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/netfilter_arp.h> -#include <net/netfilter/nf_tables.h> - -static unsigned int -nft_do_chain_arp(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_unspec(&pkt, skb); - - return nft_do_chain(&pkt, priv); -} - -static const struct nf_chain_type filter_arp = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_ARP, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_ARP_IN) | - (1 << NF_ARP_OUT), - .hooks = { - [NF_ARP_IN] = nft_do_chain_arp, - [NF_ARP_OUT] = nft_do_chain_arp, - }, -}; - -static int __init nf_tables_arp_init(void) -{ - return nft_register_chain_type(&filter_arp); -} - -static void __exit nf_tables_arp_exit(void) -{ - nft_unregister_chain_type(&filter_arp); -} - -module_init(nf_tables_arp_init); -module_exit(nf_tables_arp_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(3, "filter"); /* NFPROTO_ARP */ diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c deleted file mode 100644 index 96f955496d5f..000000000000 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/ip.h> -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_tables.h> -#include <net/net_namespace.h> -#include <net/ip.h> -#include <net/netfilter/nf_tables_ipv4.h> - -static unsigned int nft_do_chain_ipv4(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); - - return nft_do_chain(&pkt, priv); -} - -static const struct nf_chain_type filter_ipv4 = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_IPV4, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_FORWARD) | - (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_POST_ROUTING), - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, - [NF_INET_LOCAL_OUT] = nft_do_chain_ipv4, - [NF_INET_FORWARD] = nft_do_chain_ipv4, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, - }, -}; - -static int __init nf_tables_ipv4_init(void) -{ - return nft_register_chain_type(&filter_ipv4); -} - -static void __exit nf_tables_ipv4_exit(void) -{ - nft_unregister_chain_type(&filter_ipv4); -} - -module_init(nf_tables_ipv4_init); -module_exit(nf_tables_ipv4_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(AF_INET, "filter"); diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index f2a490981594..b5464a3f253b 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -67,7 +67,17 @@ static unsigned int nft_nat_ipv4_local_fn(void *priv, return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain); } -static const struct nf_chain_type nft_chain_nat_ipv4 = { +static int nft_nat_ipv4_init(struct nft_ctx *ctx) +{ + return nf_ct_netns_get(ctx->net, ctx->family); +} + +static void nft_nat_ipv4_free(struct nft_ctx *ctx) +{ + nf_ct_netns_put(ctx->net, ctx->family); +} + +static const struct nft_chain_type nft_chain_nat_ipv4 = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_IPV4, @@ -82,15 +92,13 @@ static const struct nf_chain_type nft_chain_nat_ipv4 = { [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn, [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn, }, + .init = nft_nat_ipv4_init, + .free = nft_nat_ipv4_free, }; static int __init nft_chain_nat_init(void) { - int err; - - err = nft_register_chain_type(&nft_chain_nat_ipv4); - if (err < 0) - return err; + nft_register_chain_type(&nft_chain_nat_ipv4); return 0; } diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index d965c225b9f6..7d82934c46f4 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -58,7 +58,7 @@ static unsigned int nf_route_table_hook(void *priv, return ret; } -static const struct nf_chain_type nft_chain_route_ipv4 = { +static const struct nft_chain_type nft_chain_route_ipv4 = { .name = "route", .type = NFT_CHAIN_T_ROUTE, .family = NFPROTO_IPV4, @@ -71,7 +71,9 @@ static const struct nf_chain_type nft_chain_route_ipv4 = { static int __init nft_chain_route_init(void) { - return nft_register_chain_type(&nft_chain_route_ipv4); + nft_register_chain_type(&nft_chain_route_ipv4); + + return 0; } static void __exit nft_chain_route_exit(void) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index b8f0db54b197..05e47d777009 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1177,7 +1177,7 @@ static struct ping_seq_afinfo ping_v4_seq_afinfo = { int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo) { struct proc_dir_entry *p; - p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + p = proc_create_data(afinfo->name, 0444, net->proc_net, afinfo->seq_fops, afinfo); if (!p) return -ENOMEM; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index dc5edc8f7564..a058de677e94 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -54,7 +54,6 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; - unsigned int frag_mem; int orphans, sockets; orphans = percpu_counter_sum_positive(&tcp_orphan_count); @@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); - frag_mem = ip_frag_mem(net); - seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem); + seq_printf(seq, "FRAG: inuse %u memory %lu\n", + atomic_read(&net->ipv4.frags.rhashtable.nelems), + frag_mem_limit(&net->ipv4.frags)); return 0; } @@ -521,12 +521,12 @@ static const struct file_operations netstat_seq_fops = { static __net_init int ip_proc_init_net(struct net *net) { - if (!proc_create("sockstat", S_IRUGO, net->proc_net, + if (!proc_create("sockstat", 0444, net->proc_net, &sockstat_seq_fops)) goto out_sockstat; - if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops)) + if (!proc_create("netstat", 0444, net->proc_net, &netstat_seq_fops)) goto out_netstat; - if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops)) + if (!proc_create("snmp", 0444, net->proc_net, &snmp_seq_fops)) goto out_snmp; return 0; @@ -555,4 +555,3 @@ int __init ip_misc_proc_init(void) { return register_pernet_subsys(&ip_proc_ops); } - diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 9b367fc48d7d..1b4d3355624a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout) /* * Raw sockets may have direct kernel references. Kill them. */ - rtnl_lock(); ip_ra_control(sk, 0, NULL); - rtnl_unlock(); sk_common_release(sk); } @@ -1142,7 +1140,7 @@ static const struct file_operations raw_seq_fops = { static __net_init int raw_init_net(struct net *net) { - if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops)) + if (!proc_create("raw", 0444, net->proc_net, &raw_seq_fops)) return -ENOMEM; return 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 49cc1c1df1ba..8322e479f299 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -128,10 +128,11 @@ static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); static int ip_rt_error_cost __read_mostly = HZ; static int ip_rt_error_burst __read_mostly = 5 * HZ; static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; -static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; +static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; + /* * Interface to generic destination cache. */ @@ -378,12 +379,12 @@ static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; - pde = proc_create("rt_cache", S_IRUGO, net->proc_net, + pde = proc_create("rt_cache", 0444, net->proc_net, &rt_cache_seq_fops); if (!pde) goto err1; - pde = proc_create("rt_cache", S_IRUGO, + pde = proc_create("rt_cache", 0444, net->proc_net_stat, &rt_cpu_seq_fops); if (!pde) goto err2; @@ -633,6 +634,7 @@ static inline u32 fnhe_hashfun(__be32 daddr) static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) { rt->rt_pmtu = fnhe->fnhe_pmtu; + rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; rt->dst.expires = fnhe->fnhe_expires; if (fnhe->fnhe_gw) { @@ -643,7 +645,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh } static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, - u32 pmtu, unsigned long expires) + u32 pmtu, bool lock, unsigned long expires) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; @@ -680,8 +682,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_genid = genid; if (gw) fnhe->fnhe_gw = gw; - if (pmtu) + if (pmtu) { fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_mtu_locked = lock; + } fnhe->fnhe_expires = max(1UL, expires); /* Update all cached dsts too */ rt = rcu_dereference(fnhe->fnhe_rth_input); @@ -705,6 +709,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = expires; /* Exception created; mark the cached routes for the nexthop @@ -786,7 +791,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, - 0, jiffies + ip_rt_gc_timeout); + 0, false, + jiffies + ip_rt_gc_timeout); } if (kill_route) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -930,14 +936,23 @@ out_put_peer: static int ip_error(struct sk_buff *skb) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); struct rtable *rt = skb_rtable(skb); + struct net_device *dev = skb->dev; + struct in_device *in_dev; struct inet_peer *peer; unsigned long now; struct net *net; bool send; int code; + if (netif_is_l3_master(skb->dev)) { + dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); + if (!dev) + goto out; + } + + in_dev = __in_dev_get_rcu(dev); + /* IP on this device is disabled. */ if (!in_dev) goto out; @@ -999,15 +1014,18 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; struct fib_result res; + bool lock = false; - if (dst_metric_locked(dst, RTAX_MTU)) + if (ip_mtu_locked(dst)) return; if (ipv4_mtu(dst) < mtu) return; - if (mtu < ip_rt_min_pmtu) + if (mtu < ip_rt_min_pmtu) { + lock = true; mtu = ip_rt_min_pmtu; + } if (rt->rt_pmtu == mtu && time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) @@ -1017,7 +1035,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); - update_or_create_fnhe(nh, fl4->daddr, 0, mtu, + update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock, jiffies + ip_rt_mtu_expires); } rcu_read_unlock(); @@ -1270,7 +1288,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = READ_ONCE(dst->dev->mtu); - if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { + if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } @@ -1383,7 +1401,7 @@ struct uncached_list { static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); -static void rt_add_uncached_list(struct rtable *rt) +void rt_add_uncached_list(struct rtable *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); @@ -1394,14 +1412,8 @@ static void rt_add_uncached_list(struct rtable *rt) spin_unlock_bh(&ul->lock); } -static void ipv4_dst_destroy(struct dst_entry *dst) +void rt_del_uncached_list(struct rtable *rt) { - struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); - struct rtable *rt = (struct rtable *) dst; - - if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) - kfree(p); - if (!list_empty(&rt->rt_uncached)) { struct uncached_list *ul = rt->rt_uncached_list; @@ -1411,6 +1423,17 @@ static void ipv4_dst_destroy(struct dst_entry *dst) } } +static void ipv4_dst_destroy(struct dst_entry *dst) +{ + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); + struct rtable *rt = (struct rtable *)dst; + + if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) + kfree(p); + + rt_del_uncached_list(rt); +} + void rt_flush_dev(struct net_device *dev) { struct net *net = dev_net(dev); @@ -1506,9 +1529,9 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_is_input = 0; rt->rt_iif = 0; rt->rt_pmtu = 0; + rt->rt_mtu_locked = 0; rt->rt_gateway = 0; rt->rt_uses_gateway = 0; - rt->rt_table_id = 0; INIT_LIST_HEAD(&rt->rt_uncached); rt->dst.output = ip_output; @@ -1644,19 +1667,6 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) spin_unlock_bh(&fnhe_lock); } -static void set_lwt_redirect(struct rtable *rth) -{ - if (lwtunnel_output_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_output = rth->dst.output; - rth->dst.output = lwtunnel_output; - } - - if (lwtunnel_input_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_input = rth->dst.input; - rth->dst.input = lwtunnel_input; - } -} - /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1739,15 +1749,13 @@ rt_cache: } rth->rt_is_input = 1; - if (res->table) - rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, do_cache); - set_lwt_redirect(rth); + lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: err = 0; @@ -1763,44 +1771,45 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *hash_keys) { const struct iphdr *outer_iph = ip_hdr(skb); + const struct iphdr *key_iph = outer_iph; const struct iphdr *inner_iph; const struct icmphdr *icmph; struct iphdr _inner_iph; struct icmphdr _icmph; - hash_keys->addrs.v4addrs.src = outer_iph->saddr; - hash_keys->addrs.v4addrs.dst = outer_iph->daddr; if (likely(outer_iph->protocol != IPPROTO_ICMP)) - return; + goto out; if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) - return; + goto out; icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), &_icmph); if (!icmph) - return; + goto out; if (icmph->type != ICMP_DEST_UNREACH && icmph->type != ICMP_REDIRECT && icmph->type != ICMP_TIME_EXCEEDED && icmph->type != ICMP_PARAMETERPROB) - return; + goto out; inner_iph = skb_header_pointer(skb, outer_iph->ihl * 4 + sizeof(_icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) - return; - hash_keys->addrs.v4addrs.src = inner_iph->saddr; - hash_keys->addrs.v4addrs.dst = inner_iph->daddr; + goto out; + + key_iph = inner_iph; +out: + hash_keys->addrs.v4addrs.src = key_iph->saddr; + hash_keys->addrs.v4addrs.dst = key_iph->daddr; } /* if skb is set it will be used and fl4 can be NULL */ -int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, - const struct sk_buff *skb) +int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, + const struct sk_buff *skb, struct flow_keys *flkeys) { - struct net *net = fi->fib_net; struct flow_keys hash_keys; u32 mhash; @@ -1824,13 +1833,20 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; + memset(&hash_keys, 0, sizeof(hash_keys)); - skb_flow_dissect_flow_keys(skb, &keys, flag); - hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; - hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; - hash_keys.ports.src = keys.ports.src; - hash_keys.ports.dst = keys.ports.dst; - hash_keys.basic.ip_proto = keys.basic.ip_proto; + + if (!flkeys) { + skb_flow_dissect_flow_keys(skb, &keys, flag); + flkeys = &keys; + } + + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; + hash_keys.ports.src = flkeys->ports.src; + hash_keys.ports.dst = flkeys->ports.dst; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -1846,17 +1862,17 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, return mhash >> 1; } -EXPORT_SYMBOL_GPL(fib_multipath_hash); #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos) + __be32 daddr, __be32 saddr, u32 tos, + struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) { - int h = fib_multipath_hash(res->fi, NULL, skb); + int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); } @@ -1882,13 +1898,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct fib_result *res) { struct in_device *in_dev = __in_dev_get_rcu(dev); + struct flow_keys *flkeys = NULL, _flkeys; + struct net *net = dev_net(dev); struct ip_tunnel_info *tun_info; - struct flowi4 fl4; + int err = -EINVAL; unsigned int flags = 0; u32 itag = 0; struct rtable *rth; - int err = -EINVAL; - struct net *net = dev_net(dev); + struct flowi4 fl4; bool do_cache; /* IP on this device is disabled. */ @@ -1947,6 +1964,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); + + if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) + flkeys = &_flkeys; + err = fib_lookup(net, &fl4, res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) @@ -1972,7 +1993,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res->type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); out: return err; brd_input: @@ -2014,8 +2035,6 @@ local_input: rth->dst.tclassid = itag; #endif rth->rt_is_input = 1; - if (res->table) - rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); if (res->type == RTN_UNREACHABLE) { @@ -2244,8 +2263,6 @@ add: return ERR_PTR(-ENOBUFS); rth->rt_iif = orig_oif; - if (res->table) - rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(out_slow_tot); @@ -2267,7 +2284,7 @@ add: } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); - set_lwt_redirect(rth); + lwtunnel_set_redirect(&rth->dst); return rth; } @@ -2529,6 +2546,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; rt->rt_pmtu = ort->rt_pmtu; + rt->rt_mtu_locked = ort->rt_mtu_locked; rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; @@ -2631,6 +2649,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); if (rt->rt_pmtu && expires) metrics[RTAX_MTU - 1] = rt->rt_pmtu; + if (rt->rt_mtu_locked && expires) + metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; @@ -2775,7 +2795,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, rt->rt_flags |= RTCF_NOTIFY; if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) - table_id = rt->rt_table_id; + table_id = res.table ? res.table->tb_id : 0; if (rtm->rtm_flags & RTM_F_FIB_MATCH) { if (!res.fi) { @@ -2816,6 +2836,7 @@ void ip_rt_multicast_event(struct in_device *in_dev) static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; +static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, void __user *buffer, @@ -2931,7 +2952,8 @@ static struct ctl_table ipv4_route_table[] = { .data = &ip_rt_min_pmtu, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &ip_min_valid_pmtu, }, { .procname = "min_adv_mss", diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index fda37f2862c9..c3387dfd725b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -349,6 +349,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; treq->snt_synack = 0; treq->tfo_listener = false; + if (IS_ENABLED(CONFIG_SMC)) + ireq->smc_ok = 0; ireq->ir_iif = inet_request_bound_dev_if(sk, skb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 93e172118a94..4b195bac8ac0 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -400,7 +400,7 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net); + call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); return ret; } @@ -520,22 +520,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { - .procname = "udp_rmem_min", - .data = &sysctl_udp_rmem_min, - .maxlen = sizeof(sysctl_udp_rmem_min), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one - }, - { - .procname = "udp_wmem_min", - .data = &sysctl_udp_wmem_min, - .maxlen = sizeof(sysctl_udp_wmem_min), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one - }, { } }; @@ -1167,6 +1151,22 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one, }, + { + .procname = "udp_rmem_min", + .data = &init_net.ipv4.sysctl_udp_rmem_min, + .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, + { + .procname = "udp_wmem_min", + .data = &init_net.ipv4.sysctl_udp_wmem_min, + .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 48636aee23c3..bccc4c270087 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -453,6 +453,7 @@ void tcp_init_sock(struct sock *sk) sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk_sockets_allocated_inc(sk); + sk->sk_route_forced_caps = NETIF_F_GSO; } EXPORT_SYMBOL(tcp_init_sock); @@ -484,6 +485,14 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) } } +static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, + int target, struct sock *sk) +{ + return (tp->rcv_nxt - tp->copied_seq >= target) || + (sk->sk_prot->stream_memory_read ? + sk->sk_prot->stream_memory_read(sk) : false); +} + /* * Wait for a TCP event. * @@ -553,7 +562,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - if (tp->rcv_nxt - tp->copied_seq >= target) + if (tcp_stream_is_readable(tp, target, sk)) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { @@ -897,7 +906,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, struct tcp_sock *tp = tcp_sk(sk); u32 new_size_goal, size_goal; - if (!large_allowed || !sk_can_gso(sk)) + if (!large_allowed) return mss_now; /* Note : tcp_tso_autosize() will eventually split this later */ @@ -993,7 +1002,9 @@ new_segment: get_page(page); skb_fill_page_desc(skb, i, page, offset, copy); } - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; skb->len += copy; skb->data_len += copy; @@ -1062,8 +1073,7 @@ EXPORT_SYMBOL_GPL(do_tcp_sendpages); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags) { - if (!(sk->sk_route_caps & NETIF_F_SG) || - !sk_check_csum_caps(sk)) + if (!(sk->sk_route_caps & NETIF_F_SG)) return sock_no_sendpage_locked(sk, page, offset, size, flags); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ @@ -1102,27 +1112,11 @@ static int linear_payload_sz(bool first_skb) return 0; } -static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc) +static int select_size(bool first_skb, bool zc) { - const struct tcp_sock *tp = tcp_sk(sk); - int tmp = tp->mss_cache; - - if (sg) { - if (zc) - return 0; - - if (sk_can_gso(sk)) { - tmp = linear_payload_sz(first_skb); - } else { - int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); - - if (tmp >= pgbreak && - tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) - tmp = pgbreak; - } - } - - return tmp; + if (zc) + return 0; + return linear_payload_sz(first_skb); } void tcp_free_fastopen_req(struct tcp_sock *tp) @@ -1187,7 +1181,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; bool process_backlog = false; - bool sg, zc = false; + bool zc = false; long timeo; flags = msg->msg_flags; @@ -1205,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG; + zc = sk->sk_route_caps & NETIF_F_SG; if (!zc) uarg->zerocopy = 0; } @@ -1268,18 +1262,12 @@ restart: if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; - sg = !!(sk->sk_route_caps & NETIF_F_SG); - while (msg_data_left(msg)) { int copy = 0; - int max = size_goal; skb = tcp_write_queue_tail(sk); - if (skb) { - if (skb->ip_summed == CHECKSUM_NONE) - max = mss_now; - copy = max - skb->len; - } + if (skb) + copy = size_goal - skb->len; if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; @@ -1297,22 +1285,17 @@ new_segment: goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); - linear = select_size(sk, sg, first_skb, zc); + linear = select_size(first_skb, zc); skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation, first_skb); if (!skb) goto wait_for_memory; process_backlog = true; - /* - * Check whether we can use HW checksum. - */ - if (sk_check_csum_caps(sk)) - skb->ip_summed = CHECKSUM_PARTIAL; + skb->ip_summed = CHECKSUM_PARTIAL; skb_entail(sk, skb); copy = size_goal; - max = size_goal; /* All packets are restored as if they have * already been sent. skb_mstamp isn't set to @@ -1343,7 +1326,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i >= sysctl_max_skb_frags || !sg) { + if (i >= sysctl_max_skb_frags) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1396,7 +1379,7 @@ new_segment: goto out; } - if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) + if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair)) continue; if (forced_push(tp)) { @@ -3058,8 +3041,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u32 rate; stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 3 * nla_total_size(sizeof(u32)) + - 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); + 5 * nla_total_size(sizeof(u32)) + + 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3088,6 +3071,10 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); + nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); + + nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); + nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); return stats; } @@ -3566,6 +3553,7 @@ int tcp_abort(struct sock *sk, int err) bh_unlock_sock(sk); local_bh_enable(); + tcp_write_queue_purge(sk); release_sock(sk); return 0; } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index a471f696e13c..158d105e76da 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -97,10 +97,9 @@ struct bbr { packet_conservation:1, /* use packet conservation? */ restore_cwnd:1, /* decided to revert cwnd to old value */ round_start:1, /* start of packet-timed tx->ack round? */ - tso_segs_goal:7, /* segments we want in each skb we send */ idle_restart:1, /* restarting after idle? */ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ - unused:5, + unused:12, lt_is_sampling:1, /* taking long-term ("LT") samples now? */ lt_rtt_cnt:7, /* round trips in long-term interval */ lt_use_bw:1; /* use lt_bw as our bw estimate? */ @@ -261,23 +260,25 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) sk->sk_pacing_rate = rate; } -/* Return count of segments we want in the skbs we send, or 0 for default. */ -static u32 bbr_tso_segs_goal(struct sock *sk) +/* override sysctl_tcp_min_tso_segs */ +static u32 bbr_min_tso_segs(struct sock *sk) { - struct bbr *bbr = inet_csk_ca(sk); - - return bbr->tso_segs_goal; + return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; } -static void bbr_set_tso_segs_goal(struct sock *sk) +static u32 bbr_tso_segs_goal(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); - u32 min_segs; + u32 segs, bytes; + + /* Sort of tcp_tso_autosize() but ignoring + * driver provided sk_gso_max_size. + */ + bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift, + GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); + segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); - min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; - bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs), - 0x7FU); + return min(segs, 0x7FU); } /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ @@ -348,7 +349,7 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; /* Allow enough full-sized skbs in flight to utilize end systems. */ - cwnd += 3 * bbr->tso_segs_goal; + cwnd += 3 * bbr_tso_segs_goal(sk); /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ cwnd = (cwnd + 1) & ~1U; @@ -730,6 +731,8 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) bbr->mode = BBR_DRAIN; /* drain queue we created */ bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ + tcp_sk(sk)->snd_ssthresh = + bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); } /* fall through to check if in-flight is already small: */ if (bbr->mode == BBR_DRAIN && tcp_packets_in_flight(tcp_sk(sk)) <= @@ -824,7 +827,6 @@ static void bbr_main(struct sock *sk, const struct rate_sample *rs) bw = bbr_bw(sk); bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); - bbr_set_tso_segs_goal(sk); bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); } @@ -834,7 +836,7 @@ static void bbr_init(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->prior_cwnd = 0; - bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; bbr->rtt_cnt = 0; bbr->next_rtt_delivered = 0; bbr->prev_ca_state = TCP_CA_Open; @@ -887,7 +889,7 @@ static u32 bbr_undo_cwnd(struct sock *sk) static u32 bbr_ssthresh(struct sock *sk) { bbr_save_cwnd(sk); - return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */ + return tcp_sk(sk)->snd_ssthresh; } static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, @@ -936,7 +938,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .undo_cwnd = bbr_undo_cwnd, .cwnd_event = bbr_cwnd_event, .ssthresh = bbr_ssthresh, - .tso_segs_goal = bbr_tso_segs_goal, + .min_tso_segs = bbr_min_tso_segs, .get_info = bbr_get_info, .set_state = bbr_set_state, }; diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 7c843578f233..faddf4f9a707 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -6,7 +6,7 @@ * The algorithm is described in: * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm * for High-Speed Networks" - * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf + * http://tamerbasar.csl.illinois.edu/LiuBasarSrikantPerfEvalArtJun2008.pdf * * Implemented from description in paper and ns-2 simulation. * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 575d3c1fb6e8..367def6ddeda 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1358,9 +1358,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, int len; int in_sack; - if (!sk_can_gso(sk)) - goto fallback; - /* Normally R but no L won't result in plain S */ if (!dup_sack && (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) @@ -1971,11 +1968,6 @@ void tcp_enter_loss(struct sock *sk) /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous * loss recovery is underway except recurring timeout(s) on * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing - * - * In theory F-RTO can be used repeatedly during loss recovery. - * In practice this interacts badly with broken middle-boxes that - * falsely raise the receive window, which results in repeated - * timeouts and stop-and-go behavior. */ tp->frto = net->ipv4.sysctl_tcp_frto && (new_recovery || icsk->icsk_retransmits) && @@ -2631,18 +2623,14 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, tcp_try_undo_loss(sk, false)) return; - /* The ACK (s)acks some never-retransmitted data meaning not all - * the data packets before the timeout were lost. Therefore we - * undo the congestion window and state. This is essentially - * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since - * a retransmitted skb is permantly marked, we can apply such an - * operation even if F-RTO was not used. - */ - if ((flag & FLAG_ORIG_SACK_ACKED) && - tcp_try_undo_loss(sk, tp->undo_marker)) - return; - if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ + /* Step 3.b. A timeout is spurious if not all data are + * lost, i.e., never-retransmitted data are (s)acked. + */ + if ((flag & FLAG_ORIG_SACK_ACKED) && + tcp_try_undo_loss(sk, true)) + return; + if (after(tp->snd_nxt, tp->high_seq)) { if (flag & FLAG_DATA_SACKED || is_dupack) tp->frto = 0; /* Step 3.a. loss was real */ @@ -4001,6 +3989,7 @@ void tcp_reset(struct sock *sk) /* This barrier is coupled with smp_rmb() in tcp_poll() */ smp_wmb(); + tcp_write_queue_purge(sk); tcp_done(sk); if (!sock_flag(sk, SOCK_DEAD)) @@ -5870,10 +5859,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->rx_opt.saw_tstamp = 0; req = tp->fastopen_rsk; if (req) { + bool req_stolen; + WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - if (!tcp_check_req(sk, skb, req, true)) + if (!tcp_check_req(sk, skb, req, true, &req_stolen)) goto discard; } @@ -6264,6 +6255,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); + if (IS_ENABLED(CONFIG_SMC) && want_cookie) + tmp_opt.smc_ok = 0; + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb, sk); inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f8ad397e285e..f70586b50838 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -140,6 +140,21 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) } EXPORT_SYMBOL_GPL(tcp_twsk_unique); +static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v4_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); +} + /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -561,16 +576,9 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct tcphdr *th = tcp_hdr(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) { - th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - } else { - th->check = tcp_v4_check(skb->len, saddr, daddr, - csum_partial(th, - th->doff << 2, - skb->csum)); - } + th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); } /* This routine computes an IPv4 TCP checksum. */ @@ -1672,6 +1680,7 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); + bool req_stolen = false; struct sock *nsk; sk = req->rsk_listener; @@ -1694,10 +1703,20 @@ process: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); - nsk = tcp_check_req(sk, skb, req, false); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen); } if (!nsk) { reqsk_put(req); + if (req_stolen) { + /* Another cpu got exclusive access to req + * and created a full blown socket. + * Try to feed this packet to this socket + * instead of discarding it. + */ + tcp_v4_restore_cb(skb); + sock_put(sk); + goto lookup; + } goto discard_and_relse; } if (nsk == sk) { @@ -2211,7 +2230,7 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) afinfo->seq_ops.next = tcp_seq_next; afinfo->seq_ops.stop = tcp_seq_stop; - p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + p = proc_create_data(afinfo->name, 0444, net->proc_net, afinfo->seq_fops, afinfo); if (!p) rc = -ENOMEM; @@ -2404,6 +2423,7 @@ struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v4_pre_connect, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a8384b0c11f8..57b5468b5139 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -332,6 +332,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcp_update_metrics(sk); tcp_done(sk); } +EXPORT_SYMBOL(tcp_time_wait); void tcp_twsk_destructor(struct sock *sk) { @@ -578,7 +579,7 @@ EXPORT_SYMBOL(tcp_create_openreq_child); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - bool fastopen) + bool fastopen, bool *req_stolen) { struct tcp_options_received tmp_opt; struct sock *child; @@ -785,6 +786,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); + *req_stolen = !own_req; return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b2bca373f8be..383cac0ff0ec 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1206,7 +1206,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { - if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { + if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ @@ -1335,21 +1335,9 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; tcp_skb_fragment_eor(skb, buff); - if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { - /* Copy and checksum data tail into the new buffer. */ - buff->csum = csum_partial_copy_nocheck(skb->data + len, - skb_put(buff, nsize), - nsize, 0); - - skb_trim(skb, len); - - skb->csum = csum_block_sub(skb->csum, buff->csum, len); - } else { - skb->ip_summed = CHECKSUM_PARTIAL; - skb_split(skb, buff, len); - } + skb_split(skb, buff, len); - buff->ip_summed = skb->ip_summed; + buff->ip_summed = CHECKSUM_PARTIAL; buff->tstamp = skb->tstamp; tcp_fragment_tstamp(skb, buff); @@ -1715,8 +1703,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, /* Return how many segs we'd like on a TSO packet, * to send one TSO packet per ms */ -u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, - int min_tso_segs) +static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + int min_tso_segs) { u32 bytes, segs; @@ -1730,9 +1718,8 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, */ segs = max_t(u32, bytes / mss_now, min_tso_segs); - return min_t(u32, segs, sk->sk_gso_max_segs); + return segs; } -EXPORT_SYMBOL(tcp_tso_autosize); /* Return the number of segments we want in the skb we are transmitting. * See if congestion control module wants to decide; otherwise, autosize. @@ -1740,11 +1727,14 @@ EXPORT_SYMBOL(tcp_tso_autosize); static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; + u32 min_tso, tso_segs; - return tso_segs ? : - tcp_tso_autosize(sk, mss_now, - sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + min_tso = ca_ops->min_tso_segs ? + ca_ops->min_tso_segs(sk) : + sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs; + + tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); } /* Returns the portion of skb which can be sent right away */ @@ -1901,7 +1891,7 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, tcp_skb_fragment_eor(skb, buff); - buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; + buff->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); @@ -2134,7 +2124,7 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; TCP_SKB_CB(nskb)->sacked = 0; nskb->csum = 0; - nskb->ip_summed = skb->ip_summed; + nskb->ip_summed = CHECKSUM_PARTIAL; tcp_insert_write_queue_before(nskb, skb, sk); tcp_highest_sack_replace(sk, skb, nskb); @@ -2142,14 +2132,7 @@ static int tcp_mtu_probe(struct sock *sk) len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); - if (nskb->ip_summed) { - skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); - } else { - __wsum csum = skb_copy_and_csum_bits(skb, 0, - skb_put(nskb, copy), - copy, 0); - nskb->csum = csum_block_add(nskb->csum, csum, len); - } + skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); if (skb->len <= copy) { /* We've eaten all the data from this skb. @@ -2166,9 +2149,6 @@ static int tcp_mtu_probe(struct sock *sk) ~(TCPHDR_FIN|TCPHDR_PSH); if (!skb_shinfo(skb)->nr_frags) { skb_pull(skb, copy); - if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->csum = csum_partial(skb->data, - skb->len, 0); } else { __pskb_trim_head(skb, copy); tcp_set_skb_tso_segs(skb, mss_now); @@ -2746,12 +2726,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) } tcp_highest_sack_replace(sk, next_skb, skb); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) - skb->ip_summed = CHECKSUM_PARTIAL; - - if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); - /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 71fc60f1b326..f7d944855f8e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -34,6 +34,7 @@ static void tcp_write_err(struct sock *sk) sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; sk->sk_error_report(sk); + tcp_write_queue_purge(sk); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index ec35eaa5c029..c0630013c1ae 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -90,7 +90,7 @@ EXPORT_SYMBOL(xfrm4_tunnel_deregister); for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ - + static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e5ef7c38c934..24b5c59b1c53 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -122,12 +122,6 @@ EXPORT_SYMBOL(udp_table); long sysctl_udp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_udp_mem); -int sysctl_udp_rmem_min __read_mostly; -EXPORT_SYMBOL(sysctl_udp_rmem_min); - -int sysctl_udp_wmem_min __read_mostly; -EXPORT_SYMBOL(sysctl_udp_wmem_min); - atomic_long_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); @@ -1664,6 +1658,19 @@ csum_copy_err: goto try_again; } +int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + /* This check is replicated from __ip4_datagram_connect() and + * intended to prevent BPF program called below from accessing bytes + * that are out of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr); +} +EXPORT_SYMBOL(udp_pre_connect); + int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2533,35 +2540,36 @@ int udp_abort(struct sock *sk, int err) EXPORT_SYMBOL_GPL(udp_abort); struct proto udp_prot = { - .name = "UDP", - .owner = THIS_MODULE, - .close = udp_lib_close, - .connect = ip4_datagram_connect, - .disconnect = udp_disconnect, - .ioctl = udp_ioctl, - .init = udp_init_sock, - .destroy = udp_destroy_sock, - .setsockopt = udp_setsockopt, - .getsockopt = udp_getsockopt, - .sendmsg = udp_sendmsg, - .recvmsg = udp_recvmsg, - .sendpage = udp_sendpage, - .release_cb = ip4_datagram_release_cb, - .hash = udp_lib_hash, - .unhash = udp_lib_unhash, - .rehash = udp_v4_rehash, - .get_port = udp_v4_get_port, - .memory_allocated = &udp_memory_allocated, - .sysctl_mem = sysctl_udp_mem, - .sysctl_wmem = &sysctl_udp_wmem_min, - .sysctl_rmem = &sysctl_udp_rmem_min, - .obj_size = sizeof(struct udp_sock), - .h.udp_table = &udp_table, + .name = "UDP", + .owner = THIS_MODULE, + .close = udp_lib_close, + .pre_connect = udp_pre_connect, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .init = udp_init_sock, + .destroy = udp_destroy_sock, + .setsockopt = udp_setsockopt, + .getsockopt = udp_getsockopt, + .sendmsg = udp_sendmsg, + .recvmsg = udp_recvmsg, + .sendpage = udp_sendpage, + .release_cb = ip4_datagram_release_cb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .rehash = udp_v4_rehash, + .get_port = udp_v4_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), + .obj_size = sizeof(struct udp_sock), + .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udp_setsockopt, - .compat_getsockopt = compat_udp_getsockopt, + .compat_setsockopt = compat_udp_setsockopt, + .compat_getsockopt = compat_udp_getsockopt, #endif - .diag_destroy = udp_abort, + .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); @@ -2679,7 +2687,7 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) afinfo->seq_ops.next = udp_seq_next; afinfo->seq_ops.stop = udp_seq_stop; - p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + p = proc_create_data(afinfo->name, 0444, net->proc_net, afinfo->seq_fops, afinfo); if (!p) rc = -ENOMEM; @@ -2830,6 +2838,26 @@ u32 udp_flow_hashrnd(void) } EXPORT_SYMBOL(udp_flow_hashrnd); +static void __udp_sysctl_init(struct net *net) +{ + net->ipv4.sysctl_udp_rmem_min = SK_MEM_QUANTUM; + net->ipv4.sysctl_udp_wmem_min = SK_MEM_QUANTUM; + +#ifdef CONFIG_NET_L3_MASTER_DEV + net->ipv4.sysctl_udp_l3mdev_accept = 0; +#endif +} + +static int __net_init udp_sysctl_init(struct net *net) +{ + __udp_sysctl_init(net); + return 0; +} + +static struct pernet_operations __net_initdata udp_sysctl_ops = { + .init = udp_sysctl_init, +}; + void __init udp_init(void) { unsigned long limit; @@ -2842,8 +2870,7 @@ void __init udp_init(void) sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; - sysctl_udp_rmem_min = SK_MEM_QUANTUM; - sysctl_udp_wmem_min = SK_MEM_QUANTUM; + __udp_sysctl_init(&init_net); /* 16 spinlocks per cpu */ udp_busylocks_log = ilog2(nr_cpu_ids) + 4; @@ -2853,4 +2880,7 @@ void __init udp_init(void) panic("UDP: failed to alloc udp_busylocks\n"); for (i = 0; i < (1U << udp_busylocks_log); i++) spin_lock_init(udp_busylocks + i); + + if (register_pernet_subsys(&udp_sysctl_ops)) + panic("UDP: failed to init sysctl parameters.\n"); } diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 63faeee989a9..2a9764bd1719 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -92,7 +92,8 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) skb_reset_network_header(skb); skb_mac_header_rebuild(skb); - eth_hdr(skb)->h_proto = skb->protocol; + if (skb->mac_len) + eth_hdr(skb)->h_proto = skb->protocol; err = 0; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 94b8702603bc..be980c195fc5 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -30,7 +30,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) mtu = dst_mtu(skb_dst(skb)); if ((!skb_is_gso(skb) && skb->len > mtu) || - (skb_is_gso(skb) && skb_gso_network_seglen(skb) > ip_skb_dst_mtu(skb->sk, skb))) { + (skb_is_gso(skb) && + !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) { skb->protocol = htons(ETH_P_IP); if (skb->sk) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 05017e2c849c..d73a6d6652f6 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -100,8 +100,9 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; - xdst->u.rt.rt_table_id = rt->rt_table_id; + xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); + rt_add_uncached_list(&xdst->u.rt); return 0; } @@ -241,7 +242,8 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) struct xfrm_dst *xdst = (struct xfrm_dst *)dst; dst_destroy_metrics_generic(dst); - + if (xdst->u.rt.rt_uncached_list) + rt_del_uncached_list(&xdst->u.rt); xfrm_dst_destroy(xdst); } @@ -379,4 +381,3 @@ void __init xfrm4_init(void) xfrm4_protocol_init(); register_pernet_subsys(&xfrm4_net_ops); } - diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index ea71e4b0ab7a..6794ddf0547c 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -278,6 +278,7 @@ config IPV6_SUBTREES config IPV6_MROUTE bool "IPv6: multicast routing" depends on IPV6 + select IP_MROUTE_COMMON ---help--- Experimental support for IPv6 multicast forwarding. If unsure, say N. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e1846b97ee69..78cef00c9596 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -94,15 +94,6 @@ #include <linux/seq_file.h> #include <linux/export.h> -/* Set to 3 to get tracing... */ -#define ACONF_DEBUG 2 - -#if ACONF_DEBUG >= 3 -#define ADBG(fmt, ...) printk(fmt, ##__VA_ARGS__) -#else -#define ADBG(fmt, ...) do { if (0) printk(fmt, ##__VA_ARGS__); } while (0) -#endif - #define INFINITY_LIFE_TIME 0xFFFFFFFF #define IPV6_MAX_STRLEN \ @@ -409,9 +400,8 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) dev_hold(dev); if (snmp6_alloc_dev(ndev) < 0) { - ADBG(KERN_WARNING - "%s: cannot allocate memory for statistics; dev=%s.\n", - __func__, dev->name); + netdev_dbg(dev, "%s: cannot allocate memory for statistics\n", + __func__); neigh_parms_release(&nd_tbl, ndev->nd_parms); dev_put(dev); kfree(ndev); @@ -419,9 +409,8 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) } if (snmp6_register_dev(ndev) < 0) { - ADBG(KERN_WARNING - "%s: cannot create /proc/net/dev_snmp6/%s\n", - __func__, dev->name); + netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n", + __func__, dev->name); goto err_release; } @@ -984,7 +973,7 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) /* Ignore adding duplicate addresses on an interface */ if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev, hash)) { - ADBG("ipv6_add_addr: already assigned\n"); + netdev_dbg(dev, "ipv6_add_addr: already assigned\n"); err = -EEXIST; } else { hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); @@ -1044,7 +1033,6 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifa = kzalloc(sizeof(*ifa), gfp_flags); if (!ifa) { - ADBG("ipv6_add_addr: malloc failed\n"); err = -ENOBUFS; goto out; } @@ -1459,6 +1447,21 @@ static bool ipv6_use_optimistic_addr(struct net *net, #endif } +static bool ipv6_allow_optimistic_dad(struct net *net, + struct inet6_dev *idev) +{ +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + if (!idev) + return false; + if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + return false; + + return true; +#else + return false; +#endif +} + static int ipv6_get_saddr_eval(struct net *net, struct ipv6_saddr_score *score, struct ipv6_saddr_dst *dst, @@ -1836,22 +1839,42 @@ static int ipv6_count_addresses(const struct inet6_dev *idev) int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict) { - return ipv6_chk_addr_and_flags(net, addr, dev, strict, IFA_F_TENTATIVE); + return ipv6_chk_addr_and_flags(net, addr, dev, !dev, + strict, IFA_F_TENTATIVE); } EXPORT_SYMBOL(ipv6_chk_addr); +/* device argument is used to find the L3 domain of interest. If + * skip_dev_check is set, then the ifp device is not checked against + * the passed in dev argument. So the 2 cases for addresses checks are: + * 1. does the address exist in the L3 domain that dev is part of + * (skip_dev_check = true), or + * + * 2. does the address exist on the specific device + * (skip_dev_check = false) + */ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, - const struct net_device *dev, int strict, - u32 banned_flags) + const struct net_device *dev, bool skip_dev_check, + int strict, u32 banned_flags) { unsigned int hash = inet6_addr_hash(net, addr); + const struct net_device *l3mdev; struct inet6_ifaddr *ifp; u32 ifp_flags; rcu_read_lock(); + + l3mdev = l3mdev_master_dev_rcu(dev); + if (skip_dev_check) + dev = NULL; + hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; + + if (l3mdev_master_dev_rcu(ifp->idev->dev) != l3mdev) + continue; + /* Decouple optimistic from tentative for evaluation here. * Ban optimistic addresses explicitly, when required. */ @@ -1968,6 +1991,8 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) spin_lock_bh(&ifp->lock); addrconf_del_dad_work(ifp); ifp->flags |= IFA_F_TENTATIVE; + if (dad_failed) + ifp->flags &= ~IFA_F_OPTIMISTIC; spin_unlock_bh(&ifp->lock); if (dad_failed) ipv6_ifa_notify(0, ifp); @@ -2581,7 +2606,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) pinfo = (struct prefix_info *) opt; if (len < sizeof(struct prefix_info)) { - ADBG("addrconf: prefix option too short\n"); + netdev_dbg(dev, "addrconf: prefix option too short\n"); return; } @@ -4244,7 +4269,7 @@ static const struct file_operations if6_fops = { static int __net_init if6_proc_net_init(struct net *net) { - if (!proc_create("if_inet6", S_IRUGO, net->proc_net, &if6_fops)) + if (!proc_create("if_inet6", 0444, net->proc_net, &if6_fops)) return -ENOMEM; return 0; } @@ -4408,8 +4433,8 @@ restart: if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX; - ADBG(KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", - now, next, next_sec, next_sched); + pr_debug("now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", + now, next, next_sec, next_sched); mod_delayed_work(addrconf_wq, &addr_chk_work, next_sched - now); rcu_read_unlock_bh(); } @@ -4500,6 +4525,9 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64)) return -EINVAL; + if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED) + ifa_flags &= ~IFA_F_OPTIMISTIC; + timeout = addrconf_timeout_fixup(valid_lft, HZ); if (addrconf_finite_timeout(timeout)) { expires = jiffies_to_clock_t(timeout * HZ); @@ -4573,6 +4601,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct in6_addr *pfx, *peer_pfx; struct inet6_ifaddr *ifa; struct net_device *dev; + struct inet6_dev *idev; u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME; u32 ifa_flags; int err; @@ -4606,7 +4635,19 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, /* We ignore other flags so far. */ ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | - IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN; + IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; + + idev = ipv6_find_idev(dev); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + if (!ipv6_allow_optimistic_dad(net, idev)) + ifa_flags &= ~IFA_F_OPTIMISTIC; + + if (ifa_flags & IFA_F_NODAD && ifa_flags & IFA_F_OPTIMISTIC) { + NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive"); + return -EINVAL; + } ifa = ipv6_get_ifaddr(net, pfx, dev, 1); if (!ifa) { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 416917719a6f..8da0b513f188 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -277,15 +277,7 @@ out_rcu_unlock: /* bind for INET6 API */ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct net *net = sock_net(sk); - __be32 v4addr = 0; - unsigned short snum; - bool saved_ipv6only; - int addr_type = 0; int err = 0; /* If the socket has its own bind function then use it. */ @@ -295,11 +287,35 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); + if (err) + return err; + + return __inet6_bind(sk, uaddr, addr_len, false, true); +} +EXPORT_SYMBOL(inet6_bind); + +int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock) +{ + struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + __be32 v4addr = 0; + unsigned short snum; + bool saved_ipv6only; + int addr_type = 0; + int err = 0; + if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; addr_type = ipv6_addr_type(&addr->sin6_addr); - if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) + if ((addr_type & IPV6_ADDR_MULTICAST) && sk->sk_type == SOCK_STREAM) return -EINVAL; snum = ntohs(addr->sin6_port); @@ -307,7 +323,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; - lock_sock(sk); + if (with_lock) + lock_sock(sk); /* Check these errors (active socket, double bind). */ if (sk->sk_state != TCP_CLOSE || inet->inet_num) { @@ -395,12 +412,20 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ - if ((snum || !inet->bind_address_no_port) && - sk->sk_prot->get_port(sk, snum)) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - err = -EADDRINUSE; - goto out; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + err = -EADDRINUSE; + goto out; + } + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); + if (err) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + goto out; + } } if (addr_type != IPV6_ADDR_ANY) @@ -411,13 +436,13 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_dport = 0; inet->inet_daddr = 0; out: - release_sock(sk); + if (with_lock) + release_sock(sk); return err; out_unlock: rcu_read_unlock(); goto out; } -EXPORT_SYMBOL(inet6_bind); int inet6_release(struct socket *sock) { @@ -470,7 +495,7 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock); */ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; @@ -500,8 +525,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); - *uaddr_len = sizeof(*sin); - return 0; + return sizeof(*sin); } EXPORT_SYMBOL(inet6_getname); @@ -869,6 +893,10 @@ static const struct ipv6_stub ipv6_stub_impl = { .nd_tbl = &nd_tbl, }; +static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { + .inet6_bind = __inet6_bind, +}; + static int __init inet6_init(void) { struct list_head *r; @@ -1025,6 +1053,7 @@ static int __init inet6_init(void) /* ensure that ipv6 stubs are visible only after ipv6 is ready */ wmb(); ipv6_stub = &ipv6_stub_impl; + ipv6_bpf_stub = &ipv6_bpf_stub_impl; out: return err; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 8e085cc05aeb..bbcabbba9bd8 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -66,7 +66,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return -EPERM; if (ipv6_addr_is_multicast(addr)) return -EINVAL; - if (ipv6_chk_addr(net, addr, NULL, 0)) + + if (ifindex) + dev = __dev_get_by_index(net, ifindex); + + if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) return -EINVAL; pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); @@ -78,7 +82,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (ifindex == 0) { struct rt6_info *rt; - rt = rt6_lookup(net, addr, NULL, 0, 0); + rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); @@ -90,8 +94,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) dev = __dev_get_by_flags(net, IFF_UP, IFF_UP | IFF_LOOPBACK); } - } else - dev = __dev_get_by_index(net, ifindex); + } if (!dev) { err = -ENODEV; @@ -541,7 +544,7 @@ static const struct file_operations ac6_seq_fops = { int __net_init ac6_proc_init(struct net *net) { - if (!proc_create("anycast6", S_IRUGO, net->proc_net, &ac6_seq_fops)) + if (!proc_create("anycast6", 0444, net->proc_net, &ac6_seq_fops)) return -ENOMEM; return 0; @@ -552,4 +555,3 @@ void ac6_proc_exit(struct net *net) remove_proc_entry("anycast6", net->proc_net); } #endif - diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index fbf08ce3f5ab..a02ad100f0d7 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -106,14 +106,7 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) } } - ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ? - &sk->sk_v6_daddr : NULL, -#ifdef CONFIG_IPV6_SUBTREES - ipv6_addr_equal(&fl6.saddr, &np->saddr) ? - &np->saddr : -#endif - NULL); + ip6_sk_dst_store_flow(sk, dst, &fl6); out: fl6_sock_release(flowlabel); @@ -146,10 +139,12 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *daddr; + struct in6_addr *daddr, old_daddr; + __be32 fl6_flowlabel = 0; + __be32 old_fl6_flowlabel; + __be16 old_dport; int addr_type; int err; - __be32 fl6_flowlabel = 0; if (usin->sin6_family == AF_INET) { if (__ipv6_only_sock(sk)) @@ -238,9 +233,13 @@ ipv4_connected: } } + /* save the current peer information before updating it */ + old_daddr = sk->sk_v6_daddr; + old_fl6_flowlabel = np->flow_label; + old_dport = inet->inet_dport; + sk->sk_v6_daddr = *daddr; np->flow_label = fl6_flowlabel; - inet->inet_dport = usin->sin6_port; /* @@ -250,11 +249,12 @@ ipv4_connected: err = ip6_datagram_dst_update(sk, true); if (err) { - /* Reset daddr and dport so that udp_v6_early_demux() - * fails to find this socket + /* Restore the socket peer info, to keep it consistent with + * the old socket state */ - memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr)); - inet->inet_dport = 0; + sk->sk_v6_daddr = old_daddr; + np->flow_label = old_fl6_flowlabel; + inet->inet_dport = old_dport; goto out; } @@ -801,8 +801,9 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) && - !ipv6_chk_addr(net, &src_info->ipi6_addr, - strict ? dev : NULL, 0) && + !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr, + dev, !strict, 0, + IFA_F_TENTATIVE) && !ipv6_chk_acast_addr_src(net, dev, &src_info->ipi6_addr)) err = -EINVAL; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 3fd1ec775dc2..27f59b61f70f 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -165,6 +165,8 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || (x->xso.dev != skb->dev)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); + else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) + esp_features = features & ~NETIF_F_CSUM_MASK; xo->flags |= XFRM_GSO_SEGMENT; diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 11025f8d124b..b643f5ce6c80 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -279,4 +279,3 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, return nexthdr; } EXPORT_SYMBOL(ipv6_find_hdr); - diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index b240f24a6e52..df113c7b5fc8 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -61,11 +61,13 @@ unsigned int fib6_rules_seq_read(struct net *net) } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + const struct sk_buff *skb, int flags, pol_lookup_t lookup) { if (net->ipv6.fib6_has_custom_rules) { struct fib_lookup_arg arg = { .lookup_ptr = lookup, + .lookup_data = skb, .flags = FIB_LOOKUP_NOREF, }; @@ -80,11 +82,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, } else { struct rt6_info *rt; - rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, flags); + rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put(rt); - rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); + rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put(rt); @@ -130,7 +132,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } - rt = lookup(net, table, flp6, flags); + rt = lookup(net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { struct fib6_rule *r = (struct fib6_rule *)rule; @@ -223,6 +225,17 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel)) return 0; + if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto)) + return 0; + + if (fib_rule_port_range_set(&rule->sport_range) && + !fib_rule_port_inrange(&rule->sport_range, fl6->fl6_sport)) + return 0; + + if (fib_rule_port_range_set(&rule->dport_range) && + !fib_rule_port_inrange(&rule->dport_range, fl6->fl6_dport)) + return 0; + return 1; } @@ -258,12 +271,26 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule6->dst.plen = frh->dst_len; rule6->tclass = frh->tos; + if (fib_rule_requires_fldissect(rule)) + net->ipv6.fib6_rules_require_fldissect++; + net->ipv6.fib6_has_custom_rules = true; err = 0; errout: return err; } +static int fib6_rule_delete(struct fib_rule *rule) +{ + struct net *net = rule->fr_net; + + if (net->ipv6.fib6_rules_require_fldissect && + fib_rule_requires_fldissect(rule)) + net->ipv6.fib6_rules_require_fldissect--; + + return 0; +} + static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { @@ -323,6 +350,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .match = fib6_rule_match, .suppress = fib6_rule_suppress, .configure = fib6_rule_configure, + .delete = fib6_rule_delete, .compare = fib6_rule_compare, .fill = fib6_rule_fill, .nlmsg_payload = fib6_rule_nlmsg_payload, @@ -350,6 +378,7 @@ static int __net_init fib6_rules_net_init(struct net *net) goto out_fib6_rules_ops; net->ipv6.fib6_rules_ops = ops; + net->ipv6.fib6_rules_require_fldissect = 0; out: return err; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 6ae5dd3f4d0d..d8c4b6374377 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); - fl6.mp_hash = rt6_multipath_hash(&fl6, skb); + fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); @@ -629,7 +629,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, skb_pull(skb2, nhs); skb_reset_network_header(skb2); - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0); + rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, + skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 92b8d8c75eed..deab2db6692e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -299,11 +299,12 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; - rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); + rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put(rt); rt = net->ipv6.ip6_null_entry; @@ -1006,12 +1007,16 @@ add: if (err) return err; + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_ADD, + rt, extack); + if (err) + return err; + rcu_assign_pointer(rt->rt6_next, iter); atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rcu_assign_pointer(*ins, rt); - call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, - rt, extack); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; @@ -1035,12 +1040,16 @@ add: if (err) return err; + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_REPLACE, + rt, extack); + if (err) + return err; + atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rt->rt6_next = iter->rt6_next; rcu_assign_pointer(*ins, rt); - call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, - rt, extack); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 3dab664ff503..c05c4e82a7ca 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -844,7 +844,7 @@ static const struct file_operations ip6fl_seq_fops = { static int __net_init ip6_flowlabel_proc_init(struct net *net) { - if (!proc_create("ip6_flowlabel", S_IRUGO, net->proc_net, + if (!proc_create("ip6_flowlabel", 0444, net->proc_net, &ip6fl_seq_fops)) return -ENOMEM; return 0; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3c353125546d..f8a103bdbd60 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -126,7 +126,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, struct ip6_tnl *t, *cand = NULL; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); int dev_type = (gre_proto == htons(ETH_P_TEB) || - gre_proto == htons(ETH_P_ERSPAN)) ? + gre_proto == htons(ETH_P_ERSPAN) || + gre_proto == htons(ETH_P_ERSPAN2)) ? ARPHRD_ETHER : ARPHRD_IP6GRE; int score, cand_score = 4; @@ -236,7 +237,7 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, return t; dev = ign->fb_tunnel_dev; - if (dev->flags & IFF_UP) + if (dev && dev->flags & IFF_UP) return netdev_priv(dev); return NULL; @@ -695,9 +696,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, else fl6->daddr = tunnel->parms.raddr; - if (tunnel->parms.o_flags & TUNNEL_SEQ) - tunnel->o_seqno++; - /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; @@ -720,14 +718,20 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = key->tos; - flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + flags = key->tun_flags & + (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); tunnel->tun_hlen = gre_calc_hlen(flags); gre_build_header(skb, tunnel->tun_hlen, flags, protocol, - tunnel_id_to_key32(tun_info->key.tun_id), 0); + tunnel_id_to_key32(tun_info->key.tun_id), + (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) + : 0); } else { + if (tunnel->parms.o_flags & TUNNEL_SEQ) + tunnel->o_seqno++; + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); @@ -902,6 +906,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, truncate = true; } + if (skb_cow_head(skb, dev->needed_headroom)) + goto tx_err; + t->parms.o_flags &= ~TUNNEL_KEY; IPCB(skb)->flags = 0; @@ -944,6 +951,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, md->u.md2.dir, get_hwid(&md->u.md2), truncate, false); + } else { + goto tx_err; } } else { switch (skb->protocol) { @@ -1053,7 +1062,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, - p->link, strict); + p->link, NULL, strict); if (!rt) return; @@ -1469,6 +1478,8 @@ static int __net_init ip6gre_init_net(struct net *net) struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); int err; + if (!net_has_fallback_tunnels(net)) + return 0; ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", NET_NAME_UNKNOWN, ip6gre_tunnel_setup); @@ -1751,7 +1762,6 @@ static int ip6erspan_tap_init(struct net_device *dev) dev->mtu -= 8; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - tunnel = netdev_priv(dev); ip6gre_tnl_link_config(tunnel, 1); return 0; @@ -1784,6 +1794,12 @@ static void ip6gre_tap_setup(struct net_device *dev) netif_keep_dst(dev); } +bool is_ip6gretap_dev(const struct net_device *dev) +{ + return dev->netdev_ops == &ip6gre_tap_netdev_ops; +} +EXPORT_SYMBOL_GPL(is_ip6gretap_dev); + static bool ip6gre_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *ipencap) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 997c7f19ad62..b8ee50e94af3 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -71,7 +71,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && - ((mroute6_socket(net, skb) && + ((mroute6_is_socket(net, skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr))) { @@ -412,7 +412,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) if (skb->ignore_df) return false; - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; @@ -1105,23 +1105,32 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); * @sk: socket which provides the dst cache and route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup + * @connected: whether @sk is connected or not * * This function performs a route lookup on the given flow with the * possibility of using the cached route in the socket if it is valid. * It will take the socket dst lock when operating on the dst cache. * As a result, this function can only be used in process context. * + * In addition, for a connected socket, cache the dst in the socket + * if the current cache is not valid. + * * It returns a valid dst pointer on success, or a pointer encoded * error code. */ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, - const struct in6_addr *final_dst) + const struct in6_addr *final_dst, + bool connected) { struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); dst = ip6_sk_dst_check(sk, dst, fl6); - if (!dst) - dst = ip6_dst_lookup_flow(sk, fl6, final_dst); + if (dst) + return dst; + + dst = ip6_dst_lookup_flow(sk, fl6, final_dst); + if (connected && !IS_ERR(dst)) + ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); return dst; } @@ -1246,7 +1255,7 @@ static int __ip6_append_data(struct sock *sk, const struct sockcm_cookie *sockc) { struct sk_buff *skb, *skb_prev = NULL; - unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu; + unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; int exthdrlen = 0; int dst_exthdrlen = 0; int hh_len; @@ -1259,6 +1268,7 @@ static int __ip6_append_data(struct sock *sk, struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; + unsigned int wmem_alloc_delta = 0; skb = skb_peek_tail(queue); if (!skb) { @@ -1282,6 +1292,12 @@ static int __ip6_append_data(struct sock *sk, sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; + /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit + * the first fragment + */ + if (headersize + transhdrlen > mtu) + goto emsgsize; + if (cork->length + length > mtu - headersize && ipc6->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_RAW)) { @@ -1297,9 +1313,8 @@ static int __ip6_append_data(struct sock *sk, if (cork->length + length > maxnonfragsize - headersize) { emsgsize: - ipv6_local_error(sk, EMSGSIZE, fl6, - mtu - headersize + - sizeof(struct ipv6hdr)); + pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); + ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); return -EMSGSIZE; } @@ -1411,11 +1426,10 @@ alloc_new_skb: (flags & MSG_DONTWAIT), &err); } else { skb = NULL; - if (refcount_read(&sk->sk_wmem_alloc) <= + if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) - skb = sock_wmalloc(sk, - alloclen + hh_len, 1, - sk->sk_allocation); + skb = alloc_skb(alloclen + hh_len, + sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; } @@ -1474,6 +1488,11 @@ alloc_new_skb: /* * Put the packet on the pending queue */ + if (!skb->destructor) { + skb->destructor = sock_wfree; + skb->sk = sk; + wmem_alloc_delta += skb->truesize; + } __skb_queue_tail(queue, skb); continue; } @@ -1520,12 +1539,14 @@ alloc_new_skb: skb->len += copy; skb->data_len += copy; skb->truesize += copy; - refcount_add(copy, &sk->sk_wmem_alloc); + wmem_alloc_delta += copy; } offset += copy; length -= copy; } + if (wmem_alloc_delta) + refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return 0; error_efault: @@ -1533,6 +1554,7 @@ error_efault: error: cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return err; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 4b15fe928278..df4c29f7d59f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -679,7 +679,7 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, - NULL, 0, 0); + NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; @@ -758,9 +758,11 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, ldev = dev_get_by_index_rcu(net, p->link); if ((ipv6_addr_is_multicast(laddr) || - likely(ipv6_chk_addr(net, laddr, ldev, 0))) && + likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false, + 0, IFA_F_TENTATIVE))) && ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || - likely(!ipv6_chk_addr(net, raddr, NULL, 0)))) + likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true, + 0, IFA_F_TENTATIVE)))) ret = 1; } return ret; @@ -990,12 +992,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, if (p->link) ldev = dev_get_by_index_rcu(net, p->link); - if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0))) + if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, + 0, IFA_F_TENTATIVE))) pr_warn("%s xmit: Local address not yet configured!\n", p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && - unlikely(ipv6_chk_addr(net, raddr, NULL, 0))) + unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, + true, 0, IFA_F_TENTATIVE))) pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", p->name); else @@ -1444,7 +1448,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, - p->link, strict); + p->link, NULL, strict); if (!rt) return; @@ -1982,14 +1986,14 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, { struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - struct ip6_tnl *nt, *t; struct ip_tunnel_encap ipencap; + struct ip6_tnl *nt, *t; + int err; nt = netdev_priv(dev); if (ip6_tnl_netlink_encap_parms(data, &ipencap)) { - int err = ip6_tnl_encap_setup(nt, &ipencap); - + err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return err; } @@ -2005,7 +2009,11 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, return -EEXIST; } - return ip6_tnl_create2(dev); + err = ip6_tnl_create2(dev); + if (!err && tb[IFLA_MTU]) + ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + + return err; } static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], @@ -2201,6 +2209,8 @@ static int __net_init ip6_tnl_init_net(struct net *net) ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; + if (!net_has_fallback_tunnels(net)) + return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", NET_NAME_UNKNOWN, ip6_tnl_dev_setup); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index fa3ae1cb50d3..6ebb2e8777f4 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -622,11 +622,12 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } -static void vti6_link_config(struct ip6_tnl *t) +static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu) { struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; struct net_device *tdev = NULL; + int mtu; memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); @@ -640,12 +641,17 @@ static void vti6_link_config(struct ip6_tnl *t) else dev->flags &= ~IFF_POINTOPOINT; + if (keep_mtu && dev->mtu) { + dev->mtu = clamp(dev->mtu, dev->min_mtu, dev->max_mtu); + return; + } + if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, - p->link, strict); + p->link, NULL, strict); if (rt) tdev = rt->dst.dev; @@ -656,20 +662,25 @@ static void vti6_link_config(struct ip6_tnl *t) tdev = __dev_get_by_index(t->net, p->link); if (tdev) - dev->mtu = max_t(int, tdev->mtu - dev->hard_header_len, - IPV6_MIN_MTU); + mtu = tdev->mtu - sizeof(struct ipv6hdr); + else + mtu = ETH_DATA_LEN - LL_MAX_HEADER - sizeof(struct ipv6hdr); + + dev->mtu = max_t(int, mtu, IPV6_MIN_MTU); } /** * vti6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters + * @keep_mtu: MTU was set from userspace, don't re-compute it * * Description: * vti6_tnl_change() updates the tunnel parameters **/ static int -vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) +vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p, + bool keep_mtu) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; @@ -679,11 +690,12 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); - vti6_link_config(t); + vti6_link_config(t, keep_mtu); return 0; } -static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) +static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p, + bool keep_mtu) { struct net *net = dev_net(t->dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); @@ -691,7 +703,7 @@ static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) vti6_tnl_unlink(ip6n, t); synchronize_net(); - err = vti6_tnl_change(t, p); + err = vti6_tnl_change(t, p, keep_mtu); vti6_tnl_link(ip6n, t); netdev_state_change(t->dev); return err; @@ -804,7 +816,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } else t = netdev_priv(dev); - err = vti6_update(t, &p1); + err = vti6_update(t, &p1, false); } if (t) { err = 0; @@ -866,10 +878,8 @@ static void vti6_dev_setup(struct net_device *dev) dev->priv_destructor = vti6_dev_free; dev->type = ARPHRD_TUNNEL6; - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr); - dev->mtu = ETH_DATA_LEN; dev->min_mtu = IPV6_MIN_MTU; - dev->max_mtu = IP_MAX_MTU; + dev->max_mtu = IP_MAX_MTU - sizeof(struct ipv6hdr); dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); netif_keep_dst(dev); @@ -905,7 +915,7 @@ static int vti6_dev_init(struct net_device *dev) if (err) return err; - vti6_link_config(t); + vti6_link_config(t, true); return 0; } @@ -1010,7 +1020,7 @@ static int vti6_changelink(struct net_device *dev, struct nlattr *tb[], } else t = netdev_priv(dev); - return vti6_update(t, &p); + return vti6_update(t, &p, tb && tb[IFLA_MTU]); } static size_t vti6_get_size(const struct net_device *dev) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 9f6cace9c817..298fd8b6ed17 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -20,7 +20,6 @@ #include <linux/types.h> #include <linux/sched.h> #include <linux/errno.h> -#include <linux/timer.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> @@ -32,11 +31,9 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/compat.h> #include <net/protocol.h> #include <linux/skbuff.h> -#include <net/sock.h> #include <net/raw.h> #include <linux/notifier.h> #include <linux/if_arp.h> @@ -54,30 +51,12 @@ #include <net/ip6_checksum.h> #include <linux/netconf.h> -struct mr6_table { - struct list_head list; - possible_net_t net; - u32 id; - struct sock *mroute6_sk; - struct timer_list ipmr_expire_timer; - struct list_head mfc6_unres_queue; - struct list_head mfc6_cache_array[MFC6_LINES]; - struct mif_device vif6_table[MAXMIFS]; - int maxvif; - atomic_t cache_resolve_queue_len; - bool mroute_do_assert; - bool mroute_do_pim; -#ifdef CONFIG_IPV6_PIMSM_V2 - int mroute_reg_vif_num; -#endif -}; - struct ip6mr_rule { struct fib_rule common; }; struct ip6mr_result { - struct mr6_table *mrt; + struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. @@ -86,11 +65,7 @@ struct ip6mr_result { static DEFINE_RWLOCK(mrt_lock); -/* - * Multicast router control variables - */ - -#define MIF_EXISTS(_mrt, _idx) ((_mrt)->vif6_table[_idx].dev != NULL) +/* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); @@ -105,30 +80,45 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __read_mostly; -static struct mr6_table *ip6mr_new_table(struct net *net, u32 id); -static void ip6mr_free_table(struct mr6_table *mrt); +static struct mr_table *ip6mr_new_table(struct net *net, u32 id); +static void ip6mr_free_table(struct mr_table *mrt); -static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, +static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc6_cache *cache); -static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, +static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); -static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, - struct mfc6_cache *c, struct rtmsg *rtm); -static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, +static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd); -static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt); +static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); -static void mroute_clean_tables(struct mr6_table *mrt, bool all); +static void mroute_clean_tables(struct mr_table *mrt, bool all); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES #define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list) -static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) +static struct mr_table *ip6mr_mr_table_iter(struct net *net, + struct mr_table *mrt) { - struct mr6_table *mrt; + struct mr_table *ret; + + if (!mrt) + ret = list_entry_rcu(net->ipv6.mr6_tables.next, + struct mr_table, list); + else + ret = list_entry_rcu(mrt->list.next, + struct mr_table, list); + + if (&ret->list == &net->ipv6.mr6_tables) + return NULL; + return ret; +} + +static struct mr_table *ip6mr_get_table(struct net *net, u32 id) +{ + struct mr_table *mrt; ip6mr_for_each_table(mrt, net) { if (mrt->id == id) @@ -138,7 +128,7 @@ static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, - struct mr6_table **mrt) + struct mr_table **mrt) { int err; struct ip6mr_result res; @@ -159,7 +149,7 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ip6mr_result *res = arg->result; - struct mr6_table *mrt; + struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: @@ -227,7 +217,7 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { static int __net_init ip6mr_rules_init(struct net *net) { struct fib_rules_ops *ops; - struct mr6_table *mrt; + struct mr_table *mrt; int err; ops = fib_rules_register(&ip6mr_rules_ops_template, net); @@ -258,7 +248,7 @@ err1: static void __net_exit ip6mr_rules_exit(struct net *net) { - struct mr6_table *mrt, *next; + struct mr_table *mrt, *next; rtnl_lock(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { @@ -268,17 +258,42 @@ static void __net_exit ip6mr_rules_exit(struct net *net) fib_rules_unregister(net->ipv6.mr6_rules_ops); rtnl_unlock(); } + +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR); +} + +static unsigned int ip6mr_rules_seq_read(struct net *net) +{ + return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); +} + +bool ip6mr_rule_default(const struct fib_rule *rule) +{ + return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL && + rule->table == RT6_TABLE_DFLT && !rule->l3mdev; +} +EXPORT_SYMBOL(ip6mr_rule_default); #else #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) -static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) +static struct mr_table *ip6mr_mr_table_iter(struct net *net, + struct mr_table *mrt) +{ + if (!mrt) + return net->ipv6.mrt6; + return NULL; +} + +static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { return net->ipv6.mrt6; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, - struct mr6_table **mrt) + struct mr_table **mrt) { *mrt = net->ipv6.mrt6; return 0; @@ -297,114 +312,87 @@ static void __net_exit ip6mr_rules_exit(struct net *net) net->ipv6.mrt6 = NULL; rtnl_unlock(); } -#endif -static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) { - struct mr6_table *mrt; - unsigned int i; - - mrt = ip6mr_get_table(net, id); - if (mrt) - return mrt; + return 0; +} - mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); - if (!mrt) - return NULL; - mrt->id = id; - write_pnet(&mrt->net, net); +static unsigned int ip6mr_rules_seq_read(struct net *net) +{ + return 0; +} +#endif - /* Forwarding cache */ - for (i = 0; i < MFC6_LINES; i++) - INIT_LIST_HEAD(&mrt->mfc6_cache_array[i]); +static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct mfc6_cache_cmp_arg *cmparg = arg->key; + struct mfc6_cache *c = (struct mfc6_cache *)ptr; - INIT_LIST_HEAD(&mrt->mfc6_unres_queue); + return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) || + !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin); +} - timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0); +static const struct rhashtable_params ip6mr_rht_params = { + .head_offset = offsetof(struct mr_mfc, mnode), + .key_offset = offsetof(struct mfc6_cache, cmparg), + .key_len = sizeof(struct mfc6_cache_cmp_arg), + .nelem_hint = 3, + .locks_mul = 1, + .obj_cmpfn = ip6mr_hash_cmp, + .automatic_shrinking = true, +}; -#ifdef CONFIG_IPV6_PIMSM_V2 - mrt->mroute_reg_vif_num = -1; -#endif +static void ip6mr_new_table_set(struct mr_table *mrt, + struct net *net) +{ #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); #endif - return mrt; -} - -static void ip6mr_free_table(struct mr6_table *mrt) -{ - del_timer_sync(&mrt->ipmr_expire_timer); - mroute_clean_tables(mrt, true); - kfree(mrt); } -#ifdef CONFIG_PROC_FS - -struct ipmr_mfc_iter { - struct seq_net_private p; - struct mr6_table *mrt; - struct list_head *cache; - int ct; +static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = { + .mf6c_origin = IN6ADDR_ANY_INIT, + .mf6c_mcastgrp = IN6ADDR_ANY_INIT, }; +static struct mr_table_ops ip6mr_mr_table_ops = { + .rht_params = &ip6mr_rht_params, + .cmparg_any = &ip6mr_mr_table_ops_cmparg_any, +}; -static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net, - struct ipmr_mfc_iter *it, loff_t pos) +static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { - struct mr6_table *mrt = it->mrt; - struct mfc6_cache *mfc; + struct mr_table *mrt; - read_lock(&mrt_lock); - for (it->ct = 0; it->ct < MFC6_LINES; it->ct++) { - it->cache = &mrt->mfc6_cache_array[it->ct]; - list_for_each_entry(mfc, it->cache, list) - if (pos-- == 0) - return mfc; - } - read_unlock(&mrt_lock); - - spin_lock_bh(&mfc_unres_lock); - it->cache = &mrt->mfc6_unres_queue; - list_for_each_entry(mfc, it->cache, list) - if (pos-- == 0) - return mfc; - spin_unlock_bh(&mfc_unres_lock); + mrt = ip6mr_get_table(net, id); + if (mrt) + return mrt; - it->cache = NULL; - return NULL; + return mr_table_alloc(net, id, &ip6mr_mr_table_ops, + ipmr_expire_process, ip6mr_new_table_set); } -/* - * The /proc interfaces to multicast routing /proc/ip6_mr_cache /proc/ip6_mr_vif - */ - -struct ipmr_vif_iter { - struct seq_net_private p; - struct mr6_table *mrt; - int ct; -}; - -static struct mif_device *ip6mr_vif_seq_idx(struct net *net, - struct ipmr_vif_iter *iter, - loff_t pos) +static void ip6mr_free_table(struct mr_table *mrt) { - struct mr6_table *mrt = iter->mrt; - - for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { - if (!MIF_EXISTS(mrt, iter->ct)) - continue; - if (pos-- == 0) - return &mrt->vif6_table[iter->ct]; - } - return NULL; + del_timer_sync(&mrt->ipmr_expire_timer); + mroute_clean_tables(mrt, true); + rhltable_destroy(&mrt->mfc_hash); + kfree(mrt); } +#ifdef CONFIG_PROC_FS +/* The /proc interfaces to multicast routing + * /proc/ip6_mr_cache /proc/ip6_mr_vif + */ + static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(mrt_lock) { - struct ipmr_vif_iter *iter = seq->private; + struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); - struct mr6_table *mrt; + struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) @@ -413,26 +401,7 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) iter->mrt = mrt; read_lock(&mrt_lock); - return *pos ? ip6mr_vif_seq_idx(net, seq->private, *pos - 1) - : SEQ_START_TOKEN; -} - -static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ipmr_vif_iter *iter = seq->private; - struct net *net = seq_file_net(seq); - struct mr6_table *mrt = iter->mrt; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip6mr_vif_seq_idx(net, iter, 0); - - while (++iter->ct < mrt->maxvif) { - if (!MIF_EXISTS(mrt, iter->ct)) - continue; - return &mrt->vif6_table[iter->ct]; - } - return NULL; + return mr_vif_seq_start(seq, pos); } static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) @@ -443,19 +412,19 @@ static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) { - struct ipmr_vif_iter *iter = seq->private; - struct mr6_table *mrt = iter->mrt; + struct mr_vif_iter *iter = seq->private; + struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); } else { - const struct mif_device *vif = v; + const struct vif_device *vif = v; const char *name = vif->dev ? vif->dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", - vif - mrt->vif6_table, + vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags); @@ -465,7 +434,7 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) static const struct seq_operations ip6mr_vif_seq_ops = { .start = ip6mr_vif_seq_start, - .next = ip6mr_vif_seq_next, + .next = mr_vif_seq_next, .stop = ip6mr_vif_seq_stop, .show = ip6mr_vif_seq_show, }; @@ -473,7 +442,7 @@ static const struct seq_operations ip6mr_vif_seq_ops = { static int ip6mr_vif_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &ip6mr_vif_seq_ops, - sizeof(struct ipmr_vif_iter)); + sizeof(struct mr_vif_iter)); } static const struct file_operations ip6mr_vif_fops = { @@ -485,72 +454,14 @@ static const struct file_operations ip6mr_vif_fops = { static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { - struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); - struct mr6_table *mrt; + struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); - it->mrt = mrt; - it->cache = NULL; - return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) - : SEQ_START_TOKEN; -} - -static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct mfc6_cache *mfc = v; - struct ipmr_mfc_iter *it = seq->private; - struct net *net = seq_file_net(seq); - struct mr6_table *mrt = it->mrt; - - ++*pos; - - if (v == SEQ_START_TOKEN) - return ipmr_mfc_seq_idx(net, seq->private, 0); - - if (mfc->list.next != it->cache) - return list_entry(mfc->list.next, struct mfc6_cache, list); - - if (it->cache == &mrt->mfc6_unres_queue) - goto end_of_list; - - BUG_ON(it->cache != &mrt->mfc6_cache_array[it->ct]); - - while (++it->ct < MFC6_LINES) { - it->cache = &mrt->mfc6_cache_array[it->ct]; - if (list_empty(it->cache)) - continue; - return list_first_entry(it->cache, struct mfc6_cache, list); - } - - /* exhausted cache_array, show unresolved */ - read_unlock(&mrt_lock); - it->cache = &mrt->mfc6_unres_queue; - it->ct = 0; - - spin_lock_bh(&mfc_unres_lock); - if (!list_empty(it->cache)) - return list_first_entry(it->cache, struct mfc6_cache, list); - - end_of_list: - spin_unlock_bh(&mfc_unres_lock); - it->cache = NULL; - - return NULL; -} - -static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) -{ - struct ipmr_mfc_iter *it = seq->private; - struct mr6_table *mrt = it->mrt; - - if (it->cache == &mrt->mfc6_unres_queue) - spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == &mrt->mfc6_cache_array[it->ct]) - read_unlock(&mrt_lock); + return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) @@ -564,25 +475,25 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) "Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc6_cache *mfc = v; - const struct ipmr_mfc_iter *it = seq->private; - struct mr6_table *mrt = it->mrt; + const struct mr_mfc_iter *it = seq->private; + struct mr_table *mrt = it->mrt; seq_printf(seq, "%pI6 %pI6 %-3hd", &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, - mfc->mf6c_parent); + mfc->_c.mfc_parent); - if (it->cache != &mrt->mfc6_unres_queue) { + if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", - mfc->mfc_un.res.pkt, - mfc->mfc_un.res.bytes, - mfc->mfc_un.res.wrong_if); - for (n = mfc->mfc_un.res.minvif; - n < mfc->mfc_un.res.maxvif; n++) { - if (MIF_EXISTS(mrt, n) && - mfc->mfc_un.res.ttls[n] < 255) + mfc->_c.mfc_un.res.pkt, + mfc->_c.mfc_un.res.bytes, + mfc->_c.mfc_un.res.wrong_if); + for (n = mfc->_c.mfc_un.res.minvif; + n < mfc->_c.mfc_un.res.maxvif; n++) { + if (VIF_EXISTS(mrt, n) && + mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, - " %2d:%-3d", - n, mfc->mfc_un.res.ttls[n]); + " %2d:%-3d", n, + mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain @@ -597,15 +508,15 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, - .next = ipmr_mfc_seq_next, - .stop = ipmr_mfc_seq_stop, + .next = mr_mfc_seq_next, + .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; static int ipmr_mfc_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &ipmr_mfc_seq_ops, - sizeof(struct ipmr_mfc_iter)); + sizeof(struct mr_mfc_iter)); } static const struct file_operations ip6mr_mfc_fops = { @@ -624,7 +535,7 @@ static int pim6_rcv(struct sk_buff *skb) struct ipv6hdr *encap; struct net_device *reg_dev = NULL; struct net *net = dev_net(skb->dev); - struct mr6_table *mrt; + struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, @@ -658,7 +569,7 @@ static int pim6_rcv(struct sk_buff *skb) read_lock(&mrt_lock); if (reg_vif_num >= 0) - reg_dev = mrt->vif6_table[reg_vif_num].dev; + reg_dev = mrt->vif_table[reg_vif_num].dev; if (reg_dev) dev_hold(reg_dev); read_unlock(&mrt_lock); @@ -693,7 +604,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); - struct mr6_table *mrt; + struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, @@ -736,7 +647,7 @@ static void reg_vif_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; } -static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) +static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; @@ -769,21 +680,41 @@ failure: } #endif -/* - * Delete a VIF entry - */ +static int call_ip6mr_vif_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + mifi_t vif_index, u32 tb_id) +{ + return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type, + vif, vif_index, tb_id, + &net->ipv6.ipmr_seq); +} + +static int call_ip6mr_mfc_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct mfc6_cache *mfc, u32 tb_id) +{ + return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type, + &mfc->_c, tb_id, &net->ipv6.ipmr_seq); +} -static int mif6_delete(struct mr6_table *mrt, int vifi, int notify, +/* Delete a VIF entry */ +static int mif6_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { - struct mif_device *v; + struct vif_device *v; struct net_device *dev; struct inet6_dev *in6_dev; if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; - v = &mrt->vif6_table[vifi]; + v = &mrt->vif_table[vifi]; + + if (VIF_EXISTS(mrt, vifi)) + call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net), + FIB_EVENT_VIF_DEL, v, vifi, + mrt->id); write_lock_bh(&mrt_lock); dev = v->dev; @@ -802,7 +733,7 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, int notify, if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { - if (MIF_EXISTS(mrt, tmp)) + if (VIF_EXISTS(mrt, tmp)) break; } mrt->maxvif = tmp + 1; @@ -827,23 +758,30 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, int notify, return 0; } +static inline void ip6mr_cache_free_rcu(struct rcu_head *head) +{ + struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); + + kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c); +} + static inline void ip6mr_cache_free(struct mfc6_cache *c) { - kmem_cache_free(mrt_cachep, c); + call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs and reporting error to netlink readers. */ -static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c) +static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; atomic_dec(&mrt->cache_resolve_queue_len); - while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) { + while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); @@ -862,13 +800,13 @@ static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c) /* Timer process for all the unresolved queue. */ -static void ipmr_do_expire_process(struct mr6_table *mrt) +static void ipmr_do_expire_process(struct mr_table *mrt) { unsigned long now = jiffies; unsigned long expires = 10 * HZ; - struct mfc6_cache *c, *next; + struct mr_mfc *c, *next; - list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) { + list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { /* not yet... */ unsigned long interval = c->mfc_un.unres.expires - now; @@ -878,24 +816,24 @@ static void ipmr_do_expire_process(struct mr6_table *mrt) } list_del(&c->list); - mr6_netlink_event(mrt, c, RTM_DELROUTE); - ip6mr_destroy_unres(mrt, c); + mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); + ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } - if (!list_empty(&mrt->mfc6_unres_queue)) + if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); } static void ipmr_expire_process(struct timer_list *t) { - struct mr6_table *mrt = from_timer(mrt, t, ipmr_expire_timer); + struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); return; } - if (!list_empty(&mrt->mfc6_unres_queue)) + if (!list_empty(&mrt->mfc_unres_queue)) ipmr_do_expire_process(mrt); spin_unlock(&mfc_unres_lock); @@ -903,7 +841,8 @@ static void ipmr_expire_process(struct timer_list *t) /* Fill oifs list. It is called under write locked mrt_lock. */ -static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *cache, +static void ip6mr_update_thresholds(struct mr_table *mrt, + struct mr_mfc *cache, unsigned char *ttls) { int vifi; @@ -913,7 +852,7 @@ static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *ca memset(cache->mfc_un.res.ttls, 255, MAXMIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { - if (MIF_EXISTS(mrt, vifi) && + if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) @@ -925,17 +864,17 @@ static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *ca cache->mfc_un.res.lastuse = jiffies; } -static int mif6_add(struct net *net, struct mr6_table *mrt, +static int mif6_add(struct net *net, struct mr_table *mrt, struct mif6ctl *vifc, int mrtsock) { int vifi = vifc->mif6c_mifi; - struct mif_device *v = &mrt->vif6_table[vifi]; + struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct inet6_dev *in6_dev; int err; /* Is vif busy ? */ - if (MIF_EXISTS(mrt, vifi)) + if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->mif6c_flags) { @@ -980,21 +919,10 @@ static int mif6_add(struct net *net, struct mr6_table *mrt, dev->ifindex, &in6_dev->cnf); } - /* - * Fill in the VIF structures - */ - v->rate_limit = vifc->vifc_rate_limit; - v->flags = vifc->mif6c_flags; - if (!mrtsock) - v->flags |= VIFF_STATIC; - v->threshold = vifc->vifc_threshold; - v->bytes_in = 0; - v->bytes_out = 0; - v->pkt_in = 0; - v->pkt_out = 0; - v->link = dev->ifindex; - if (v->flags & MIFF_REGISTER) - v->link = dev_get_iflink(dev); + /* Fill in the VIF structures */ + vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, + vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0), + MIFF_REGISTER); /* And finish update writing critical data */ write_lock_bh(&mrt_lock); @@ -1006,78 +934,63 @@ static int mif6_add(struct net *net, struct mr6_table *mrt, if (vifi + 1 > mrt->maxvif) mrt->maxvif = vifi + 1; write_unlock_bh(&mrt_lock); + call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); return 0; } -static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt, +static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp) { - int line = MFC6_HASH(mcastgrp, origin); - struct mfc6_cache *c; - - list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { - if (ipv6_addr_equal(&c->mf6c_origin, origin) && - ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) - return c; - } - return NULL; -} - -/* Look for a (*,*,oif) entry */ -static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt, - mifi_t mifi) -{ - int line = MFC6_HASH(&in6addr_any, &in6addr_any); - struct mfc6_cache *c; - - list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) - if (ipv6_addr_any(&c->mf6c_origin) && - ipv6_addr_any(&c->mf6c_mcastgrp) && - (c->mfc_un.res.ttls[mifi] < 255)) - return c; + struct mfc6_cache_cmp_arg arg = { + .mf6c_origin = *origin, + .mf6c_mcastgrp = *mcastgrp, + }; - return NULL; + return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ -static struct mfc6_cache *ip6mr_cache_find_any(struct mr6_table *mrt, +static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, struct in6_addr *mcastgrp, mifi_t mifi) { - int line = MFC6_HASH(mcastgrp, &in6addr_any); - struct mfc6_cache *c, *proxy; + struct mfc6_cache_cmp_arg arg = { + .mf6c_origin = in6addr_any, + .mf6c_mcastgrp = *mcastgrp, + }; if (ipv6_addr_any(mcastgrp)) - goto skip; - - list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) - if (ipv6_addr_any(&c->mf6c_origin) && - ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) { - if (c->mfc_un.res.ttls[mifi] < 255) - return c; - - /* It's ok if the mifi is part of the static tree */ - proxy = ip6mr_cache_find_any_parent(mrt, - c->mf6c_parent); - if (proxy && proxy->mfc_un.res.ttls[mifi] < 255) - return c; - } + return mr_mfc_find_any_parent(mrt, mifi); + return mr_mfc_find_any(mrt, mifi, &arg); +} -skip: - return ip6mr_cache_find_any_parent(mrt, mifi); +/* Look for a (S,G,iif) entry if parent != -1 */ +static struct mfc6_cache * +ip6mr_cache_find_parent(struct mr_table *mrt, + const struct in6_addr *origin, + const struct in6_addr *mcastgrp, + int parent) +{ + struct mfc6_cache_cmp_arg arg = { + .mf6c_origin = *origin, + .mf6c_mcastgrp = *mcastgrp, + }; + + return mr_mfc_find_parent(mrt, &arg, parent); } -/* - * Allocate a multicast cache entry - */ +/* Allocate a multicast cache entry */ static struct mfc6_cache *ip6mr_cache_alloc(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (!c) return NULL; - c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; - c->mfc_un.res.minvif = MAXMIFS; + c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; + c->_c.mfc_un.res.minvif = MAXMIFS; + c->_c.free = ip6mr_cache_free_rcu; + refcount_set(&c->_c.mfc_un.res.refcount, 1); return c; } @@ -1086,8 +999,8 @@ static struct mfc6_cache *ip6mr_cache_alloc_unres(void) struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (!c) return NULL; - skb_queue_head_init(&c->mfc_un.unres.unresolved); - c->mfc_un.unres.expires = jiffies + 10 * HZ; + skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); + c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; return c; } @@ -1095,7 +1008,7 @@ static struct mfc6_cache *ip6mr_cache_alloc_unres(void) * A cache entry has gone into a resolved state from queued */ -static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, +static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc6_cache *uc, struct mfc6_cache *c) { struct sk_buff *skb; @@ -1104,12 +1017,13 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, * Play the pending entries through our router */ - while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { + while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); - if (__ip6mr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) { + if (mr_fill_mroute(mrt, skb, &c->_c, + nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; @@ -1129,9 +1043,10 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, * Called under mrt_lock. */ -static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, +static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert) { + struct sock *mroute6_sk; struct sk_buff *skb; struct mrt6msg *msg; int ret; @@ -1201,17 +1116,19 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, skb->ip_summed = CHECKSUM_UNNECESSARY; } - if (!mrt->mroute6_sk) { + rcu_read_lock(); + mroute6_sk = rcu_dereference(mrt->mroute_sk); + if (!mroute6_sk) { + rcu_read_unlock(); kfree_skb(skb); return -EINVAL; } mrt6msg_netlink_event(mrt, skb); - /* - * Deliver to user space multicast routing algorithms - */ - ret = sock_queue_rcv_skb(mrt->mroute6_sk, skb); + /* Deliver to user space multicast routing algorithms */ + ret = sock_queue_rcv_skb(mroute6_sk, skb); + rcu_read_unlock(); if (ret < 0) { net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); kfree_skb(skb); @@ -1220,19 +1137,16 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, return ret; } -/* - * Queue a packet for resolution. It gets locked cache entry! - */ - -static int -ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) +/* Queue a packet for resolution. It gets locked cache entry! */ +static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, + struct sk_buff *skb) { + struct mfc6_cache *c; bool found = false; int err; - struct mfc6_cache *c; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(c, &mrt->mfc6_unres_queue, list) { + list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { found = true; @@ -1253,10 +1167,8 @@ ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) return -ENOBUFS; } - /* - * Fill in the new cache entry - */ - c->mf6c_parent = -1; + /* Fill in the new cache entry */ + c->_c.mfc_parent = -1; c->mf6c_origin = ipv6_hdr(skb)->saddr; c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; @@ -1276,20 +1188,18 @@ ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) } atomic_inc(&mrt->cache_resolve_queue_len); - list_add(&c->list, &mrt->mfc6_unres_queue); + list_add(&c->_c.list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); ipmr_do_expire_process(mrt); } - /* - * See if we can append the packet - */ - if (c->mfc_un.unres.unresolved.qlen > 3) { + /* See if we can append the packet */ + if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { - skb_queue_tail(&c->mfc_un.unres.unresolved, skb); + skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } @@ -1301,29 +1211,26 @@ ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) * MFC6 cache manipulation by user space */ -static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc, +static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, int parent) { - int line; - struct mfc6_cache *c, *next; - - line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr); + struct mfc6_cache *c; - list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[line], list) { - if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && - ipv6_addr_equal(&c->mf6c_mcastgrp, - &mfc->mf6cc_mcastgrp.sin6_addr) && - (parent == -1 || parent == c->mf6c_parent)) { - write_lock_bh(&mrt_lock); - list_del(&c->list); - write_unlock_bh(&mrt_lock); + /* The entries are added/deleted only under RTNL */ + rcu_read_lock(); + c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, + &mfc->mf6cc_mcastgrp.sin6_addr, parent); + rcu_read_unlock(); + if (!c) + return -ENOENT; + rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params); + list_del_rcu(&c->_c.list); - mr6_netlink_event(mrt, c, RTM_DELROUTE); - ip6mr_cache_free(c); - return 0; - } - } - return -ENOENT; + call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), + FIB_EVENT_ENTRY_DEL, c, mrt->id); + mr6_netlink_event(mrt, c, RTM_DELROUTE); + mr_cache_put(&c->_c); + return 0; } static int ip6mr_device_event(struct notifier_block *this, @@ -1331,15 +1238,15 @@ static int ip6mr_device_event(struct notifier_block *this, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); - struct mr6_table *mrt; - struct mif_device *v; + struct mr_table *mrt; + struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; ip6mr_for_each_table(mrt, net) { - v = &mrt->vif6_table[0]; + v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (v->dev == dev) mif6_delete(mrt, ct, 1, NULL); @@ -1349,21 +1256,63 @@ static int ip6mr_device_event(struct notifier_block *this, return NOTIFY_DONE; } +static unsigned int ip6mr_seq_read(struct net *net) +{ + ASSERT_RTNL(); + + return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); +} + +static int ip6mr_dump(struct net *net, struct notifier_block *nb) +{ + return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, + ip6mr_mr_table_iter, &mrt_lock); +} + static struct notifier_block ip6_mr_notifier = { .notifier_call = ip6mr_device_event }; -/* - * Setup for IP multicast routing - */ +static const struct fib_notifier_ops ip6mr_notifier_ops_template = { + .family = RTNL_FAMILY_IP6MR, + .fib_seq_read = ip6mr_seq_read, + .fib_dump = ip6mr_dump, + .owner = THIS_MODULE, +}; + +static int __net_init ip6mr_notifier_init(struct net *net) +{ + struct fib_notifier_ops *ops; + + net->ipv6.ipmr_seq = 0; + + ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + net->ipv6.ip6mr_notifier_ops = ops; + + return 0; +} +static void __net_exit ip6mr_notifier_exit(struct net *net) +{ + fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops); + net->ipv6.ip6mr_notifier_ops = NULL; +} + +/* Setup for IP multicast routing */ static int __net_init ip6mr_net_init(struct net *net) { int err; + err = ip6mr_notifier_init(net); + if (err) + return err; + err = ip6mr_rules_init(net); if (err < 0) - goto fail; + goto ip6mr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -1381,7 +1330,8 @@ proc_cache_fail: proc_vif_fail: ip6mr_rules_exit(net); #endif -fail: +ip6mr_rules_fail: + ip6mr_notifier_exit(net); return err; } @@ -1392,6 +1342,7 @@ static void __net_exit ip6mr_net_exit(struct net *net) remove_proc_entry("ip6_mr_vif", net->proc_net); #endif ip6mr_rules_exit(net); + ip6mr_notifier_exit(net); } static struct pernet_operations ip6mr_net_ops = { @@ -1452,14 +1403,14 @@ void ip6_mr_cleanup(void) kmem_cache_destroy(mrt_cachep); } -static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, +static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, struct mf6cctl *mfc, int mrtsock, int parent) { - bool found = false; - int line; - struct mfc6_cache *uc, *c; unsigned char ttls[MAXMIFS]; - int i; + struct mfc6_cache *uc, *c; + struct mr_mfc *_uc; + bool found; + int i, err; if (mfc->mf6cc_parent >= MAXMIFS) return -ENFILE; @@ -1468,28 +1419,22 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, for (i = 0; i < MAXMIFS; i++) { if (IF_ISSET(i, &mfc->mf6cc_ifset)) ttls[i] = 1; - } - line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr); - - list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { - if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && - ipv6_addr_equal(&c->mf6c_mcastgrp, - &mfc->mf6cc_mcastgrp.sin6_addr) && - (parent == -1 || parent == mfc->mf6cc_parent)) { - found = true; - break; - } - } - - if (found) { + /* The entries are added/deleted only under RTNL */ + rcu_read_lock(); + c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, + &mfc->mf6cc_mcastgrp.sin6_addr, parent); + rcu_read_unlock(); + if (c) { write_lock_bh(&mrt_lock); - c->mf6c_parent = mfc->mf6cc_parent; - ip6mr_update_thresholds(mrt, c, ttls); + c->_c.mfc_parent = mfc->mf6cc_parent; + ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) - c->mfc_flags |= MFC_STATIC; + c->_c.mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); + call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, + c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1504,31 +1449,36 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; - c->mf6c_parent = mfc->mf6cc_parent; - ip6mr_update_thresholds(mrt, c, ttls); + c->_c.mfc_parent = mfc->mf6cc_parent; + ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) - c->mfc_flags |= MFC_STATIC; + c->_c.mfc_flags |= MFC_STATIC; - write_lock_bh(&mrt_lock); - list_add(&c->list, &mrt->mfc6_cache_array[line]); - write_unlock_bh(&mrt_lock); + err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, + ip6mr_rht_params); + if (err) { + pr_err("ip6mr: rhtable insert error %d\n", err); + ip6mr_cache_free(c); + return err; + } + list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); - /* - * Check to see if we resolved a queued list. If so we - * need to send on the frames and tidy up. + /* Check to see if we resolved a queued list. If so we + * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(uc, &mrt->mfc6_unres_queue, list) { + list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { + uc = (struct mfc6_cache *)_uc; if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { - list_del(&uc->list); + list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } } - if (list_empty(&mrt->mfc6_unres_queue)) + if (list_empty(&mrt->mfc_unres_queue)) del_timer(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); @@ -1536,6 +1486,8 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, ip6mr_cache_resolve(net, mrt, uc, c); ip6mr_cache_free(uc); } + call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, + c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1544,61 +1496,59 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr6_table *mrt, bool all) +static void mroute_clean_tables(struct mr_table *mrt, bool all) { - int i; + struct mr_mfc *c, *tmp; LIST_HEAD(list); - struct mfc6_cache *c, *next; + int i; - /* - * Shut down all active vif entries - */ + /* Shut down all active vif entries */ for (i = 0; i < mrt->maxvif; i++) { - if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC)) + if (!all && (mrt->vif_table[i].flags & VIFF_STATIC)) continue; mif6_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); - /* - * Wipe the cache - */ - for (i = 0; i < MFC6_LINES; i++) { - list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) { - if (!all && (c->mfc_flags & MFC_STATIC)) - continue; - write_lock_bh(&mrt_lock); - list_del(&c->list); - write_unlock_bh(&mrt_lock); - - mr6_netlink_event(mrt, c, RTM_DELROUTE); - ip6mr_cache_free(c); - } + /* Wipe the cache */ + list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { + if (!all && (c->mfc_flags & MFC_STATIC)) + continue; + rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); + list_del_rcu(&c->list); + mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); + mr_cache_put(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); - list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) { + list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); - mr6_netlink_event(mrt, c, RTM_DELROUTE); - ip6mr_destroy_unres(mrt, c); + call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), + FIB_EVENT_ENTRY_DEL, + (struct mfc6_cache *)c, + mrt->id); + mr6_netlink_event(mrt, (struct mfc6_cache *)c, + RTM_DELROUTE); + ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } spin_unlock_bh(&mfc_unres_lock); } } -static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk) +static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) { int err = 0; struct net *net = sock_net(sk); rtnl_lock(); write_lock_bh(&mrt_lock); - if (likely(mrt->mroute6_sk == NULL)) { - mrt->mroute6_sk = sk; - net->ipv6.devconf_all->mc_forwarding++; - } else { + if (rtnl_dereference(mrt->mroute_sk)) { err = -EADDRINUSE; + } else { + rcu_assign_pointer(mrt->mroute_sk, sk); + sock_set_flag(sk, SOCK_RCU_FREE); + net->ipv6.devconf_all->mc_forwarding++; } write_unlock_bh(&mrt_lock); @@ -1616,7 +1566,7 @@ int ip6mr_sk_done(struct sock *sk) { int err = -EACCES; struct net *net = sock_net(sk); - struct mr6_table *mrt; + struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) @@ -1624,9 +1574,13 @@ int ip6mr_sk_done(struct sock *sk) rtnl_lock(); ip6mr_for_each_table(mrt, net) { - if (sk == mrt->mroute6_sk) { + if (sk == rtnl_dereference(mrt->mroute_sk)) { write_lock_bh(&mrt_lock); - mrt->mroute6_sk = NULL; + RCU_INIT_POINTER(mrt->mroute_sk, NULL); + /* Note that mroute_sk had SOCK_RCU_FREE set, + * so the RCU grace period before sk freeing + * is guaranteed by sk_destruct() + */ net->ipv6.devconf_all->mc_forwarding--; write_unlock_bh(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, @@ -1644,9 +1598,9 @@ int ip6mr_sk_done(struct sock *sk) return err; } -struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) +bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { - struct mr6_table *mrt; + struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_oif = skb->dev->ifindex, @@ -1656,8 +1610,9 @@ struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) return NULL; - return mrt->mroute6_sk; + return rcu_access_pointer(mrt->mroute_sk); } +EXPORT_SYMBOL(mroute6_is_socket); /* * Socket options and virtual interface manipulation. The whole @@ -1673,7 +1628,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns struct mf6cctl mfc; mifi_t mifi; struct net *net = sock_net(sk); - struct mr6_table *mrt; + struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) @@ -1684,7 +1639,8 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns return -ENOENT; if (optname != MRT6_INIT) { - if (sk != mrt->mroute6_sk && !ns_capable(net->user_ns, CAP_NET_ADMIN)) + if (sk != rcu_access_pointer(mrt->mroute_sk) && + !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } @@ -1706,7 +1662,8 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; rtnl_lock(); - ret = mif6_add(net, mrt, &vif, sk == mrt->mroute6_sk); + ret = mif6_add(net, mrt, &vif, + sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; @@ -1741,7 +1698,9 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns ret = ip6mr_mfc_delete(mrt, &mfc, parent); else ret = ip6mr_mfc_add(net, mrt, &mfc, - sk == mrt->mroute6_sk, parent); + sk == + rtnl_dereference(mrt->mroute_sk), + parent); rtnl_unlock(); return ret; @@ -1793,7 +1752,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) return -EINVAL; - if (sk == mrt->mroute6_sk) + if (sk == rcu_access_pointer(mrt->mroute_sk)) return -EBUSY; rtnl_lock(); @@ -1824,7 +1783,7 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int olr; int val; struct net *net = sock_net(sk); - struct mr6_table *mrt; + struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) @@ -1872,10 +1831,10 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) { struct sioc_sg_req6 sr; struct sioc_mif_req6 vr; - struct mif_device *vif; + struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); - struct mr6_table *mrt; + struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) @@ -1888,8 +1847,8 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) if (vr.mifi >= mrt->maxvif) return -EINVAL; read_lock(&mrt_lock); - vif = &mrt->vif6_table[vr.mifi]; - if (MIF_EXISTS(mrt, vr.mifi)) { + vif = &mrt->vif_table[vr.mifi]; + if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = vif->pkt_in; vr.ocount = vif->pkt_out; vr.ibytes = vif->bytes_in; @@ -1906,19 +1865,19 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; - read_lock(&mrt_lock); + rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { - sr.pktcnt = c->mfc_un.res.pkt; - sr.bytecnt = c->mfc_un.res.bytes; - sr.wrong_if = c->mfc_un.res.wrong_if; - read_unlock(&mrt_lock); + sr.pktcnt = c->_c.mfc_un.res.pkt; + sr.bytecnt = c->_c.mfc_un.res.bytes; + sr.wrong_if = c->_c.mfc_un.res.wrong_if; + rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; @@ -1946,10 +1905,10 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req6 sr; struct compat_sioc_mif_req6 vr; - struct mif_device *vif; + struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); - struct mr6_table *mrt; + struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) @@ -1962,8 +1921,8 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) if (vr.mifi >= mrt->maxvif) return -EINVAL; read_lock(&mrt_lock); - vif = &mrt->vif6_table[vr.mifi]; - if (MIF_EXISTS(mrt, vr.mifi)) { + vif = &mrt->vif_table[vr.mifi]; + if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = vif->pkt_in; vr.ocount = vif->pkt_out; vr.ibytes = vif->bytes_in; @@ -1980,19 +1939,19 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; - read_lock(&mrt_lock); + rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { - sr.pktcnt = c->mfc_un.res.pkt; - sr.bytecnt = c->mfc_un.res.bytes; - sr.wrong_if = c->mfc_un.res.wrong_if; - read_unlock(&mrt_lock); + sr.pktcnt = c->_c.mfc_un.res.pkt; + sr.bytecnt = c->_c.mfc_un.res.bytes; + sr.wrong_if = c->_c.mfc_un.res.wrong_if; + rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; @@ -2013,11 +1972,11 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct * Processing handlers for ip6mr_forward */ -static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, +static int ip6mr_forward2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc6_cache *c, int vifi) { struct ipv6hdr *ipv6h; - struct mif_device *vif = &mrt->vif6_table[vifi]; + struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *dev; struct dst_entry *dst; struct flowi6 fl6; @@ -2087,46 +2046,50 @@ out_free: return 0; } -static int ip6mr_find_vif(struct mr6_table *mrt, struct net_device *dev) +static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) { int ct; for (ct = mrt->maxvif - 1; ct >= 0; ct--) { - if (mrt->vif6_table[ct].dev == dev) + if (mrt->vif_table[ct].dev == dev) break; } return ct; } -static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, - struct sk_buff *skb, struct mfc6_cache *cache) +static void ip6_mr_forward(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int vif, ct; int true_vifi = ip6mr_find_vif(mrt, skb->dev); - vif = cache->mf6c_parent; - cache->mfc_un.res.pkt++; - cache->mfc_un.res.bytes += skb->len; - cache->mfc_un.res.lastuse = jiffies; + vif = c->_c.mfc_parent; + c->_c.mfc_un.res.pkt++; + c->_c.mfc_un.res.bytes += skb->len; + c->_c.mfc_un.res.lastuse = jiffies; - if (ipv6_addr_any(&cache->mf6c_origin) && true_vifi >= 0) { + if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ - cache_proxy = ip6mr_cache_find_any_parent(mrt, vif); + rcu_read_lock(); + cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && - cache_proxy->mfc_un.res.ttls[true_vifi] < 255) + cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) { + rcu_read_unlock(); goto forward; + } + rcu_read_unlock(); } /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (mrt->vif6_table[vif].dev != skb->dev) { - cache->mfc_un.res.wrong_if++; + if (mrt->vif_table[vif].dev != skb->dev) { + c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -2135,52 +2098,55 @@ static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || - cache->mfc_un.res.ttls[true_vifi] < 255) && + c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, - cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { - cache->mfc_un.res.last_assert = jiffies; + c->_c.mfc_un.res.last_assert + + MFC_ASSERT_THRESH)) { + c->_c.mfc_un.res.last_assert = jiffies; ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); } goto dont_forward; } forward: - mrt->vif6_table[vif].pkt_in++; - mrt->vif6_table[vif].bytes_in += skb->len; + mrt->vif_table[vif].pkt_in++; + mrt->vif_table[vif].bytes_in += skb->len; /* * Forward the frame */ - if (ipv6_addr_any(&cache->mf6c_origin) && - ipv6_addr_any(&cache->mf6c_mcastgrp)) { + if (ipv6_addr_any(&c->mf6c_origin) && + ipv6_addr_any(&c->mf6c_mcastgrp)) { if (true_vifi >= 0 && - true_vifi != cache->mf6c_parent && + true_vifi != c->_c.mfc_parent && ipv6_hdr(skb)->hop_limit > - cache->mfc_un.res.ttls[cache->mf6c_parent]) { + c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ - psend = cache->mf6c_parent; + psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } - for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) { + for (ct = c->_c.mfc_un.res.maxvif - 1; + ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ - if ((!ipv6_addr_any(&cache->mf6c_origin) || ct != true_vifi) && - ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) { + if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) && + ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ip6mr_forward2(net, mrt, skb2, cache, psend); + ip6mr_forward2(net, mrt, skb2, + c, psend); } psend = ct; } } last_forward: if (psend != -1) { - ip6mr_forward2(net, mrt, skb, cache, psend); + ip6mr_forward2(net, mrt, skb, c, psend); return; } @@ -2197,7 +2163,7 @@ int ip6_mr_input(struct sk_buff *skb) { struct mfc6_cache *cache; struct net *net = dev_net(skb->dev); - struct mr6_table *mrt; + struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, @@ -2247,66 +2213,11 @@ int ip6_mr_input(struct sk_buff *skb) return 0; } - -static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, - struct mfc6_cache *c, struct rtmsg *rtm) -{ - struct rta_mfc_stats mfcs; - struct nlattr *mp_attr; - struct rtnexthop *nhp; - unsigned long lastuse; - int ct; - - /* If cache is unresolved, don't try to parse IIF and OIF */ - if (c->mf6c_parent >= MAXMIFS) { - rtm->rtm_flags |= RTNH_F_UNRESOLVED; - return -ENOENT; - } - - if (MIF_EXISTS(mrt, c->mf6c_parent) && - nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0) - return -EMSGSIZE; - mp_attr = nla_nest_start(skb, RTA_MULTIPATH); - if (!mp_attr) - return -EMSGSIZE; - - for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (MIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { - nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); - if (!nhp) { - nla_nest_cancel(skb, mp_attr); - return -EMSGSIZE; - } - - nhp->rtnh_flags = 0; - nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - nhp->rtnh_ifindex = mrt->vif6_table[ct].dev->ifindex; - nhp->rtnh_len = sizeof(*nhp); - } - } - - nla_nest_end(skb, mp_attr); - - lastuse = READ_ONCE(c->mfc_un.res.lastuse); - lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; - - mfcs.mfcs_packets = c->mfc_un.res.pkt; - mfcs.mfcs_bytes = c->mfc_un.res.bytes; - mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; - if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || - nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), - RTA_PAD)) - return -EMSGSIZE; - - rtm->rtm_type = RTN_MULTICAST; - return 1; -} - int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { int err; - struct mr6_table *mrt; + struct mr_table *mrt; struct mfc6_cache *cache; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); @@ -2367,15 +2278,12 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, return err; } - if (rtm->rtm_flags & RTM_F_NOTIFY) - cache->mfc_flags |= MFC_NOTIFY; - - err = __ip6mr_fill_mroute(mrt, skb, cache, rtm); + err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); read_unlock(&mrt_lock); return err; } -static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, +static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc6_cache *c, int cmd, int flags) { @@ -2397,7 +2305,7 @@ static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; - if (c->mfc_flags & MFC_STATIC) + if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; @@ -2406,7 +2314,7 @@ static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) || nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp)) goto nla_put_failure; - err = __ip6mr_fill_mroute(mrt, skb, c, rtm); + err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; @@ -2419,6 +2327,14 @@ nla_put_failure: return -EMSGSIZE; } +static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, + int cmd, int flags) +{ + return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c, + cmd, flags); +} + static int mr6_msgsize(bool unresolved, int maxvif) { size_t len = @@ -2440,14 +2356,14 @@ static int mr6_msgsize(bool unresolved, int maxvif) return len; } -static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, +static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(mr6_msgsize(mfc->mf6c_parent >= MAXMIFS, mrt->maxvif), + skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; @@ -2482,7 +2398,7 @@ static size_t mrt6msg_netlink_msgsize(size_t payloadlen) return len; } -static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt) +static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; @@ -2532,65 +2448,6 @@ errout: static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); - struct mr6_table *mrt; - struct mfc6_cache *mfc; - unsigned int t = 0, s_t; - unsigned int h = 0, s_h; - unsigned int e = 0, s_e; - - s_t = cb->args[0]; - s_h = cb->args[1]; - s_e = cb->args[2]; - - read_lock(&mrt_lock); - ip6mr_for_each_table(mrt, net) { - if (t < s_t) - goto next_table; - if (t > s_t) - s_h = 0; - for (h = s_h; h < MFC6_LINES; h++) { - list_for_each_entry(mfc, &mrt->mfc6_cache_array[h], list) { - if (e < s_e) - goto next_entry; - if (ip6mr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) - goto done; -next_entry: - e++; - } - e = s_e = 0; - } - spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(mfc, &mrt->mfc6_unres_queue, list) { - if (e < s_e) - goto next_entry2; - if (ip6mr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) { - spin_unlock_bh(&mfc_unres_lock); - goto done; - } -next_entry2: - e++; - } - spin_unlock_bh(&mfc_unres_lock); - e = s_e = 0; - s_h = 0; -next_table: - t++; - } -done: - read_unlock(&mrt_lock); - - cb->args[2] = e; - cb->args[1] = h; - cb->args[0] = t; - - return skb->len; + return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, + _ip6mr_fill_mroute, &mfc_unres_lock); } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index d78d41fc4b1a..4d780c7f0130 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1367,10 +1367,7 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, if (get_user(len, optlen)) return -EFAULT; - lock_sock(sk); - err = nf_getsockopt(sk, PF_INET6, optname, optval, - &len); - release_sock(sk); + err = nf_getsockopt(sk, PF_INET6, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); } @@ -1409,10 +1406,7 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, if (get_user(len, optlen)) return -EFAULT; - lock_sock(sk); - err = compat_nf_getsockopt(sk, PF_INET6, - optname, optval, &len); - release_sock(sk); + err = compat_nf_getsockopt(sk, PF_INET6, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); } @@ -1421,4 +1415,3 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, } EXPORT_SYMBOL(compat_ipv6_getsockopt); #endif - diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 9b9d2ff01b35..793159d77d8a 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -165,7 +165,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (ifindex == 0) { struct rt6_info *rt; - rt = rt6_lookup(net, addr, NULL, 0, 0); + rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); @@ -254,7 +254,7 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, struct inet6_dev *idev = NULL; if (ifindex == 0) { - struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, 0); + struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; @@ -2921,9 +2921,9 @@ static int __net_init igmp6_proc_init(struct net *net) int err; err = -ENOMEM; - if (!proc_create("igmp6", S_IRUGO, net->proc_net, &igmp6_mc_seq_fops)) + if (!proc_create("igmp6", 0444, net->proc_net, &igmp6_mc_seq_fops)) goto out; - if (!proc_create("mcfilter6", S_IRUGO, net->proc_net, + if (!proc_create("mcfilter6", 0444, net->proc_net, &igmp6_mcf_seq_fops)) goto out_proc_net_igmp6; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index f61a5b613b52..9de4dfb126ba 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -527,7 +527,7 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, } if (!dev->addr_len) - inc_opt = 0; + inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_ADVERTISEMENT); @@ -707,7 +707,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) int probes = atomic_read(&neigh->probes); if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr, - dev, 1, + dev, false, 1, IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) saddr = &ipv6_hdr(skb)->saddr; probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); @@ -1554,7 +1554,8 @@ static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, *(opt++) = (rd_len >> 3); opt += 6; - memcpy(opt, ipv6_hdr(orig_skb), rd_len - 8); + skb_copy_bits(orig_skb, skb_network_offset(orig_skb), opt, + rd_len - 8); } void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index d95ceca7ff8f..531d6957af36 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -21,18 +21,19 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); + struct sock *sk = sk_to_full_sk(skb->sk); unsigned int hh_len; struct dst_entry *dst; struct flowi6 fl6 = { - .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, + .flowi6_oif = sk ? sk->sk_bound_dev_if : 0, .flowi6_mark = skb->mark, - .flowi6_uid = sock_net_uid(net, skb->sk), + .flowi6_uid = sock_net_uid(net, sk), .daddr = iph->daddr, .saddr = iph->saddr, }; int err; - dst = ip6_route_output(net, skb->sk, &fl6); + dst = ip6_route_output(net, sk, &fl6); err = dst->error; if (err) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); @@ -50,7 +51,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb) if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && xfrm_decode_session(skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) { skb_dst_set(skb, NULL); - dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), skb->sk, 0); + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); skb_dst_set(skb, dst); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index d395d1590699..ccbfa83e4bb0 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -34,7 +34,7 @@ config NF_SOCKET_IPV6 if NF_TABLES config NF_TABLES_IPV6 - tristate "IPv6 nf_tables support" + bool "IPv6 nf_tables support" help This option enables the IPv6 support for nf_tables. diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index d984057b8395..44273d6f03a5 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -36,7 +36,6 @@ obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o obj-$(CONFIG_NF_DUP_IPV6) += nf_dup_ipv6.o # nf_tables -obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index af4c917e0836..65c9e1a58305 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -352,6 +352,10 @@ ip6t_do_table(struct sk_buff *skb, } if (table_base + v != ip6t_next_entry(e) && !(e->ipv6.flags & IP6T_F_GOTO)) { + if (unlikely(stackidx >= private->stacksize)) { + verdict = NF_DROP; + break; + } jumpstack[stackidx++] = e; } @@ -416,11 +420,6 @@ mark_source_chains(const struct xt_table_info *newinfo, t->verdict < 0) || visited) { unsigned int oldpos, size; - if ((strcmp(t->target.u.user.name, - XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) - return 0; - /* Return: backtrack through the last big jump. */ do { @@ -721,16 +720,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (i != repl->num_entries) goto out_free; - /* Check hooks all assigned */ - for (i = 0; i < NF_INET_NUMHOOKS; i++) { - /* Only hooks which are valid */ - if (!(repl->valid_hooks & (1 << i))) - continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) - goto out_free; - if (newinfo->underflow[i] == 0xFFFFFFFF) - goto out_free; - } + ret = xt_check_table_hooks(newinfo, repl->valid_hooks); + if (ret) + goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; @@ -958,7 +950,9 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; - xt_compat_init_offsets(AF_INET6, info->number); + ret = xt_compat_init_offsets(AF_INET6, info->number); + if (ret) + return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1071,7 +1065,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct ip6t_entry *iter; ret = 0; - counters = vzalloc(num_counters * sizeof(struct xt_counters)); + counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; @@ -1101,6 +1095,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); + xt_table_unlock(t); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ @@ -1114,7 +1110,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); } vfree(counters); - xt_table_unlock(t); return ret; put_module: @@ -1421,7 +1416,7 @@ translate_compat_table(struct net *net, struct compat_ip6t_entry *iter0; struct ip6t_replace repl; unsigned int size; - int ret = 0; + int ret; info = *pinfo; entry0 = *pentry0; @@ -1430,7 +1425,9 @@ translate_compat_table(struct net *net, j = 0; xt_compat_lock(AF_INET6); - xt_compat_init_offsets(AF_INET6, compatr->num_entries); + ret = xt_compat_init_offsets(AF_INET6, compatr->num_entries); + if (ret) + goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index fa51a205918d..38dea8ff680f 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -85,14 +85,14 @@ static int reject_tg6_check(const struct xt_tgchk_param *par) const struct ip6t_entry *e = par->entryinfo; if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) { - pr_info("ECHOREPLY is not supported.\n"); + pr_info_ratelimited("ECHOREPLY is not supported\n"); return -EINVAL; } else if (rejinfo->with == IP6T_TCP_RESET) { /* Must specify that it's a TCP packet */ if (!(e->ipv6.flags & IP6T_F_PROTO) || e->ipv6.proto != IPPROTO_TCP || (e->ipv6.invflags & XT_INV_PROTO)) { - pr_info("TCP_RESET illegal for non-tcp\n"); + pr_info_ratelimited("TCP_RESET illegal for non-tcp\n"); return -EINVAL; } } diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 437af8c95277..cb6d42b03cb5 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -18,6 +18,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> +#include <net/netfilter/nf_conntrack_ecache.h> static struct ipv6hdr * synproxy_build_ip(struct net *net, struct sk_buff *skb, @@ -405,6 +406,8 @@ static unsigned int ipv6_synproxy_hook(void *priv, synproxy->isn = ntohl(th->ack_seq); if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); break; case TCP_CONNTRACK_SYN_RECV: if (!th->syn || !th->ack) @@ -413,8 +416,10 @@ static unsigned int ipv6_synproxy_hook(void *priv, if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } opts.options &= ~(XT_SYNPROXY_OPT_MSS | XT_SYNPROXY_OPT_WSCALE | @@ -424,6 +429,7 @@ static unsigned int ipv6_synproxy_hook(void *priv, synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); swap(opts.tsval, opts.tsecr); synproxy_send_client_ack(net, skb, th, &opts); diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index b12e61b7b16c..d12f511929f5 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -48,12 +48,8 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, } fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - if ((flags & XT_RPFILTER_LOOSE) == 0) { - fl6.flowi6_oif = dev->ifindex; - lookup_flags |= RT6_LOOKUP_F_IFACE; - } - rt = (void *) ip6_route_lookup(net, &fl6, lookup_flags); + rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags); if (rt->dst.error) goto out; @@ -103,14 +99,14 @@ static int rpfilter_check(const struct xt_mtchk_param *par) unsigned int options = ~XT_RPFILTER_OPTION_MASK; if (info->flags & options) { - pr_info("unknown options encountered"); + pr_info_ratelimited("unknown options\n"); return -EINVAL; } if (strcmp(par->table, "mangle") != 0 && strcmp(par->table, "raw") != 0) { - pr_info("match only valid in the \'raw\' " - "or \'mangle\' tables, not \'%s\'.\n", par->table); + pr_info_ratelimited("only valid in \'raw\' or \'mangle\' table, not \'%s\'\n", + par->table); return -EINVAL; } diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c index 9642164107ce..33719d5560c8 100644 --- a/net/ipv6/netfilter/ip6t_srh.c +++ b/net/ipv6/netfilter/ip6t_srh.c @@ -122,12 +122,14 @@ static int srh_mt6_check(const struct xt_mtchk_param *par) const struct ip6t_srh *srhinfo = par->matchinfo; if (srhinfo->mt_flags & ~IP6T_SRH_MASK) { - pr_err("unknown srh match flags %X\n", srhinfo->mt_flags); + pr_info_ratelimited("unknown srh match flags %X\n", + srhinfo->mt_flags); return -EINVAL; } if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) { - pr_err("unknown srh invflags %X\n", srhinfo->mt_invflags); + pr_info_ratelimited("unknown srh invflags %X\n", + srhinfo->mt_invflags); return -EINVAL; } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b84ce3e6d728..5e0332014c17 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -52,18 +52,9 @@ static const char nf_frags_cache_name[] = "nf-frags"; -struct nf_ct_frag6_skb_cb -{ - struct inet6_skb_parm h; - int offset; -}; - -#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb *)((skb)->cb)) - static struct inet_frags nf_frags; #ifdef CONFIG_SYSCTL -static int zero; static struct ctl_table nf_ct_frag6_sysctl_table[] = { { @@ -76,18 +67,17 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_low_thresh", .data = &init_net.nf_frag.frags.low_thresh, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .proc_handler = proc_doulongvec_minmax, .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", .data = &init_net.nf_frag.frags.high_thresh, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.nf_frag.frags.low_thresh }, { } @@ -152,23 +142,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } -static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, nf_frags.rnd); -} - - -static unsigned int nf_hashfn(const struct inet_frag_queue *q) -{ - const struct frag_queue *nq; - - nq = container_of(q, struct frag_queue, q); - return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); -} - static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); @@ -178,34 +151,26 @@ static void nf_ct_frag6_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6_expire_frag_queue(net, fq, &nf_frags); + ip6_expire_frag_queue(net, fq); } /* Creation primitives. */ -static inline struct frag_queue *fq_find(struct net *net, __be32 id, - u32 user, struct in6_addr *src, - struct in6_addr *dst, int iif, u8 ecn) +static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, + const struct ipv6hdr *hdr, int iif) { + struct frag_v6_compare_key key = { + .id = id, + .saddr = hdr->saddr, + .daddr = hdr->daddr, + .user = user, + .iif = iif, + }; struct inet_frag_queue *q; - struct ip6_create_arg arg; - unsigned int hash; - - arg.id = id; - arg.user = user; - arg.src = src; - arg.dst = dst; - arg.iif = iif; - arg.ecn = ecn; - - local_bh_disable(); - hash = nf_hash_frag(id, src, dst); - - q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); - local_bh_enable(); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + + q = inet_frag_find(&net->nf_frag.frags, &key); + if (!q) return NULL; - } + return container_of(q, struct frag_queue, q); } @@ -264,7 +229,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q); return -EPROTO; } if (end > fq->q.len) { @@ -295,13 +260,13 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this fragment, right? */ prev = fq->q.fragments_tail; - if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { + if (!prev || prev->ip_defrag_offset < offset) { next = NULL; goto found; } prev = NULL; for (next = fq->q.fragments; next != NULL; next = next->next) { - if (NFCT_FRAG6_CB(next)->offset >= offset) + if (next->ip_defrag_offset >= offset) break; /* bingo! */ prev = next; } @@ -317,14 +282,19 @@ found: /* Check for overlap with preceding fragment. */ if (prev && - (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset) + (prev->ip_defrag_offset + prev->len) > offset) goto discard_fq; /* Look for overlap with succeeding segment. */ - if (next && NFCT_FRAG6_CB(next)->offset < end) + if (next && next->ip_defrag_offset < end) goto discard_fq; - NFCT_FRAG6_CB(skb)->offset = offset; + /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + if (skb->dev) + fq->iif = skb->dev->ifindex; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + skb->ip_defrag_offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; @@ -335,10 +305,6 @@ found: else fq->q.fragments = skb; - if (skb->dev) { - fq->iif = skb->dev->ifindex; - skb->dev = NULL; - } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; @@ -357,7 +323,7 @@ found: return 0; discard_fq: - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q); err: return -EINVAL; } @@ -379,10 +345,10 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic int payload_len; u8 ecn; - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q); WARN_ON(head == NULL); - WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); + WARN_ON(head->ip_defrag_offset != 0); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -593,8 +559,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) fhdr = (struct frag_hdr *)skb_transport_header(skb); skb_orphan(skb); - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); + fq = fq_find(net, fhdr->identification, user, hdr, + skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); return -ENOMEM; @@ -622,25 +588,33 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) out_unlock: spin_unlock_bh(&fq->q.lock); - inet_frag_put(&fq->q, &nf_frags); + inet_frag_put(&fq->q); return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); static int nf_ct_net_init(struct net *net) { + int res; + net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; - inet_frags_init_net(&net->nf_frag.frags); - - return nf_ct_frag6_sysctl_register(net); + net->nf_frag.frags.f = &nf_frags; + + res = inet_frags_init_net(&net->nf_frag.frags); + if (res < 0) + return res; + res = nf_ct_frag6_sysctl_register(net); + if (res < 0) + inet_frags_exit_net(&net->nf_frag.frags); + return res; } static void nf_ct_net_exit(struct net *net) { nf_ct_frags6_sysctl_unregister(net); - inet_frags_exit_net(&net->nf_frag.frags, &nf_frags); + inet_frags_exit_net(&net->nf_frag.frags); } static struct pernet_operations nf_ct_net_ops = { @@ -652,13 +626,12 @@ int nf_ct_frag6_init(void) { int ret = 0; - nf_frags.hashfn = nf_hashfn; nf_frags.constructor = ip6_frag_init; nf_frags.destructor = NULL; nf_frags.qsize = sizeof(struct frag_queue); - nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frags_cache_name = nf_frags_cache_name; + nf_frags.rhash_params = ip6_rhash_params; ret = inet_frags_init(&nf_frags); if (ret) goto out; diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index d346705d6ee6..207cb35569b1 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -178,7 +178,7 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) if (skb->len <= mtu) return false; - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index bed57ee65f7b..6b7f075f811f 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -99,6 +99,10 @@ static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, !l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv6, iphdroff, hdroff, target, maniptype)) return false; + + /* must reload, offset might have changed */ + ipv6h = (void *)skb->data + iphdroff; + manip_addr: if (maniptype == NF_NAT_MANIP_SRC) ipv6h->saddr = target->src.u3.in6; diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index ebb2bf84232a..f14de4b6d639 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -116,9 +116,11 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, } if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) { - struct udphdr _hdr, *hp; + struct tcphdr _hdr; + struct udphdr *hp; - hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); + hp = skb_header_pointer(skb, thoff, tproto == IPPROTO_UDP ? + sizeof(*hp) : sizeof(_hdr), &_hdr); if (hp == NULL) return NULL; diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c deleted file mode 100644 index 17e03589331c..000000000000 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/ipv6.h> -#include <linux/netfilter_ipv6.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables_ipv6.h> - -static unsigned int nft_do_chain_ipv6(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv6(&pkt, skb); - - return nft_do_chain(&pkt, priv); -} - -static const struct nf_chain_type filter_ipv6 = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_IPV6, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_FORWARD) | - (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_POST_ROUTING), - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, - [NF_INET_LOCAL_OUT] = nft_do_chain_ipv6, - [NF_INET_FORWARD] = nft_do_chain_ipv6, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, - }, -}; - -static int __init nf_tables_ipv6_init(void) -{ - return nft_register_chain_type(&filter_ipv6); -} - -static void __exit nf_tables_ipv6_exit(void) -{ - nft_unregister_chain_type(&filter_ipv6); -} - -module_init(nf_tables_ipv6_init); -module_exit(nf_tables_ipv6_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(AF_INET6, "filter"); diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index 73fe2bd13fcf..3557b114446c 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -65,7 +65,17 @@ static unsigned int nft_nat_ipv6_local_fn(void *priv, return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain); } -static const struct nf_chain_type nft_chain_nat_ipv6 = { +static int nft_nat_ipv6_init(struct nft_ctx *ctx) +{ + return nf_ct_netns_get(ctx->net, ctx->family); +} + +static void nft_nat_ipv6_free(struct nft_ctx *ctx) +{ + nf_ct_netns_put(ctx->net, ctx->family); +} + +static const struct nft_chain_type nft_chain_nat_ipv6 = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_IPV6, @@ -80,15 +90,13 @@ static const struct nf_chain_type nft_chain_nat_ipv6 = { [NF_INET_LOCAL_OUT] = nft_nat_ipv6_local_fn, [NF_INET_LOCAL_IN] = nft_nat_ipv6_fn, }, + .init = nft_nat_ipv6_init, + .free = nft_nat_ipv6_free, }; static int __init nft_chain_nat_ipv6_init(void) { - int err; - - err = nft_register_chain_type(&nft_chain_nat_ipv6); - if (err < 0) - return err; + nft_register_chain_type(&nft_chain_nat_ipv6); return 0; } diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index 11d3c3b9aa18..da3f1f8cb325 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -60,7 +60,7 @@ static unsigned int nf_route_table_hook(void *priv, return ret; } -static const struct nf_chain_type nft_chain_route_ipv6 = { +static const struct nft_chain_type nft_chain_route_ipv6 = { .name = "route", .type = NFT_CHAIN_T_ROUTE, .family = NFPROTO_IPV6, @@ -73,7 +73,9 @@ static const struct nf_chain_type nft_chain_route_ipv6 = { static int __init nft_chain_route_init(void) { - return nft_register_chain_type(&nft_chain_route_ipv6); + nft_register_chain_type(&nft_chain_route_ipv6); + + return 0; } static void __exit nft_chain_route_exit(void) diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index cc5174c7254c..36be3cf0adef 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -180,8 +180,8 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, } *dest = 0; - again: - rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, lookup_flags); + rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb, + lookup_flags); if (rt->dst.error) goto put_rt_err; @@ -189,15 +189,8 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) goto put_rt_err; - if (oif && oif != rt->rt6i_idev->dev) { - /* multipath route? Try again with F_IFACE */ - if ((lookup_flags & RT6_LOOKUP_F_IFACE) == 0) { - lookup_flags |= RT6_LOOKUP_F_IFACE; - fl6.flowi6_oif = oif->ifindex; - ip6_rt_put(rt); - goto again; - } - } + if (oif && oif != rt->rt6i_idev->dev) + goto put_rt_err; switch (priv->result) { case NFT_FIB_RESULT_OIF: diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index d12c55dad7d1..746eeae7f581 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -121,7 +121,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc6.tclass = np->tclass; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); - dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr); + dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); if (IS_ERR(dst)) return PTR_ERR(dst); rt = (struct rt6_info *) dst; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index b67814242f78..a85f7e0b14b1 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -38,7 +38,6 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; - unsigned int frag_mem = ip6_frag_mem(net); seq_printf(seq, "TCP6: inuse %d\n", sock_prot_inuse_get(net, &tcpv6_prot)); @@ -48,7 +47,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); - seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem); + seq_printf(seq, "FRAG6: inuse %u memory %lu\n", + atomic_read(&net->ipv6.frags.rhashtable.nelems), + frag_mem_limit(&net->ipv6.frags)); return 0; } @@ -290,7 +291,7 @@ int snmp6_register_dev(struct inet6_dev *idev) if (!net->mib.proc_net_devsnmp6) return -ENOENT; - p = proc_create_data(idev->dev->name, S_IRUGO, + p = proc_create_data(idev->dev->name, 0444, net->mib.proc_net_devsnmp6, &snmp6_dev_seq_fops, idev); if (!p) @@ -314,11 +315,11 @@ int snmp6_unregister_dev(struct inet6_dev *idev) static int __net_init ipv6_proc_init_net(struct net *net) { - if (!proc_create("sockstat6", S_IRUGO, net->proc_net, + if (!proc_create("sockstat6", 0444, net->proc_net, &sockstat6_seq_fops)) return -ENOMEM; - if (!proc_create("snmp6", S_IRUGO, net->proc_net, &snmp6_seq_fops)) + if (!proc_create("snmp6", 0444, net->proc_net, &snmp6_seq_fops)) goto proc_snmp6_fail; net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net); @@ -354,4 +355,3 @@ void ipv6_misc_proc_exit(void) { unregister_pernet_subsys(&ipv6_proc_ops); } - diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4c25339b1984..5eb9b08947ed 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1318,7 +1318,7 @@ static const struct file_operations raw6_seq_fops = { static int __net_init raw6_init_net(struct net *net) { - if (!proc_create("raw6", S_IRUGO, net->proc_net, &raw6_seq_fops)) + if (!proc_create("raw6", 0444, net->proc_net, &raw6_seq_fops)) return -ENOMEM; return 0; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index afbc000ad4f2..4979610287e2 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -62,13 +62,6 @@ static const char ip6_frag_cache_name[] = "ip6-frags"; -struct ip6frag_skb_cb { - struct inet6_skb_parm h; - int offset; -}; - -#define FRAG6_CB(skb) ((struct ip6frag_skb_cb *)((skb)->cb)) - static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); @@ -79,94 +72,58 @@ static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev); -/* - * callers should be careful not to use the hash value outside the ipfrag_lock - * as doing so could race with ipfrag_hash_rnd being recalculated. - */ -static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, ip6_frags.rnd); -} - -static unsigned int ip6_hashfn(const struct inet_frag_queue *q) -{ - const struct frag_queue *fq; - - fq = container_of(q, struct frag_queue, q); - return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); -} - -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct frag_queue *fq; - const struct ip6_create_arg *arg = a; - - fq = container_of(q, struct frag_queue, q); - return fq->id == arg->id && - fq->user == arg->user && - ipv6_addr_equal(&fq->saddr, arg->src) && - ipv6_addr_equal(&fq->daddr, arg->dst) && - (arg->iif == fq->iif || - !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST | - IPV6_ADDR_LINKLOCAL))); -} -EXPORT_SYMBOL(ip6_frag_match); - void ip6_frag_init(struct inet_frag_queue *q, const void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); - const struct ip6_create_arg *arg = a; + const struct frag_v6_compare_key *key = a; - fq->id = arg->id; - fq->user = arg->user; - fq->saddr = *arg->src; - fq->daddr = *arg->dst; - fq->ecn = arg->ecn; + q->key.v6 = *key; + fq->ecn = 0; } EXPORT_SYMBOL(ip6_frag_init); -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, - struct inet_frags *frags) +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; + struct sk_buff *head; + rcu_read_lock(); spin_lock(&fq->q.lock); if (fq->q.flags & INET_FRAG_COMPLETE) goto out; - inet_frag_kill(&fq->q, frags); + inet_frag_kill(&fq->q); - rcu_read_lock(); dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) - goto out_rcu_unlock; + goto out; __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); - - if (inet_frag_evicting(&fq->q)) - goto out_rcu_unlock; - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); /* Don't send error if the first segment did not arrive. */ - if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments) - goto out_rcu_unlock; + head = fq->q.fragments; + if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) + goto out; /* But use as source device on which LAST ARRIVED * segment was received. And do not use fq->dev * pointer directly, device might already disappeared. */ - fq->q.fragments->dev = dev; - icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); -out_rcu_unlock: - rcu_read_unlock(); + head->dev = dev; + skb_get(head); + spin_unlock(&fq->q.lock); + + icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); + kfree_skb(head); + goto out_rcu_unlock; + out: spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, frags); +out_rcu_unlock: + rcu_read_unlock(); + inet_frag_put(&fq->q); } EXPORT_SYMBOL(ip6_expire_frag_queue); @@ -179,31 +136,29 @@ static void ip6_frag_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ipv6.frags); - ip6_expire_frag_queue(net, fq, &ip6_frags); + ip6_expire_frag_queue(net, fq); } static struct frag_queue * -fq_find(struct net *net, __be32 id, const struct in6_addr *src, - const struct in6_addr *dst, int iif, u8 ecn) +fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) { + struct frag_v6_compare_key key = { + .id = id, + .saddr = hdr->saddr, + .daddr = hdr->daddr, + .user = IP6_DEFRAG_LOCAL_DELIVER, + .iif = iif, + }; struct inet_frag_queue *q; - struct ip6_create_arg arg; - unsigned int hash; - - arg.id = id; - arg.user = IP6_DEFRAG_LOCAL_DELIVER; - arg.src = src; - arg.dst = dst; - arg.iif = iif; - arg.ecn = ecn; - hash = inet6_hash_frag(id, src, dst); + if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))) + key.iif = 0; - q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + q = inet_frag_find(&net->ipv6.frags, &key); + if (!q) return NULL; - } + return container_of(q, struct frag_queue, q); } @@ -288,13 +243,13 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, * this fragment, right? */ prev = fq->q.fragments_tail; - if (!prev || FRAG6_CB(prev)->offset < offset) { + if (!prev || prev->ip_defrag_offset < offset) { next = NULL; goto found; } prev = NULL; for (next = fq->q.fragments; next != NULL; next = next->next) { - if (FRAG6_CB(next)->offset >= offset) + if (next->ip_defrag_offset >= offset) break; /* bingo! */ prev = next; } @@ -309,14 +264,20 @@ found: /* Check for overlap with preceding fragment. */ if (prev && - (FRAG6_CB(prev)->offset + prev->len) > offset) + (prev->ip_defrag_offset + prev->len) > offset) goto discard_fq; /* Look for overlap with succeeding segment. */ - if (next && FRAG6_CB(next)->offset < end) + if (next && next->ip_defrag_offset < end) goto discard_fq; - FRAG6_CB(skb)->offset = offset; + /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + dev = skb->dev; + if (dev) + fq->iif = dev->ifindex; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + skb->ip_defrag_offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; @@ -327,11 +288,6 @@ found: else fq->q.fragments = skb; - dev = skb->dev; - if (dev) { - fq->iif = dev->ifindex; - skb->dev = NULL; - } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; @@ -364,7 +320,7 @@ found: return -1; discard_fq: - inet_frag_kill(&fq->q, &ip6_frags); + inet_frag_kill(&fq->q); err: __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); @@ -391,7 +347,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, int sum_truesize; u8 ecn; - inet_frag_kill(&fq->q, &ip6_frags); + inet_frag_kill(&fq->q); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -418,7 +374,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, } WARN_ON(head == NULL); - WARN_ON(FRAG6_CB(head)->offset != 0); + WARN_ON(head->ip_defrag_offset != 0); /* Unfragmented part is taken from the first segment. */ payload_len = ((head->data - skb_network_header(head)) - @@ -531,6 +487,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); + int iif; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; @@ -559,17 +516,18 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); + iif = skb->dev ? skb->dev->ifindex : 0; + fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { int ret; spin_lock(&fq->q.lock); + fq->iif = iif; ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, &ip6_frags); + inet_frag_put(&fq->q); return ret; } @@ -590,24 +548,22 @@ static const struct inet6_protocol frag_protocol = { }; #ifdef CONFIG_SYSCTL -static int zero; static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", .data = &init_net.ipv6.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", .data = &init_net.ipv6.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .proc_handler = proc_doulongvec_minmax, .extra2 = &init_net.ipv6.frags.high_thresh }, { @@ -650,10 +606,6 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) table[1].data = &net->ipv6.frags.low_thresh; table[1].extra2 = &net->ipv6.frags.high_thresh; table[2].data = &net->ipv6.frags.timeout; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; } hdr = register_net_sysctl(net, "net/ipv6", table); @@ -715,19 +667,27 @@ static void ip6_frags_sysctl_unregister(void) static int __net_init ipv6_frags_init_net(struct net *net) { + int res; + net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; + net->ipv6.frags.f = &ip6_frags; - inet_frags_init_net(&net->ipv6.frags); + res = inet_frags_init_net(&net->ipv6.frags); + if (res < 0) + return res; - return ip6_frags_ns_sysctl_register(net); + res = ip6_frags_ns_sysctl_register(net); + if (res < 0) + inet_frags_exit_net(&net->ipv6.frags); + return res; } static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); + inet_frags_exit_net(&net->ipv6.frags); } static struct pernet_operations ip6_frags_ops = { @@ -735,14 +695,55 @@ static struct pernet_operations ip6_frags_ops = { .exit = ipv6_frags_exit_net, }; +static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v6, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v6_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +const struct rhashtable_params ip6_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = ip6_key_hashfn, + .obj_hashfn = ip6_obj_hashfn, + .obj_cmpfn = ip6_obj_cmpfn, + .automatic_shrinking = true, +}; +EXPORT_SYMBOL(ip6_rhash_params); + int __init ipv6_frag_init(void) { int ret; - ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); + ip6_frags.constructor = ip6_frag_init; + ip6_frags.destructor = NULL; + ip6_frags.qsize = sizeof(struct frag_queue); + ip6_frags.frag_expire = ip6_frag_expire; + ip6_frags.frags_cache_name = ip6_frag_cache_name; + ip6_frags.rhash_params = ip6_rhash_params; + ret = inet_frags_init(&ip6_frags); if (ret) goto out; + ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); + if (ret) + goto err_protocol; + ret = ip6_frags_sysctl_register(); if (ret) goto err_sysctl; @@ -751,16 +752,6 @@ int __init ipv6_frag_init(void) if (ret) goto err_pernet; - ip6_frags.hashfn = ip6_hashfn; - ip6_frags.constructor = ip6_frag_init; - ip6_frags.destructor = NULL; - ip6_frags.qsize = sizeof(struct frag_queue); - ip6_frags.match = ip6_frag_match; - ip6_frags.frag_expire = ip6_frag_expire; - ip6_frags.frags_cache_name = ip6_frag_cache_name; - ret = inet_frags_init(&ip6_frags); - if (ret) - goto err_pernet; out: return ret; @@ -768,6 +759,8 @@ err_pernet: ip6_frags_sysctl_unregister(); err_sysctl: inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); +err_protocol: + inet_frags_fini(&ip6_frags); goto out; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9dcfadddd800..49b954d6d0fa 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -128,7 +128,7 @@ struct uncached_list { static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); -static void rt6_uncached_list_add(struct rt6_info *rt) +void rt6_uncached_list_add(struct rt6_info *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); @@ -139,7 +139,7 @@ static void rt6_uncached_list_add(struct rt6_info *rt) spin_unlock_bh(&ul->lock); } -static void rt6_uncached_list_del(struct rt6_info *rt) +void rt6_uncached_list_del(struct rt6_info *rt) { if (!list_empty(&rt->rt6i_uncached)) { struct uncached_list *ul = rt->rt6i_uncached_list; @@ -450,8 +450,10 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -static struct rt6_info *rt6_multipath_select(struct rt6_info *match, +static struct rt6_info *rt6_multipath_select(const struct net *net, + struct rt6_info *match, struct flowi6 *fl6, int oif, + const struct sk_buff *skb, int strict) { struct rt6_info *sibling, *next_sibling; @@ -460,7 +462,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash) - fl6->mp_hash = rt6_multipath_hash(fl6, NULL); + fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) return match; @@ -914,11 +916,16 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, - struct flowi6 *fl6, int flags) + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { struct rt6_info *rt, *rt_cache; struct fib6_node *fn; + if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) + flags &= ~RT6_LOOKUP_F_IFACE; + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: @@ -929,8 +936,8 @@ restart: rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6, - fl6->flowi6_oif, flags); + rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif, + skb, flags); } if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); @@ -954,14 +961,15 @@ restart: } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, - int flags) + const struct sk_buff *skb, int flags) { - return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); + return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); } EXPORT_SYMBOL_GPL(ip6_route_lookup); struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, - const struct in6_addr *saddr, int oif, int strict) + const struct in6_addr *saddr, int oif, + const struct sk_buff *skb, int strict) { struct flowi6 fl6 = { .flowi6_oif = oif, @@ -975,7 +983,7 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, flags |= RT6_LOOKUP_F_HAS_SADDR; } - dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); + dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); if (dst->error == 0) return (struct rt6_info *) dst; @@ -1509,7 +1517,30 @@ static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) } } -static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) +static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, + struct rt6_info *rt, int mtu) +{ + /* If the new MTU is lower than the route PMTU, this new MTU will be the + * lowest MTU in the path: always allow updating the route PMTU to + * reflect PMTU decreases. + * + * If the new MTU is higher, and the route PMTU is equal to the local + * MTU, this means the old MTU is the lowest in the path, so allow + * updating it: if other nodes now have lower MTUs, PMTU discovery will + * handle this. + */ + + if (dst_mtu(&rt->dst) >= mtu) + return true; + + if (dst_mtu(&rt->dst) == idev->cnf.mtu6) + return true; + + return false; +} + +static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, + struct rt6_info *rt, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1518,20 +1549,22 @@ static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); - if (bucket) { - for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { - hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { - struct rt6_info *entry = rt6_ex->rt6i; - /* For RTF_CACHE with rt6i_pmtu == 0 - * (i.e. a redirected route), - * the metrics of its rt->dst.from has already - * been updated. - */ - if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu) - entry->rt6i_pmtu = mtu; - } - bucket++; + if (!bucket) + return; + + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + struct rt6_info *entry = rt6_ex->rt6i; + + /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected + * route), the metrics of its rt->dst.from have already + * been updated. + */ + if (entry->rt6i_pmtu && + rt6_mtu_change_route_allowed(idev, entry, mtu)) + entry->rt6i_pmtu = mtu; } + bucket++; } } @@ -1601,11 +1634,10 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, struct neighbour *neigh; __u8 neigh_flags = 0; - neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); - if (neigh) { + neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + if (neigh) neigh_flags = neigh->flags; - neigh_release(neigh); - } + if (!(neigh_flags & NTF_ROUTER)) { RT6_TRACE("purging route %p via non-router but gateway\n", rt); @@ -1629,7 +1661,8 @@ void rt6_age_exceptions(struct rt6_info *rt, if (!rcu_access_pointer(rt->rt6i_exception_bucket)) return; - spin_lock_bh(&rt6_exception_lock); + rcu_read_lock_bh(); + spin_lock(&rt6_exception_lock); bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); @@ -1643,11 +1676,13 @@ void rt6_age_exceptions(struct rt6_info *rt, bucket++; } } - spin_unlock_bh(&rt6_exception_lock); + spin_unlock(&rt6_exception_lock); + rcu_read_unlock_bh(); } struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, int flags) + int oif, struct flowi6 *fl6, + const struct sk_buff *skb, int flags) { struct fib6_node *fn, *saved_fn; struct rt6_info *rt, *rt_cache; @@ -1669,7 +1704,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) - rt = rt6_multipath_select(rt, fl6, oif, strict); + rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict); if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) @@ -1768,28 +1803,35 @@ uncached_rt_out: } EXPORT_SYMBOL_GPL(ip6_pol_route); -static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, - struct flowi6 *fl6, int flags) +static struct rt6_info *ip6_pol_route_input(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { - return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); + return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); } struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, - struct flowi6 *fl6, int flags) + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; - return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); + return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); } EXPORT_SYMBOL_GPL(ip6_route_input_lookup); static void ip6_multipath_l3_keys(const struct sk_buff *skb, - struct flow_keys *keys) + struct flow_keys *keys, + struct flow_keys *flkeys) { const struct ipv6hdr *outer_iph = ipv6_hdr(skb); const struct ipv6hdr *key_iph = outer_iph; + struct flow_keys *_flkeys = flkeys; const struct ipv6hdr *inner_iph; const struct icmp6hdr *icmph; struct ipv6hdr _inner_iph; @@ -1811,26 +1853,76 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb, goto out; key_iph = inner_iph; + _flkeys = NULL; out: - memset(keys, 0, sizeof(*keys)); - keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; - keys->addrs.v6addrs.src = key_iph->saddr; - keys->addrs.v6addrs.dst = key_iph->daddr; - keys->tags.flow_label = ip6_flowinfo(key_iph); - keys->basic.ip_proto = key_iph->nexthdr; + if (_flkeys) { + keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; + keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; + keys->tags.flow_label = _flkeys->tags.flow_label; + keys->basic.ip_proto = _flkeys->basic.ip_proto; + } else { + keys->addrs.v6addrs.src = key_iph->saddr; + keys->addrs.v6addrs.dst = key_iph->daddr; + keys->tags.flow_label = ip6_flowinfo(key_iph); + keys->basic.ip_proto = key_iph->nexthdr; + } } /* if skb is set it will be used and fl6 can be NULL */ -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) +u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, + const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; + u32 mhash; - if (skb) { - ip6_multipath_l3_keys(skb, &hash_keys); - return flow_hash_from_keys(&hash_keys) >> 1; + switch (ip6_multipath_hash_policy(net)) { + case 0: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (skb) { + ip6_multipath_l3_keys(skb, &hash_keys, flkeys); + } else { + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; + case 1: + if (skb) { + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; + struct flow_keys keys; + + /* short-circuit if we already have L4 hash present */ + if (skb->l4_hash) + return skb_get_hash_raw(skb) >> 1; + + memset(&hash_keys, 0, sizeof(hash_keys)); + + if (!flkeys) { + skb_flow_dissect_flow_keys(skb, &keys, flag); + flkeys = &keys; + } + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; + hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; + hash_keys.ports.src = flkeys->ports.src; + hash_keys.ports.dst = flkeys->ports.dst; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; + } else { + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.ports.src = fl6->fl6_sport; + hash_keys.ports.dst = fl6->fl6_dport; + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; } + mhash = flow_hash_from_keys(&hash_keys); - return get_hash_from_flowi6(fl6) >> 1; + return mhash >> 1; } void ip6_route_input(struct sk_buff *skb) @@ -1847,20 +1939,29 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, }; + struct flow_keys *flkeys = NULL, _flkeys; tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; + + if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) + flkeys = &_flkeys; + if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) - fl6.mp_hash = rt6_multipath_hash(&fl6, skb); + fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); - skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); + skb_dst_set(skb, + ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); } -static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, - struct flowi6 *fl6, int flags) +static struct rt6_info *ip6_pol_route_output(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { - return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); + return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, @@ -1888,7 +1989,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, else if (sk) flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); - return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); + return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } EXPORT_SYMBOL_GPL(ip6_route_output_flags); @@ -2128,6 +2229,23 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) } EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); +void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, + const struct flowi6 *fl6) +{ +#ifdef CONFIG_IPV6_SUBTREES + struct ipv6_pinfo *np = inet6_sk(sk); +#endif + + ip6_dst_store(sk, dst, + ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? + &sk->sk_v6_daddr : NULL, +#ifdef CONFIG_IPV6_SUBTREES + ipv6_addr_equal(&fl6->saddr, &np->saddr) ? + &np->saddr : +#endif + NULL); +} + /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; @@ -2137,6 +2255,7 @@ struct ip6rd_flowi { static struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, + const struct sk_buff *skb, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; @@ -2210,8 +2329,9 @@ out: }; static struct dst_entry *ip6_route_redirect(struct net *net, - const struct flowi6 *fl6, - const struct in6_addr *gateway) + const struct flowi6 *fl6, + const struct sk_buff *skb, + const struct in6_addr *gateway) { int flags = RT6_LOOKUP_F_HAS_SADDR; struct ip6rd_flowi rdfl; @@ -2219,7 +2339,7 @@ static struct dst_entry *ip6_route_redirect(struct net *net, rdfl.fl6 = *fl6; rdfl.gateway = *gateway; - return fib6_rule_lookup(net, &rdfl.fl6, + return fib6_rule_lookup(net, &rdfl.fl6, skb, flags, __ip6_route_redirect); } @@ -2239,7 +2359,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, fl6.flowlabel = ip6_flowinfo(iph); fl6.flowi6_uid = uid; - dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); + dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } @@ -2261,7 +2381,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, fl6.saddr = iph->daddr; fl6.flowi6_uid = sock_net_uid(net, NULL); - dst = ip6_route_redirect(net, &fl6, &iph->saddr); + dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } @@ -2463,7 +2583,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; - rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); + rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); /* if table lookup failed, fall back to full lookup */ if (rt == net->ipv6.ip6_null_entry) { @@ -2476,7 +2596,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, static int ip6_route_check_nh_onlink(struct net *net, struct fib6_config *cfg, - struct net_device *dev, + const struct net_device *dev, struct netlink_ext_ack *extack) { u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; @@ -2526,7 +2646,7 @@ static int ip6_route_check_nh(struct net *net, } if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); if (!grt) goto out; @@ -2552,6 +2672,79 @@ out: return err; } +static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, + struct net_device **_dev, struct inet6_dev **idev, + struct netlink_ext_ack *extack) +{ + const struct in6_addr *gw_addr = &cfg->fc_gateway; + int gwa_type = ipv6_addr_type(gw_addr); + bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; + const struct net_device *dev = *_dev; + bool need_addr_check = !dev; + int err = -EINVAL; + + /* if gw_addr is local we will fail to detect this in case + * address is still TENTATIVE (DAD in progress). rt6_lookup() + * will return already-added prefix route via interface that + * prefix route was assigned to, which might be non-loopback. + */ + if (dev && + ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { + NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); + goto out; + } + + if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { + /* IPv6 strictly inhibits using not link-local + * addresses as nexthop address. + * Otherwise, router will not able to send redirects. + * It is very good, but in some (rare!) circumstances + * (SIT, PtP, NBMA NOARP links) it is handy to allow + * some exceptions. --ANK + * We allow IPv4-mapped nexthops to support RFC4798-type + * addressing + */ + if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { + NL_SET_ERR_MSG(extack, "Invalid gateway address"); + goto out; + } + + if (cfg->fc_flags & RTNH_F_ONLINK) + err = ip6_route_check_nh_onlink(net, cfg, dev, extack); + else + err = ip6_route_check_nh(net, cfg, _dev, idev); + + if (err) + goto out; + } + + /* reload in case device was changed */ + dev = *_dev; + + err = -EINVAL; + if (!dev) { + NL_SET_ERR_MSG(extack, "Egress device not specified"); + goto out; + } else if (dev->flags & IFF_LOOPBACK) { + NL_SET_ERR_MSG(extack, + "Egress device can not be loopback device for this route"); + goto out; + } + + /* if we did not check gw_addr above, do so now that the + * egress device has been resolved. + */ + if (need_addr_check && + ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { + NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); + goto out; + } + + err = 0; +out: + return err; +} + static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -2671,14 +2864,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (err) goto out; rt->dst.lwtstate = lwtstate_get(lwtstate); - if (lwtunnel_output_redirect(rt->dst.lwtstate)) { - rt->dst.lwtstate->orig_output = rt->dst.output; - rt->dst.output = lwtunnel_output; - } - if (lwtunnel_input_redirect(rt->dst.lwtstate)) { - rt->dst.lwtstate->orig_input = rt->dst.input; - rt->dst.input = lwtunnel_input; - } + lwtunnel_set_redirect(&rt->dst); } ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); @@ -2741,67 +2927,23 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, } if (cfg->fc_flags & RTF_GATEWAY) { - const struct in6_addr *gw_addr; - int gwa_type; - - gw_addr = &cfg->fc_gateway; - gwa_type = ipv6_addr_type(gw_addr); - - /* if gw_addr is local we will fail to detect this in case - * address is still TENTATIVE (DAD in progress). rt6_lookup() - * will return already-added prefix route via interface that - * prefix route was assigned to, which might be non-loopback. - */ - err = -EINVAL; - if (ipv6_chk_addr_and_flags(net, gw_addr, - gwa_type & IPV6_ADDR_LINKLOCAL ? - dev : NULL, 0, 0)) { - NL_SET_ERR_MSG(extack, "Invalid gateway address"); + err = ip6_validate_gw(net, cfg, &dev, &idev, extack); + if (err) goto out; - } - rt->rt6i_gateway = *gw_addr; - - if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { - /* IPv6 strictly inhibits using not link-local - addresses as nexthop address. - Otherwise, router will not able to send redirects. - It is very good, but in some (rare!) circumstances - (SIT, PtP, NBMA NOARP links) it is handy to allow - some exceptions. --ANK - We allow IPv4-mapped nexthops to support RFC4798-type - addressing - */ - if (!(gwa_type & (IPV6_ADDR_UNICAST | - IPV6_ADDR_MAPPED))) { - NL_SET_ERR_MSG(extack, - "Invalid gateway address"); - goto out; - } - if (cfg->fc_flags & RTNH_F_ONLINK) { - err = ip6_route_check_nh_onlink(net, cfg, dev, - extack); - } else { - err = ip6_route_check_nh(net, cfg, &dev, &idev); - } - if (err) - goto out; - } - err = -EINVAL; - if (!dev) { - NL_SET_ERR_MSG(extack, "Egress device not specified"); - goto out; - } else if (dev->flags & IFF_LOOPBACK) { - NL_SET_ERR_MSG(extack, - "Egress device can not be loopback device for this route"); - goto out; - } + rt->rt6i_gateway = cfg->fc_gateway; } err = -ENODEV; if (!dev) goto out; + if (idev->cnf.disable_ipv6) { + NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); + err = -EACCES; + goto out; + } + if (!(dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; @@ -3809,25 +3951,13 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) Since RFC 1981 doesn't include administrative MTU increase update PMTU increase is a MUST. (i.e. jumbo frame) */ - /* - If new MTU is less than route PMTU, this new MTU will be the - lowest MTU in the path, update the route PMTU to reflect PMTU - decreases; if new MTU is greater than route PMTU, and the - old MTU is the lowest MTU in the path, update the route PMTU - to reflect the increase. In this case if the other nodes' MTU - also have the lowest MTU, TOO BIG MESSAGE will be lead to - PMTU discovery. - */ if (rt->dst.dev == arg->dev && - dst_metric_raw(&rt->dst, RTAX_MTU) && !dst_metric_locked(&rt->dst, RTAX_MTU)) { spin_lock_bh(&rt6_exception_lock); - if (dst_mtu(&rt->dst) >= arg->mtu || - (dst_mtu(&rt->dst) < arg->mtu && - dst_mtu(&rt->dst) == idev->cnf.mtu6)) { + if (dst_metric_raw(&rt->dst, RTAX_MTU) && + rt6_mtu_change_route_allowed(idev, rt, arg->mtu)) dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); - } - rt6_exceptions_update_pmtu(rt, arg->mtu); + rt6_exceptions_update_pmtu(idev, rt, arg->mtu); spin_unlock_bh(&rt6_exception_lock); } return 0; @@ -4099,6 +4229,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, r_cfg.fc_encap_type = nla_get_u16(nla); } + r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); rt = ip6_route_info_create(&r_cfg, extack); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -4598,7 +4729,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - dst = ip6_route_input_lookup(net, dev, &fl6, flags); + dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); rcu_read_unlock(); } else { @@ -4963,7 +5094,7 @@ static int __net_init ip6_route_net_init_late(struct net *net) { #ifdef CONFIG_PROC_FS proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); - proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops); + proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops); #endif return 0; } diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index bd6cc688bd19..f343e6f0fc95 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -16,6 +16,7 @@ #include <linux/net.h> #include <linux/module.h> #include <net/ip.h> +#include <net/ip_tunnels.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/netns/generic.h> @@ -93,7 +94,8 @@ static void set_tun_src(struct net *net, struct net_device *dev, /* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct dst_entry *dst = skb_dst(skb); + struct net *net = dev_net(dst->dev); struct ipv6hdr *hdr, *inner_hdr; struct ipv6_sr_hdr *isrh; int hdrlen, tot_len, err; @@ -134,7 +136,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) isrh->nexthdr = proto; hdr->daddr = isrh->segments[isrh->first_segment]; - set_tun_src(net, skb->dev, &hdr->daddr, &hdr->saddr); + set_tun_src(net, ip6_dst_idev(dst)->dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { @@ -210,11 +212,6 @@ static int seg6_do_srh(struct sk_buff *skb) tinfo = seg6_encap_lwtunnel(dst->lwtstate); - if (likely(!skb->encapsulation)) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } - switch (tinfo->mode) { case SEG6_IPTUN_MODE_INLINE: if (skb->protocol != htons(ETH_P_IPV6)) @@ -223,10 +220,12 @@ static int seg6_do_srh(struct sk_buff *skb) err = seg6_do_srh_inline(skb, tinfo->srh); if (err) return err; - - skb_reset_inner_headers(skb); break; case SEG6_IPTUN_MODE_ENCAP: + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6); + if (err) + return err; + if (skb->protocol == htons(ETH_P_IPV6)) proto = IPPROTO_IPV6; else if (skb->protocol == htons(ETH_P_IP)) @@ -238,6 +237,8 @@ static int seg6_do_srh(struct sk_buff *skb) if (err) return err; + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + skb_set_inner_protocol(skb, skb->protocol); skb->protocol = htons(ETH_P_IPV6); break; case SEG6_IPTUN_MODE_L2ENCAP: @@ -261,8 +262,6 @@ static int seg6_do_srh(struct sk_buff *skb) ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); - skb_set_inner_protocol(skb, skb->protocol); - return 0; } @@ -418,7 +417,7 @@ static int seg6_build_state(struct nlattr *nla, slwt = seg6_lwt_lwtunnel(newts); - err = dst_cache_init(&slwt->cache, GFP_KERNEL); + err = dst_cache_init(&slwt->cache, GFP_ATOMIC); if (err) { kfree(newts); return err; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index ba3767ef5e93..45722327375a 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -161,7 +161,7 @@ static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; if (!tbl_id) { - dst = ip6_route_input_lookup(net, skb->dev, &fl6, flags); + dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags); } else { struct fib6_table *table; @@ -169,7 +169,7 @@ static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, if (!table) goto out; - rt = ip6_pol_route(net, table, 0, &fl6, flags); + rt = ip6_pol_route(net, table, 0, &fl6, skb, flags); dst = &rt->dst; } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 3873d3877135..1522bcfd253f 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -182,7 +182,7 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel *t = netdev_priv(dev); - if (t->dev == sitn->fb_tunnel_dev) { + if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) { ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0); t->ip6rd.relay_prefix = 0; t->ip6rd.prefixlen = 16; @@ -1578,6 +1578,13 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev, if (err < 0) return err; + if (tb[IFLA_MTU]) { + u32 mtu = nla_get_u32(tb[IFLA_MTU]); + + if (mtu >= IPV6_MIN_MTU && mtu <= 0xFFF8 - dev->hard_header_len) + dev->mtu = mtu; + } + #ifdef CONFIG_IPV6_SIT_6RD if (ipip6_netlink_6rd_parms(data, &ip6rd)) err = ipip6_tunnel_update_6rd(nt, &ip6rd); @@ -1828,6 +1835,9 @@ static int __net_init sit_init_net(struct net *net) sitn->tunnels[2] = sitn->tunnels_r; sitn->tunnels[3] = sitn->tunnels_r_l; + if (!net_has_fallback_tunnels(net)) + return 0; + sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", NET_NAME_UNKNOWN, ipip6_tunnel_setup); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index e7a3a6b6cf56..e997141aed8c 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -217,6 +217,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq->snt_isn = cookie; treq->ts_off = 0; treq->txhash = net_tx_rndhash(); + if (IS_ENABLED(CONFIG_SMC)) + ireq->smc_ok = 0; /* * We need to lookup the dst_entry to get the correct window size. diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index a789a8ac6a64..6fbdef630152 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -16,14 +16,31 @@ #include <net/ipv6.h> #include <net/addrconf.h> #include <net/inet_frag.h> +#include <net/netevent.h> #ifdef CONFIG_NETLABEL #include <net/calipso.h> #endif +static int zero; static int one = 1; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; +static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv6.sysctl.multipath_hash_policy); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); + + return ret; +} static struct ctl_table ipv6_table_template[] = { { @@ -126,6 +143,15 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "fib_multipath_hash_policy", + .data = &init_net.ipv6.sysctl.multipath_hash_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_rt6_multipath_hash_policy, + .extra1 = &zero, + .extra2 = &one, + }, { } }; @@ -190,6 +216,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt; ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len; ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len; + ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy, ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 412139f4eccd..6d664d83cd16 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -117,6 +117,21 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) ipv6_hdr(skb)->saddr.s6_addr32); } +static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v6_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr); +} + static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -1451,6 +1466,7 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); + bool req_stolen = false; struct sock *nsk; sk = req->rsk_listener; @@ -1470,10 +1486,20 @@ process: th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); - nsk = tcp_check_req(sk, skb, req, false); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen); } if (!nsk) { reqsk_put(req); + if (req_stolen) { + /* Another cpu got exclusive access to req + * and created a full blown socket. + * Try to feed this packet to this socket + * instead of discarding it. + */ + tcp_v6_restore_cb(skb); + sock_put(sk); + goto lookup; + } goto discard_and_relse; } if (nsk == sk) { @@ -1914,6 +1940,7 @@ struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v6_pre_connect, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 52e3ea0e6f50..4ec76a87aeb8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -957,6 +957,25 @@ static void udp_v6_flush_pending_frames(struct sock *sk) } } +static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* The following checks are replicated from __ip6_datagram_connect() + * and intended to prevent BPF program called below from accessing + * bytes that are out of the bound specified by user in addr_len. + */ + if (uaddr->sa_family == AF_INET) { + if (__ipv6_only_sock(sk)) + return -EAFNOSUPPORT; + return udp_pre_connect(sk, uaddr, addr_len); + } + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); +} + /** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on @@ -1097,10 +1116,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; + bool connected = false; int ulen = len; int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int err; - int connected = 0; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sockcm_cookie sockc; @@ -1222,7 +1241,7 @@ do_udp_sendmsg: fl6.fl6_dport = inet->inet_dport; daddr = &sk->sk_v6_daddr; fl6.flowlabel = np->flow_label; - connected = 1; + connected = true; } if (!fl6.flowi6_oif) @@ -1252,7 +1271,7 @@ do_udp_sendmsg: } if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; - connected = 0; + connected = false; } if (!opt) { opt = txopt_get(np); @@ -1274,11 +1293,11 @@ do_udp_sendmsg: final_p = fl6_update_dst(&fl6, opt, &final); if (final_p) - connected = 0; + connected = false; if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) { fl6.flowi6_oif = np->mcast_oif; - connected = 0; + connected = false; } else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; @@ -1289,7 +1308,7 @@ do_udp_sendmsg: fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); - dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, connected); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; @@ -1314,7 +1333,7 @@ back_from_confirm: err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_v6_send_skb(skb, &fl6); - goto release_dst; + goto out; } lock_sock(sk); @@ -1348,23 +1367,6 @@ do_append_data: err = np->recverr ? net_xmit_errno(err) : 0; release_sock(sk); -release_dst: - if (dst) { - if (connected) { - ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ? - &sk->sk_v6_daddr : NULL, -#ifdef CONFIG_IPV6_SUBTREES - ipv6_addr_equal(&fl6.saddr, &np->saddr) ? - &np->saddr : -#endif - NULL); - } else { - dst_release(dst); - } - dst = NULL; - } - out: dst_release(dst); fl6_sock_release(flowlabel); @@ -1509,34 +1511,35 @@ void udp6_proc_exit(struct net *net) /* ------------------------------------------------------------------------ */ struct proto udpv6_prot = { - .name = "UDPv6", - .owner = THIS_MODULE, - .close = udp_lib_close, - .connect = ip6_datagram_connect, - .disconnect = udp_disconnect, - .ioctl = udp_ioctl, - .init = udp_init_sock, - .destroy = udpv6_destroy_sock, - .setsockopt = udpv6_setsockopt, - .getsockopt = udpv6_getsockopt, - .sendmsg = udpv6_sendmsg, - .recvmsg = udpv6_recvmsg, - .release_cb = ip6_datagram_release_cb, - .hash = udp_lib_hash, - .unhash = udp_lib_unhash, - .rehash = udp_v6_rehash, - .get_port = udp_v6_get_port, - .memory_allocated = &udp_memory_allocated, - .sysctl_mem = sysctl_udp_mem, - .sysctl_wmem = &sysctl_udp_wmem_min, - .sysctl_rmem = &sysctl_udp_rmem_min, - .obj_size = sizeof(struct udp6_sock), - .h.udp_table = &udp_table, + .name = "UDPv6", + .owner = THIS_MODULE, + .close = udp_lib_close, + .pre_connect = udpv6_pre_connect, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .init = udp_init_sock, + .destroy = udpv6_destroy_sock, + .setsockopt = udpv6_setsockopt, + .getsockopt = udpv6_getsockopt, + .sendmsg = udpv6_sendmsg, + .recvmsg = udpv6_recvmsg, + .release_cb = ip6_datagram_release_cb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .rehash = udp_v6_rehash, + .get_port = udp_v6_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), + .obj_size = sizeof(struct udp6_sock), + .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udpv6_setsockopt, - .compat_getsockopt = compat_udpv6_getsockopt, + .compat_setsockopt = compat_udpv6_setsockopt, + .compat_getsockopt = compat_udpv6_getsockopt, #endif - .diag_destroy = udp_abort, + .diag_destroy = udp_abort, }; static struct inet_protosw udpv6_protosw = { diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index bb935a3b7fea..de1b0b8c53b0 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -92,7 +92,8 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) skb_reset_network_header(skb); skb_mac_header_rebuild(skb); - eth_hdr(skb)->h_proto = skb->protocol; + if (skb->mac_len) + eth_hdr(skb)->h_proto = skb->protocol; err = 0; diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 8ae87d4ec5ff..5959ce9620eb 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -82,7 +82,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && - skb_gso_network_seglen(skb) > ip6_skb_dst_mtu(skb))) { + !skb_gso_validate_network_len(skb, ip6_skb_dst_mtu(skb)))) { skb->dev = dst->dev; skb->protocol = htons(ETH_P_IPV6); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 09fb44ee3b45..416fe67271a9 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -113,6 +113,9 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; xdst->u.rt6.rt6i_src = rt->rt6i_src; + INIT_LIST_HEAD(&xdst->u.rt6.rt6i_uncached); + rt6_uncached_list_add(&xdst->u.rt6); + atomic_inc(&dev_net(dev)->ipv6.rt6_stats->fib_rt_uncache); return 0; } @@ -244,6 +247,8 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); dst_destroy_metrics_generic(dst); + if (xdst->u.rt6.rt6i_uncached_list) + rt6_uncached_list_del(&xdst->u.rt6); xfrm_dst_destroy(xdst); } diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index b15075a5c227..16f434791763 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -196,4 +196,3 @@ void xfrm6_state_fini(void) { xfrm_state_unregister_afinfo(&xfrm6_state_afinfo); } - diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 1e8cc7bcbca3..893a022f9620 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -989,14 +989,13 @@ done: } static int iucv_sock_getname(struct socket *sock, struct sockaddr *addr, - int *len, int peer) + int peer) { struct sockaddr_iucv *siucv = (struct sockaddr_iucv *) addr; struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); addr->sa_family = AF_IUCV; - *len = sizeof(struct sockaddr_iucv); if (peer) { memcpy(siucv->siucv_user_id, iucv->dst_user_id, 8); @@ -1009,7 +1008,7 @@ static int iucv_sock_getname(struct socket *sock, struct sockaddr *addr, memset(&siucv->siucv_addr, 0, sizeof(siucv->siucv_addr)); memset(&siucv->siucv_nodeid, 0, sizeof(siucv->siucv_nodeid)); - return 0; + return sizeof(struct sockaddr_iucv); } /** @@ -2433,9 +2432,11 @@ static int afiucv_iucv_init(void) af_iucv_dev->driver = &af_iucv_driver; err = device_register(af_iucv_dev); if (err) - goto out_driver; + goto out_iucv_dev; return 0; +out_iucv_dev: + put_device(af_iucv_dev); out_driver: driver_unregister(&af_iucv_driver); out_iucv: diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c index 9d5649e4e8b7..1fac92543094 100644 --- a/net/kcm/kcmproc.c +++ b/net/kcm/kcmproc.c @@ -269,7 +269,7 @@ static int kcm_proc_register(struct net *net, struct kcm_seq_muxinfo *muxinfo) struct proc_dir_entry *p; int rc = 0; - p = proc_create_data(muxinfo->name, S_IRUGO, net->proc_net, + p = proc_create_data(muxinfo->name, 0444, net->proc_net, muxinfo->seq_fops, muxinfo); if (!p) rc = -ENOMEM; @@ -406,7 +406,7 @@ static int kcm_proc_init_net(struct net *net) { int err; - if (!proc_create("kcm_stats", S_IRUGO, net->proc_net, + if (!proc_create("kcm_stats", 0444, net->proc_net, &kcm_stats_seq_fops)) { err = -ENOMEM; goto out_kcm_stats; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index f297d53a11aa..dc76bc346829 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1381,24 +1381,32 @@ static int kcm_attach(struct socket *sock, struct socket *csock, .parse_msg = kcm_parse_func_strparser, .read_sock_done = kcm_read_sock_done, }; - int err; + int err = 0; csk = csock->sk; if (!csk) return -EINVAL; + lock_sock(csk); + /* Only allow TCP sockets to be attached for now */ if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) || - csk->sk_protocol != IPPROTO_TCP) - return -EOPNOTSUPP; + csk->sk_protocol != IPPROTO_TCP) { + err = -EOPNOTSUPP; + goto out; + } /* Don't allow listeners or closed sockets */ - if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) - return -EOPNOTSUPP; + if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) { + err = -EOPNOTSUPP; + goto out; + } psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); - if (!psock) - return -ENOMEM; + if (!psock) { + err = -ENOMEM; + goto out; + } psock->mux = mux; psock->sk = csk; @@ -1407,7 +1415,7 @@ static int kcm_attach(struct socket *sock, struct socket *csock, err = strp_init(&psock->strp, csk, &cb); if (err) { kmem_cache_free(kcm_psockp, psock); - return err; + goto out; } write_lock_bh(&csk->sk_callback_lock); @@ -1417,9 +1425,11 @@ static int kcm_attach(struct socket *sock, struct socket *csock, */ if (csk->sk_user_data) { write_unlock_bh(&csk->sk_callback_lock); + strp_stop(&psock->strp); strp_done(&psock->strp); kmem_cache_free(kcm_psockp, psock); - return -EALREADY; + err = -EALREADY; + goto out; } psock->save_data_ready = csk->sk_data_ready; @@ -1455,7 +1465,10 @@ static int kcm_attach(struct socket *sock, struct socket *csock, /* Schedule RX work in case there are already bytes queued */ strp_check_rcv(&psock->strp); - return 0; +out: + release_sock(csk); + + return err; } static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) @@ -1507,6 +1520,7 @@ static void kcm_unattach(struct kcm_psock *psock) if (WARN_ON(psock->rx_kcm)) { write_unlock_bh(&csk->sk_callback_lock); + release_sock(csk); return; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 194a7483bb93..14b67dfacc4b 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -111,6 +111,13 @@ struct l2tp_net { spinlock_t l2tp_session_hlist_lock; }; +#if IS_ENABLED(CONFIG_IPV6) +static bool l2tp_sk_is_v6(struct sock *sk) +{ + return sk->sk_family == PF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_daddr); +} +#endif static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk) { @@ -136,51 +143,6 @@ l2tp_session_id_hash_2(struct l2tp_net *pn, u32 session_id) } -/* Lookup the tunnel socket, possibly involving the fs code if the socket is - * owned by userspace. A struct sock returned from this function must be - * released using l2tp_tunnel_sock_put once you're done with it. - */ -static struct sock *l2tp_tunnel_sock_lookup(struct l2tp_tunnel *tunnel) -{ - int err = 0; - struct socket *sock = NULL; - struct sock *sk = NULL; - - if (!tunnel) - goto out; - - if (tunnel->fd >= 0) { - /* Socket is owned by userspace, who might be in the process - * of closing it. Look the socket up using the fd to ensure - * consistency. - */ - sock = sockfd_lookup(tunnel->fd, &err); - if (sock) - sk = sock->sk; - } else { - /* Socket is owned by kernelspace */ - sk = tunnel->sock; - sock_hold(sk); - } - -out: - return sk; -} - -/* Drop a reference to a tunnel socket obtained via. l2tp_tunnel_sock_put */ -static void l2tp_tunnel_sock_put(struct sock *sk) -{ - struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); - if (tunnel) { - if (tunnel->fd >= 0) { - /* Socket is owned by userspace */ - sockfd_put(sk->sk_socket); - } - sock_put(sk); - } - sock_put(sk); -} - /* Session hash list. * The session_id SHOULD be random according to RFC2661, but several * L2TP implementations (Cisco and Microsoft) use incrementing @@ -193,6 +155,13 @@ l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id) return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)]; } +void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) +{ + sock_put(tunnel->sock); + /* the tunnel is freed in the socket destructor */ +} +EXPORT_SYMBOL(l2tp_tunnel_free); + /* Lookup a tunnel. A new reference is held on the returned tunnel. */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) { @@ -345,13 +314,11 @@ int l2tp_session_register(struct l2tp_session *session, } l2tp_tunnel_inc_refcount(tunnel); - sock_hold(tunnel->sock); hlist_add_head_rcu(&session->global_hlist, g_head); spin_unlock_bh(&pn->l2tp_session_hlist_lock); } else { l2tp_tunnel_inc_refcount(tunnel); - sock_hold(tunnel->sock); } hlist_add_head(&session->hlist, head); @@ -969,7 +936,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct l2tp_tunnel *tunnel; - tunnel = l2tp_sock_to_tunnel(sk); + tunnel = l2tp_tunnel(sk); if (tunnel == NULL) goto pass_up; @@ -977,13 +944,10 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) tunnel->name, skb->len); if (l2tp_udp_recv_core(tunnel, skb, tunnel->recv_payload_hook)) - goto pass_up_put; + goto pass_up; - sock_put(sk); return 0; -pass_up_put: - sock_put(sk); pass_up: return 1; } @@ -1092,7 +1056,7 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, /* Queue the packet to IP for output */ skb->ignore_df = 1; #if IS_ENABLED(CONFIG_IPV6) - if (tunnel->sock->sk_family == PF_INET6 && !tunnel->v4mapped) + if (l2tp_sk_is_v6(tunnel->sock)) error = inet6_csk_xmit(tunnel->sock, skb, NULL); else #endif @@ -1155,6 +1119,15 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len goto out_unlock; } + /* The user-space may change the connection status for the user-space + * provided socket at run time: we must check it under the socket lock + */ + if (tunnel->fd >= 0 && sk->sk_state != TCP_ESTABLISHED) { + kfree_skb(skb); + ret = NET_XMIT_DROP; + goto out_unlock; + } + /* Get routing info from the tunnel socket */ skb_dst_drop(skb); skb_dst_set(skb, dst_clone(__sk_dst_check(sk, 0))); @@ -1174,7 +1147,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len /* Calculate UDP checksum if configured to do so */ #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == PF_INET6 && !tunnel->v4mapped) + if (l2tp_sk_is_v6(sk)) udp6_set_csum(udp_get_no_check6_tx(sk), skb, &inet6_sk(sk)->saddr, &sk->sk_v6_daddr, udp_len); @@ -1207,14 +1180,12 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb); static void l2tp_tunnel_destruct(struct sock *sk) { struct l2tp_tunnel *tunnel = l2tp_tunnel(sk); - struct l2tp_net *pn; if (tunnel == NULL) goto end; l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing...\n", tunnel->name); - /* Disable udp encapsulation */ switch (tunnel->encap) { case L2TP_ENCAPTYPE_UDP: @@ -1231,18 +1202,11 @@ static void l2tp_tunnel_destruct(struct sock *sk) sk->sk_destruct = tunnel->old_sk_destruct; sk->sk_user_data = NULL; - /* Remove the tunnel struct from the tunnel list */ - pn = l2tp_pernet(tunnel->l2tp_net); - spin_lock_bh(&pn->l2tp_tunnel_list_lock); - list_del_rcu(&tunnel->list); - spin_unlock_bh(&pn->l2tp_tunnel_list_lock); - - tunnel->sock = NULL; - l2tp_tunnel_dec_refcount(tunnel); - /* Call the original destructor */ if (sk->sk_destruct) (*sk->sk_destruct)(sk); + + kfree_rcu(tunnel, rcu); end: return; } @@ -1303,49 +1267,43 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall); /* Tunnel socket destroy hook for UDP encapsulation */ static void l2tp_udp_encap_destroy(struct sock *sk) { - struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); - if (tunnel) { - l2tp_tunnel_closeall(tunnel); - sock_put(sk); - } + struct l2tp_tunnel *tunnel = l2tp_tunnel(sk); + + if (tunnel) + l2tp_tunnel_delete(tunnel); } /* Workqueue tunnel deletion function */ static void l2tp_tunnel_del_work(struct work_struct *work) { - struct l2tp_tunnel *tunnel = NULL; - struct socket *sock = NULL; - struct sock *sk = NULL; - - tunnel = container_of(work, struct l2tp_tunnel, del_work); + struct l2tp_tunnel *tunnel = container_of(work, struct l2tp_tunnel, + del_work); + struct sock *sk = tunnel->sock; + struct socket *sock = sk->sk_socket; + struct l2tp_net *pn; l2tp_tunnel_closeall(tunnel); - sk = l2tp_tunnel_sock_lookup(tunnel); - if (!sk) - goto out; - - sock = sk->sk_socket; - - /* If the tunnel socket was created by userspace, then go through the - * inet layer to shut the socket down, and let userspace close it. - * Otherwise, if we created the socket directly within the kernel, use + /* If the tunnel socket was created within the kernel, use * the sk API to release it here. - * In either case the tunnel resources are freed in the socket - * destructor when the tunnel socket goes away. */ - if (tunnel->fd >= 0) { - if (sock) - inet_shutdown(sock, 2); - } else { + if (tunnel->fd < 0) { if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } } - l2tp_tunnel_sock_put(sk); -out: + /* Remove the tunnel struct from the tunnel list */ + pn = l2tp_pernet(tunnel->l2tp_net); + spin_lock_bh(&pn->l2tp_tunnel_list_lock); + list_del_rcu(&tunnel->list); + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + + /* drop initial ref */ + l2tp_tunnel_dec_refcount(tunnel); + + /* drop workqueue ref */ l2tp_tunnel_dec_refcount(tunnel); } @@ -1515,9 +1473,14 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 encap = cfg->encap; /* Quick sanity checks */ + err = -EPROTONOSUPPORT; + if (sk->sk_type != SOCK_DGRAM) { + pr_debug("tunl %hu: fd %d wrong socket type\n", + tunnel_id, fd); + goto err; + } switch (encap) { case L2TP_ENCAPTYPE_UDP: - err = -EPROTONOSUPPORT; if (sk->sk_protocol != IPPROTO_UDP) { pr_err("tunl %hu: fd %d wrong protocol, got %d, expected %d\n", tunnel_id, fd, sk->sk_protocol, IPPROTO_UDP); @@ -1525,7 +1488,6 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 } break; case L2TP_ENCAPTYPE_IP: - err = -EPROTONOSUPPORT; if (sk->sk_protocol != IPPROTO_L2TP) { pr_err("tunl %hu: fd %d wrong protocol, got %d, expected %d\n", tunnel_id, fd, sk->sk_protocol, IPPROTO_L2TP); @@ -1565,24 +1527,6 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 if (cfg != NULL) tunnel->debug = cfg->debug; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == PF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - - if (ipv6_addr_v4mapped(&np->saddr) && - ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { - struct inet_sock *inet = inet_sk(sk); - - tunnel->v4mapped = true; - inet->inet_saddr = np->saddr.s6_addr32[3]; - inet->inet_rcv_saddr = sk->sk_v6_rcv_saddr.s6_addr32[3]; - inet->inet_daddr = sk->sk_v6_daddr.s6_addr32[3]; - } else { - tunnel->v4mapped = false; - } - } -#endif - /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ tunnel->encap = encap; if (encap == L2TP_ENCAPTYPE_UDP) { @@ -1598,13 +1542,22 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 sk->sk_user_data = tunnel; } + /* Bump the reference count. The tunnel context is deleted + * only when this drops to zero. A reference is also held on + * the tunnel socket to ensure that it is not released while + * the tunnel is extant. Must be done before sk_destruct is + * set. + */ + refcount_set(&tunnel->ref_count, 1); + sock_hold(sk); + tunnel->sock = sk; + tunnel->fd = fd; + /* Hook on the tunnel socket destructor so that we can cleanup * if the tunnel socket goes away. */ tunnel->old_sk_destruct = sk->sk_destruct; sk->sk_destruct = &l2tp_tunnel_destruct; - tunnel->sock = sk; - tunnel->fd = fd; lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, "l2tp_sock"); sk->sk_allocation = GFP_ATOMIC; @@ -1614,11 +1567,6 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 /* Add tunnel to our list */ INIT_LIST_HEAD(&tunnel->list); - - /* Bump the reference count. The tunnel context is deleted - * only when this drops to zero. Must be done before list insertion - */ - refcount_set(&tunnel->ref_count, 1); spin_lock_bh(&pn->l2tp_tunnel_list_lock); list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); spin_unlock_bh(&pn->l2tp_tunnel_list_lock); @@ -1659,8 +1607,6 @@ void l2tp_session_free(struct l2tp_session *session) if (tunnel) { BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC); - sock_put(tunnel->sock); - session->tunnel = NULL; l2tp_tunnel_dec_refcount(tunnel); } diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 9bbee90e9963..2718d0b284d0 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -188,9 +188,6 @@ struct l2tp_tunnel { struct sock *sock; /* Parent socket */ int fd; /* Parent fd, if tunnel socket * was created by userspace */ -#if IS_ENABLED(CONFIG_IPV6) - bool v4mapped; -#endif struct work_struct del_work; @@ -214,27 +211,8 @@ static inline void *l2tp_session_priv(struct l2tp_session *session) return &session->priv[0]; } -static inline struct l2tp_tunnel *l2tp_sock_to_tunnel(struct sock *sk) -{ - struct l2tp_tunnel *tunnel; - - if (sk == NULL) - return NULL; - - sock_hold(sk); - tunnel = (struct l2tp_tunnel *)(sk->sk_user_data); - if (tunnel == NULL) { - sock_put(sk); - goto out; - } - - BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC); - -out: - return tunnel; -} - struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); +void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_get(const struct net *net, struct l2tp_tunnel *tunnel, @@ -283,7 +261,7 @@ static inline void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel) static inline void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel) { if (refcount_dec_and_test(&tunnel->ref_count)) - kfree_rcu(tunnel, rcu); + l2tp_tunnel_free(tunnel); } /* Session reference counts. Incremented when code obtains a reference diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index ff61124fdf59..a9c05b2bc1b0 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -234,17 +234,13 @@ static void l2tp_ip_close(struct sock *sk, long timeout) static void l2tp_ip_destroy_sock(struct sock *sk) { struct sk_buff *skb; - struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); + struct l2tp_tunnel *tunnel = sk->sk_user_data; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); - if (tunnel) { - l2tp_tunnel_closeall(tunnel); - sock_put(sk); - } - - sk_refcnt_debug_dec(sk); + if (tunnel) + l2tp_tunnel_delete(tunnel); } static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -349,7 +345,7 @@ static int l2tp_ip_disconnect(struct sock *sk, int flags) } static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -370,8 +366,7 @@ static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr, lsa->l2tp_conn_id = lsk->conn_id; lsa->l2tp_addr.s_addr = addr; } - *uaddr_len = sizeof(*lsa); - return 0; + return sizeof(*lsa); } static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 192344688c06..957369192ca1 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -248,16 +248,14 @@ static void l2tp_ip6_close(struct sock *sk, long timeout) static void l2tp_ip6_destroy_sock(struct sock *sk) { - struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); + struct l2tp_tunnel *tunnel = sk->sk_user_data; lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); - if (tunnel) { - l2tp_tunnel_closeall(tunnel); - sock_put(sk); - } + if (tunnel) + l2tp_tunnel_delete(tunnel); inet6_destroy_sock(sk); } @@ -421,7 +419,7 @@ static int l2tp_ip6_disconnect(struct sock *sk, int flags) } static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; struct sock *sk = sock->sk; @@ -449,8 +447,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, } if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = sk->sk_bound_dev_if; - *uaddr_len = sizeof(*lsa); - return 0; + return sizeof(*lsa); } static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 59f246d7b290..d6deca11da19 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -416,20 +416,28 @@ abort: * Session (and tunnel control) socket create/destroy. *****************************************************************************/ +static void pppol2tp_put_sk(struct rcu_head *head) +{ + struct pppol2tp_session *ps; + + ps = container_of(head, typeof(*ps), rcu); + sock_put(ps->__sk); +} + /* Called by l2tp_core when a session socket is being closed. */ static void pppol2tp_session_close(struct l2tp_session *session) { - struct sock *sk; - - BUG_ON(session->magic != L2TP_SESSION_MAGIC); + struct pppol2tp_session *ps; - sk = pppol2tp_session_get_sock(session); - if (sk) { - if (sk->sk_socket) - inet_shutdown(sk->sk_socket, SEND_SHUTDOWN); - sock_put(sk); - } + ps = l2tp_session_priv(session); + mutex_lock(&ps->sk_lock); + ps->__sk = rcu_dereference_protected(ps->sk, + lockdep_is_held(&ps->sk_lock)); + RCU_INIT_POINTER(ps->sk, NULL); + if (ps->__sk) + call_rcu(&ps->rcu, pppol2tp_put_sk); + mutex_unlock(&ps->sk_lock); } /* Really kill the session socket. (Called from sock_put() if @@ -449,14 +457,6 @@ static void pppol2tp_session_destruct(struct sock *sk) } } -static void pppol2tp_put_sk(struct rcu_head *head) -{ - struct pppol2tp_session *ps; - - ps = container_of(head, typeof(*ps), rcu); - sock_put(ps->__sk); -} - /* Called when the PPPoX socket (session) is closed. */ static int pppol2tp_release(struct socket *sock) @@ -480,26 +480,17 @@ static int pppol2tp_release(struct socket *sock) sock_orphan(sk); sock->sk = NULL; + /* If the socket is associated with a session, + * l2tp_session_delete will call pppol2tp_session_close which + * will drop the session's ref on the socket. + */ session = pppol2tp_sock_to_session(sk); - - if (session != NULL) { - struct pppol2tp_session *ps; - + if (session) { l2tp_session_delete(session); - - ps = l2tp_session_priv(session); - mutex_lock(&ps->sk_lock); - ps->__sk = rcu_dereference_protected(ps->sk, - lockdep_is_held(&ps->sk_lock)); - RCU_INIT_POINTER(ps->sk, NULL); - mutex_unlock(&ps->sk_lock); - call_rcu(&ps->rcu, pppol2tp_put_sk); - - /* Rely on the sock_put() call at the end of the function for - * dropping the reference held by pppol2tp_sock_to_session(). - * The last reference will be dropped by pppol2tp_put_sk(). - */ + /* drop the ref obtained by pppol2tp_sock_to_session */ + sock_put(sk); } + release_sock(sk); /* This will delete the session context via @@ -796,6 +787,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, out_no_ppp: /* This is how we get the session context from the socket. */ + sock_hold(sk); sk->sk_user_data = session; rcu_assign_pointer(ps->sk, sk); mutex_unlock(&ps->sk_lock); @@ -870,7 +862,7 @@ err: /* getname() support. */ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, - int *usockaddr_len, int peer) + int peer) { int len = 0; int error = 0; @@ -969,8 +961,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, memcpy(uaddr, &sp, len); } - *usockaddr_len = len; - error = 0; + error = len; sock_put(sk); end: @@ -1751,7 +1742,7 @@ static __net_init int pppol2tp_init_net(struct net *net) struct proc_dir_entry *pde; int err = 0; - pde = proc_create("pppol2tp", S_IRUGO, net->proc_net, + pde = proc_create("pppol2tp", 0444, net->proc_net, &pppol2tp_proc_fops); if (!pde) { err = -ENOMEM; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index c38d16f22d2a..01dcc0823d1f 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -971,7 +971,7 @@ release: * Return the address information of a socket. */ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddrlen, int peer) + int peer) { struct sockaddr_llc sllc; struct sock *sk = sock->sk; @@ -982,7 +982,6 @@ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) goto out; - *uaddrlen = sizeof(sllc); if (peer) { rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) @@ -1003,9 +1002,9 @@ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, IFHWADDRLEN); } } - rc = 0; sllc.sllc_family = AF_LLC; memcpy(uaddr, &sllc, sizeof(sllc)); + rc = sizeof(sllc); out: release_sock(sk); return rc; diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index f59648018060..163121192aca 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -389,7 +389,7 @@ static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - llc_conn_send_pdu(sk, skb); + rc = llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; @@ -916,7 +916,7 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - llc_conn_send_pdu(sk, skb); + rc = llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; @@ -935,14 +935,17 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); + int ret; if (llc->ack_must_be_send) { - llc_conn_ac_send_i_rsp_f_set_ackpf(sk, skb); + ret = llc_conn_ac_send_i_rsp_f_set_ackpf(sk, skb); llc->ack_must_be_send = 0 ; llc->ack_pf = 0; - } else - llc_conn_ac_send_i_cmd_p_set_0(sk, skb); - return 0; + } else { + ret = llc_conn_ac_send_i_cmd_p_set_0(sk, skb); + } + + return ret; } /** diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 9177dbb16dce..110e32bcb399 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -30,7 +30,7 @@ #endif static int llc_find_offset(int state, int ev_type); -static void llc_conn_send_pdus(struct sock *sk); +static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *skb); static int llc_conn_service(struct sock *sk, struct sk_buff *skb); static int llc_exec_conn_trans_actions(struct sock *sk, struct llc_conn_state_trans *trans, @@ -193,11 +193,11 @@ out_skb_put: return rc; } -void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) +int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) { /* queue PDU to send to MAC layer */ skb_queue_tail(&sk->sk_write_queue, skb); - llc_conn_send_pdus(sk); + return llc_conn_send_pdus(sk, skb); } /** @@ -255,7 +255,7 @@ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk); + llc_conn_send_pdus(sk, NULL); out:; } @@ -296,7 +296,7 @@ void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk); + llc_conn_send_pdus(sk, NULL); out:; } @@ -340,12 +340,16 @@ out: /** * llc_conn_send_pdus - Sends queued PDUs * @sk: active connection + * @hold_skb: the skb held by caller, or NULL if does not care * - * Sends queued pdus to MAC layer for transmission. + * Sends queued pdus to MAC layer for transmission. When @hold_skb is + * NULL, always return 0. Otherwise, return 0 if @hold_skb is sent + * successfully, or 1 for failure. */ -static void llc_conn_send_pdus(struct sock *sk) +static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb) { struct sk_buff *skb; + int ret = 0; while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); @@ -357,10 +361,20 @@ static void llc_conn_send_pdus(struct sock *sk) skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb); if (!skb2) break; - skb = skb2; + dev_queue_xmit(skb2); + } else { + bool is_target = skb == hold_skb; + int rc; + + if (is_target) + skb_get(skb); + rc = dev_queue_xmit(skb); + if (is_target) + ret = rc; } - dev_queue_xmit(skb); } + + return ret; } /** diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c index 66821e8a2b7a..62ea0aed94b4 100644 --- a/net/llc/llc_proc.c +++ b/net/llc/llc_proc.c @@ -249,11 +249,11 @@ int __init llc_proc_init(void) if (!llc_proc_dir) goto out; - p = proc_create("socket", S_IRUGO, llc_proc_dir, &llc_seq_socket_fops); + p = proc_create("socket", 0444, llc_proc_dir, &llc_seq_socket_fops); if (!p) goto out_socket; - p = proc_create("core", S_IRUGO, llc_proc_dir, &llc_seq_core_fops); + p = proc_create("core", 0444, llc_proc_dir, &llc_seq_core_fops); if (!p) goto out_core; diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index d90928f50226..a7f7b8ff4729 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -394,8 +394,9 @@ static void llc_sap_mcast(struct llc_sap *sap, const struct llc_addr *laddr, struct sk_buff *skb) { - int i = 0, count = 256 / sizeof(struct sock *); - struct sock *sk, *stack[count]; + int i = 0; + struct sock *sk; + struct sock *stack[256 / sizeof(struct sock *)]; struct llc_sock *llc; struct hlist_head *dev_hb = llc_sk_dev_hash(sap, skb->dev->ifindex); @@ -408,7 +409,7 @@ static void llc_sap_mcast(struct llc_sap *sap, continue; sock_hold(sk); - if (i < count) + if (i < ARRAY_SIZE(stack)) stack[i++] = sk; else { llc_do_mcast(sap, skb, stack, i); diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index a8b1616cec41..e83c19d4c292 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -8,6 +8,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -297,16 +298,23 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) { if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) { + struct tid_ampdu_rx *tid_rx; + ht_dbg_ratelimited(sta->sdata, "updated AddBA Req from %pM on tid %u\n", sta->sta.addr, tid); /* We have no API to update the timeout value in the - * driver so reject the timeout update. + * driver so reject the timeout update if the timeout + * changed. If if did not change, i.e., no real update, + * just reply with success. */ - status = WLAN_STATUS_REQUEST_DECLINED; - ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, - tid, dialog_token, status, - 1, buf_size, timeout); + rcu_read_lock(); + tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); + if (tid_rx && tid_rx->timeout == timeout) + status = WLAN_STATUS_SUCCESS; + else + status = WLAN_STATUS_REQUEST_DECLINED; + rcu_read_unlock(); goto end; } diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 46028e12e216..85dbaa891059 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -4,6 +4,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This file is GPLv2 as found in COPYING. */ @@ -925,6 +926,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, */ sdata->control_port_protocol = params->crypto.control_port_ethertype; sdata->control_port_no_encrypt = params->crypto.control_port_no_encrypt; + sdata->control_port_over_nl80211 = + params->crypto.control_port_over_nl80211; sdata->encrypt_headroom = ieee80211_cs_headroom(sdata->local, ¶ms->crypto, sdata->vif.type); @@ -934,6 +937,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, params->crypto.control_port_ethertype; vlan->control_port_no_encrypt = params->crypto.control_port_no_encrypt; + vlan->control_port_over_nl80211 = + params->crypto.control_port_over_nl80211; vlan->encrypt_headroom = ieee80211_cs_headroom(sdata->local, ¶ms->crypto, @@ -2019,6 +2024,8 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev, if (err) return err; + sdata->control_port_over_nl80211 = setup->control_port_over_nl80211; + /* can mesh use other SMPS modes? */ sdata->smps_mode = IEEE80211_SMPS_OFF; sdata->needed_rx_chains = sdata->local->rx_chains; @@ -2156,6 +2163,8 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy, */ p.uapsd = false; + ieee80211_regulatory_limit_wmm_params(sdata, &p, params->ac); + sdata->tx_conf[params->ac] = p; if (drv_conf_tx(local, sdata, params->ac, &p)) { wiphy_debug(local->hw.wiphy, @@ -2313,6 +2322,8 @@ static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, memcpy(sdata->vif.bss_conf.mcast_rate, rate, sizeof(int) * NUM_NL80211_BANDS); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MCAST_RATE); + return 0; } @@ -2685,6 +2696,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); + ieee80211_check_fast_rx_iface(sdata); return 0; } @@ -2892,7 +2904,7 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) } if (beacon->probe_resp_len) { new_beacon->probe_resp_len = beacon->probe_resp_len; - beacon->probe_resp = pos; + new_beacon->probe_resp = pos; memcpy(pos, beacon->probe_resp, beacon->probe_resp_len); pos += beacon->probe_resp_len; } @@ -3785,4 +3797,5 @@ const struct cfg80211_ops mac80211_config_ops = { .add_nan_func = ieee80211_add_nan_func, .del_nan_func = ieee80211_del_nan_func, .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast, + .tx_control_port = ieee80211_tx_control_port, }; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 1f466d12a6bc..b5adf3625d16 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -212,6 +212,8 @@ static const char *hw_flag_names[] = { FLAG(REPORTS_LOW_ACK), FLAG(SUPPORTS_TX_FRAG), FLAG(SUPPORTS_TDLS_BUFFER_STA), + FLAG(DEAUTH_NEED_MGD_TX_PREP), + FLAG(DOESNT_SUPPORT_QOS_NDP), #undef FLAG }; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 444ea8d127fe..4105081dc1df 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -160,12 +160,12 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, sta->cparams.ecn ? "yes" : "no"); p += scnprintf(p, bufsz+buf-p, - "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n"); + "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n"); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { txqi = to_txq_info(sta->sta.txq[i]); p += scnprintf(p, bufsz+buf-p, - "%d %d %u %u %u %u %u %u %u %u %u\n", + "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s)\n", txqi->txq.tid, txqi->txq.ac, txqi->tin.backlog_bytes, @@ -176,7 +176,11 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, txqi->tin.overlimit, txqi->tin.collisions, txqi->tin.tx_bytes, - txqi->tin.tx_packets); + txqi->tin.tx_packets, + txqi->flags, + txqi->flags & (1<<IEEE80211_TXQ_STOP) ? "STOP" : "RUN", + txqi->flags & (1<<IEEE80211_TXQ_AMPDU) ? " AMPDU" : "", + txqi->flags & (1<<IEEE80211_TXQ_NO_AMSDU) ? " NO-AMSDU" : ""); } rcu_read_unlock(); diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index d7523530d3f8..c78036a0ac94 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -466,6 +466,21 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_PEER_REQUEST); } +enum nl80211_smps_mode +ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps) +{ + switch (smps) { + case IEEE80211_SMPS_OFF: + return NL80211_SMPS_OFF; + case IEEE80211_SMPS_STATIC: + return NL80211_SMPS_STATIC; + case IEEE80211_SMPS_DYNAMIC: + return NL80211_SMPS_DYNAMIC; + default: + return NL80211_SMPS_OFF; + } +} + int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps, const u8 *da, const u8 *bssid) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index db07e0de9a03..6449a1c2283b 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1839,11 +1839,12 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED | IEEE80211_HT_PARAM_RIFS_MODE; - changed |= BSS_CHANGED_HT; + changed |= BSS_CHANGED_HT | BSS_CHANGED_MCAST_RATE; ieee80211_bss_info_change_notify(sdata, changed); sdata->smps_mode = IEEE80211_SMPS_OFF; sdata->needed_rx_chains = local->rx_chains; + sdata->control_port_over_nl80211 = params->control_port_over_nl80211; ieee80211_queue_work(&local->hw, &sdata->work); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 26900025de2f..6372dbdadf53 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -4,6 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -899,6 +900,7 @@ struct ieee80211_sub_if_data { u16 sequence_number; __be16 control_port_protocol; bool control_port_no_encrypt; + bool control_port_over_nl80211; int encrypt_headroom; atomic_t num_tx_queued; @@ -1467,7 +1469,7 @@ struct ieee802_11_elems { const struct ieee80211_timeout_interval_ie *timeout_int; const u8 *opmode_notif; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; - const struct ieee80211_mesh_chansw_params_ie *mesh_chansw_params_ie; + struct ieee80211_mesh_chansw_params_ie *mesh_chansw_params_ie; const struct ieee80211_bss_max_idle_period_ie *max_idle_period_ie; /* length of them, respectively */ @@ -1734,6 +1736,9 @@ void ieee80211_check_fast_xmit(struct sta_info *sta); void ieee80211_check_fast_xmit_all(struct ieee80211_local *local); void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_xmit(struct sta_info *sta); +int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, + const u8 *buf, size_t len, + const u8 *dest, __be16 proto, bool unencrypted); /* HT */ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, @@ -1788,6 +1793,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid); void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid); u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs); +enum nl80211_smps_mode +ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps); /* VHT */ void @@ -1814,6 +1821,8 @@ void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap); void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, u16 vht_mask[NL80211_VHT_NSS_MAX]); +enum nl80211_chan_width +ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, @@ -1865,6 +1874,9 @@ extern const void *const mac80211_wiphy_privid; /* for wiphy privid */ int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble, int shift); +void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, + struct ieee80211_tx_queue_params *qparam, + int ac); void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify, bool enable_qos); void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 5fe01f82df12..555e389b7dfa 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -519,6 +519,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) master->control_port_protocol; sdata->control_port_no_encrypt = master->control_port_no_encrypt; + sdata->control_port_over_nl80211 = + master->control_port_over_nl80211; sdata->vif.cab_queue = master->vif.cab_queue; memcpy(sdata->vif.hw_queue, master->vif.hw_queue, sizeof(sdata->vif.hw_queue)); @@ -1324,8 +1326,7 @@ static void ieee80211_iface_work(struct work_struct *work) mutex_lock(&local->sta_mtx); sta = sta_info_get_bss(sdata, mgmt->sa); if (sta) { - u16 tid = *ieee80211_get_qos_ctl(hdr) & - IEEE80211_QOS_CTL_TID_MASK; + u16 tid = ieee80211_get_tid(hdr); __ieee80211_stop_rx_ba_session( sta, tid, WLAN_BACK_RECIPIENT, diff --git a/net/mac80211/key.c b/net/mac80211/key.c index aee05ec3f7ea..ee0d0cc8dc3b 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -126,7 +126,7 @@ static void decrease_tailroom_need_count(struct ieee80211_sub_if_data *sdata, static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) { - struct ieee80211_sub_if_data *sdata; + struct ieee80211_sub_if_data *sdata = key->sdata; struct sta_info *sta; int ret = -EOPNOTSUPP; @@ -162,7 +162,6 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) if (sta && !sta->uploaded) goto out_unsupported; - sdata = key->sdata; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { /* * The driver doesn't know anything about VLAN interfaces. @@ -214,8 +213,11 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) /* all of these we can do in software - if driver can */ if (ret == 1) return 0; - if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) + if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) { + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + return 0; return -EINVAL; + } return 0; default: return -EINVAL; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 0785d04a80bc..9ea17afaa237 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -554,6 +554,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, NL80211_FEATURE_USERSPACE_MPM | NL80211_FEATURE_FULL_AP_CLIENT_STATE; wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_FILS_STA); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); if (!ops->hw_scan) wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | @@ -930,8 +932,12 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) IEEE80211_HT_CAP_SM_PS_SHIFT; } - /* if low-level driver supports AP, we also support VLAN */ - if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP)) { + /* if low-level driver supports AP, we also support VLAN. + * drivers advertising SW_CRYPTO_CONTROL should enable AP_VLAN + * based on their support to transmit SW encrypted packets. + */ + if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP) && + !ieee80211_hw_check(&local->hw, SW_CRYPTO_CONTROL)) { hw->wiphy->interface_modes |= BIT(NL80211_IFTYPE_AP_VLAN); hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_AP_VLAN); } diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 73ac607beb5d..d51da26e9c18 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -880,7 +880,8 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_HT | BSS_CHANGED_BASIC_RATES | - BSS_CHANGED_BEACON_INT; + BSS_CHANGED_BEACON_INT | + BSS_CHANGED_MCAST_RATE; local->fif_other_bss++; /* mesh ifaces must set allmulti to forward mcast traffic */ @@ -1255,13 +1256,12 @@ int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata, } static int mesh_fwd_csa_frame(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, size_t len) + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee802_11_elems *elems) { struct ieee80211_mgmt *mgmt_fwd; struct sk_buff *skb; struct ieee80211_local *local = sdata->local; - u8 *pos = mgmt->u.action.u.chan_switch.variable; - size_t offset_ttl; skb = dev_alloc_skb(local->tx_headroom + len); if (!skb) @@ -1269,13 +1269,9 @@ static int mesh_fwd_csa_frame(struct ieee80211_sub_if_data *sdata, skb_reserve(skb, local->tx_headroom); mgmt_fwd = skb_put(skb, len); - /* offset_ttl is based on whether the secondary channel - * offset is available or not. Subtract 1 from the mesh TTL - * and disable the initiator flag before forwarding. - */ - offset_ttl = (len < 42) ? 7 : 10; - *(pos + offset_ttl) -= 1; - *(pos + offset_ttl + 1) &= ~WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR; + elems->mesh_chansw_params_ie->mesh_ttl--; + elems->mesh_chansw_params_ie->mesh_flags &= + ~WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR; memcpy(mgmt_fwd, mgmt, len); eth_broadcast_addr(mgmt_fwd->da); @@ -1323,7 +1319,7 @@ static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata, /* forward or re-broadcast the CSA frame */ if (fwd_csa) { - if (mesh_fwd_csa_frame(sdata, mgmt, len) < 0) + if (mesh_fwd_csa_frame(sdata, mgmt, len, &elems) < 0) mcsa_dbg(sdata, "Failed to forward the CSA frame"); } } diff --git a/net/mac80211/michael.c b/net/mac80211/michael.c index 408649bd4702..37e172701a63 100644 --- a/net/mac80211/michael.c +++ b/net/mac80211/michael.c @@ -35,7 +35,7 @@ static void michael_mic_hdr(struct michael_mic_ctx *mctx, const u8 *key, da = ieee80211_get_DA(hdr); sa = ieee80211_get_SA(hdr); if (ieee80211_is_data_qos(hdr->frame_control)) - tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + tid = ieee80211_get_tid(hdr); else tid = 0; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 39b660b9a908..69449db7e283 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7,6 +7,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -896,7 +897,8 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, true); + skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, + !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP)); if (!skb) return; @@ -1785,12 +1787,14 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, params[ac].acm = acm; params[ac].uapsd = uapsd; - if (params[ac].cw_min > params[ac].cw_max) { + if (params->cw_min == 0 || + params[ac].cw_min > params[ac].cw_max) { sdata_info(sdata, "AP has invalid WMM params (CWmin/max=%d/%d for ACI %d), using defaults\n", params[ac].cw_min, params[ac].cw_max, aci); return false; } + ieee80211_regulatory_limit_wmm_params(sdata, ¶ms[ac], ac); } for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { @@ -2008,9 +2012,20 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, ieee80211_flush_queues(local, sdata, true); /* deauthenticate/disassociate now */ - if (tx || frame_buf) + if (tx || frame_buf) { + /* + * In multi channel scenarios guarantee that the virtual + * interface is granted immediate airtime to transmit the + * deauthentication frame by calling mgd_prepare_tx, if the + * driver requested so. + */ + if (ieee80211_hw_check(&local->hw, DEAUTH_NEED_MGD_TX_PREP) && + !ifmgd->have_beacon) + drv_mgd_prepare_tx(sdata->local, sdata); + ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, stype, reason, tx, frame_buf); + } /* flush out frame - make sure the deauth was actually sent */ if (tx) @@ -2151,7 +2166,7 @@ static void ieee80211_sta_tx_wmm_ac_notify(struct ieee80211_sub_if_data *sdata, u16 tx_time) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - u16 tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + u16 tid = ieee80211_get_tid(hdr); int ac = ieee80211_ac_from_tid(tid); struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac]; unsigned long now = jiffies; @@ -3292,82 +3307,14 @@ static const u64 care_about_ies = (1ULL << WLAN_EID_HT_OPERATION) | (1ULL << WLAN_EID_EXT_CHANSWITCH_ANN); -static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, size_t len, - struct ieee80211_rx_status *rx_status) +static void ieee80211_handle_beacon_sig(struct ieee80211_sub_if_data *sdata, + struct ieee80211_if_managed *ifmgd, + struct ieee80211_bss_conf *bss_conf, + struct ieee80211_local *local, + struct ieee80211_rx_status *rx_status) { - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; - size_t baselen; - struct ieee802_11_elems elems; - struct ieee80211_local *local = sdata->local; - struct ieee80211_chanctx_conf *chanctx_conf; - struct ieee80211_channel *chan; - struct sta_info *sta; - u32 changed = 0; - bool erp_valid; - u8 erp_value = 0; - u32 ncrc; - u8 *bssid; - u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN]; - - sdata_assert_lock(sdata); - - /* Process beacon from the current BSS */ - baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt; - if (baselen > len) - return; - - rcu_read_lock(); - chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); - if (!chanctx_conf) { - rcu_read_unlock(); - return; - } - - if (rx_status->freq != chanctx_conf->def.chan->center_freq) { - rcu_read_unlock(); - return; - } - chan = chanctx_conf->def.chan; - rcu_read_unlock(); - - if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon && - ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) { - ieee802_11_parse_elems(mgmt->u.beacon.variable, - len - baselen, false, &elems); - - ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems); - if (elems.tim && !elems.parse_error) { - const struct ieee80211_tim_ie *tim_ie = elems.tim; - ifmgd->dtim_period = tim_ie->dtim_period; - } - ifmgd->have_beacon = true; - ifmgd->assoc_data->need_beacon = false; - if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { - sdata->vif.bss_conf.sync_tsf = - le64_to_cpu(mgmt->u.beacon.timestamp); - sdata->vif.bss_conf.sync_device_ts = - rx_status->device_timestamp; - if (elems.tim) - sdata->vif.bss_conf.sync_dtim_count = - elems.tim->dtim_count; - else - sdata->vif.bss_conf.sync_dtim_count = 0; - } - /* continue assoc process */ - ifmgd->assoc_data->timeout = jiffies; - ifmgd->assoc_data->timeout_started = true; - run_again(sdata, ifmgd->assoc_data->timeout); - return; - } - - if (!ifmgd->associated || - !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) - return; - bssid = ifmgd->associated->bssid; - /* Track average RSSI from the Beacon frames of the current AP */ + if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) { ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE; ewma_beacon_signal_init(&ifmgd->ave_beacon_signal); @@ -3454,6 +3401,86 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, sig, GFP_KERNEL); } } +} + +static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee80211_rx_status *rx_status) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; + size_t baselen; + struct ieee802_11_elems elems; + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *chanctx_conf; + struct ieee80211_channel *chan; + struct sta_info *sta; + u32 changed = 0; + bool erp_valid; + u8 erp_value = 0; + u32 ncrc; + u8 *bssid; + u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN]; + + sdata_assert_lock(sdata); + + /* Process beacon from the current BSS */ + baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt; + if (baselen > len) + return; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + if (!chanctx_conf) { + rcu_read_unlock(); + return; + } + + if (rx_status->freq != chanctx_conf->def.chan->center_freq) { + rcu_read_unlock(); + return; + } + chan = chanctx_conf->def.chan; + rcu_read_unlock(); + + if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon && + ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) { + ieee802_11_parse_elems(mgmt->u.beacon.variable, + len - baselen, false, &elems); + + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems); + if (elems.tim && !elems.parse_error) { + const struct ieee80211_tim_ie *tim_ie = elems.tim; + ifmgd->dtim_period = tim_ie->dtim_period; + } + ifmgd->have_beacon = true; + ifmgd->assoc_data->need_beacon = false; + if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { + sdata->vif.bss_conf.sync_tsf = + le64_to_cpu(mgmt->u.beacon.timestamp); + sdata->vif.bss_conf.sync_device_ts = + rx_status->device_timestamp; + if (elems.tim) + sdata->vif.bss_conf.sync_dtim_count = + elems.tim->dtim_count; + else + sdata->vif.bss_conf.sync_dtim_count = 0; + } + /* continue assoc process */ + ifmgd->assoc_data->timeout = jiffies; + ifmgd->assoc_data->timeout_started = true; + run_again(sdata, ifmgd->assoc_data->timeout); + return; + } + + if (!ifmgd->associated || + !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) + return; + bssid = ifmgd->associated->bssid; + + if (!(rx_status->flag & RX_FLAG_NO_SIGNAL_VAL)) + ieee80211_handle_beacon_sig(sdata, ifmgd, bss_conf, + local, rx_status); if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) { mlme_dbg_ratelimited(sdata, @@ -4830,6 +4857,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, sdata->control_port_protocol = req->crypto.control_port_ethertype; sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt; + sdata->control_port_over_nl80211 = + req->crypto.control_port_over_nl80211; sdata->encrypt_headroom = ieee80211_cs_headroom(local, &req->crypto, sdata->vif.type); diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 9766c1cc4b0a..8221bc5582ab 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -690,7 +690,7 @@ minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) #ifdef CONFIG_MAC80211_DEBUGFS mp->fixed_rate_idx = (u32) -1; mp->dbg_fixed_rate = debugfs_create_u32("fixed_rate_idx", - S_IRUGO | S_IWUGO, debugfsdir, &mp->fixed_rate_idx); + 0666, debugfsdir, &mp->fixed_rate_idx); #endif minstrel_init_cck_rates(mp); diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index 36fc971deb86..9ad7d63d3e5b 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -214,11 +214,11 @@ minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir) { struct minstrel_sta_info *mi = priv_sta; - mi->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, mi, - &minstrel_stat_fops); + mi->dbg_stats = debugfs_create_file("rc_stats", 0444, dir, mi, + &minstrel_stat_fops); - mi->dbg_stats_csv = debugfs_create_file("rc_stats_csv", S_IRUGO, dir, - mi, &minstrel_stat_csv_fops); + mi->dbg_stats_csv = debugfs_create_file("rc_stats_csv", 0444, dir, mi, + &minstrel_stat_csv_fops); } void diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 4a5bdad9f303..fb586b6e5d49 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -669,7 +669,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb) if (unlikely(skb->protocol == cpu_to_be16(ETH_P_PAE))) return; - tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + tid = ieee80211_get_tid(hdr); if (likely(sta->ampdu_mlme.tid_tx[tid])) return; diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index 7d969e300fb3..bfcc03152dc6 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -303,10 +303,10 @@ minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir) { struct minstrel_ht_sta_priv *msp = priv_sta; - msp->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, msp, - &minstrel_ht_stat_fops); - msp->dbg_stats_csv = debugfs_create_file("rc_stats_csv", S_IRUGO, - dir, msp, &minstrel_ht_stat_csv_fops); + msp->dbg_stats = debugfs_create_file("rc_stats", 0444, dir, msp, + &minstrel_ht_stat_fops); + msp->dbg_stats_csv = debugfs_create_file("rc_stats_csv", 0444, dir, msp, + &minstrel_ht_stat_csv_fops); } void diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index fd580614085b..03102aff0953 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -439,6 +439,10 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_ERR; if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN) flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_KNOWN; + if (status->flag & RX_FLAG_AMPDU_EOF_BIT_KNOWN) + flags |= IEEE80211_RADIOTAP_AMPDU_EOF_KNOWN; + if (status->flag & RX_FLAG_AMPDU_EOF_BIT) + flags |= IEEE80211_RADIOTAP_AMPDU_EOF; put_unaligned_le16(flags, pos); pos += 2; if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN) @@ -1185,7 +1189,7 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, ack_policy = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_ACK_POLICY_MASK; - tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + tid = ieee80211_get_tid(hdr); tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); if (!tid_agg_rx) { @@ -1524,9 +1528,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx) ieee80211_has_pm(hdr->frame_control) && (ieee80211_is_data_qos(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control))) { - u8 tid; - - tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + u8 tid = ieee80211_get_tid(hdr); ieee80211_sta_uapsd_trigger(&rx->sta->sta, tid); } @@ -2243,6 +2245,32 @@ static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc) return true; } +static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb, + struct ieee80211_rx_data *rx) +{ + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct net_device *dev = sdata->dev; + + if (unlikely((skb->protocol == sdata->control_port_protocol || + skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) && + sdata->control_port_over_nl80211)) { + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + bool noencrypt = status->flag & RX_FLAG_DECRYPTED; + struct ethhdr *ehdr = eth_hdr(skb); + + cfg80211_rx_control_port(dev, skb->data, skb->len, + ehdr->h_source, + be16_to_cpu(skb->protocol), noencrypt); + dev_kfree_skb(skb); + } else { + /* deliver to local stack */ + if (rx->napi) + napi_gro_receive(rx->napi, skb); + else + netif_receive_skb(skb); + } +} + /* * requires that rx->skb is a frame with ethernet header */ @@ -2327,13 +2355,10 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) #endif if (skb) { - /* deliver to local stack */ skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->napi) - napi_gro_receive(rx->napi, skb); - else - netif_receive_skb(skb); + + ieee80211_deliver_skb_to_local_stack(skb, rx); } if (xmit_skb) { @@ -2351,39 +2376,17 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) } static ieee80211_rx_result debug_noinline -ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) +__ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset) { struct net_device *dev = rx->sdata->dev; struct sk_buff *skb = rx->skb; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; __le16 fc = hdr->frame_control; struct sk_buff_head frame_list; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); struct ethhdr ethhdr; const u8 *check_da = ethhdr.h_dest, *check_sa = ethhdr.h_source; - if (unlikely(!ieee80211_is_data(fc))) - return RX_CONTINUE; - - if (unlikely(!ieee80211_is_data_present(fc))) - return RX_DROP_MONITOR; - - if (!(status->rx_flags & IEEE80211_RX_AMSDU)) - return RX_CONTINUE; - if (unlikely(ieee80211_has_a4(hdr->frame_control))) { - switch (rx->sdata->vif.type) { - case NL80211_IFTYPE_AP_VLAN: - if (!rx->sdata->u.vlan.sta) - return RX_DROP_UNUSABLE; - break; - case NL80211_IFTYPE_STATION: - if (!rx->sdata->u.mgd.use_4addr) - return RX_DROP_UNUSABLE; - break; - default: - return RX_DROP_UNUSABLE; - } check_da = NULL; check_sa = NULL; } else switch (rx->sdata->vif.type) { @@ -2403,15 +2406,13 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) break; } - if (is_multicast_ether_addr(hdr->addr1)) - return RX_DROP_UNUSABLE; - skb->dev = dev; __skb_queue_head_init(&frame_list); if (ieee80211_data_to_8023_exthdr(skb, ðhdr, rx->sdata->vif.addr, - rx->sdata->vif.type)) + rx->sdata->vif.type, + data_offset)) return RX_DROP_UNUSABLE; ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr, @@ -2433,6 +2434,44 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) return RX_QUEUED; } +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) +{ + struct sk_buff *skb = rx->skb; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + __le16 fc = hdr->frame_control; + + if (!(status->rx_flags & IEEE80211_RX_AMSDU)) + return RX_CONTINUE; + + if (unlikely(!ieee80211_is_data(fc))) + return RX_CONTINUE; + + if (unlikely(!ieee80211_is_data_present(fc))) + return RX_DROP_MONITOR; + + if (unlikely(ieee80211_has_a4(hdr->frame_control))) { + switch (rx->sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + if (!rx->sdata->u.vlan.sta) + return RX_DROP_UNUSABLE; + break; + case NL80211_IFTYPE_STATION: + if (!rx->sdata->u.mgd.use_4addr) + return RX_DROP_UNUSABLE; + break; + default: + return RX_DROP_UNUSABLE; + } + } + + if (is_multicast_ether_addr(hdr->addr1)) + return RX_DROP_UNUSABLE; + + return __ieee80211_rx_h_amsdu(rx, 0); +} + #ifdef CONFIG_MAC80211_MESH static ieee80211_rx_result ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) @@ -2533,11 +2572,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) fwd_skb = skb_copy_expand(skb, local->tx_headroom + sdata->encrypt_headroom, 0, GFP_ATOMIC); - if (!fwd_skb) { - net_info_ratelimited("%s: failed to clone mesh frame\n", - sdata->name); + if (!fwd_skb) goto out; - } fwd_hdr = (struct ieee80211_hdr *) fwd_skb->data; fwd_hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_RETRY); @@ -2791,7 +2827,8 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) !(rx->flags & IEEE80211_RX_BEACON_REPORTED)) { int sig = 0; - if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM)) + if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM) && + !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) sig = status->signal; cfg80211_report_obss_beacon(rx->local->hw.wiphy, @@ -2848,6 +2885,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) case WLAN_HT_ACTION_SMPS: { struct ieee80211_supported_band *sband; enum ieee80211_smps_mode smps_mode; + struct sta_opmode_info sta_opmode = {}; /* convert to HT capability */ switch (mgmt->u.action.u.ht_smps.smps_control) { @@ -2868,17 +2906,25 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) if (rx->sta->sta.smps_mode == smps_mode) goto handled; rx->sta->sta.smps_mode = smps_mode; + sta_opmode.smps_mode = + ieee80211_smps_mode_to_smps_mode(smps_mode); + sta_opmode.changed = STA_OPMODE_SMPS_MODE_CHANGED; sband = rx->local->hw.wiphy->bands[status->band]; rate_control_rate_update(local, sband, rx->sta, IEEE80211_RC_SMPS_CHANGED); + cfg80211_sta_opmode_change_notify(sdata->dev, + rx->sta->addr, + &sta_opmode, + GFP_KERNEL); goto handled; } case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: { struct ieee80211_supported_band *sband; u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth; enum ieee80211_sta_rx_bandwidth max_bw, new_bw; + struct sta_opmode_info sta_opmode = {}; /* If it doesn't support 40 MHz it can't change ... */ if (!(rx->sta->sta.ht_cap.cap & @@ -2899,9 +2945,16 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) rx->sta->sta.bandwidth = new_bw; sband = rx->local->hw.wiphy->bands[status->band]; + sta_opmode.bw = + ieee80211_sta_rx_bw_to_chan_width(rx->sta); + sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED; rate_control_rate_update(local, sband, rx->sta, IEEE80211_RC_BW_CHANGED); + cfg80211_sta_opmode_change_notify(sdata->dev, + rx->sta->addr, + &sta_opmode, + GFP_KERNEL); goto handled; } default: @@ -3118,7 +3171,8 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) * it transmitted were processed or returned. */ - if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM)) + if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM) && + !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) sig = status->signal; if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, @@ -3731,15 +3785,6 @@ void ieee80211_check_fast_rx(struct sta_info *sta) switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: - /* 4-addr is harder to deal with, later maybe */ - if (sdata->u.mgd.use_4addr) - goto clear; - /* software powersave is a huge mess, avoid all of it */ - if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) - goto clear; - if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) && - !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) - goto clear; if (sta->sta.tdls) { fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1); fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2); @@ -3751,6 +3796,23 @@ void ieee80211_check_fast_rx(struct sta_info *sta) fastrx.expected_ds_bits = cpu_to_le16(IEEE80211_FCTL_FROMDS); } + + if (sdata->u.mgd.use_4addr && !sta->sta.tdls) { + fastrx.expected_ds_bits |= + cpu_to_le16(IEEE80211_FCTL_TODS); + fastrx.da_offs = offsetof(struct ieee80211_hdr, addr3); + fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr4); + } + + if (!sdata->u.mgd.powersave) + break; + + /* software powersave is a huge mess, avoid all of it */ + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) + goto clear; + if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) && + !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) + goto clear; break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_AP: @@ -3767,6 +3829,15 @@ void ieee80211_check_fast_rx(struct sta_info *sta) !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta); + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && + sdata->u.vlan.sta) { + fastrx.expected_ds_bits |= + cpu_to_le16(IEEE80211_FCTL_FROMDS); + fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr4); + fastrx.internal_forward = 0; + } + break; default: goto clear; @@ -3865,7 +3936,8 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct sta_info *sta = rx->sta; int orig_len = skb->len; - int snap_offs = ieee80211_hdrlen(hdr->frame_control); + int hdrlen = ieee80211_hdrlen(hdr->frame_control); + int snap_offs = hdrlen; struct { u8 snap[sizeof(rfc1042_header)]; __be16 proto; @@ -3896,10 +3968,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, (status->flag & FAST_RX_CRYPT_FLAGS) != FAST_RX_CRYPT_FLAGS) return false; - /* we don't deal with A-MSDU deaggregation here */ - if (status->rx_flags & IEEE80211_RX_AMSDU) - return false; - if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return false; @@ -3921,7 +3989,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, if ((hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) != fast_rx->expected_ds_bits) - goto drop; + return false; /* assign the key to drop unencrypted frames (later) * and strip the IV/MIC if necessary @@ -3931,21 +3999,24 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, snap_offs += IEEE80211_CCMP_HDR_LEN; } - if (!pskb_may_pull(skb, snap_offs + sizeof(*payload))) - goto drop; - payload = (void *)(skb->data + snap_offs); + if (!(status->rx_flags & IEEE80211_RX_AMSDU)) { + if (!pskb_may_pull(skb, snap_offs + sizeof(*payload))) + goto drop; - if (!ether_addr_equal(payload->snap, fast_rx->rfc1042_hdr)) - return false; + payload = (void *)(skb->data + snap_offs); - /* Don't handle these here since they require special code. - * Accept AARP and IPX even though they should come with a - * bridge-tunnel header - but if we get them this way then - * there's little point in discarding them. - */ - if (unlikely(payload->proto == cpu_to_be16(ETH_P_TDLS) || - payload->proto == fast_rx->control_port_protocol)) - return false; + if (!ether_addr_equal(payload->snap, fast_rx->rfc1042_hdr)) + return false; + + /* Don't handle these here since they require special code. + * Accept AARP and IPX even though they should come with a + * bridge-tunnel header - but if we get them this way then + * there's little point in discarding them. + */ + if (unlikely(payload->proto == cpu_to_be16(ETH_P_TDLS) || + payload->proto == fast_rx->control_port_protocol)) + return false; + } /* after this point, don't punt to the slowpath! */ @@ -3959,12 +4030,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, } /* statistics part of ieee80211_rx_h_sta_process() */ - stats->last_rx = jiffies; - stats->last_rate = sta_stats_encode_rate(status); - - stats->fragments++; - stats->packets++; - if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { stats->last_signal = status->signal; if (!fast_rx->uses_rss) @@ -3993,6 +4058,20 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, if (rx->key && !ieee80211_has_protected(hdr->frame_control)) goto drop; + if (status->rx_flags & IEEE80211_RX_AMSDU) { + if (__ieee80211_rx_h_amsdu(rx, snap_offs - hdrlen) != + RX_QUEUED) + goto drop; + + return true; + } + + stats->last_rx = jiffies; + stats->last_rate = sta_stats_encode_rate(status); + + stats->fragments++; + stats->packets++; + /* do the header conversion - first grab the addresses */ ether_addr_copy(addrs.da, skb->data + fast_rx->da_offs); ether_addr_copy(addrs.sa, skb->data + fast_rx->sa_offs); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index ef2becaade50..a3b1bcc2b461 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -73,7 +73,9 @@ ieee80211_bss_info_update(struct ieee80211_local *local, bool signal_valid; struct ieee80211_sub_if_data *scan_sdata; - if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) + if (rx_status->flag & RX_FLAG_NO_SIGNAL_VAL) + bss_meta.signal = 0; /* invalid signal indication */ + else if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) bss_meta.signal = rx_status->signal * 100; else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal; diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index ee0181778a42..029334835747 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -8,6 +8,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2008, Intel Corporation * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -27,7 +28,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie) { - enum nl80211_band new_band; + enum nl80211_band new_band = current_band; int new_freq; u8 new_chan_no; struct ieee80211_channel *new_chan; @@ -55,15 +56,13 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, elems->ext_chansw_ie->new_operating_class, &new_band)) { sdata_info(sdata, - "cannot understand ECSA IE operating class %d, disconnecting\n", + "cannot understand ECSA IE operating class, %d, ignoring\n", elems->ext_chansw_ie->new_operating_class); - return -EINVAL; } new_chan_no = elems->ext_chansw_ie->new_ch_num; csa_ie->count = elems->ext_chansw_ie->count; csa_ie->mode = elems->ext_chansw_ie->mode; } else if (elems->ch_switch_ie) { - new_band = current_band; new_chan_no = elems->ch_switch_ie->new_ch_num; csa_ie->count = elems->ch_switch_ie->count; csa_ie->mode = elems->ch_switch_ie->mode; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 0c5627f8a104..655c3d8b0d80 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -314,7 +314,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, if (ieee80211_hw_check(hw, USES_RSS)) { sta->pcpu_rx_stats = - alloc_percpu(struct ieee80211_sta_rx_stats); + alloc_percpu_gfp(struct ieee80211_sta_rx_stats, gfp); if (!sta->pcpu_rx_stats) goto free; } @@ -433,6 +433,7 @@ free_txq: if (sta->sta.txq[0]) kfree(to_txq_info(sta->sta.txq[0])); free: + free_percpu(sta->pcpu_rx_stats); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif @@ -2287,6 +2288,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT); sinfo->expected_throughput = thr; } + + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && + sta->status_stats.ack_signal_filled) { + sinfo->ack_signal = sta->status_stats.last_ack_signal; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); + } } u32 sta_get_expected_throughput(struct sta_info *sta) diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index cd53619435b6..f64eb86ca64b 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -548,6 +548,8 @@ struct sta_info { u64 msdu_retries[IEEE80211_NUM_TIDS + 1]; u64 msdu_failed[IEEE80211_NUM_TIDS + 1]; unsigned long last_ack; + s8 last_ack_signal; + bool ack_signal_filled; } status_stats; /* Updated from TX path only, no locking requirements */ diff --git a/net/mac80211/status.c b/net/mac80211/status.c index da7427a41529..743e89c5926c 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -187,9 +187,16 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) struct ieee80211_mgmt *mgmt = (void *) skb->data; struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb); - if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { sta->status_stats.last_ack = jiffies; + if (txinfo->status.is_valid_ack_signal) { + sta->status_stats.last_ack_signal = + (s8)txinfo->status.ack_signal; + sta->status_stats.ack_signal_filled = true; + } + } if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *) skb->data; @@ -487,6 +494,8 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, ieee80211_is_qos_nullfunc(hdr->frame_control)) cfg80211_probe_status(sdata->dev, hdr->addr1, cookie, acked, + info->status.ack_signal, + info->status.is_valid_ack_signal, GFP_ATOMIC); else cfg80211_mgmt_tx_status(&sdata->wdev, cookie, diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 25904af38839..535de3161a78 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -797,7 +797,6 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; - u8 *qc; int tid; /* @@ -844,9 +843,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) return TX_CONTINUE; /* include per-STA, per-TID sequence counter */ - - qc = ieee80211_get_qos_ctl(hdr); - tid = *qc & IEEE80211_QOS_CTL_TID_MASK; + tid = ieee80211_get_tid(hdr); tx->sta->tx_stats.msdu[tid]++; hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); @@ -1158,7 +1155,6 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int tid; - u8 *qc; memset(tx, 0, sizeof(*tx)); tx->skb = skb; @@ -1198,8 +1194,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, !ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) { struct tid_ampdu_tx *tid_tx; - qc = ieee80211_get_qos_ctl(hdr); - tid = *qc & IEEE80211_QOS_CTL_TID_MASK; + tid = ieee80211_get_tid(hdr); tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); if (tid_tx) { @@ -1921,7 +1916,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_hdr *hdr; int headroom; bool may_encrypt; @@ -3574,6 +3569,14 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, if (!IS_ERR_OR_NULL(sta)) { struct ieee80211_fast_tx *fast_tx; + /* We need a bit of data queued to build aggregates properly, so + * instruct the TCP stack to allow more than a single ms of data + * to be queued in the stack. The value is a bit-shift of 1 + * second, so 8 is ~4ms of queued data. Only affects local TCP + * sockets. + */ + sk_pacing_shift_update(skb->sk, 8); + fast_tx = rcu_dereference(sta->fast_tx); if (fast_tx && @@ -4754,3 +4757,49 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, ieee80211_xmit(sdata, NULL, skb); local_bh_enable(); } + +int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, + const u8 *buf, size_t len, + const u8 *dest, __be16 proto, bool unencrypted) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ethhdr *ehdr; + u32 flags; + + /* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE + * or Pre-Authentication + */ + if (proto != sdata->control_port_protocol && + proto != cpu_to_be16(ETH_P_PREAUTH)) + return -EINVAL; + + if (unencrypted) + flags = IEEE80211_TX_INTFL_DONT_ENCRYPT; + else + flags = 0; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + + sizeof(struct ethhdr) + len); + if (!skb) + return -ENOMEM; + + skb_reserve(skb, local->hw.extra_tx_headroom + sizeof(struct ethhdr)); + + skb_put_data(skb, buf, len); + + ehdr = skb_push(skb, sizeof(struct ethhdr)); + memcpy(ehdr->h_dest, dest, ETH_ALEN); + memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN); + ehdr->h_proto = proto; + + skb->dev = dev; + skb->protocol = htons(ETH_P_802_3); + skb_reset_network_header(skb); + skb_reset_mac_header(skb); + + __ieee80211_subif_start_xmit(skb, skb->dev, flags); + + return 0; +} diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 1f82191ce601..11f9cfc016d9 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -5,6 +5,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1113,6 +1114,48 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, return crc; } +void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, + struct ieee80211_tx_queue_params + *qparam, int ac) +{ + struct ieee80211_chanctx_conf *chanctx_conf; + const struct ieee80211_reg_rule *rrule; + struct ieee80211_wmm_ac *wmm_ac; + u16 center_freq = 0; + + if (sdata->vif.type != NL80211_IFTYPE_AP && + sdata->vif.type != NL80211_IFTYPE_STATION) + return; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + if (chanctx_conf) + center_freq = chanctx_conf->def.chan->center_freq; + + if (!center_freq) { + rcu_read_unlock(); + return; + } + + rrule = freq_reg_info(sdata->wdev.wiphy, MHZ_TO_KHZ(center_freq)); + + if (IS_ERR_OR_NULL(rrule) || !rrule->wmm_rule) { + rcu_read_unlock(); + return; + } + + if (sdata->vif.type == NL80211_IFTYPE_AP) + wmm_ac = &rrule->wmm_rule->ap[ac]; + else + wmm_ac = &rrule->wmm_rule->client[ac]; + qparam->cw_min = max_t(u16, qparam->cw_min, wmm_ac->cw_min); + qparam->cw_max = max_t(u16, qparam->cw_max, wmm_ac->cw_max); + qparam->aifs = max_t(u8, qparam->aifs, wmm_ac->aifsn); + qparam->txop = !qparam->txop ? wmm_ac->cot / 32 : + min_t(u16, qparam->txop, wmm_ac->cot / 32); + rcu_read_unlock(); +} + void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify, bool enable_qos) { @@ -1206,6 +1249,7 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, break; } } + ieee80211_regulatory_limit_wmm_params(sdata, &qparam, ac); qparam.uapsd = false; @@ -1968,7 +2012,8 @@ int ieee80211_reconfig(struct ieee80211_local *local) BSS_CHANGED_CQM | BSS_CHANGED_QOS | BSS_CHANGED_IDLE | - BSS_CHANGED_TXPOWER; + BSS_CHANGED_TXPOWER | + BSS_CHANGED_MCAST_RATE; if (sdata->vif.mu_mimo_owner) changed |= BSS_CHANGED_MU_GROUPS; diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index b9276ac849fa..259325cbcc31 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -358,6 +358,36 @@ enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta) return NL80211_CHAN_WIDTH_80; } +enum nl80211_chan_width +ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta) +{ + enum ieee80211_sta_rx_bandwidth cur_bw = sta->sta.bandwidth; + struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; + u32 cap_width; + + switch (cur_bw) { + case IEEE80211_STA_RX_BW_20: + if (!sta->sta.ht_cap.ht_supported) + return NL80211_CHAN_WIDTH_20_NOHT; + else + return NL80211_CHAN_WIDTH_20; + case IEEE80211_STA_RX_BW_40: + return NL80211_CHAN_WIDTH_40; + case IEEE80211_STA_RX_BW_80: + return NL80211_CHAN_WIDTH_80; + case IEEE80211_STA_RX_BW_160: + cap_width = + vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; + + if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) + return NL80211_CHAN_WIDTH_160; + + return NL80211_CHAN_WIDTH_80P80; + default: + return NL80211_CHAN_WIDTH_20; + } +} + enum ieee80211_sta_rx_bandwidth ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) { @@ -447,6 +477,7 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, enum nl80211_band band) { enum ieee80211_sta_rx_bandwidth new_bw; + struct sta_opmode_info sta_opmode = {}; u32 changed = 0; u8 nss; @@ -460,7 +491,9 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, if (sta->sta.rx_nss != nss) { sta->sta.rx_nss = nss; + sta_opmode.rx_nss = nss; changed |= IEEE80211_RC_NSS_CHANGED; + sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED; } switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { @@ -481,9 +514,15 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, new_bw = ieee80211_sta_cur_vht_bw(sta); if (new_bw != sta->sta.bandwidth) { sta->sta.bandwidth = new_bw; + sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(sta); changed |= IEEE80211_RC_BW_CHANGED; + sta_opmode.changed |= STA_OPMODE_MAX_BW_CHANGED; } + if (sta_opmode.changed) + cfg80211_sta_opmode_change_notify(sdata->dev, sta->addr, + &sta_opmode, GFP_KERNEL); + return changed; } diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 785056cb76f6..58d0b258b684 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -340,7 +340,7 @@ static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad) a4_included = ieee80211_has_a4(hdr->frame_control); if (ieee80211_is_data_qos(hdr->frame_control)) - qos_tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + qos_tid = ieee80211_get_tid(hdr); else qos_tid = 0; @@ -601,8 +601,7 @@ static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad) aad[23] = 0; if (ieee80211_is_data_qos(hdr->frame_control)) - qos_tid = *ieee80211_get_qos_ctl(hdr) & - IEEE80211_QOS_CTL_TID_MASK; + qos_tid = ieee80211_get_tid(hdr); else qos_tid = 0; @@ -867,8 +866,7 @@ ieee80211_crypto_cs_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; if (ieee80211_is_data_qos(hdr->frame_control)) - qos_tid = *ieee80211_get_qos_ctl(hdr) & - IEEE80211_QOS_CTL_TID_MASK; + qos_tid = ieee80211_get_tid(hdr); else qos_tid = 0; diff --git a/net/mac802154/trace.h b/net/mac802154/trace.h index 2c8a43d3607f..df855c33daf2 100644 --- a/net/mac802154/trace.h +++ b/net/mac802154/trace.h @@ -33,7 +33,7 @@ /* Tracing for driver callbacks */ -DECLARE_EVENT_CLASS(local_only_evt, +DECLARE_EVENT_CLASS(local_only_evt4, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local), TP_STRUCT__entry( @@ -45,7 +45,7 @@ DECLARE_EVENT_CLASS(local_only_evt, TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG) ); -DEFINE_EVENT(local_only_evt, 802154_drv_return_void, +DEFINE_EVENT(local_only_evt4, 802154_drv_return_void, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local) ); @@ -65,12 +65,12 @@ TRACE_EVENT(802154_drv_return_int, __entry->ret) ); -DEFINE_EVENT(local_only_evt, 802154_drv_start, +DEFINE_EVENT(local_only_evt4, 802154_drv_start, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local) ); -DEFINE_EVENT(local_only_evt, 802154_drv_stop, +DEFINE_EVENT(local_only_evt4, 802154_drv_stop, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local) ); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index e545a3c9365f..7a4de6d618b1 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -122,7 +122,7 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) if (skb->len <= mtu) return false; - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; diff --git a/net/ncsi/Makefile b/net/ncsi/Makefile index dd12b564f2e7..436ef68331f2 100644 --- a/net/ncsi/Makefile +++ b/net/ncsi/Makefile @@ -1,4 +1,4 @@ # # Makefile for NCSI API # -obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-aen.o ncsi-manage.o +obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-aen.o ncsi-manage.o ncsi-netlink.o diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index d30f7bd741d0..8da84312cd3b 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -276,6 +276,8 @@ struct ncsi_dev_priv { unsigned int package_num; /* Number of packages */ struct list_head packages; /* List of packages */ struct ncsi_channel *hot_channel; /* Channel was ever active */ + struct ncsi_package *force_package; /* Force a specific package */ + struct ncsi_channel *force_channel; /* Force a specific channel */ struct ncsi_request requests[256]; /* Request table */ unsigned int request_id; /* Last used request ID */ #define NCSI_REQ_START_IDX 1 @@ -318,6 +320,7 @@ extern spinlock_t ncsi_dev_lock; list_for_each_entry_rcu(nc, &np->channels, node) /* Resources */ +u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index); int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data); int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data); int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index c989211bbabc..c3695ba0cf94 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -12,7 +12,6 @@ #include <linux/init.h> #include <linux/netdevice.h> #include <linux/skbuff.h> -#include <linux/netlink.h> #include <net/ncsi.h> #include <net/net_namespace.h> @@ -23,6 +22,7 @@ #include "internal.h" #include "ncsi-pkt.h" +#include "ncsi-netlink.h" LIST_HEAD(ncsi_dev_list); DEFINE_SPINLOCK(ncsi_dev_lock); @@ -38,7 +38,7 @@ static inline int ncsi_filter_size(int table) return sizes[table]; } -static u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index) +u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index) { struct ncsi_channel_filter *ncf; int size; @@ -965,20 +965,37 @@ error: static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) { - struct ncsi_package *np; - struct ncsi_channel *nc, *found, *hot_nc; + struct ncsi_package *np, *force_package; + struct ncsi_channel *nc, *found, *hot_nc, *force_channel; struct ncsi_channel_mode *ncm; unsigned long flags; spin_lock_irqsave(&ndp->lock, flags); hot_nc = ndp->hot_channel; + force_channel = ndp->force_channel; + force_package = ndp->force_package; spin_unlock_irqrestore(&ndp->lock, flags); + /* Force a specific channel whether or not it has link if we have been + * configured to do so + */ + if (force_package && force_channel) { + found = force_channel; + ncm = &found->modes[NCSI_MODE_LINK]; + if (!(ncm->data[2] & 0x1)) + netdev_info(ndp->ndev.dev, + "NCSI: Channel %u forced, but it is link down\n", + found->id); + goto out; + } + /* The search is done once an inactive channel with up * link is found. */ found = NULL; NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (ndp->force_package && np != ndp->force_package) + continue; NCSI_FOR_EACH_CHANNEL(np, nc) { spin_lock_irqsave(&nc->lock, flags); @@ -1594,6 +1611,9 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, ndp->ptype.dev = dev; dev_add_pack(&ndp->ptype); + /* Set up generic netlink interface */ + ncsi_init_netlink(dev); + return nd; } EXPORT_SYMBOL_GPL(ncsi_register_dev); @@ -1673,6 +1693,8 @@ void ncsi_unregister_dev(struct ncsi_dev *nd) #endif spin_unlock_irqrestore(&ncsi_dev_lock, flags); + ncsi_unregister_netlink(nd->dev); + kfree(ndp); } EXPORT_SYMBOL_GPL(ncsi_unregister_dev); diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c new file mode 100644 index 000000000000..8d7e849d4825 --- /dev/null +++ b/net/ncsi/ncsi-netlink.c @@ -0,0 +1,427 @@ +/* + * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/if_arp.h> +#include <linux/rtnetlink.h> +#include <linux/etherdevice.h> +#include <linux/module.h> +#include <net/genetlink.h> +#include <net/ncsi.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <uapi/linux/ncsi.h> + +#include "internal.h" +#include "ncsi-netlink.h" + +static struct genl_family ncsi_genl_family; + +static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = { + [NCSI_ATTR_IFINDEX] = { .type = NLA_U32 }, + [NCSI_ATTR_PACKAGE_LIST] = { .type = NLA_NESTED }, + [NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 }, + [NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 }, +}; + +static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex) +{ + struct ncsi_dev_priv *ndp; + struct net_device *dev; + struct ncsi_dev *nd; + struct ncsi_dev; + + if (!net) + return NULL; + + dev = dev_get_by_index(net, ifindex); + if (!dev) { + pr_err("NCSI netlink: No device for ifindex %u\n", ifindex); + return NULL; + } + + nd = ncsi_find_dev(dev); + ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; + + dev_put(dev); + return ndp; +} + +static int ncsi_write_channel_info(struct sk_buff *skb, + struct ncsi_dev_priv *ndp, + struct ncsi_channel *nc) +{ + struct nlattr *vid_nest; + struct ncsi_channel_filter *ncf; + struct ncsi_channel_mode *m; + u32 *data; + int i; + + nla_put_u32(skb, NCSI_CHANNEL_ATTR_ID, nc->id); + m = &nc->modes[NCSI_MODE_LINK]; + nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]); + if (nc->state == NCSI_CHANNEL_ACTIVE) + nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE); + if (ndp->force_channel == nc) + nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED); + + nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version); + nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MINOR, nc->version.alpha2); + nla_put_string(skb, NCSI_CHANNEL_ATTR_VERSION_STR, nc->version.fw_name); + + vid_nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR_VLAN_LIST); + if (!vid_nest) + return -ENOMEM; + ncf = nc->filters[NCSI_FILTER_VLAN]; + i = -1; + if (ncf) { + while ((i = find_next_bit((void *)&ncf->bitmap, ncf->total, + i + 1)) < ncf->total) { + data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, i); + /* Uninitialised channels will have 'zero' vlan ids */ + if (!data || !*data) + continue; + nla_put_u16(skb, NCSI_CHANNEL_ATTR_VLAN_ID, + *(u16 *)data); + } + } + nla_nest_end(skb, vid_nest); + + return 0; +} + +static int ncsi_write_package_info(struct sk_buff *skb, + struct ncsi_dev_priv *ndp, unsigned int id) +{ + struct nlattr *pnest, *cnest, *nest; + struct ncsi_package *np; + struct ncsi_channel *nc; + bool found; + int rc; + + if (id > ndp->package_num) { + netdev_info(ndp->ndev.dev, "NCSI: No package with id %u\n", id); + return -ENODEV; + } + + found = false; + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (np->id != id) + continue; + pnest = nla_nest_start(skb, NCSI_PKG_ATTR); + if (!pnest) + return -ENOMEM; + nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id); + if (ndp->force_package == np) + nla_put_flag(skb, NCSI_PKG_ATTR_FORCED); + cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST); + if (!cnest) { + nla_nest_cancel(skb, pnest); + return -ENOMEM; + } + NCSI_FOR_EACH_CHANNEL(np, nc) { + nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR); + if (!nest) { + nla_nest_cancel(skb, cnest); + nla_nest_cancel(skb, pnest); + return -ENOMEM; + } + rc = ncsi_write_channel_info(skb, ndp, nc); + if (rc) { + nla_nest_cancel(skb, nest); + nla_nest_cancel(skb, cnest); + nla_nest_cancel(skb, pnest); + return rc; + } + nla_nest_end(skb, nest); + } + nla_nest_end(skb, cnest); + nla_nest_end(skb, pnest); + found = true; + } + + if (!found) + return -ENODEV; + + return 0; +} + +static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info) +{ + struct ncsi_dev_priv *ndp; + unsigned int package_id; + struct sk_buff *skb; + struct nlattr *attr; + void *hdr; + int rc; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) + return -EINVAL; + + ndp = ndp_from_ifindex(genl_info_net(info), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO); + if (!hdr) { + kfree_skb(skb); + return -EMSGSIZE; + } + + package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); + + attr = nla_nest_start(skb, NCSI_ATTR_PACKAGE_LIST); + if (!attr) { + kfree_skb(skb); + return -EMSGSIZE; + } + rc = ncsi_write_package_info(skb, ndp, package_id); + + if (rc) { + nla_nest_cancel(skb, attr); + goto err; + } + + nla_nest_end(skb, attr); + + genlmsg_end(skb, hdr); + return genlmsg_reply(skb, info); + +err: + genlmsg_cancel(skb, hdr); + kfree_skb(skb); + return rc; +} + +static int ncsi_pkg_info_all_nl(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *attrs[NCSI_ATTR_MAX]; + struct ncsi_package *np, *package; + struct ncsi_dev_priv *ndp; + unsigned int package_id; + struct nlattr *attr; + void *hdr; + int rc; + + rc = genlmsg_parse(cb->nlh, &ncsi_genl_family, attrs, NCSI_ATTR_MAX, + ncsi_genl_policy, NULL); + if (rc) + return rc; + + if (!attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(skb->sk)), + nla_get_u32(attrs[NCSI_ATTR_IFINDEX])); + + if (!ndp) + return -ENODEV; + + package_id = cb->args[0]; + package = NULL; + NCSI_FOR_EACH_PACKAGE(ndp, np) + if (np->id == package_id) + package = np; + + if (!package) + return 0; /* done */ + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO); + if (!hdr) { + rc = -EMSGSIZE; + goto err; + } + + attr = nla_nest_start(skb, NCSI_ATTR_PACKAGE_LIST); + rc = ncsi_write_package_info(skb, ndp, package->id); + if (rc) { + nla_nest_cancel(skb, attr); + goto err; + } + + nla_nest_end(skb, attr); + genlmsg_end(skb, hdr); + + cb->args[0] = package_id + 1; + + return skb->len; +err: + genlmsg_cancel(skb, hdr); + return rc; +} + +static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) +{ + struct ncsi_package *np, *package; + struct ncsi_channel *nc, *channel; + u32 package_id, channel_id; + struct ncsi_dev_priv *ndp; + unsigned long flags; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); + package = NULL; + + spin_lock_irqsave(&ndp->lock, flags); + + NCSI_FOR_EACH_PACKAGE(ndp, np) + if (np->id == package_id) + package = np; + if (!package) { + /* The user has set a package that does not exist */ + spin_unlock_irqrestore(&ndp->lock, flags); + return -ERANGE; + } + + channel = NULL; + if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) { + /* Allow any channel */ + channel_id = NCSI_RESERVED_CHANNEL; + } else { + channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); + NCSI_FOR_EACH_CHANNEL(package, nc) + if (nc->id == channel_id) + channel = nc; + } + + if (channel_id != NCSI_RESERVED_CHANNEL && !channel) { + /* The user has set a channel that does not exist on this + * package + */ + spin_unlock_irqrestore(&ndp->lock, flags); + netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n", + channel_id); + return -ERANGE; + } + + ndp->force_package = package; + ndp->force_channel = channel; + spin_unlock_irqrestore(&ndp->lock, flags); + + netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x%s as preferred\n", + package_id, channel_id, + channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : ""); + + /* Bounce the NCSI channel to set changes */ + ncsi_stop_dev(&ndp->ndev); + ncsi_start_dev(&ndp->ndev); + + return 0; +} + +static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) +{ + struct ncsi_dev_priv *ndp; + unsigned long flags; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + /* Clear any override */ + spin_lock_irqsave(&ndp->lock, flags); + ndp->force_package = NULL; + ndp->force_channel = NULL; + spin_unlock_irqrestore(&ndp->lock, flags); + netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n"); + + /* Bounce the NCSI channel to set changes */ + ncsi_stop_dev(&ndp->ndev); + ncsi_start_dev(&ndp->ndev); + + return 0; +} + +static const struct genl_ops ncsi_ops[] = { + { + .cmd = NCSI_CMD_PKG_INFO, + .policy = ncsi_genl_policy, + .doit = ncsi_pkg_info_nl, + .dumpit = ncsi_pkg_info_all_nl, + .flags = 0, + }, + { + .cmd = NCSI_CMD_SET_INTERFACE, + .policy = ncsi_genl_policy, + .doit = ncsi_set_interface_nl, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NCSI_CMD_CLEAR_INTERFACE, + .policy = ncsi_genl_policy, + .doit = ncsi_clear_interface_nl, + .flags = GENL_ADMIN_PERM, + }, +}; + +static struct genl_family ncsi_genl_family __ro_after_init = { + .name = "NCSI", + .version = 0, + .maxattr = NCSI_ATTR_MAX, + .module = THIS_MODULE, + .ops = ncsi_ops, + .n_ops = ARRAY_SIZE(ncsi_ops), +}; + +int ncsi_init_netlink(struct net_device *dev) +{ + int rc; + + rc = genl_register_family(&ncsi_genl_family); + if (rc) + netdev_err(dev, "ncsi: failed to register netlink family\n"); + + return rc; +} + +int ncsi_unregister_netlink(struct net_device *dev) +{ + int rc; + + rc = genl_unregister_family(&ncsi_genl_family); + if (rc) + netdev_err(dev, "ncsi: failed to unregister netlink family\n"); + + return rc; +} diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h new file mode 100644 index 000000000000..91a5c256f8c4 --- /dev/null +++ b/net/ncsi/ncsi-netlink.h @@ -0,0 +1,20 @@ +/* + * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __NCSI_NETLINK_H__ +#define __NCSI_NETLINK_H__ + +#include <linux/netdevice.h> + +#include "internal.h" + +int ncsi_init_netlink(struct net_device *dev); +int ncsi_unregister_netlink(struct net_device *dev); + +#endif /* __NCSI_NETLINK_H__ */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index d3220b43c832..704b3832dbad 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -465,12 +465,12 @@ config NF_TABLES_INET depends on IPV6 select NF_TABLES_IPV4 select NF_TABLES_IPV6 - tristate "Netfilter nf_tables mixed IPv4/IPv6 tables support" + bool "Netfilter nf_tables mixed IPv4/IPv6 tables support" help This option enables support for a mixed IPv4/IPv6 "inet" table. config NF_TABLES_NETDEV - tristate "Netfilter nf_tables netdev tables support" + bool "Netfilter nf_tables netdev tables support" help This option enables support for the "netdev" table. diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 5d9b8b959e58..fd32bd2c9521 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -73,13 +73,12 @@ obj-$(CONFIG_NETFILTER_CONNCOUNT) += nf_conncount.o obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o # nf_tables -nf_tables-objs := nf_tables_core.o nf_tables_api.o nf_tables_trace.o \ - nft_immediate.o nft_cmp.o nft_range.o nft_bitwise.o \ - nft_byteorder.o nft_payload.o nft_lookup.o nft_dynset.o +nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ + nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ + nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ + nft_dynset.o obj-$(CONFIG_NF_TABLES) += nf_tables.o -obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o -obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o obj-$(CONFIG_NFT_META) += nft_meta.o diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 975a85a48d39..bc4bd247bb7d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -2094,7 +2094,7 @@ static struct pernet_operations ip_set_net_ops = { .init = ip_set_net_init, .exit = ip_set_net_exit, .id = &ip_set_net_id, - .size = sizeof(struct ip_set_net) + .size = sizeof(struct ip_set_net), }; static int __init diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 8f004edad396..f9d5a2a1e3d0 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -72,9 +72,6 @@ hash_mac4_data_next(struct hash_mac4_elem *next, #define IP_SET_PROTO_UNDEF #include "ip_set_hash_gen.h" -/* Zero valued element is not supported */ -static const unsigned char invalid_ether[ETH_ALEN] = { 0 }; - static int hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -93,7 +90,7 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, return -EINVAL; ether_addr_copy(e.ether, eth_hdr(skb)->h_source); - if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) + if (is_zero_ether_addr(e.ether)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -118,7 +115,7 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER])); - if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) + if (is_zero_ether_addr(e.ether)) return -IPSET_ERR_HASH_ELEM; return adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 3e17d32b629d..58d5d05aec24 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -260,7 +260,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, buf_len = strlen(buf); ct = nf_ct_get(skb, &ctinfo); - if (ct && (ct->status & IPS_NAT_MASK)) { + if (ct) { bool mangled; /* If mangling fails this function will return 0 diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index d625179de485..3057e453bf31 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -238,7 +238,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc) int i; spin_lock_bh(&svc->sched_lock); - tbl->dead = 1; + tbl->dead = true; for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblc_del(en); @@ -369,7 +369,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; - tbl->dead = 0; + tbl->dead = false; tbl->svc = svc; /* diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 84c57b62a588..92adc04557ed 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -404,7 +404,7 @@ static void ip_vs_lblcr_flush(struct ip_vs_service *svc) struct hlist_node *next; spin_lock_bh(&svc->sched_lock); - tbl->dead = 1; + tbl->dead = true; for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblcr_free(en); @@ -532,7 +532,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; - tbl->dead = 0; + tbl->dead = false; tbl->svc = svc; /* diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 6d65389e308f..153e690e2893 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -104,7 +104,7 @@ static unsigned int check_hlist(struct net *net, struct nf_conn *found_ct; unsigned int length = 0; - *addit = true; + *addit = tuple ? true : false; /* check the saved connections */ hlist_for_each_entry_safe(conn, n, head, node) { @@ -117,7 +117,7 @@ static unsigned int check_hlist(struct net *net, found_ct = nf_ct_tuplehash_to_ctrack(found); - if (nf_ct_tuple_equal(&conn->tuple, tuple)) { + if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple)) { /* * Just to be sure we have it only once in the list. * We should not see tuples twice unless someone hooks @@ -158,7 +158,6 @@ static void tree_nodes_free(struct rb_root *root, static unsigned int count_tree(struct net *net, struct rb_root *root, const u32 *key, u8 keylen, - u8 family, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { @@ -221,6 +220,9 @@ count_tree(struct net *net, struct rb_root *root, goto restart; } + if (!tuple) + return 0; + /* no match, need to insert new node */ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); if (rbconn == NULL) @@ -243,10 +245,12 @@ count_tree(struct net *net, struct rb_root *root, return 1; } +/* Count and return number of conntrack entries in 'net' with particular 'key'. + * If 'tuple' is not null, insert it into the accounting data structure. + */ unsigned int nf_conncount_count(struct net *net, struct nf_conncount_data *data, const u32 *key, - unsigned int family, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { @@ -259,7 +263,7 @@ unsigned int nf_conncount_count(struct net *net, spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); - count = count_tree(net, root, key, data->keylen, family, tuple, zone); + count = count_tree(net, root, key, data->keylen, tuple, zone); spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index 866916712905..1d66de5151b2 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -8,6 +8,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/kernel.h> @@ -80,7 +82,7 @@ static int nf_conntrack_acct_init_sysctl(struct net *net) net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.acct_sysctl_header) { - printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n"); + pr_err("can't register to sysctl\n"); goto out_register; } return 0; @@ -125,7 +127,7 @@ int nf_conntrack_acct_init(void) { int ret = nf_ct_extend_register(&acct_extend); if (ret < 0) - pr_err("nf_conntrack_acct: Unable to register extension\n"); + pr_err("Unable to register extension\n"); return ret; } diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index ecc3ab784633..a1086bdec242 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -20,7 +20,6 @@ #include <net/netfilter/nf_conntrack_expect.h> int nf_conntrack_broadcast_help(struct sk_buff *skb, - unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int timeout) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 705198de671d..41ff04ee2554 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1763,14 +1763,14 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) { struct net *net; - rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) { if (atomic_read(&net->ct.count) == 0) continue; __nf_ct_unconfirmed_destroy(net); nf_queue_nf_hook_drop(net); } - rtnl_unlock(); + up_read(&net_rwsem); /* Need to wait for netns cleanup worker to finish, if its * running -- it might have deleted a net namespace from diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index caac41ad9483..c11822a7d2bf 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -11,6 +11,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/netfilter.h> #include <linux/skbuff.h> @@ -372,7 +374,7 @@ static int nf_conntrack_event_init_sysctl(struct net *net) net->ct.event_sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.event_sysctl_header) { - printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n"); + pr_err("can't register to sysctl\n"); goto out_register; } return 0; @@ -419,7 +421,7 @@ int nf_conntrack_ecache_init(void) { int ret = nf_ct_extend_register(&event_extend); if (ret < 0) - pr_err("nf_ct_event: Unable to register event extension.\n"); + pr_err("Unable to register event extension\n"); BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */ diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index 496ce173f0c1..bac5848f1c8e 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -33,7 +33,7 @@ MODULE_ALIAS("ip_conntrack_netbios_ns"); MODULE_ALIAS_NFCT_HELPER("netbios_ns"); static unsigned int timeout __read_mostly = 3; -module_param(timeout, uint, S_IRUSR); +module_param(timeout, uint, 0400); MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); static struct nf_conntrack_expect_policy exp_policy = { @@ -41,9 +41,10 @@ static struct nf_conntrack_expect_policy exp_policy = { }; static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo) + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) { - return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); + return nf_conntrack_broadcast_help(skb, ct, ctinfo, timeout); } static struct nf_conntrack_helper helper __read_mostly = { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index dd177ebee9aa..4c1d0c5bc268 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -440,6 +440,31 @@ err: return -1; } +static int ctnetlink_dump_ct_synproxy(struct sk_buff *skb, struct nf_conn *ct) +{ + struct nf_conn_synproxy *synproxy = nfct_synproxy(ct); + struct nlattr *nest_parms; + + if (!synproxy) + return 0; + + nest_parms = nla_nest_start(skb, CTA_SYNPROXY | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (nla_put_be32(skb, CTA_SYNPROXY_ISN, htonl(synproxy->isn)) || + nla_put_be32(skb, CTA_SYNPROXY_ITS, htonl(synproxy->its)) || + nla_put_be32(skb, CTA_SYNPROXY_TSOFF, htonl(synproxy->tsoff))) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) @@ -518,7 +543,8 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0 || - ctnetlink_dump_ct_seq_adj(skb, ct) < 0) + ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || + ctnetlink_dump_ct_synproxy(skb, ct) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -730,6 +756,10 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) if (events & (1 << IPCT_SEQADJ) && ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; + + if (events & (1 << IPCT_SYNPROXY) && + ctnetlink_dump_ct_synproxy(skb, ct) < 0) + goto nla_put_failure; } #ifdef CONFIG_NF_CONNTRACK_MARK @@ -1497,9 +1527,8 @@ ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) if (ret < 0) return ret; - ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, - cda[CTA_NAT_SRC]); - return ret; + return ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, + cda[CTA_NAT_SRC]); #else if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) return 0; @@ -1689,6 +1718,39 @@ err: return ret; } +static const struct nla_policy synproxy_policy[CTA_SYNPROXY_MAX + 1] = { + [CTA_SYNPROXY_ISN] = { .type = NLA_U32 }, + [CTA_SYNPROXY_ITS] = { .type = NLA_U32 }, + [CTA_SYNPROXY_TSOFF] = { .type = NLA_U32 }, +}; + +static int ctnetlink_change_synproxy(struct nf_conn *ct, + const struct nlattr * const cda[]) +{ + struct nf_conn_synproxy *synproxy = nfct_synproxy(ct); + struct nlattr *tb[CTA_SYNPROXY_MAX + 1]; + int err; + + if (!synproxy) + return 0; + + err = nla_parse_nested(tb, CTA_SYNPROXY_MAX, cda[CTA_SYNPROXY], + synproxy_policy, NULL); + if (err < 0) + return err; + + if (!tb[CTA_SYNPROXY_ISN] || + !tb[CTA_SYNPROXY_ITS] || + !tb[CTA_SYNPROXY_TSOFF]) + return -EINVAL; + + synproxy->isn = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ISN])); + synproxy->its = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ITS])); + synproxy->tsoff = ntohl(nla_get_be32(tb[CTA_SYNPROXY_TSOFF])); + + return 0; +} + static int ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) { @@ -1759,6 +1821,12 @@ ctnetlink_change_conntrack(struct nf_conn *ct, return err; } + if (cda[CTA_SYNPROXY]) { + err = ctnetlink_change_synproxy(ct, cda); + if (err < 0) + return err; + } + if (cda[CTA_LABELS]) { err = ctnetlink_attach_labels(ct, cda); if (err < 0) @@ -1880,6 +1948,12 @@ ctnetlink_create_conntrack(struct net *net, goto err2; } + if (cda[CTA_SYNPROXY]) { + err = ctnetlink_change_synproxy(ct, cda); + if (err < 0) + goto err2; + } + #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); @@ -1991,7 +2065,9 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, (1 << IPCT_HELPER) | (1 << IPCT_PROTOINFO) | (1 << IPCT_SEQADJ) | - (1 << IPCT_MARK) | events, + (1 << IPCT_MARK) | + (1 << IPCT_SYNPROXY) | + events, ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_put(ct); @@ -2012,7 +2088,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, (1 << IPCT_LABEL) | (1 << IPCT_PROTOINFO) | (1 << IPCT_SEQADJ) | - (1 << IPCT_MARK), + (1 << IPCT_MARK) | + (1 << IPCT_SYNPROXY), ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); } @@ -2282,6 +2359,9 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; + if (ctnetlink_dump_ct_synproxy(skb, ct) < 0) + goto nla_put_failure; + #ifdef CONFIG_NF_CONNTRACK_MARK if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0) goto nla_put_failure; diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c index 87b95a2c270c..b8e0a22ca1a9 100644 --- a/net/netfilter/nf_conntrack_snmp.c +++ b/net/netfilter/nf_conntrack_snmp.c @@ -26,7 +26,7 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_NFCT_HELPER("snmp"); static unsigned int timeout __read_mostly = 30; -module_param(timeout, uint, S_IRUSR); +module_param(timeout, uint, 0400); MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); int (*nf_nat_snmp_hook)(struct sk_buff *skb, @@ -36,11 +36,12 @@ int (*nf_nat_snmp_hook)(struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_snmp_hook); static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo) + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) { typeof(nf_nat_snmp_hook) nf_nat_snmp; - nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); + nf_conntrack_broadcast_help(skb, ct, ctinfo, timeout); nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook); if (nf_nat_snmp && ct->status & IPS_NAT_MASK) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9123fdec5e14..037fec54c850 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -495,7 +495,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net) if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(pde, root_uid, root_gid); - pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat, + pde = proc_create("nf_conntrack", 0444, net->proc_net_stat, &ct_cpu_seq_fops); if (!pde) goto out_stat_nf_conntrack; diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c index 4c4734b78318..56766cb26e40 100644 --- a/net/netfilter/nf_conntrack_timestamp.c +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -6,6 +6,8 @@ * published by the Free Software Foundation (or any later at your option). */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/kernel.h> @@ -58,7 +60,7 @@ static int nf_conntrack_tstamp_init_sysctl(struct net *net) net->ct.tstamp_sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.tstamp_sysctl_header) { - printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n"); + pr_err("can't register to sysctl\n"); goto out_register; } return 0; @@ -104,7 +106,7 @@ int nf_conntrack_tstamp_init(void) int ret; ret = nf_ct_extend_register(&tstamp_extend); if (ret < 0) - pr_err("nf_ct_tstamp: Unable to register extension\n"); + pr_err("Unable to register extension\n"); return ret; } diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index c2c1b16b7538..6d0357817cda 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -549,7 +549,7 @@ static int __net_init nf_log_net_init(struct net *net) int ret = -ENOMEM; #ifdef CONFIG_PROC_FS - if (!proc_create("nf_log", S_IRUGO, + if (!proc_create("nf_log", 0444, net->nf.proc_netfilter, &nflog_file_ops)) return ret; #endif diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 6c38421e31f9..617693ff9f4c 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -8,6 +8,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/types.h> #include <linux/timer.h> @@ -814,7 +816,7 @@ static int __init nf_nat_init(void) ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); - printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); + pr_err("Unable to register extension\n"); return ret; } diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c index d76afafdc699..5063cbf1689c 100644 --- a/net/netfilter/nf_nat_ftp.c +++ b/net/netfilter/nf_nat_ftp.c @@ -8,6 +8,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/inet.h> @@ -71,7 +73,7 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, char buffer[sizeof("|1||65535|") + INET6_ADDRSTRLEN]; unsigned int buflen; - pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); + pr_debug("type %i, off %u len %u\n", type, matchoff, matchlen); /* Connection will come from wherever this packet goes, hence !dir */ newaddr = ct->tuplehash[!dir].tuple.dst.u3; @@ -136,8 +138,7 @@ static int __init nf_nat_ftp_init(void) /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ static int warn_set(const char *val, const struct kernel_param *kp) { - printk(KERN_INFO KBUILD_MODNAME - ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + pr_info("kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); return 0; } module_param_call(ports, warn_set, NULL, NULL, 0); diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index dcb5f6375d9d..3aa35a43100d 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -10,6 +10,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/tcp.h> @@ -79,7 +81,7 @@ static unsigned int help(struct sk_buff *skb, */ /* AAA = "us", ie. where server normally talks to. */ snprintf(buffer, sizeof(buffer), "%u %u", ntohl(newaddr.ip), port); - pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n", + pr_debug("inserting '%s' == %pI4, port %u\n", buffer, &newaddr.ip, port); if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff, @@ -108,8 +110,7 @@ static int __init nf_nat_irc_init(void) /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ static int warn_set(const char *val, const struct kernel_param *kp) { - printk(KERN_INFO KBUILD_MODNAME - ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + pr_info("kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); return 0; } module_param_call(ports, warn_set, NULL, NULL, 0); diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c index fbce552a796e..7d7466dbf663 100644 --- a/net/netfilter/nf_nat_proto_common.c +++ b/net/netfilter/nf_nat_proto_common.c @@ -41,7 +41,7 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, const struct nf_conn *ct, u16 *rover) { - unsigned int range_size, min, i; + unsigned int range_size, min, max, i; __be16 *portptr; u_int16_t off; @@ -71,7 +71,10 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, } } else { min = ntohs(range->min_proto.all); - range_size = ntohs(range->max_proto.all) - min + 1; + max = ntohs(range->max_proto.all); + if (unlikely(max < min)) + swap(max, min); + range_size = max - min + 1; } if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) { diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 92139a087260..6039b350abbe 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -325,7 +325,7 @@ static const struct file_operations synproxy_cpu_seq_fops = { static int __net_init synproxy_proc_init(struct net *net) { - if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat, + if (!proc_create("synproxy", 0444, net->proc_net_stat, &synproxy_cpu_seq_fops)) return -ENOMEM; return 0; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8b9fe30de0cd..9134cc429ad4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -74,15 +74,77 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } +/* removal requests are queued in the commit_list, but not acted upon + * until after all new rules are in place. + * + * Therefore, nf_register_net_hook(net, &nat_hook) runs before pending + * nf_unregister_net_hook(). + * + * nf_register_net_hook thus fails if a nat hook is already in place + * even if the conflicting hook is about to be removed. + * + * If collision is detected, search commit_log for DELCHAIN matching + * the new nat hooknum; if we find one collision is temporary: + * + * Either transaction is aborted (new/colliding hook is removed), or + * transaction is committed (old hook is removed). + */ +static bool nf_tables_allow_nat_conflict(const struct net *net, + const struct nf_hook_ops *ops) +{ + const struct nft_trans *trans; + bool ret = false; + + if (!ops->nat_hook) + return false; + + list_for_each_entry(trans, &net->nft.commit_list, list) { + const struct nf_hook_ops *pending_ops; + const struct nft_chain *pending; + + if (trans->msg_type != NFT_MSG_NEWCHAIN && + trans->msg_type != NFT_MSG_DELCHAIN) + continue; + + pending = trans->ctx.chain; + if (!nft_is_base_chain(pending)) + continue; + + pending_ops = &nft_base_chain(pending)->ops; + if (pending_ops->nat_hook && + pending_ops->pf == ops->pf && + pending_ops->hooknum == ops->hooknum) { + /* other hook registration already pending? */ + if (trans->msg_type == NFT_MSG_NEWCHAIN) + return false; + + ret = true; + } + } + + return ret; +} + static int nf_tables_register_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { + struct nf_hook_ops *ops; + int ret; + if (table->flags & NFT_TABLE_F_DORMANT || !nft_is_base_chain(chain)) return 0; - return nf_register_net_hook(net, &nft_base_chain(chain)->ops); + ops = &nft_base_chain(chain)->ops; + ret = nf_register_net_hook(net, ops); + if (ret == -EBUSY && nf_tables_allow_nat_conflict(net, ops)) { + ops->nat_hook = false; + ret = nf_register_net_hook(net, ops); + ops->nat_hook = true; + } + + return ret; } static void nf_tables_unregister_hook(struct net *net, @@ -384,9 +446,9 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table) return ++table->hgenerator; } -static const struct nf_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; +static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; -static const struct nf_chain_type * +static const struct nft_chain_type * __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) { int i; @@ -399,10 +461,10 @@ __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) return NULL; } -static const struct nf_chain_type * +static const struct nft_chain_type * nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload) { - const struct nf_chain_type *type; + const struct nft_chain_type *type; type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) @@ -859,26 +921,22 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) kfree(ctx->table); } -int nft_register_chain_type(const struct nf_chain_type *ctype) +void nft_register_chain_type(const struct nft_chain_type *ctype) { - int err = 0; - if (WARN_ON(ctype->family >= NFPROTO_NUMPROTO)) - return -EINVAL; + return; nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (chain_type[ctype->family][ctype->type] != NULL) { - err = -EBUSY; - goto out; + if (WARN_ON(chain_type[ctype->family][ctype->type] != NULL)) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + return; } chain_type[ctype->family][ctype->type] = ctype; -out: nfnl_unlock(NFNL_SUBSYS_NFTABLES); - return err; } EXPORT_SYMBOL_GPL(nft_register_chain_type); -void nft_unregister_chain_type(const struct nf_chain_type *ctype) +void nft_unregister_chain_type(const struct nft_chain_type *ctype) { nfnl_lock(NFNL_SUBSYS_NFTABLES); chain_type[ctype->family][ctype->type] = NULL; @@ -1215,19 +1273,21 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain, rcu_assign_pointer(chain->stats, newstats); } -static void nf_tables_chain_destroy(struct nft_chain *chain) +static void nf_tables_chain_destroy(struct nft_ctx *ctx) { + struct nft_chain *chain = ctx->chain; + BUG_ON(chain->use > 0); if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); + if (basechain->type->free) + basechain->type->free(ctx); module_put(basechain->type->owner); free_percpu(basechain->stats); if (basechain->stats) static_branch_dec(&nft_counters_enabled); - if (basechain->ops.dev != NULL) - dev_put(basechain->ops.dev); kfree(chain->name); kfree(basechain); } else { @@ -1239,7 +1299,7 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) struct nft_chain_hook { u32 num; s32 priority; - const struct nf_chain_type *type; + const struct nft_chain_type *type; struct net_device *dev; }; @@ -1249,7 +1309,7 @@ static int nft_chain_parse_hook(struct net *net, bool create) { struct nlattr *ha[NFTA_HOOK_MAX + 1]; - const struct nf_chain_type *type; + const struct nft_chain_type *type; struct net_device *dev; int err; @@ -1294,7 +1354,7 @@ static int nft_chain_parse_hook(struct net *net, } nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ); - dev = dev_get_by_name(net, ifname); + dev = __dev_get_by_name(net, ifname); if (!dev) { module_put(type->owner); return -ENOENT; @@ -1311,8 +1371,6 @@ static int nft_chain_parse_hook(struct net *net, static void nft_chain_release_hook(struct nft_chain_hook *hook) { module_put(hook->type->owner); - if (hook->dev != NULL) - dev_put(hook->dev); } static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, @@ -1358,6 +1416,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, } basechain->type = hook.type; + if (basechain->type->init) + basechain->type->init(ctx); + chain = &basechain->chain; ops = &basechain->ops; @@ -1378,6 +1439,8 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (chain == NULL) return -ENOMEM; } + ctx->chain = chain; + INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); chain->table = table; @@ -1391,7 +1454,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (err < 0) goto err1; - ctx->chain = chain; err = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (err < 0) goto err2; @@ -1403,7 +1465,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, err2: nf_tables_unregister_hook(net, table, chain); err1: - nf_tables_chain_destroy(chain); + nf_tables_chain_destroy(ctx); return err; } @@ -1911,6 +1973,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { [NFTA_RULE_POSITION] = { .type = NLA_U64 }, [NFTA_RULE_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, + [NFTA_RULE_ID] = { .type = NLA_U32 }, }; static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, @@ -2446,6 +2509,9 @@ EXPORT_SYMBOL_GPL(nft_unregister_set); static bool nft_set_ops_candidate(const struct nft_set_ops *ops, u32 flags) { + if ((flags & NFT_SET_EVAL) && !ops->update) + return false; + return (flags & ops->features) == (flags & NFT_SET_FEATURES); } @@ -2510,7 +2576,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, if (est.space == best.space && est.lookup < best.lookup) break; - } else if (est.size < best.size) { + } else if (est.size < best.size || !bops) { break; } continue; @@ -2629,11 +2695,11 @@ static struct nft_set *nf_tables_set_lookup_byid(const struct net *net, return ERR_PTR(-ENOENT); } -struct nft_set *nft_set_lookup(const struct net *net, - const struct nft_table *table, - const struct nlattr *nla_set_name, - const struct nlattr *nla_set_id, - u8 genmask) +struct nft_set *nft_set_lookup_global(const struct net *net, + const struct nft_table *table, + const struct nlattr *nla_set_name, + const struct nlattr *nla_set_id, + u8 genmask) { struct nft_set *set; @@ -2646,7 +2712,7 @@ struct nft_set *nft_set_lookup(const struct net *net, } return set; } -EXPORT_SYMBOL_GPL(nft_set_lookup); +EXPORT_SYMBOL_GPL(nft_set_lookup_global); static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, const char *name) @@ -3315,6 +3381,8 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 }, [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, + [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -4028,17 +4096,10 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); - if (IS_ERR(set)) { - if (nla[NFTA_SET_ELEM_LIST_SET_ID]) { - set = nf_tables_set_lookup_byid(net, - nla[NFTA_SET_ELEM_LIST_SET_ID], - genmask); - } - if (IS_ERR(set)) - return PTR_ERR(set); - } + set = nft_set_lookup_global(net, ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + nla[NFTA_SET_ELEM_LIST_SET_ID], genmask); + if (IS_ERR(set)) + return PTR_ERR(set); if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) return -EBUSY; @@ -4328,9 +4389,9 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); -struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table, - const struct nlattr *nla, - u32 objtype, u8 genmask) +static struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, + u32 objtype, u8 genmask) { struct nft_object *obj; @@ -4357,16 +4418,20 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, const struct nft_object_type *type, const struct nlattr *attr) { - struct nlattr *tb[type->maxattr + 1]; + struct nlattr **tb; const struct nft_object_ops *ops; struct nft_object *obj; - int err; + int err = -ENOMEM; + + tb = kmalloc_array(type->maxattr + 1, sizeof(*tb), GFP_KERNEL); + if (!tb) + goto err1; if (attr) { err = nla_parse_nested(tb, type->maxattr, attr, type->policy, NULL); if (err < 0) - goto err1; + goto err2; } else { memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1)); } @@ -4375,7 +4440,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, ops = type->select_ops(ctx, (const struct nlattr * const *)tb); if (IS_ERR(ops)) { err = PTR_ERR(ops); - goto err1; + goto err2; } } else { ops = type->ops; @@ -4383,18 +4448,21 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, err = -ENOMEM; obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL); - if (obj == NULL) - goto err1; + if (!obj) + goto err2; err = ops->init(ctx, (const struct nlattr * const *)tb, obj); if (err < 0) - goto err2; + goto err3; obj->ops = ops; + kfree(tb); return obj; -err2: +err3: kfree(obj); +err2: + kfree(tb); err1: return ERR_PTR(err); } @@ -4850,7 +4918,7 @@ struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup); -struct nft_flowtable * +static struct nft_flowtable * nf_tables_flowtable_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u8 genmask) { @@ -4864,8 +4932,6 @@ nf_tables_flowtable_lookup_byhandle(const struct nft_table *table, return ERR_PTR(-ENOENT); } -#define NFT_FLOWTABLE_DEVICE_MAX 8 - static int nf_tables_parse_devices(const struct nft_ctx *ctx, const struct nlattr *attr, struct net_device *dev_array[], int *len) @@ -4882,7 +4948,7 @@ static int nf_tables_parse_devices(const struct nft_ctx *ctx, } nla_strlcpy(ifname, tmp, IFNAMSIZ); - dev = dev_get_by_name(ctx->net, ifname); + dev = __dev_get_by_name(ctx->net, ifname); if (!dev) { err = -ENOENT; goto err1; @@ -4938,13 +5004,11 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS], dev_array, &n); if (err < 0) - goto err1; + return err; ops = kzalloc(sizeof(struct nf_hook_ops) * n, GFP_KERNEL); - if (!ops) { - err = -ENOMEM; - goto err1; - } + if (!ops) + return -ENOMEM; flowtable->hooknum = hooknum; flowtable->priority = priority; @@ -4958,13 +5022,10 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, flowtable->ops[i].priv = &flowtable->data.rhashtable; flowtable->ops[i].hook = flowtable->data.type->hook; flowtable->ops[i].dev = dev_array[i]; + flowtable->dev_name[i] = kstrdup(dev_array[i]->name, + GFP_KERNEL); } - err = 0; -err1: - for (i = 0; i < n; i++) - dev_put(dev_array[i]); - return err; } @@ -5037,9 +5098,9 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nf_flowtable_type *type; + struct nft_flowtable *flowtable, *ft; u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; - struct nft_flowtable *flowtable; struct nft_table *table; struct nft_ctx ctx; int err, i, k; @@ -5099,6 +5160,22 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, goto err3; for (i = 0; i < flowtable->ops_len; i++) { + if (!flowtable->ops[i].dev) + continue; + + list_for_each_entry(ft, &table->flowtables, list) { + for (k = 0; k < ft->ops_len; k++) { + if (!ft->ops[k].dev) + continue; + + if (flowtable->ops[i].dev == ft->ops[k].dev && + flowtable->ops[i].pf == ft->ops[k].pf) { + err = -EBUSY; + goto err4; + } + } + } + err = nf_register_net_hook(net, &flowtable->ops[i]); if (err < 0) goto err4; @@ -5119,8 +5196,10 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, err5: i = flowtable->ops_len; err4: - for (k = i - 1; k >= 0; k--) - nf_unregister_net_hook(net, &flowtable->ops[i]); + for (k = i - 1; k >= 0; k--) { + kfree(flowtable->dev_name[k]); + nf_unregister_net_hook(net, &flowtable->ops[k]); + } kfree(flowtable->ops); err3: @@ -5145,6 +5224,11 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, struct nft_table *table; struct nft_ctx ctx; + if (!nla[NFTA_FLOWTABLE_TABLE] || + (!nla[NFTA_FLOWTABLE_NAME] && + !nla[NFTA_FLOWTABLE_HANDLE])) + return -EINVAL; + table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, genmask); if (IS_ERR(table)) @@ -5205,9 +5289,9 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; for (i = 0; i < flowtable->ops_len; i++) { - if (flowtable->ops[i].dev && + if (flowtable->dev_name[i][0] && nla_put_string(skb, NFTA_DEVICE_NAME, - flowtable->ops[i].dev->name)) + flowtable->dev_name[i])) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -5402,6 +5486,7 @@ err: static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { cancel_delayed_work_sync(&flowtable->data.gc_work); + kfree(flowtable->ops); kfree(flowtable->name); flowtable->data.type->free(&flowtable->data); rhashtable_destroy(&flowtable->data.rhashtable); @@ -5448,6 +5533,7 @@ static void nft_flowtable_event(unsigned long event, struct net_device *dev, continue; nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]); + flowtable->dev_name[i][0] = '\0'; flowtable->ops[i].dev = NULL; break; } @@ -5675,7 +5761,7 @@ static void nf_tables_commit_release(struct nft_trans *trans) nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_DELCHAIN: - nf_tables_chain_destroy(trans->ctx.chain); + nf_tables_chain_destroy(&trans->ctx); break; case NFT_MSG_DELRULE: nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); @@ -5846,7 +5932,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_NEWCHAIN: - nf_tables_chain_destroy(trans->ctx.chain); + nf_tables_chain_destroy(&trans->ctx); break; case NFT_MSG_NEWRULE: nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); @@ -5993,7 +6079,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { }; int nft_chain_validate_dependency(const struct nft_chain *chain, - enum nft_chain_type type) + enum nft_chain_types type) { const struct nft_base_chain *basechain; @@ -6496,7 +6582,7 @@ int __nft_release_basechain(struct nft_ctx *ctx) } list_del(&ctx->chain->list); ctx->table->use--; - nf_tables_chain_destroy(ctx->chain); + nf_tables_chain_destroy(ctx); return 0; } @@ -6512,6 +6598,7 @@ static void __nft_release_tables(struct net *net) struct nft_set *set, *ns; struct nft_ctx ctx = { .net = net, + .family = NFPROTO_NETDEV, }; list_for_each_entry_safe(table, nt, &net->nft.tables, list) { @@ -6548,9 +6635,10 @@ static void __nft_release_tables(struct net *net) nft_obj_destroy(obj); } list_for_each_entry_safe(chain, nc, &table->chains, list) { + ctx.chain = chain; list_del(&chain->list); table->use--; - nf_tables_chain_destroy(chain); + nf_tables_chain_destroy(&ctx); } list_del(&table->list); nf_tables_table_destroy(&ctx); @@ -6581,6 +6669,8 @@ static int __init nf_tables_module_init(void) { int err; + nft_chain_filter_init(); + info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS, GFP_KERNEL); if (info == NULL) { @@ -6615,6 +6705,7 @@ static void __exit nf_tables_module_exit(void) rcu_barrier(); nf_tables_core_module_exit(); kfree(info); + nft_chain_filter_fini(); } module_init(nf_tables_module_init); diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c deleted file mode 100644 index e30c7da09d0d..000000000000 --- a/net/netfilter/nf_tables_inet.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2012-2014 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv6.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables_ipv4.h> -#include <net/netfilter/nf_tables_ipv6.h> -#include <net/ip.h> - -static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - - switch (state->pf) { - case NFPROTO_IPV4: - nft_set_pktinfo_ipv4(&pkt, skb); - break; - case NFPROTO_IPV6: - nft_set_pktinfo_ipv6(&pkt, skb); - break; - default: - break; - } - - return nft_do_chain(&pkt, priv); -} - -static const struct nf_chain_type filter_inet = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_INET, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_FORWARD) | - (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_POST_ROUTING), - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_inet, - [NF_INET_LOCAL_OUT] = nft_do_chain_inet, - [NF_INET_FORWARD] = nft_do_chain_inet, - [NF_INET_PRE_ROUTING] = nft_do_chain_inet, - [NF_INET_POST_ROUTING] = nft_do_chain_inet, - }, -}; - -static int __init nf_tables_inet_init(void) -{ - return nft_register_chain_type(&filter_inet); -} - -static void __exit nf_tables_inet_exit(void) -{ - nft_unregister_chain_type(&filter_inet); -} - -module_init(nf_tables_inet_init); -module_exit(nf_tables_inet_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(1, "filter"); diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c deleted file mode 100644 index 4041fafca934..000000000000 --- a/net/netfilter/nf_tables_netdev.c +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/netdevice.h> -#include <net/netfilter/nf_tables.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <net/netfilter/nf_tables_ipv4.h> -#include <net/netfilter/nf_tables_ipv6.h> - -static unsigned int -nft_do_chain_netdev(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - - switch (skb->protocol) { - case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb); - break; - case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb); - break; - default: - nft_set_pktinfo_unspec(&pkt, skb); - break; - } - - return nft_do_chain(&pkt, priv); -} - -static const struct nf_chain_type nft_filter_chain_netdev = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_NETDEV, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_NETDEV_INGRESS), - .hooks = { - [NF_NETDEV_INGRESS] = nft_do_chain_netdev, - }, -}; - -static void nft_netdev_event(unsigned long event, struct net_device *dev, - struct nft_ctx *ctx) -{ - struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - - switch (event) { - case NETDEV_UNREGISTER: - if (strcmp(basechain->dev_name, dev->name) != 0) - return; - - __nft_release_basechain(ctx); - break; - case NETDEV_CHANGENAME: - if (dev->ifindex != basechain->ops.dev->ifindex) - return; - - strncpy(basechain->dev_name, dev->name, IFNAMSIZ); - break; - } -} - -static int nf_tables_netdev_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct nft_table *table; - struct nft_chain *chain, *nr; - struct nft_ctx ctx = { - .net = dev_net(dev), - }; - - if (event != NETDEV_UNREGISTER && - event != NETDEV_CHANGENAME) - return NOTIFY_DONE; - - nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_for_each_entry(table, &ctx.net->nft.tables, list) { - if (table->family != NFPROTO_NETDEV) - continue; - - ctx.family = table->family; - ctx.table = table; - list_for_each_entry_safe(chain, nr, &table->chains, list) { - if (!nft_is_base_chain(chain)) - continue; - - ctx.chain = chain; - nft_netdev_event(event, dev, &ctx); - } - } - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - - return NOTIFY_DONE; -} - -static struct notifier_block nf_tables_netdev_notifier = { - .notifier_call = nf_tables_netdev_event, -}; - -static int __init nf_tables_netdev_init(void) -{ - int ret; - - ret = nft_register_chain_type(&nft_filter_chain_netdev); - if (ret) - return ret; - - ret = register_netdevice_notifier(&nf_tables_netdev_notifier); - if (ret) - goto err_register_netdevice_notifier; - - return 0; - -err_register_netdevice_notifier: - nft_unregister_chain_type(&nft_filter_chain_netdev); - - return ret; -} - -static void __exit nf_tables_netdev_exit(void) -{ - unregister_netdevice_notifier(&nf_tables_netdev_notifier); - nft_unregister_chain_type(&nft_filter_chain_netdev); -} - -module_init(nf_tables_netdev_init); -module_exit(nf_tables_netdev_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); -MODULE_ALIAS_NFT_CHAIN(5, "filter"); /* NFPROTO_NETDEV */ diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 88d427f9f9e6..b9505bcd3827 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -467,8 +467,7 @@ static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct) GFP_ATOMIC); } -int nfnl_acct_overquota(struct net *net, const struct sk_buff *skb, - struct nf_acct *nfacct) +int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct) { u64 now; u64 *quota; diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index d33ce6d5ebce..4a4b293fb2e5 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -314,23 +314,30 @@ nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy, static int nfnl_cthelper_update_policy_all(struct nlattr *tb[], struct nf_conntrack_helper *helper) { - struct nf_conntrack_expect_policy new_policy[helper->expect_class_max + 1]; + struct nf_conntrack_expect_policy *new_policy; struct nf_conntrack_expect_policy *policy; - int i, err; + int i, ret = 0; + + new_policy = kmalloc_array(helper->expect_class_max + 1, + sizeof(*new_policy), GFP_KERNEL); + if (!new_policy) + return -ENOMEM; /* Check first that all policy attributes are well-formed, so we don't * leave things in inconsistent state on errors. */ for (i = 0; i < helper->expect_class_max + 1; i++) { - if (!tb[NFCTH_POLICY_SET + i]) - return -EINVAL; + if (!tb[NFCTH_POLICY_SET + i]) { + ret = -EINVAL; + goto err; + } - err = nfnl_cthelper_update_policy_one(&helper->expect_policy[i], + ret = nfnl_cthelper_update_policy_one(&helper->expect_policy[i], &new_policy[i], tb[NFCTH_POLICY_SET + i]); - if (err < 0) - return err; + if (ret < 0) + goto err; } /* Now we can safely update them. */ for (i = 0; i < helper->expect_class_max + 1; i++) { @@ -340,7 +347,9 @@ static int nfnl_cthelper_update_policy_all(struct nlattr *tb[], policy->timeout = new_policy->timeout; } - return 0; +err: + kfree(new_policy); + return ret; } static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper, diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 95b04702a655..9ee5fa551fa6 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -51,19 +51,27 @@ ctnl_timeout_parse_policy(void *timeouts, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { + struct nlattr **tb; int ret = 0; - if (likely(l4proto->ctnl_timeout.nlattr_to_obj)) { - struct nlattr *tb[l4proto->ctnl_timeout.nlattr_max+1]; + if (!l4proto->ctnl_timeout.nlattr_to_obj) + return 0; - ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, - attr, l4proto->ctnl_timeout.nla_policy, - NULL); - if (ret < 0) - return ret; + tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), + GFP_KERNEL); - ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); - } + if (!tb) + return -ENOMEM; + + ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, attr, + l4proto->ctnl_timeout.nla_policy, NULL); + if (ret < 0) + goto err; + + ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); + +err: + kfree(tb); return ret; } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 8bba23160a68..74a04638ef03 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -14,6 +14,9 @@ * published by the Free Software Foundation. * */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/skbuff.h> #include <linux/init.h> @@ -833,11 +836,8 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) if (diff > skb_tailroom(e->skb)) { nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), diff, GFP_ATOMIC); - if (!nskb) { - printk(KERN_WARNING "nf_queue: OOM " - "in mangle, dropping packet\n"); + if (!nskb) return -ENOMEM; - } kfree_skb(e->skb); e->skb = nskb; } @@ -1536,20 +1536,20 @@ static int __init nfnetlink_queue_init(void) status = register_pernet_subsys(&nfnl_queue_net_ops); if (status < 0) { - pr_err("nf_queue: failed to register pernet ops\n"); + pr_err("failed to register pernet ops\n"); goto out; } netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); if (status < 0) { - pr_err("nf_queue: failed to create netlink socket\n"); + pr_err("failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = register_netdevice_notifier(&nfqnl_dev_notifier); if (status < 0) { - pr_err("nf_queue: failed to register netdevice notifier\n"); + pr_err("failed to register netdevice notifier\n"); goto cleanup_netlink_subsys; } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c new file mode 100644 index 000000000000..84c902477a91 --- /dev/null +++ b/net/netfilter/nft_chain_filter.c @@ -0,0 +1,398 @@ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <net/net_namespace.h> +#include <net/netfilter/nf_tables.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_bridge.h> +#include <linux/netfilter_arp.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_tables_ipv6.h> + +#ifdef CONFIG_NF_TABLES_IPV4 +static unsigned int nft_do_chain_ipv4(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_filter_ipv4 = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_IPV4, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, + [NF_INET_LOCAL_OUT] = nft_do_chain_ipv4, + [NF_INET_FORWARD] = nft_do_chain_ipv4, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, + }, +}; + +static void nft_chain_filter_ipv4_init(void) +{ + nft_register_chain_type(&nft_chain_filter_ipv4); +} +static void nft_chain_filter_ipv4_fini(void) +{ + nft_unregister_chain_type(&nft_chain_filter_ipv4); +} + +#else +static inline void nft_chain_filter_ipv4_init(void) {} +static inline void nft_chain_filter_ipv4_fini(void) {} +#endif /* CONFIG_NF_TABLES_IPV4 */ + +#ifdef CONFIG_NF_TABLES_ARP +static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb); + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_filter_arp = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_ARP, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_ARP_IN) | + (1 << NF_ARP_OUT), + .hooks = { + [NF_ARP_IN] = nft_do_chain_arp, + [NF_ARP_OUT] = nft_do_chain_arp, + }, +}; + +static void nft_chain_filter_arp_init(void) +{ + nft_register_chain_type(&nft_chain_filter_arp); +} + +static void nft_chain_filter_arp_fini(void) +{ + nft_unregister_chain_type(&nft_chain_filter_arp); +} +#else +static inline void nft_chain_filter_arp_init(void) {} +static inline void nft_chain_filter_arp_fini(void) {} +#endif /* CONFIG_NF_TABLES_ARP */ + +#ifdef CONFIG_NF_TABLES_IPV6 +static unsigned int nft_do_chain_ipv6(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_filter_ipv6 = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_IPV6, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, + [NF_INET_LOCAL_OUT] = nft_do_chain_ipv6, + [NF_INET_FORWARD] = nft_do_chain_ipv6, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, + }, +}; + +static void nft_chain_filter_ipv6_init(void) +{ + nft_register_chain_type(&nft_chain_filter_ipv6); +} + +static void nft_chain_filter_ipv6_fini(void) +{ + nft_unregister_chain_type(&nft_chain_filter_ipv6); +} +#else +static inline void nft_chain_filter_ipv6_init(void) {} +static inline void nft_chain_filter_ipv6_fini(void) {} +#endif /* CONFIG_NF_TABLES_IPV6 */ + +#ifdef CONFIG_NF_TABLES_INET +static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + + switch (state->pf) { + case NFPROTO_IPV4: + nft_set_pktinfo_ipv4(&pkt, skb); + break; + case NFPROTO_IPV6: + nft_set_pktinfo_ipv6(&pkt, skb); + break; + default: + break; + } + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_filter_inet = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_INET, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_inet, + [NF_INET_LOCAL_OUT] = nft_do_chain_inet, + [NF_INET_FORWARD] = nft_do_chain_inet, + [NF_INET_PRE_ROUTING] = nft_do_chain_inet, + [NF_INET_POST_ROUTING] = nft_do_chain_inet, + }, +}; + +static void nft_chain_filter_inet_init(void) +{ + nft_register_chain_type(&nft_chain_filter_inet); +} + +static void nft_chain_filter_inet_fini(void) +{ + nft_unregister_chain_type(&nft_chain_filter_inet); +} +#else +static inline void nft_chain_filter_inet_init(void) {} +static inline void nft_chain_filter_inet_fini(void) {} +#endif /* CONFIG_NF_TABLES_IPV6 */ + +#ifdef CONFIG_NF_TABLES_BRIDGE +static unsigned int +nft_do_chain_bridge(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + nft_set_pktinfo_ipv4_validate(&pkt, skb); + break; + case htons(ETH_P_IPV6): + nft_set_pktinfo_ipv6_validate(&pkt, skb); + break; + default: + nft_set_pktinfo_unspec(&pkt, skb); + break; + } + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_filter_bridge = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_BRIDGE, + .hook_mask = (1 << NF_BR_PRE_ROUTING) | + (1 << NF_BR_LOCAL_IN) | + (1 << NF_BR_FORWARD) | + (1 << NF_BR_LOCAL_OUT) | + (1 << NF_BR_POST_ROUTING), + .hooks = { + [NF_BR_PRE_ROUTING] = nft_do_chain_bridge, + [NF_BR_LOCAL_IN] = nft_do_chain_bridge, + [NF_BR_FORWARD] = nft_do_chain_bridge, + [NF_BR_LOCAL_OUT] = nft_do_chain_bridge, + [NF_BR_POST_ROUTING] = nft_do_chain_bridge, + }, +}; + +static void nft_chain_filter_bridge_init(void) +{ + nft_register_chain_type(&nft_chain_filter_bridge); +} + +static void nft_chain_filter_bridge_fini(void) +{ + nft_unregister_chain_type(&nft_chain_filter_bridge); +} +#else +static inline void nft_chain_filter_bridge_init(void) {} +static inline void nft_chain_filter_bridge_fini(void) {} +#endif /* CONFIG_NF_TABLES_BRIDGE */ + +#ifdef CONFIG_NF_TABLES_NETDEV +static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + + switch (skb->protocol) { + case htons(ETH_P_IP): + nft_set_pktinfo_ipv4_validate(&pkt, skb); + break; + case htons(ETH_P_IPV6): + nft_set_pktinfo_ipv6_validate(&pkt, skb); + break; + default: + nft_set_pktinfo_unspec(&pkt, skb); + break; + } + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_filter_netdev = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_NETDEV, + .hook_mask = (1 << NF_NETDEV_INGRESS), + .hooks = { + [NF_NETDEV_INGRESS] = nft_do_chain_netdev, + }, +}; + +static void nft_netdev_event(unsigned long event, struct net_device *dev, + struct nft_ctx *ctx) +{ + struct nft_base_chain *basechain = nft_base_chain(ctx->chain); + + switch (event) { + case NETDEV_UNREGISTER: + if (strcmp(basechain->dev_name, dev->name) != 0) + return; + + __nft_release_basechain(ctx); + break; + case NETDEV_CHANGENAME: + if (dev->ifindex != basechain->ops.dev->ifindex) + return; + + strncpy(basechain->dev_name, dev->name, IFNAMSIZ); + break; + } +} + +static int nf_tables_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nft_table *table; + struct nft_chain *chain, *nr; + struct nft_ctx ctx = { + .net = dev_net(dev), + }; + + if (event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; + + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_for_each_entry(table, &ctx.net->nft.tables, list) { + if (table->family != NFPROTO_NETDEV) + continue; + + ctx.family = table->family; + ctx.table = table; + list_for_each_entry_safe(chain, nr, &table->chains, list) { + if (!nft_is_base_chain(chain)) + continue; + + ctx.chain = chain; + nft_netdev_event(event, dev, &ctx); + } + } + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + + return NOTIFY_DONE; +} + +static struct notifier_block nf_tables_netdev_notifier = { + .notifier_call = nf_tables_netdev_event, +}; + +static int nft_chain_filter_netdev_init(void) +{ + int err; + + nft_register_chain_type(&nft_chain_filter_netdev); + + err = register_netdevice_notifier(&nf_tables_netdev_notifier); + if (err) + goto err_register_netdevice_notifier; + + return 0; + +err_register_netdevice_notifier: + nft_unregister_chain_type(&nft_chain_filter_netdev); + + return err; +} + +static void nft_chain_filter_netdev_fini(void) +{ + nft_unregister_chain_type(&nft_chain_filter_netdev); + unregister_netdevice_notifier(&nf_tables_netdev_notifier); +} +#else +static inline int nft_chain_filter_netdev_init(void) { return 0; } +static inline void nft_chain_filter_netdev_fini(void) {} +#endif /* CONFIG_NF_TABLES_NETDEV */ + +int __init nft_chain_filter_init(void) +{ + int err; + + err = nft_chain_filter_netdev_init(); + if (err < 0) + return err; + + nft_chain_filter_ipv4_init(); + nft_chain_filter_ipv6_init(); + nft_chain_filter_arp_init(); + nft_chain_filter_inet_init(); + nft_chain_filter_bridge_init(); + + return 0; +} + +void __exit nft_chain_filter_fini(void) +{ + nft_chain_filter_bridge_fini(); + nft_chain_filter_inet_fini(); + nft_chain_filter_arp_fini(); + nft_chain_filter_ipv6_fini(); + nft_chain_filter_ipv4_fini(); + nft_chain_filter_netdev_fini(); +} diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 6ab274b14484..ea737fd789e8 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -196,6 +196,26 @@ static void nft_ct_get_eval(const struct nft_expr *expr, case NFT_CT_PROTO_DST: nft_reg_store16(dest, (__force u16)tuple->dst.u.all); return; + case NFT_CT_SRC_IP: + if (nf_ct_l3num(ct) != NFPROTO_IPV4) + goto err; + *dest = tuple->src.u3.ip; + return; + case NFT_CT_DST_IP: + if (nf_ct_l3num(ct) != NFPROTO_IPV4) + goto err; + *dest = tuple->dst.u3.ip; + return; + case NFT_CT_SRC_IP6: + if (nf_ct_l3num(ct) != NFPROTO_IPV6) + goto err; + memcpy(dest, tuple->src.u3.ip6, sizeof(struct in6_addr)); + return; + case NFT_CT_DST_IP6: + if (nf_ct_l3num(ct) != NFPROTO_IPV6) + goto err; + memcpy(dest, tuple->dst.u3.ip6, sizeof(struct in6_addr)); + return; default: break; } @@ -419,6 +439,20 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, return -EAFNOSUPPORT; } break; + case NFT_CT_SRC_IP: + case NFT_CT_DST_IP: + if (tb[NFTA_CT_DIRECTION] == NULL) + return -EINVAL; + + len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip); + break; + case NFT_CT_SRC_IP6: + case NFT_CT_DST_IP6: + if (tb[NFTA_CT_DIRECTION] == NULL) + return -EINVAL; + + len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip6); + break; case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (tb[NFTA_CT_DIRECTION] == NULL) @@ -588,6 +622,10 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) switch (priv->key) { case NFT_CT_SRC: case NFT_CT_DST: + case NFT_CT_SRC_IP: + case NFT_CT_DST_IP: + case NFT_CT_SRC_IP6: + case NFT_CT_DST_IP6: case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index fc83e29d6634..04863fad05dd 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -132,8 +132,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, priv->invert = true; } - set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_DYNSET_SET_NAME], - tb[NFTA_DYNSET_SET_ID], genmask); + set = nft_set_lookup_global(ctx->net, ctx->table, + tb[NFTA_DYNSET_SET_NAME], + tb[NFTA_DYNSET_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 475570e89ede..f52da5e2199f 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -71,8 +71,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx, tb[NFTA_LOOKUP_SREG] == NULL) return -EINVAL; - set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_LOOKUP_SET], - tb[NFTA_LOOKUP_SET_ID], genmask); + set = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_LOOKUP_SET], + tb[NFTA_LOOKUP_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 7bcdc48f3d73..0b02407773ad 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -117,8 +117,9 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, struct nft_set *set; int err; - set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_OBJREF_SET_NAME], - tb[NFTA_OBJREF_SET_ID], genmask); + set = nft_set_lookup_global(ctx->net, ctx->table, + tb[NFTA_OBJREF_SET_NAME], + tb[NFTA_OBJREF_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 3f1624ee056f..fc9c6d5d64cd 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -674,7 +674,7 @@ static const struct nft_set_ops * nft_hash_select_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc, u32 flags) { - if (desc->size) { + if (desc->size && !(flags & (NFT_SET_EVAL | NFT_SET_TIMEOUT))) { switch (desc->klen) { case 4: return &nft_hash_fast_ops; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2f685ee1f9c8..71325fef647d 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -40,6 +40,7 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define XT_PCPU_BLOCK_SIZE 4096 +#define XT_MAX_TABLE_SIZE (512 * 1024 * 1024) struct compat_delta { unsigned int offset; /* offset in kernel */ @@ -423,6 +424,36 @@ textify_hooks(char *buf, size_t size, unsigned int mask, uint8_t nfproto) return buf; } +/** + * xt_check_proc_name - check that name is suitable for /proc file creation + * + * @name: file name candidate + * @size: length of buffer + * + * some x_tables modules wish to create a file in /proc. + * This function makes sure that the name is suitable for this + * purpose, it checks that name is NUL terminated and isn't a 'special' + * name, like "..". + * + * returns negative number on error or 0 if name is useable. + */ +int xt_check_proc_name(const char *name, unsigned int size) +{ + if (name[0] == '\0') + return -EINVAL; + + if (strnlen(name, size) == size) + return -ENAMETOOLONG; + + if (strcmp(name, ".") == 0 || + strcmp(name, "..") == 0 || + strchr(name, '/')) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(xt_check_proc_name); + int xt_check_match(struct xt_mtchk_param *par, unsigned int size, u_int8_t proto, bool inv_proto) { @@ -434,36 +465,35 @@ int xt_check_match(struct xt_mtchk_param *par, * ebt_among is exempt from centralized matchsize checking * because it uses a dynamic-size data set. */ - pr_err("%s_tables: %s.%u match: invalid size " - "%u (kernel) != (user) %u\n", - xt_prefix[par->family], par->match->name, - par->match->revision, - XT_ALIGN(par->match->matchsize), size); + pr_err_ratelimited("%s_tables: %s.%u match: invalid size %u (kernel) != (user) %u\n", + xt_prefix[par->family], par->match->name, + par->match->revision, + XT_ALIGN(par->match->matchsize), size); return -EINVAL; } if (par->match->table != NULL && strcmp(par->match->table, par->table) != 0) { - pr_err("%s_tables: %s match: only valid in %s table, not %s\n", - xt_prefix[par->family], par->match->name, - par->match->table, par->table); + pr_info_ratelimited("%s_tables: %s match: only valid in %s table, not %s\n", + xt_prefix[par->family], par->match->name, + par->match->table, par->table); return -EINVAL; } if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { char used[64], allow[64]; - pr_err("%s_tables: %s match: used from hooks %s, but only " - "valid from %s\n", - xt_prefix[par->family], par->match->name, - textify_hooks(used, sizeof(used), par->hook_mask, - par->family), - textify_hooks(allow, sizeof(allow), par->match->hooks, - par->family)); + pr_info_ratelimited("%s_tables: %s match: used from hooks %s, but only valid from %s\n", + xt_prefix[par->family], par->match->name, + textify_hooks(used, sizeof(used), + par->hook_mask, par->family), + textify_hooks(allow, sizeof(allow), + par->match->hooks, + par->family)); return -EINVAL; } if (par->match->proto && (par->match->proto != proto || inv_proto)) { - pr_err("%s_tables: %s match: only valid for protocol %u\n", - xt_prefix[par->family], par->match->name, - par->match->proto); + pr_info_ratelimited("%s_tables: %s match: only valid for protocol %u\n", + xt_prefix[par->family], par->match->name, + par->match->proto); return -EINVAL; } if (par->match->checkentry != NULL) { @@ -519,19 +549,104 @@ static int xt_check_entry_match(const char *match, const char *target, return 0; } +/** xt_check_table_hooks - check hook entry points are sane + * + * @info xt_table_info to check + * @valid_hooks - hook entry points that we can enter from + * + * Validates that the hook entry and underflows points are set up. + * + * Return: 0 on success, negative errno on failure. + */ +int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks) +{ + const char *err = "unsorted underflow"; + unsigned int i, max_uflow, max_entry; + bool check_hooks = false; + + BUILD_BUG_ON(ARRAY_SIZE(info->hook_entry) != ARRAY_SIZE(info->underflow)); + + max_entry = 0; + max_uflow = 0; + + for (i = 0; i < ARRAY_SIZE(info->hook_entry); i++) { + if (!(valid_hooks & (1 << i))) + continue; + + if (info->hook_entry[i] == 0xFFFFFFFF) + return -EINVAL; + if (info->underflow[i] == 0xFFFFFFFF) + return -EINVAL; + + if (check_hooks) { + if (max_uflow > info->underflow[i]) + goto error; + + if (max_uflow == info->underflow[i]) { + err = "duplicate underflow"; + goto error; + } + if (max_entry > info->hook_entry[i]) { + err = "unsorted entry"; + goto error; + } + if (max_entry == info->hook_entry[i]) { + err = "duplicate entry"; + goto error; + } + } + max_entry = info->hook_entry[i]; + max_uflow = info->underflow[i]; + check_hooks = true; + } + + return 0; +error: + pr_err_ratelimited("%s at hook %d\n", err, i); + return -EINVAL; +} +EXPORT_SYMBOL(xt_check_table_hooks); + +static bool verdict_ok(int verdict) +{ + if (verdict > 0) + return true; + + if (verdict < 0) { + int v = -verdict - 1; + + if (verdict == XT_RETURN) + return true; + + switch (v) { + case NF_ACCEPT: return true; + case NF_DROP: return true; + case NF_QUEUE: return true; + default: + break; + } + + return false; + } + + return false; +} + +static bool error_tg_ok(unsigned int usersize, unsigned int kernsize, + const char *msg, unsigned int msglen) +{ + return usersize == kernsize && strnlen(msg, msglen) < msglen; +} + #ifdef CONFIG_COMPAT int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) { struct xt_af *xp = &xt[af]; - if (!xp->compat_tab) { - if (!xp->number) - return -EINVAL; - xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number); - if (!xp->compat_tab) - return -ENOMEM; - xp->cur = 0; - } + WARN_ON(!mutex_is_locked(&xt[af].compat_mutex)); + + if (WARN_ON(!xp->compat_tab)) + return -ENOMEM; if (xp->cur >= xp->number) return -EINVAL; @@ -547,6 +662,8 @@ EXPORT_SYMBOL_GPL(xt_compat_add_offset); void xt_compat_flush_offsets(u_int8_t af) { + WARN_ON(!mutex_is_locked(&xt[af].compat_mutex)); + if (xt[af].compat_tab) { vfree(xt[af].compat_tab); xt[af].compat_tab = NULL; @@ -574,10 +691,30 @@ int xt_compat_calc_jump(u_int8_t af, unsigned int offset) } EXPORT_SYMBOL_GPL(xt_compat_calc_jump); -void xt_compat_init_offsets(u_int8_t af, unsigned int number) +int xt_compat_init_offsets(u8 af, unsigned int number) { + size_t mem; + + WARN_ON(!mutex_is_locked(&xt[af].compat_mutex)); + + if (!number || number > (INT_MAX / sizeof(struct compat_delta))) + return -EINVAL; + + if (WARN_ON(xt[af].compat_tab)) + return -EINVAL; + + mem = sizeof(struct compat_delta) * number; + if (mem > XT_MAX_TABLE_SIZE) + return -ENOMEM; + + xt[af].compat_tab = vmalloc(mem); + if (!xt[af].compat_tab) + return -ENOMEM; + xt[af].number = number; xt[af].cur = 0; + + return 0; } EXPORT_SYMBOL(xt_compat_init_offsets); @@ -655,6 +792,11 @@ struct compat_xt_standard_target { compat_uint_t verdict; }; +struct compat_xt_error_target { + struct compat_xt_entry_target t; + char errorname[XT_FUNCTION_MAXNAMELEN]; +}; + int xt_compat_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset) @@ -676,9 +818,21 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems, if (target_offset + t->u.target_size > next_offset) return -EINVAL; - if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 && - COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset) - return -EINVAL; + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0) { + const struct compat_xt_standard_target *st = (const void *)t; + + if (COMPAT_XT_ALIGN(target_offset + sizeof(*st)) != next_offset) + return -EINVAL; + + if (!verdict_ok(st->verdict)) + return -EINVAL; + } else if (strcmp(t->u.user.name, XT_ERROR_TARGET) == 0) { + const struct compat_xt_error_target *et = (const void *)t; + + if (!error_tg_ok(t->u.target_size, sizeof(*et), + et->errorname, sizeof(et->errorname))) + return -EINVAL; + } /* compat_xt_entry match has less strict alignment requirements, * otherwise they are identical. In case of padding differences @@ -758,9 +912,21 @@ int xt_check_entry_offsets(const void *base, if (target_offset + t->u.target_size > next_offset) return -EINVAL; - if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 && - XT_ALIGN(target_offset + sizeof(struct xt_standard_target)) != next_offset) - return -EINVAL; + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0) { + const struct xt_standard_target *st = (const void *)t; + + if (XT_ALIGN(target_offset + sizeof(*st)) != next_offset) + return -EINVAL; + + if (!verdict_ok(st->verdict)) + return -EINVAL; + } else if (strcmp(t->u.user.name, XT_ERROR_TARGET) == 0) { + const struct xt_error_target *et = (const void *)t; + + if (!error_tg_ok(t->u.target_size, sizeof(*et), + et->errorname, sizeof(et->errorname))) + return -EINVAL; + } return xt_check_entry_match(elems, base + target_offset, __alignof__(struct xt_entry_match)); @@ -776,6 +942,9 @@ EXPORT_SYMBOL(xt_check_entry_offsets); */ unsigned int *xt_alloc_entry_offsets(unsigned int size) { + if (size > XT_MAX_TABLE_SIZE / sizeof(unsigned int)) + return NULL; + return kvmalloc_array(size, sizeof(unsigned int), GFP_KERNEL | __GFP_ZERO); } @@ -814,36 +983,35 @@ int xt_check_target(struct xt_tgchk_param *par, int ret; if (XT_ALIGN(par->target->targetsize) != size) { - pr_err("%s_tables: %s.%u target: invalid size " - "%u (kernel) != (user) %u\n", - xt_prefix[par->family], par->target->name, - par->target->revision, - XT_ALIGN(par->target->targetsize), size); + pr_err_ratelimited("%s_tables: %s.%u target: invalid size %u (kernel) != (user) %u\n", + xt_prefix[par->family], par->target->name, + par->target->revision, + XT_ALIGN(par->target->targetsize), size); return -EINVAL; } if (par->target->table != NULL && strcmp(par->target->table, par->table) != 0) { - pr_err("%s_tables: %s target: only valid in %s table, not %s\n", - xt_prefix[par->family], par->target->name, - par->target->table, par->table); + pr_info_ratelimited("%s_tables: %s target: only valid in %s table, not %s\n", + xt_prefix[par->family], par->target->name, + par->target->table, par->table); return -EINVAL; } if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) { char used[64], allow[64]; - pr_err("%s_tables: %s target: used from hooks %s, but only " - "usable from %s\n", - xt_prefix[par->family], par->target->name, - textify_hooks(used, sizeof(used), par->hook_mask, - par->family), - textify_hooks(allow, sizeof(allow), par->target->hooks, - par->family)); + pr_info_ratelimited("%s_tables: %s target: used from hooks %s, but only usable from %s\n", + xt_prefix[par->family], par->target->name, + textify_hooks(used, sizeof(used), + par->hook_mask, par->family), + textify_hooks(allow, sizeof(allow), + par->target->hooks, + par->family)); return -EINVAL; } if (par->target->proto && (par->target->proto != proto || inv_proto)) { - pr_err("%s_tables: %s target: only valid for protocol %u\n", - xt_prefix[par->family], par->target->name, - par->target->proto); + pr_info_ratelimited("%s_tables: %s target: only valid for protocol %u\n", + xt_prefix[par->family], par->target->name, + par->target->proto); return -EINVAL; } if (par->target->checkentry != NULL) { @@ -1001,11 +1169,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) struct xt_table_info *info = NULL; size_t sz = sizeof(*info) + size; - if (sz < sizeof(*info)) - return NULL; - - /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((size >> PAGE_SHIFT) + 2 > totalram_pages) + if (sz < sizeof(*info) || sz >= XT_MAX_TABLE_SIZE) return NULL; /* __GFP_NORETRY is not fully supported by kvmalloc but it should @@ -1174,6 +1338,21 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) return 0; } +struct xt_counters *xt_counters_alloc(unsigned int counters) +{ + struct xt_counters *mem; + + if (counters == 0 || counters > INT_MAX / sizeof(*mem)) + return NULL; + + counters *= sizeof(*mem); + if (counters > XT_MAX_TABLE_SIZE) + return NULL; + + return vzalloc(counters); +} +EXPORT_SYMBOL(xt_counters_alloc); + struct xt_table_info * xt_replace_table(struct xt_table *table, unsigned int num_counters, @@ -1705,7 +1884,9 @@ EXPORT_SYMBOL_GPL(xt_proto_fini); * to fetch the real percpu counter. * * To speed up allocation and improve data locality, a 4kb block is - * allocated. + * allocated. Freeing any counter may free an entire block, so all + * counters allocated using the same state must be freed at the same + * time. * * xt_percpu_counter_alloc_state contains the base address of the * allocated page and the current sub-offset. diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index c502419d6306..f368ee6741db 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -120,8 +120,8 @@ static int audit_tg_check(const struct xt_tgchk_param *par) const struct xt_audit_info *info = par->targinfo; if (info->type > XT_AUDIT_TYPE_MAX) { - pr_info("Audit type out of range (valid range: 0..%hhu)\n", - XT_AUDIT_TYPE_MAX); + pr_info_ratelimited("Audit type out of range (valid range: 0..%hhu)\n", + XT_AUDIT_TYPE_MAX); return -ERANGE; } diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c index 0f642ef8cd26..9f4151ec3e06 100644 --- a/net/netfilter/xt_CHECKSUM.c +++ b/net/netfilter/xt_CHECKSUM.c @@ -36,13 +36,13 @@ static int checksum_tg_check(const struct xt_tgchk_param *par) const struct xt_CHECKSUM_info *einfo = par->targinfo; if (einfo->operation & ~XT_CHECKSUM_OP_FILL) { - pr_info("unsupported CHECKSUM operation %x\n", einfo->operation); + pr_info_ratelimited("unsupported CHECKSUM operation %x\n", + einfo->operation); return -EINVAL; } - if (!einfo->operation) { - pr_info("no CHECKSUM operation enabled\n"); + if (!einfo->operation) return -EINVAL; - } + return 0; } diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index da56c06a443c..f3f1caac949b 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -91,8 +91,8 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par) if (strcmp(par->table, "mangle") != 0 && strcmp(par->table, "security") != 0) { - pr_info("target only valid in the \'mangle\' " - "or \'security\' tables, not \'%s\'.\n", par->table); + pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n", + par->table); return -EINVAL; } @@ -102,14 +102,14 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par) break; default: - pr_info("invalid mode: %hu\n", info->mode); + pr_info_ratelimited("invalid mode: %hu\n", info->mode); return -EINVAL; } ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) - pr_info("cannot load conntrack support for proto=%u\n", - par->family); + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); return ret; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 5a152e2acfd5..8790190c6feb 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -82,15 +82,14 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, proto = xt_ct_find_proto(par); if (!proto) { - pr_info("You must specify a L4 protocol, and not use " - "inversions on it.\n"); + pr_info_ratelimited("You must specify a L4 protocol and not use inversions on it\n"); return -ENOENT; } helper = nf_conntrack_helper_try_module_get(helper_name, par->family, proto); if (helper == NULL) { - pr_info("No such helper \"%s\"\n", helper_name); + pr_info_ratelimited("No such helper \"%s\"\n", helper_name); return -ENOENT; } @@ -124,6 +123,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, const struct nf_conntrack_l4proto *l4proto; struct ctnl_timeout *timeout; struct nf_conn_timeout *timeout_ext; + const char *errmsg = NULL; int ret = 0; u8 proto; @@ -131,29 +131,29 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook); if (timeout_find_get == NULL) { ret = -ENOENT; - pr_info("Timeout policy base is empty\n"); + errmsg = "Timeout policy base is empty"; goto out; } proto = xt_ct_find_proto(par); if (!proto) { ret = -EINVAL; - pr_info("You must specify a L4 protocol, and not use " - "inversions on it.\n"); + errmsg = "You must specify a L4 protocol and not use inversions on it"; goto out; } timeout = timeout_find_get(par->net, timeout_name); if (timeout == NULL) { ret = -ENOENT; - pr_info("No such timeout policy \"%s\"\n", timeout_name); + pr_info_ratelimited("No such timeout policy \"%s\"\n", + timeout_name); goto out; } if (timeout->l3num != par->family) { ret = -EINVAL; - pr_info("Timeout policy `%s' can only be used by L3 protocol " - "number %d\n", timeout_name, timeout->l3num); + pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n", + timeout_name, 3, timeout->l3num); goto err_put_timeout; } /* Make sure the timeout policy matches any existing protocol tracker, @@ -162,9 +162,8 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, l4proto = __nf_ct_l4proto_find(par->family, proto); if (timeout->l4proto->l4proto != l4proto->l4proto) { ret = -EINVAL; - pr_info("Timeout policy `%s' can only be used by L4 protocol " - "number %d\n", - timeout_name, timeout->l4proto->l4proto); + pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n", + timeout_name, 4, timeout->l4proto->l4proto); goto err_put_timeout; } timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); @@ -180,6 +179,8 @@ err_put_timeout: __xt_ct_tg_timeout_put(timeout); out: rcu_read_unlock(); + if (errmsg) + pr_info_ratelimited("%s\n", errmsg); return ret; #else return -EOPNOTSUPP; diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index 3f83d38c4e5b..098ed851b7a7 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -66,10 +66,8 @@ static int dscp_tg_check(const struct xt_tgchk_param *par) { const struct xt_DSCP_info *info = par->targinfo; - if (info->dscp > XT_DSCP_MAX) { - pr_info("dscp %x out of range\n", info->dscp); + if (info->dscp > XT_DSCP_MAX) return -EDOM; - } return 0; } diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c index 1535e87ed9bd..4653b071bed4 100644 --- a/net/netfilter/xt_HL.c +++ b/net/netfilter/xt_HL.c @@ -105,10 +105,8 @@ static int ttl_tg_check(const struct xt_tgchk_param *par) { const struct ipt_TTL_info *info = par->targinfo; - if (info->mode > IPT_TTL_MAXMODE) { - pr_info("TTL: invalid or unknown mode %u\n", info->mode); + if (info->mode > IPT_TTL_MAXMODE) return -EINVAL; - } if (info->mode != IPT_TTL_SET && info->ttl == 0) return -EINVAL; return 0; @@ -118,15 +116,10 @@ static int hl_tg6_check(const struct xt_tgchk_param *par) { const struct ip6t_HL_info *info = par->targinfo; - if (info->mode > IP6T_HL_MAXMODE) { - pr_info("invalid or unknown mode %u\n", info->mode); + if (info->mode > IP6T_HL_MAXMODE) return -EINVAL; - } - if (info->mode != IP6T_HL_SET && info->hop_limit == 0) { - pr_info("increment/decrement does not " - "make sense with value 0\n"); + if (info->mode != IP6T_HL_SET && info->hop_limit == 0) return -EINVAL; - } return 0; } diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index 60e6dbe12460..9c75f419cd80 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -9,6 +9,8 @@ * the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmp.h> @@ -312,29 +314,30 @@ hmark_tg_v4(struct sk_buff *skb, const struct xt_action_param *par) static int hmark_tg_check(const struct xt_tgchk_param *par) { const struct xt_hmark_info *info = par->targinfo; + const char *errmsg = "proto mask must be zero with L3 mode"; - if (!info->hmodulus) { - pr_info("xt_HMARK: hash modulus can't be zero\n"); + if (!info->hmodulus) return -EINVAL; - } + if (info->proto_mask && - (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))) { - pr_info("xt_HMARK: proto mask must be zero with L3 mode\n"); - return -EINVAL; - } + (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))) + goto err; + if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI_MASK) && (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT_MASK) | - XT_HMARK_FLAG(XT_HMARK_DPORT_MASK)))) { - pr_info("xt_HMARK: spi-mask and port-mask can't be combined\n"); + XT_HMARK_FLAG(XT_HMARK_DPORT_MASK)))) return -EINVAL; - } + if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI) && (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT) | XT_HMARK_FLAG(XT_HMARK_DPORT)))) { - pr_info("xt_HMARK: spi-set and port-set can't be combined\n"); - return -EINVAL; + errmsg = "spi-set and port-set can't be combined"; + goto err; } return 0; +err: + pr_info_ratelimited("%s\n", errmsg); + return -EINVAL; } static struct xt_target hmark_tg_reg[] __read_mostly = { diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 6c2482b709b1..5ee859193783 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -132,7 +132,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) ret = -ENOMEM; goto out_free_timer; } - info->timer->attr.attr.mode = S_IRUGO; + info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); @@ -146,11 +146,11 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) timer_setup(&info->timer->timer, idletimer_tg_expired, 0); info->timer->refcnt = 1; + INIT_WORK(&info->timer->work, idletimer_tg_work); + mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); - INIT_WORK(&info->timer->work, idletimer_tg_work); - return 0; out_free_attr: @@ -191,7 +191,10 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) pr_debug("timeout value is zero\n"); return -EINVAL; } - + if (info->timeout >= INT_MAX / 1000) { + pr_debug("timeout value is too big\n"); + return -EINVAL; + } if (info->label[0] == '\0' || strnlen(info->label, MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) { diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index 1dcad893df78..19846445504d 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -111,10 +111,8 @@ static int led_tg_check(const struct xt_tgchk_param *par) struct xt_led_info_internal *ledinternal; int err; - if (ledinfo->id[0] == '\0') { - pr_info("No 'id' parameter given.\n"); + if (ledinfo->id[0] == '\0') return -EINVAL; - } mutex_lock(&xt_led_mutex); @@ -138,13 +136,14 @@ static int led_tg_check(const struct xt_tgchk_param *par) err = led_trigger_register(&ledinternal->netfilter_led_trigger); if (err) { - pr_err("Trigger name is already in use.\n"); + pr_info_ratelimited("Trigger name is already in use.\n"); goto exit_alloc; } - /* See if we need to set up a timer */ - if (ledinfo->delay > 0) - timer_setup(&ledinternal->timer, led_timeout_callback, 0); + /* Since the letinternal timer can be shared between multiple targets, + * always set it up, even if the current target does not need it + */ + timer_setup(&ledinternal->timer, led_timeout_callback, 0); list_add_tail(&ledinternal->list, &xt_led_triggers); @@ -181,8 +180,7 @@ static void led_tg_destroy(const struct xt_tgdtor_param *par) list_del(&ledinternal->list); - if (ledinfo->delay > 0) - del_timer_sync(&ledinternal->timer); + del_timer_sync(&ledinternal->timer); led_trigger_unregister(&ledinternal->netfilter_led_trigger); diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index a360b99a958a..a9aca80a32ae 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -8,6 +8,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/skbuff.h> @@ -67,13 +69,13 @@ static int nfqueue_tg_check(const struct xt_tgchk_param *par) init_hashrandom(&jhash_initval); if (info->queues_total == 0) { - pr_err("NFQUEUE: number of total queues is 0\n"); + pr_info_ratelimited("number of total queues is 0\n"); return -EINVAL; } maxid = info->queues_total - 1 + info->queuenum; if (maxid > 0xffff) { - pr_err("NFQUEUE: number of queues (%u) out of range (got %u)\n", - info->queues_total, maxid); + pr_info_ratelimited("number of queues (%u) out of range (got %u)\n", + info->queues_total, maxid); return -ERANGE; } if (par->target->revision == 2 && info->flags > 1) diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 141c295191f6..dec843cadf46 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -14,15 +14,21 @@ #include <linux/slab.h> #include <net/gen_stats.h> #include <net/netlink.h> +#include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_RATEEST.h> #include <net/netfilter/xt_rateest.h> -static DEFINE_MUTEX(xt_rateest_mutex); - #define RATEEST_HSIZE 16 -static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly; + +struct xt_rateest_net { + struct mutex hash_lock; + struct hlist_head hash[RATEEST_HSIZE]; +}; + +static unsigned int xt_rateest_id; + static unsigned int jhash_rnd __read_mostly; static unsigned int xt_rateest_hash(const char *name) @@ -31,21 +37,23 @@ static unsigned int xt_rateest_hash(const char *name) (RATEEST_HSIZE - 1); } -static void xt_rateest_hash_insert(struct xt_rateest *est) +static void xt_rateest_hash_insert(struct xt_rateest_net *xn, + struct xt_rateest *est) { unsigned int h; h = xt_rateest_hash(est->name); - hlist_add_head(&est->list, &rateest_hash[h]); + hlist_add_head(&est->list, &xn->hash[h]); } -static struct xt_rateest *__xt_rateest_lookup(const char *name) +static struct xt_rateest *__xt_rateest_lookup(struct xt_rateest_net *xn, + const char *name) { struct xt_rateest *est; unsigned int h; h = xt_rateest_hash(name); - hlist_for_each_entry(est, &rateest_hash[h], list) { + hlist_for_each_entry(est, &xn->hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; return est; @@ -55,20 +63,23 @@ static struct xt_rateest *__xt_rateest_lookup(const char *name) return NULL; } -struct xt_rateest *xt_rateest_lookup(const char *name) +struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name) { + struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); struct xt_rateest *est; - mutex_lock(&xt_rateest_mutex); - est = __xt_rateest_lookup(name); - mutex_unlock(&xt_rateest_mutex); + mutex_lock(&xn->hash_lock); + est = __xt_rateest_lookup(xn, name); + mutex_unlock(&xn->hash_lock); return est; } EXPORT_SYMBOL_GPL(xt_rateest_lookup); -void xt_rateest_put(struct xt_rateest *est) +void xt_rateest_put(struct net *net, struct xt_rateest *est) { - mutex_lock(&xt_rateest_mutex); + struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); + + mutex_lock(&xn->hash_lock); if (--est->refcnt == 0) { hlist_del(&est->list); gen_kill_estimator(&est->rate_est); @@ -78,7 +89,7 @@ void xt_rateest_put(struct xt_rateest *est) */ kfree_rcu(est, rcu); } - mutex_unlock(&xt_rateest_mutex); + mutex_unlock(&xn->hash_lock); } EXPORT_SYMBOL_GPL(xt_rateest_put); @@ -98,6 +109,7 @@ xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) { + struct xt_rateest_net *xn = net_generic(par->net, xt_rateest_id); struct xt_rateest_target_info *info = par->targinfo; struct xt_rateest *est; struct { @@ -108,10 +120,10 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); - mutex_lock(&xt_rateest_mutex); - est = __xt_rateest_lookup(info->name); + mutex_lock(&xn->hash_lock); + est = __xt_rateest_lookup(xn, info->name); if (est) { - mutex_unlock(&xt_rateest_mutex); + mutex_unlock(&xn->hash_lock); /* * If estimator parameters are specified, they must match the * existing estimator. @@ -119,7 +131,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) if ((!info->interval && !info->ewma_log) || (info->interval != est->params.interval || info->ewma_log != est->params.ewma_log)) { - xt_rateest_put(est); + xt_rateest_put(par->net, est); return -EINVAL; } info->est = est; @@ -148,14 +160,14 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) goto err2; info->est = est; - xt_rateest_hash_insert(est); - mutex_unlock(&xt_rateest_mutex); + xt_rateest_hash_insert(xn, est); + mutex_unlock(&xn->hash_lock); return 0; err2: kfree(est); err1: - mutex_unlock(&xt_rateest_mutex); + mutex_unlock(&xn->hash_lock); return ret; } @@ -163,7 +175,7 @@ static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) { struct xt_rateest_target_info *info = par->targinfo; - xt_rateest_put(info->est); + xt_rateest_put(par->net, info->est); } static struct xt_target xt_rateest_tg_reg __read_mostly = { @@ -178,19 +190,46 @@ static struct xt_target xt_rateest_tg_reg __read_mostly = { .me = THIS_MODULE, }; -static int __init xt_rateest_tg_init(void) +static __net_init int xt_rateest_net_init(struct net *net) +{ + struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); + int i; + + mutex_init(&xn->hash_lock); + for (i = 0; i < ARRAY_SIZE(xn->hash); i++) + INIT_HLIST_HEAD(&xn->hash[i]); + return 0; +} + +static void __net_exit xt_rateest_net_exit(struct net *net) { - unsigned int i; + struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); + int i; + + for (i = 0; i < ARRAY_SIZE(xn->hash); i++) + WARN_ON_ONCE(!hlist_empty(&xn->hash[i])); +} - for (i = 0; i < ARRAY_SIZE(rateest_hash); i++) - INIT_HLIST_HEAD(&rateest_hash[i]); +static struct pernet_operations xt_rateest_net_ops = { + .init = xt_rateest_net_init, + .exit = xt_rateest_net_exit, + .id = &xt_rateest_id, + .size = sizeof(struct xt_rateest_net), +}; + +static int __init xt_rateest_tg_init(void) +{ + int err = register_pernet_subsys(&xt_rateest_net_ops); + if (err) + return err; return xt_register_target(&xt_rateest_tg_reg); } static void __exit xt_rateest_tg_fini(void) { xt_unregister_target(&xt_rateest_tg_reg); + unregister_pernet_subsys(&xt_rateest_net_ops); } diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 9faf5e050b79..4ad5fe27e08b 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -60,18 +60,20 @@ static int checkentry_lsm(struct xt_secmark_target_info *info) &info->secid); if (err) { if (err == -EINVAL) - pr_info("invalid security context \'%s\'\n", info->secctx); + pr_info_ratelimited("invalid security context \'%s\'\n", + info->secctx); return err; } if (!info->secid) { - pr_info("unable to map security context \'%s\'\n", info->secctx); + pr_info_ratelimited("unable to map security context \'%s\'\n", + info->secctx); return -ENOENT; } err = security_secmark_relabel_packet(info->secid); if (err) { - pr_info("unable to obtain relabeling permission\n"); + pr_info_ratelimited("unable to obtain relabeling permission\n"); return err; } @@ -86,14 +88,14 @@ static int secmark_tg_check(const struct xt_tgchk_param *par) if (strcmp(par->table, "mangle") != 0 && strcmp(par->table, "security") != 0) { - pr_info("target only valid in the \'mangle\' " - "or \'security\' tables, not \'%s\'.\n", par->table); + pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n", + par->table); return -EINVAL; } if (mode && mode != info->mode) { - pr_info("mode already set to %hu cannot mix with " - "rules for mode %hu\n", mode, info->mode); + pr_info_ratelimited("mode already set to %hu cannot mix with rules for mode %hu\n", + mode, info->mode); return -EINVAL; } @@ -101,7 +103,7 @@ static int secmark_tg_check(const struct xt_tgchk_param *par) case SECMARK_MODE_SEL: break; default: - pr_info("invalid mode: %hu\n", info->mode); + pr_info_ratelimited("invalid mode: %hu\n", info->mode); return -EINVAL; } diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 99bb8e410f22..98efb202f8b4 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -273,8 +273,7 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par) (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { - pr_info("path-MTU clamping only supported in " - "FORWARD, OUTPUT and POSTROUTING hooks\n"); + pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } if (par->nft_compat) @@ -283,7 +282,7 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par) xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; - pr_info("Only works on TCP SYN packets\n"); + pr_info_ratelimited("Only works on TCP SYN packets\n"); return -EINVAL; } @@ -298,8 +297,7 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par) (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { - pr_info("path-MTU clamping only supported in " - "FORWARD, OUTPUT and POSTROUTING hooks\n"); + pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } if (par->nft_compat) @@ -308,7 +306,7 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par) xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; - pr_info("Only works on TCP SYN packets\n"); + pr_info_ratelimited("Only works on TCP SYN packets\n"); return -EINVAL; } #endif diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 86b0580b2216..475957cfcf50 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -20,7 +20,7 @@ #include <linux/netfilter/xt_TEE.h> struct xt_tee_priv { - struct notifier_block notifier; + struct list_head list; struct xt_tee_tginfo *tginfo; int oif; }; @@ -51,29 +51,35 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) } #endif +static DEFINE_MUTEX(priv_list_mutex); +static LIST_HEAD(priv_list); + static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct xt_tee_priv *priv; - priv = container_of(this, struct xt_tee_priv, notifier); - switch (event) { - case NETDEV_REGISTER: - if (!strcmp(dev->name, priv->tginfo->oif)) - priv->oif = dev->ifindex; - break; - case NETDEV_UNREGISTER: - if (dev->ifindex == priv->oif) - priv->oif = -1; - break; - case NETDEV_CHANGENAME: - if (!strcmp(dev->name, priv->tginfo->oif)) - priv->oif = dev->ifindex; - else if (dev->ifindex == priv->oif) - priv->oif = -1; - break; + mutex_lock(&priv_list_mutex); + list_for_each_entry(priv, &priv_list, list) { + switch (event) { + case NETDEV_REGISTER: + if (!strcmp(dev->name, priv->tginfo->oif)) + priv->oif = dev->ifindex; + break; + case NETDEV_UNREGISTER: + if (dev->ifindex == priv->oif) + priv->oif = -1; + break; + case NETDEV_CHANGENAME: + if (!strcmp(dev->name, priv->tginfo->oif)) + priv->oif = dev->ifindex; + else if (dev->ifindex == priv->oif) + priv->oif = -1; + break; + } } + mutex_unlock(&priv_list_mutex); return NOTIFY_DONE; } @@ -89,8 +95,6 @@ static int tee_tg_check(const struct xt_tgchk_param *par) return -EINVAL; if (info->oif[0]) { - int ret; - if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; @@ -100,14 +104,11 @@ static int tee_tg_check(const struct xt_tgchk_param *par) priv->tginfo = info; priv->oif = -1; - priv->notifier.notifier_call = tee_netdev_event; info->priv = priv; - ret = register_netdevice_notifier(&priv->notifier); - if (ret) { - kfree(priv); - return ret; - } + mutex_lock(&priv_list_mutex); + list_add(&priv->list, &priv_list); + mutex_unlock(&priv_list_mutex); } else info->priv = NULL; @@ -120,7 +121,9 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par) struct xt_tee_tginfo *info = par->targinfo; if (info->priv) { - unregister_netdevice_notifier(&info->priv->notifier); + mutex_lock(&priv_list_mutex); + list_del(&info->priv->list); + mutex_unlock(&priv_list_mutex); kfree(info->priv); } static_key_slow_dec(&xt_tee_enabled); @@ -153,13 +156,29 @@ static struct xt_target tee_tg_reg[] __read_mostly = { #endif }; +static struct notifier_block tee_netdev_notifier = { + .notifier_call = tee_netdev_event, +}; + static int __init tee_tg_init(void) { - return xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); + int ret; + + ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); + if (ret) + return ret; + ret = register_netdevice_notifier(&tee_netdev_notifier); + if (ret) { + xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); + return ret; + } + + return 0; } static void __exit tee_tg_exit(void) { + unregister_netdevice_notifier(&tee_netdev_notifier); xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); } diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 17d7705e3bd4..8c89323c06af 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -540,8 +540,7 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par) !(i->invflags & IP6T_INV_PROTO)) return 0; - pr_info("Can be used only in combination with " - "either -p tcp or -p udp\n"); + pr_info_ratelimited("Can be used only with -p tcp or -p udp\n"); return -EINVAL; } #endif @@ -559,8 +558,7 @@ static int tproxy_tg4_check(const struct xt_tgchk_param *par) && !(i->invflags & IPT_INV_PROTO)) return 0; - pr_info("Can be used only in combination with " - "either -p tcp or -p udp\n"); + pr_info_ratelimited("Can be used only with -p tcp or -p udp\n"); return -EINVAL; } diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index 911a7c0da504..89e281b3bfc2 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -164,48 +164,47 @@ addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) { + const char *errmsg = "both incoming and outgoing interface limitation cannot be selected"; struct xt_addrtype_info_v1 *info = par->matchinfo; if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && - info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { - pr_info("both incoming and outgoing " - "interface limitation cannot be selected\n"); - return -EINVAL; - } + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) + goto err; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { - pr_info("output interface limitation " - "not valid in PREROUTING and INPUT\n"); - return -EINVAL; + errmsg = "output interface limitation not valid in PREROUTING and INPUT"; + goto err; } if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) { - pr_info("input interface limitation " - "not valid in POSTROUTING and OUTPUT\n"); - return -EINVAL; + errmsg = "input interface limitation not valid in POSTROUTING and OUTPUT"; + goto err; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) if (par->family == NFPROTO_IPV6) { if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) { - pr_err("ipv6 BLACKHOLE matching not supported\n"); - return -EINVAL; + errmsg = "ipv6 BLACKHOLE matching not supported"; + goto err; } if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) { - pr_err("ipv6 PROHIBIT (THROW, NAT ..) matching not supported\n"); - return -EINVAL; + errmsg = "ipv6 PROHIBIT (THROW, NAT ..) matching not supported"; + goto err; } if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) { - pr_err("ipv6 does not support BROADCAST matching\n"); - return -EINVAL; + errmsg = "ipv6 does not support BROADCAST matching"; + goto err; } } #endif return 0; +err: + pr_info_ratelimited("%s\n", errmsg); + return -EINVAL; } static struct xt_match addrtype_mt_reg[] __read_mostly = { diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 06b090d8e901..a2cf8a6236d6 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -7,6 +7,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/syscalls.h> #include <linux/skbuff.h> @@ -34,7 +36,7 @@ static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len, program.filter = insns; if (bpf_prog_create(ret, &program)) { - pr_info("bpf: check failed: parse error\n"); + pr_info_ratelimited("check failed: parse error\n"); return -EINVAL; } diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 891f4e7e8ea7..7df2dece57d3 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -12,6 +12,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/skbuff.h> #include <linux/module.h> #include <linux/netfilter/x_tables.h> @@ -48,7 +50,7 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) } if (info->has_path && info->has_classid) { - pr_info("xt_cgroup: both path and classid specified\n"); + pr_info_ratelimited("path and classid specified\n"); return -EINVAL; } @@ -56,8 +58,8 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) if (info->has_path) { cgrp = cgroup_get_from_path(info->path); if (IS_ERR(cgrp)) { - pr_info("xt_cgroup: invalid path, errno=%ld\n", - PTR_ERR(cgrp)); + pr_info_ratelimited("invalid path, errno=%ld\n", + PTR_ERR(cgrp)); return -EINVAL; } info->priv = cgrp; diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c index 57ef175dfbfa..dfbdbb2fc0ed 100644 --- a/net/netfilter/xt_cluster.c +++ b/net/netfilter/xt_cluster.c @@ -60,13 +60,6 @@ xt_cluster_hash(const struct nf_conn *ct, } static inline bool -xt_cluster_ipv6_is_multicast(const struct in6_addr *addr) -{ - __be32 st = addr->s6_addr32[0]; - return ((st & htonl(0xFF000000)) == htonl(0xFF000000)); -} - -static inline bool xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family) { bool is_multicast = false; @@ -76,8 +69,7 @@ xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family) is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr); break; case NFPROTO_IPV6: - is_multicast = - xt_cluster_ipv6_is_multicast(&ipv6_hdr(skb)->daddr); + is_multicast = ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr); break; default: WARN_ON(1); @@ -135,14 +127,12 @@ static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par) struct xt_cluster_match_info *info = par->matchinfo; if (info->total_nodes > XT_CLUSTER_NODES_MAX) { - pr_info("you have exceeded the maximum " - "number of cluster nodes (%u > %u)\n", - info->total_nodes, XT_CLUSTER_NODES_MAX); + pr_info_ratelimited("you have exceeded the maximum number of cluster nodes (%u > %u)\n", + info->total_nodes, XT_CLUSTER_NODES_MAX); return -EINVAL; } if (info->node_mask >= (1ULL << info->total_nodes)) { - pr_info("this node mask cannot be " - "higher than the total number of nodes\n"); + pr_info_ratelimited("node mask cannot exceed total number of nodes\n"); return -EDOM; } return 0; diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index cad0b7b5eb35..93cb018c3055 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -112,8 +112,8 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par) ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) - pr_info("cannot load conntrack support for proto=%u\n", - par->family); + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); /* * This filter cannot function correctly unless connection tracking diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index 23372879e6e3..4fa4efd24353 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -57,14 +57,15 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) int ret; if (info->options & ~options) { - pr_err("Unknown options in mask %x\n", info->options); + pr_info_ratelimited("Unknown options in mask %x\n", + info->options); return -EINVAL; } ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { - pr_info("cannot load conntrack support for proto=%u\n", - par->family); + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); return ret; } diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index b1b17b9353e1..6275106ccf50 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -67,8 +67,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) key[1] = zone->id; } - connections = nf_conncount_count(net, info->data, key, - xt_family(par), tuple_ptr, zone); + connections = nf_conncount_count(net, info->data, key, tuple_ptr, + zone); if (connections == 0) /* kmalloc failed, drop it entirely */ goto hotdrop; diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index ec377cc6a369..773da82190dc 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -36,9 +36,10 @@ MODULE_ALIAS("ipt_connmark"); MODULE_ALIAS("ip6t_connmark"); static unsigned int -connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +connmark_tg_shift(struct sk_buff *skb, + const struct xt_connmark_tginfo1 *info, + u8 shift_bits, u8 shift_dir) { - const struct xt_connmark_tginfo1 *info = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; u_int32_t newmark; @@ -50,6 +51,10 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (info->mode) { case XT_CONNMARK_SET: newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; + if (shift_dir == D_SHIFT_RIGHT) + newmark >>= shift_bits; + else + newmark <<= shift_bits; if (ct->mark != newmark) { ct->mark = newmark; nf_conntrack_event_cache(IPCT_MARK, ct); @@ -57,7 +62,11 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) break; case XT_CONNMARK_SAVE: newmark = (ct->mark & ~info->ctmask) ^ - (skb->mark & info->nfmask); + (skb->mark & info->nfmask); + if (shift_dir == D_SHIFT_RIGHT) + newmark >>= shift_bits; + else + newmark <<= shift_bits; if (ct->mark != newmark) { ct->mark = newmark; nf_conntrack_event_cache(IPCT_MARK, ct); @@ -65,22 +74,42 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) break; case XT_CONNMARK_RESTORE: newmark = (skb->mark & ~info->nfmask) ^ - (ct->mark & info->ctmask); + (ct->mark & info->ctmask); + if (shift_dir == D_SHIFT_RIGHT) + newmark >>= shift_bits; + else + newmark <<= shift_bits; skb->mark = newmark; break; } - return XT_CONTINUE; } +static unsigned int +connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_connmark_tginfo1 *info = par->targinfo; + + return connmark_tg_shift(skb, info, 0, 0); +} + +static unsigned int +connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_connmark_tginfo2 *info = par->targinfo; + + return connmark_tg_shift(skb, (const struct xt_connmark_tginfo1 *)info, + info->shift_bits, info->shift_dir); +} + static int connmark_tg_check(const struct xt_tgchk_param *par) { int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) - pr_info("cannot load conntrack support for proto=%u\n", - par->family); + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); return ret; } @@ -109,8 +138,8 @@ static int connmark_mt_check(const struct xt_mtchk_param *par) ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) - pr_info("cannot load conntrack support for proto=%u\n", - par->family); + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); return ret; } @@ -119,15 +148,27 @@ static void connmark_mt_destroy(const struct xt_mtdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_target connmark_tg_reg __read_mostly = { - .name = "CONNMARK", - .revision = 1, - .family = NFPROTO_UNSPEC, - .checkentry = connmark_tg_check, - .target = connmark_tg, - .targetsize = sizeof(struct xt_connmark_tginfo1), - .destroy = connmark_tg_destroy, - .me = THIS_MODULE, +static struct xt_target connmark_tg_reg[] __read_mostly = { + { + .name = "CONNMARK", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, + { + .name = "CONNMARK", + .revision = 2, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_tg_check, + .target = connmark_tg_v2, + .targetsize = sizeof(struct xt_connmark_tginfo2), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + } }; static struct xt_match connmark_mt_reg __read_mostly = { @@ -145,12 +186,14 @@ static int __init connmark_mt_init(void) { int ret; - ret = xt_register_target(&connmark_tg_reg); + ret = xt_register_targets(connmark_tg_reg, + ARRAY_SIZE(connmark_tg_reg)); if (ret < 0) return ret; ret = xt_register_match(&connmark_mt_reg); if (ret < 0) { - xt_unregister_target(&connmark_tg_reg); + xt_unregister_targets(connmark_tg_reg, + ARRAY_SIZE(connmark_tg_reg)); return ret; } return 0; @@ -159,7 +202,7 @@ static int __init connmark_mt_init(void) static void __exit connmark_mt_exit(void) { xt_unregister_match(&connmark_mt_reg); - xt_unregister_target(&connmark_tg_reg); + xt_unregister_target(connmark_tg_reg); } module_init(connmark_mt_init); diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 39cf1d019240..df80fe7d391c 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -272,8 +272,8 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par) ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) - pr_info("cannot load conntrack support for proto=%u\n", - par->family); + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); return ret; } diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c index 236ac8008909..a4c2b862f820 100644 --- a/net/netfilter/xt_dscp.c +++ b/net/netfilter/xt_dscp.c @@ -46,10 +46,8 @@ static int dscp_mt_check(const struct xt_mtchk_param *par) { const struct xt_dscp_info *info = par->matchinfo; - if (info->dscp > XT_DSCP_MAX) { - pr_info("dscp %x out of range\n", info->dscp); + if (info->dscp > XT_DSCP_MAX) return -EDOM; - } return 0; } diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c index 3c831a8efebc..c7ad4afa5fb8 100644 --- a/net/netfilter/xt_ecn.c +++ b/net/netfilter/xt_ecn.c @@ -97,7 +97,7 @@ static int ecn_mt_check4(const struct xt_mtchk_param *par) if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) { - pr_info("cannot match TCP bits in rule for non-tcp packets\n"); + pr_info_ratelimited("cannot match TCP bits for non-tcp packets\n"); return -EINVAL; } @@ -139,7 +139,7 @@ static int ecn_mt_check6(const struct xt_mtchk_param *par) if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && (ip->proto != IPPROTO_TCP || ip->invflags & IP6T_INV_PROTO)) { - pr_info("cannot match TCP bits in rule for non-tcp packets\n"); + pr_info_ratelimited("cannot match TCP bits for non-tcp packets\n"); return -EINVAL; } diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index ca6847403ca2..0cd73567e7ff 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -523,7 +523,8 @@ static u64 user2rate(u64 user) if (user != 0) { return div64_u64(XT_HASHLIMIT_SCALE_v2, user); } else { - pr_warn("invalid rate from userspace: %llu\n", user); + pr_info_ratelimited("invalid rate from userspace: %llu\n", + user); return 0; } } @@ -533,8 +534,7 @@ static u64 user2rate_bytes(u32 user) u64 r; r = user ? U32_MAX / user : U32_MAX; - r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT; - return r; + return (r - 1) << XT_HASHLIMIT_BYTE_SHIFT; } static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, @@ -774,7 +774,7 @@ hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par, if (!dh->rateinfo.prev_window && (dh->rateinfo.current_rate <= dh->rateinfo.burst)) { spin_unlock(&dh->lock); - rcu_read_unlock_bh(); + local_bh_enable(); return !(cfg->mode & XT_HASHLIMIT_INVERT); } else { goto overlimit; @@ -865,33 +865,34 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, } if (cfg->mode & ~XT_HASHLIMIT_ALL) { - pr_info("Unknown mode mask %X, kernel too old?\n", - cfg->mode); + pr_info_ratelimited("Unknown mode mask %X, kernel too old?\n", + cfg->mode); return -EINVAL; } /* Check for overflow. */ if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) { if (cfg->avg == 0 || cfg->avg > U32_MAX) { - pr_info("hashlimit invalid rate\n"); + pr_info_ratelimited("invalid rate\n"); return -ERANGE; } if (cfg->interval == 0) { - pr_info("hashlimit invalid interval\n"); + pr_info_ratelimited("invalid interval\n"); return -EINVAL; } } else if (cfg->mode & XT_HASHLIMIT_BYTES) { if (user2credits_byte(cfg->avg) == 0) { - pr_info("overflow, rate too high: %llu\n", cfg->avg); + pr_info_ratelimited("overflow, rate too high: %llu\n", + cfg->avg); return -EINVAL; } } else if (cfg->burst == 0 || - user2credits(cfg->avg * cfg->burst, revision) < - user2credits(cfg->avg, revision)) { - pr_info("overflow, try lower: %llu/%llu\n", - cfg->avg, cfg->burst); - return -ERANGE; + user2credits(cfg->avg * cfg->burst, revision) < + user2credits(cfg->avg, revision)) { + pr_info_ratelimited("overflow, try lower: %llu/%llu\n", + cfg->avg, cfg->burst); + return -ERANGE; } mutex_lock(&hashlimit_mutex); @@ -915,8 +916,9 @@ static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) struct hashlimit_cfg3 cfg = {}; int ret; - if (info->name[sizeof(info->name) - 1] != '\0') - return -EINVAL; + ret = xt_check_proc_name(info->name, sizeof(info->name)); + if (ret) + return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 1); @@ -933,8 +935,9 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par) struct hashlimit_cfg3 cfg = {}; int ret; - if (info->name[sizeof(info->name) - 1] != '\0') - return -EINVAL; + ret = xt_check_proc_name(info->name, sizeof(info->name)); + if (ret) + return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 2); @@ -948,9 +951,11 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par) static int hashlimit_mt_check(const struct xt_mtchk_param *par) { struct xt_hashlimit_mtinfo3 *info = par->matchinfo; + int ret; - if (info->name[sizeof(info->name) - 1] != '\0') - return -EINVAL; + ret = xt_check_proc_name(info->name, sizeof(info->name)); + if (ret) + return ret; return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg, info->name, 3); diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index 38a78151c0e9..fd077aeaaed9 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -61,8 +61,8 @@ static int helper_mt_check(const struct xt_mtchk_param *par) ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { - pr_info("cannot load conntrack support for proto=%u\n", - par->family); + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); return ret; } info->name[sizeof(info->name) - 1] = '\0'; diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c index 7ca64a50db04..57f1df575701 100644 --- a/net/netfilter/xt_ipcomp.c +++ b/net/netfilter/xt_ipcomp.c @@ -72,7 +72,7 @@ static int comp_mt_check(const struct xt_mtchk_param *par) /* Must specify no unknown invflags */ if (compinfo->invflags & ~XT_IPCOMP_INV_MASK) { - pr_err("unknown flags %X\n", compinfo->invflags); + pr_info_ratelimited("unknown flags %X\n", compinfo->invflags); return -EINVAL; } return 0; diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c index 42540d26c2b8..1d950a6100af 100644 --- a/net/netfilter/xt_ipvs.c +++ b/net/netfilter/xt_ipvs.c @@ -158,7 +158,8 @@ static int ipvs_mt_check(const struct xt_mtchk_param *par) && par->family != NFPROTO_IPV6 #endif ) { - pr_info("protocol family %u not supported\n", par->family); + pr_info_ratelimited("protocol family %u not supported\n", + par->family); return -EINVAL; } diff --git a/net/netfilter/xt_l2tp.c b/net/netfilter/xt_l2tp.c index 8aee572771f2..c43482bf48e6 100644 --- a/net/netfilter/xt_l2tp.c +++ b/net/netfilter/xt_l2tp.c @@ -216,7 +216,7 @@ static int l2tp_mt_check(const struct xt_mtchk_param *par) /* Check for invalid flags */ if (info->flags & ~(XT_L2TP_TID | XT_L2TP_SID | XT_L2TP_VERSION | XT_L2TP_TYPE)) { - pr_info("unknown flags: %x\n", info->flags); + pr_info_ratelimited("unknown flags: %x\n", info->flags); return -EINVAL; } @@ -225,7 +225,8 @@ static int l2tp_mt_check(const struct xt_mtchk_param *par) (!(info->flags & XT_L2TP_SID)) && ((!(info->flags & XT_L2TP_TYPE)) || (info->type != XT_L2TP_TYPE_CONTROL))) { - pr_info("invalid flags combination: %x\n", info->flags); + pr_info_ratelimited("invalid flags combination: %x\n", + info->flags); return -EINVAL; } @@ -234,19 +235,22 @@ static int l2tp_mt_check(const struct xt_mtchk_param *par) */ if (info->flags & XT_L2TP_VERSION) { if ((info->version < 2) || (info->version > 3)) { - pr_info("wrong L2TP version: %u\n", info->version); + pr_info_ratelimited("wrong L2TP version: %u\n", + info->version); return -EINVAL; } if (info->version == 2) { if ((info->flags & XT_L2TP_TID) && (info->tid > 0xffff)) { - pr_info("v2 tid > 0xffff: %u\n", info->tid); + pr_info_ratelimited("v2 tid > 0xffff: %u\n", + info->tid); return -EINVAL; } if ((info->flags & XT_L2TP_SID) && (info->sid > 0xffff)) { - pr_info("v2 sid > 0xffff: %u\n", info->sid); + pr_info_ratelimited("v2 sid > 0xffff: %u\n", + info->sid); return -EINVAL; } } @@ -268,13 +272,13 @@ static int l2tp_mt_check4(const struct xt_mtchk_param *par) if ((ip->proto != IPPROTO_UDP) && (ip->proto != IPPROTO_L2TP)) { - pr_info("missing protocol rule (udp|l2tpip)\n"); + pr_info_ratelimited("missing protocol rule (udp|l2tpip)\n"); return -EINVAL; } if ((ip->proto == IPPROTO_L2TP) && (info->version == 2)) { - pr_info("v2 doesn't support IP mode\n"); + pr_info_ratelimited("v2 doesn't support IP mode\n"); return -EINVAL; } @@ -295,13 +299,13 @@ static int l2tp_mt_check6(const struct xt_mtchk_param *par) if ((ip->proto != IPPROTO_UDP) && (ip->proto != IPPROTO_L2TP)) { - pr_info("missing protocol rule (udp|l2tpip)\n"); + pr_info_ratelimited("missing protocol rule (udp|l2tpip)\n"); return -EINVAL; } if ((ip->proto == IPPROTO_L2TP) && (info->version == 2)) { - pr_info("v2 doesn't support IP mode\n"); + pr_info_ratelimited("v2 doesn't support IP mode\n"); return -EINVAL; } diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 61403b77361c..9f098ecb2449 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -46,7 +46,7 @@ MODULE_ALIAS("ip6t_limit"); See Alexey's formal explanation in net/sched/sch_tbf.c. - To get the maxmum range, we multiply by this factor (ie. you get N + To get the maximum range, we multiply by this factor (ie. you get N credits per jiffy). We want to allow a rate as low as 1 per day (slowest userspace tool allows), which means CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */ @@ -106,8 +106,8 @@ static int limit_mt_check(const struct xt_mtchk_param *par) /* Check for overflow. */ if (r->burst == 0 || user2credits(r->avg * r->burst) < user2credits(r->avg)) { - pr_info("Overflow, try lower: %u/%u\n", - r->avg, r->burst); + pr_info_ratelimited("Overflow, try lower: %u/%u\n", + r->avg, r->burst); return -ERANGE; } diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c index 0fd14d1eb09d..bdb689cdc829 100644 --- a/net/netfilter/xt_nat.c +++ b/net/netfilter/xt_nat.c @@ -8,6 +8,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> @@ -19,8 +21,7 @@ static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par) const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; if (mr->rangesize != 1) { - pr_info("%s: multiple ranges no longer supported\n", - par->target->name); + pr_info_ratelimited("multiple ranges no longer supported\n"); return -EINVAL; } return nf_ct_netns_get(par->net, par->family); diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 6f92d25590a8..6b56f4170860 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -6,6 +6,8 @@ * it under the terms of the GNU General Public License version 2 (or any * later at your option) as published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/skbuff.h> @@ -26,7 +28,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par) nfnl_acct_update(skb, info->nfacct); - overquota = nfnl_acct_overquota(xt_net(par), skb, info->nfacct); + overquota = nfnl_acct_overquota(xt_net(par), info->nfacct); return overquota == NFACCT_UNDERQUOTA ? false : true; } @@ -39,8 +41,8 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par) nfacct = nfnl_acct_find_get(par->net, info->name); if (nfacct == NULL) { - pr_info("xt_nfacct: accounting object with name `%s' " - "does not exists\n", info->name); + pr_info_ratelimited("accounting object `%s' does not exists\n", + info->name); return -ENOENT; } info->nfacct = nfacct; diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index bb33598e4530..9d6d67b953ac 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -107,9 +107,7 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) info->invert & XT_PHYSDEV_OP_BRIDGED) && par->hook_mask & ((1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) { - pr_info("using --physdev-out and --physdev-is-out are only " - "supported in the FORWARD and POSTROUTING chains with " - "bridged traffic.\n"); + pr_info_ratelimited("--physdev-out and --physdev-is-out only supported in the FORWARD and POSTROUTING chains with bridged traffic\n"); if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) return -EINVAL; } diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index 5639fb03bdd9..13f8ccf946d6 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -132,26 +132,29 @@ policy_mt(const struct sk_buff *skb, struct xt_action_param *par) static int policy_mt_check(const struct xt_mtchk_param *par) { const struct xt_policy_info *info = par->matchinfo; + const char *errmsg = "neither incoming nor outgoing policy selected"; + + if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) + goto err; - if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) { - pr_info("neither incoming nor outgoing policy selected\n"); - return -EINVAL; - } if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { - pr_info("output policy not valid in PREROUTING and INPUT\n"); - return -EINVAL; + errmsg = "output policy not valid in PREROUTING and INPUT"; + goto err; } if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_POLICY_MATCH_IN) { - pr_info("input policy not valid in POSTROUTING and OUTPUT\n"); - return -EINVAL; + errmsg = "input policy not valid in POSTROUTING and OUTPUT"; + goto err; } if (info->len > XT_POLICY_MAX_ELEM) { - pr_info("too many policy elements\n"); - return -EINVAL; + errmsg = "too many policy elements"; + goto err; } return 0; +err: + pr_info_ratelimited("%s\n", errmsg); + return -EINVAL; } static struct xt_match policy_mt_reg[] __read_mostly = { diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index 755d2f6693a2..bf77326861af 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -95,13 +95,13 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) } ret = -ENOENT; - est1 = xt_rateest_lookup(info->name1); + est1 = xt_rateest_lookup(par->net, info->name1); if (!est1) goto err1; est2 = NULL; if (info->flags & XT_RATEEST_MATCH_REL) { - est2 = xt_rateest_lookup(info->name2); + est2 = xt_rateest_lookup(par->net, info->name2); if (!est2) goto err2; } @@ -111,7 +111,7 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) return 0; err2: - xt_rateest_put(est1); + xt_rateest_put(par->net, est1); err1: return ret; } @@ -120,9 +120,9 @@ static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par) { struct xt_rateest_match_info *info = par->matchinfo; - xt_rateest_put(info->est1); + xt_rateest_put(par->net, info->est1); if (info->est2) - xt_rateest_put(info->est2); + xt_rateest_put(par->net, info->est2); } static struct xt_match xt_rateest_mt_reg __read_mostly = { diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 245fa350a7a8..9bbfc17ce3ec 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -51,8 +51,8 @@ static unsigned int ip_list_gid __read_mostly; module_param(ip_list_tot, uint, 0400); module_param(ip_list_hash_size, uint, 0400); module_param(ip_list_perms, uint, 0400); -module_param(ip_list_uid, uint, S_IRUGO | S_IWUSR); -module_param(ip_list_gid, uint, S_IRUGO | S_IWUSR); +module_param(ip_list_uid, uint, 0644); +module_param(ip_list_gid, uint, 0644); MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files"); @@ -342,8 +342,8 @@ static int recent_mt_check(const struct xt_mtchk_param *par, net_get_random_once(&hash_rnd, sizeof(hash_rnd)); if (info->check_set & ~XT_RECENT_VALID_FLAGS) { - pr_info("Unsupported user space flags (%08x)\n", - info->check_set); + pr_info_ratelimited("Unsupported userspace flags (%08x)\n", + info->check_set); return -EINVAL; } if (hweight8(info->check_set & @@ -357,13 +357,13 @@ static int recent_mt_check(const struct xt_mtchk_param *par, if ((info->check_set & XT_RECENT_REAP) && !info->seconds) return -EINVAL; if (info->hit_count >= XT_RECENT_MAX_NSTAMPS) { - pr_info("hitcount (%u) is larger than allowed maximum (%u)\n", - info->hit_count, XT_RECENT_MAX_NSTAMPS - 1); + pr_info_ratelimited("hitcount (%u) is larger than allowed maximum (%u)\n", + info->hit_count, XT_RECENT_MAX_NSTAMPS - 1); return -EINVAL; } - if (info->name[0] == '\0' || - strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN) - return -EINVAL; + ret = xt_check_proc_name(info->name, sizeof(info->name)); + if (ret) + return ret; if (ip_pkt_list_tot && info->hit_count < ip_pkt_list_tot) nstamp_mask = roundup_pow_of_two(ip_pkt_list_tot) - 1; @@ -587,7 +587,7 @@ recent_mt_proc_write(struct file *file, const char __user *input, add = true; break; default: - pr_info("Need \"+ip\", \"-ip\" or \"/\"\n"); + pr_info_ratelimited("Need \"+ip\", \"-ip\" or \"/\"\n"); return -EINVAL; } @@ -601,10 +601,8 @@ recent_mt_proc_write(struct file *file, const char __user *input, succ = in4_pton(c, size, (void *)&addr, '\n', NULL); } - if (!succ) { - pr_info("illegal address written to procfs\n"); + if (!succ) return -EINVAL; - } spin_lock_bh(&recent_lock); e = recent_entry_lookup(t, &addr, family, 0); diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 16b6b11ee83f..6f4c5217d835 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -92,12 +92,12 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par) index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); if (index == IPSET_INVALID_ID) { - pr_warn("Cannot find set identified by id %u to match\n", - info->match_set.index); + pr_info_ratelimited("Cannot find set identified by id %u to match\n", + info->match_set.index); return -ENOENT; } if (info->match_set.u.flags[IPSET_DIM_MAX - 1] != 0) { - pr_warn("Protocol error: set match dimension is over the limit!\n"); + pr_info_ratelimited("set match dimension is over the limit!\n"); ip_set_nfnl_put(par->net, info->match_set.index); return -ERANGE; } @@ -143,12 +143,12 @@ set_match_v1_checkentry(const struct xt_mtchk_param *par) index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); if (index == IPSET_INVALID_ID) { - pr_warn("Cannot find set identified by id %u to match\n", - info->match_set.index); + pr_info_ratelimited("Cannot find set identified by id %u to match\n", + info->match_set.index); return -ENOENT; } if (info->match_set.dim > IPSET_DIM_MAX) { - pr_warn("Protocol error: set match dimension is over the limit!\n"); + pr_info_ratelimited("set match dimension is over the limit!\n"); ip_set_nfnl_put(par->net, info->match_set.index); return -ERANGE; } @@ -241,8 +241,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) if (info->add_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); if (index == IPSET_INVALID_ID) { - pr_warn("Cannot find add_set index %u as target\n", - info->add_set.index); + pr_info_ratelimited("Cannot find add_set index %u as target\n", + info->add_set.index); return -ENOENT; } } @@ -250,8 +250,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) if (info->del_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); if (index == IPSET_INVALID_ID) { - pr_warn("Cannot find del_set index %u as target\n", - info->del_set.index); + pr_info_ratelimited("Cannot find del_set index %u as target\n", + info->del_set.index); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); return -ENOENT; @@ -259,7 +259,7 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) } if (info->add_set.u.flags[IPSET_DIM_MAX - 1] != 0 || info->del_set.u.flags[IPSET_DIM_MAX - 1] != 0) { - pr_warn("Protocol error: SET target dimension is over the limit!\n"); + pr_info_ratelimited("SET target dimension over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) @@ -316,8 +316,8 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par) if (info->add_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); if (index == IPSET_INVALID_ID) { - pr_warn("Cannot find add_set index %u as target\n", - info->add_set.index); + pr_info_ratelimited("Cannot find add_set index %u as target\n", + info->add_set.index); return -ENOENT; } } @@ -325,8 +325,8 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par) if (info->del_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); if (index == IPSET_INVALID_ID) { - pr_warn("Cannot find del_set index %u as target\n", - info->del_set.index); + pr_info_ratelimited("Cannot find del_set index %u as target\n", + info->del_set.index); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); return -ENOENT; @@ -334,7 +334,7 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par) } if (info->add_set.dim > IPSET_DIM_MAX || info->del_set.dim > IPSET_DIM_MAX) { - pr_warn("Protocol error: SET target dimension is over the limit!\n"); + pr_info_ratelimited("SET target dimension over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) @@ -444,8 +444,8 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); if (index == IPSET_INVALID_ID) { - pr_warn("Cannot find add_set index %u as target\n", - info->add_set.index); + pr_info_ratelimited("Cannot find add_set index %u as target\n", + info->add_set.index); return -ENOENT; } } @@ -454,8 +454,8 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); if (index == IPSET_INVALID_ID) { - pr_warn("Cannot find del_set index %u as target\n", - info->del_set.index); + pr_info_ratelimited("Cannot find del_set index %u as target\n", + info->del_set.index); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); @@ -465,7 +465,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) if (info->map_set.index != IPSET_INVALID_ID) { if (strncmp(par->table, "mangle", 7)) { - pr_warn("--map-set only usable from mangle table\n"); + pr_info_ratelimited("--map-set only usable from mangle table\n"); return -EINVAL; } if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | @@ -473,14 +473,14 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) !(par->hook_mask & (1 << NF_INET_FORWARD | 1 << NF_INET_LOCAL_OUT | 1 << NF_INET_POST_ROUTING))) { - pr_warn("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); + pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); return -EINVAL; } index = ip_set_nfnl_get_byindex(par->net, info->map_set.index); if (index == IPSET_INVALID_ID) { - pr_warn("Cannot find map_set index %u as target\n", - info->map_set.index); + pr_info_ratelimited("Cannot find map_set index %u as target\n", + info->map_set.index); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); @@ -494,7 +494,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) if (info->add_set.dim > IPSET_DIM_MAX || info->del_set.dim > IPSET_DIM_MAX || info->map_set.dim > IPSET_DIM_MAX) { - pr_warn("Protocol error: SET target dimension is over the limit!\n"); + pr_info_ratelimited("SET target dimension over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 575d2153e3b8..2ac7f674d19b 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -171,7 +171,8 @@ static int socket_mt_v1_check(const struct xt_mtchk_param *par) return err; if (info->flags & ~XT_SOCKET_FLAGS_V1) { - pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1); + pr_info_ratelimited("unknown flags 0x%x\n", + info->flags & ~XT_SOCKET_FLAGS_V1); return -EINVAL; } return 0; @@ -187,7 +188,8 @@ static int socket_mt_v2_check(const struct xt_mtchk_param *par) return err; if (info->flags & ~XT_SOCKET_FLAGS_V2) { - pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2); + pr_info_ratelimited("unknown flags 0x%x\n", + info->flags & ~XT_SOCKET_FLAGS_V2); return -EINVAL; } return 0; @@ -203,8 +205,8 @@ static int socket_mt_v3_check(const struct xt_mtchk_param *par) if (err) return err; if (info->flags & ~XT_SOCKET_FLAGS_V3) { - pr_info("unknown flags 0x%x\n", - info->flags & ~XT_SOCKET_FLAGS_V3); + pr_info_ratelimited("unknown flags 0x%x\n", + info->flags & ~XT_SOCKET_FLAGS_V3); return -EINVAL; } return 0; diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index 5fbd79194d21..0b41c0befe3c 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -44,8 +44,8 @@ static int state_mt_check(const struct xt_mtchk_param *par) ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) - pr_info("cannot load conntrack support for proto=%u\n", - par->family); + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); return ret; } diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 423293ee57c2..be1feddadcf0 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -21,6 +21,7 @@ MODULE_DESCRIPTION("Xtables: string-based matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_string"); MODULE_ALIAS("ip6t_string"); +MODULE_ALIAS("ebt_string"); static bool string_mt(const struct sk_buff *skb, struct xt_action_param *par) diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 1b01eec1fbda..c13bcd0ab491 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -9,6 +9,9 @@ * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/ktime.h> #include <linux/module.h> #include <linux/skbuff.h> @@ -235,13 +238,13 @@ static int time_mt_check(const struct xt_mtchk_param *par) if (info->daytime_start > XT_TIME_MAX_DAYTIME || info->daytime_stop > XT_TIME_MAX_DAYTIME) { - pr_info("invalid argument - start or " - "stop time greater than 23:59:59\n"); + pr_info_ratelimited("invalid argument - start or stop time greater than 23:59:59\n"); return -EDOM; } if (info->flags & ~XT_TIME_ALL_FLAGS) { - pr_info("unknown flags 0x%x\n", info->flags & ~XT_TIME_ALL_FLAGS); + pr_info_ratelimited("unknown flags 0x%x\n", + info->flags & ~XT_TIME_ALL_FLAGS); return -EINVAL; } @@ -266,13 +269,11 @@ static int __init time_mt_init(void) int minutes = sys_tz.tz_minuteswest; if (minutes < 0) /* east of Greenwich */ - printk(KERN_INFO KBUILD_MODNAME - ": kernel timezone is +%02d%02d\n", - -minutes / 60, -minutes % 60); + pr_info("kernel timezone is +%02d%02d\n", + -minutes / 60, -minutes % 60); else /* west of Greenwich */ - printk(KERN_INFO KBUILD_MODNAME - ": kernel timezone is -%02d%02d\n", - minutes / 60, minutes % 60); + pr_info("kernel timezone is -%02d%02d\n", + minutes / 60, minutes % 60); return xt_register_match(&xt_time_mt_reg); } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2ad445c1d27c..fa556fdef57d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1085,6 +1085,9 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, if (addr->sa_family != AF_NETLINK) return -EINVAL; + if (alen < sizeof(struct sockaddr_nl)) + return -EINVAL; + if ((nladdr->nl_groups || nladdr->nl_pid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; @@ -1105,7 +1108,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, } static int netlink_getname(struct socket *sock, struct sockaddr *addr, - int *addr_len, int peer) + int peer) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); @@ -1113,7 +1116,6 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, nladdr->nl_family = AF_NETLINK; nladdr->nl_pad = 0; - *addr_len = sizeof(*nladdr); if (peer) { nladdr->nl_pid = nlk->dst_portid; @@ -1124,7 +1126,7 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; netlink_unlock_table(); } - return 0; + return sizeof(*nladdr); } static int netlink_ioctl(struct socket *sock, unsigned int cmd, @@ -2308,7 +2310,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, if (cb->start) { ret = cb->start(cb); if (ret) - goto error_unlock; + goto error_put; } nlk->cb_running = true; @@ -2328,6 +2330,8 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, */ return -EINTR; +error_put: + module_put(control->module); error_unlock: sock_put(sk); mutex_unlock(nlk->cb_mutex); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 6f02499ef007..b9ce82c9440f 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1106,7 +1106,7 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, if (!err) delivered = true; else if (err != -ESRCH) - goto error; + return err; return delivered ? 0 : -ESRCH; error: kfree_skb(skb); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 9ba30c63be3d..4221d98a314b 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -829,11 +829,12 @@ out_release: } static int nr_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct full_sockaddr_ax25 *sax = (struct full_sockaddr_ax25 *)uaddr; struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); + int uaddr_len; memset(&sax->fsa_ax25, 0, sizeof(struct sockaddr_ax25)); @@ -848,16 +849,16 @@ static int nr_getname(struct socket *sock, struct sockaddr *uaddr, sax->fsa_ax25.sax25_call = nr->user_addr; memset(sax->fsa_digipeater, 0, sizeof(sax->fsa_digipeater)); sax->fsa_digipeater[0] = nr->dest_addr; - *uaddr_len = sizeof(struct full_sockaddr_ax25); + uaddr_len = sizeof(struct full_sockaddr_ax25); } else { sax->fsa_ax25.sax25_family = AF_NETROM; sax->fsa_ax25.sax25_ndigis = 0; sax->fsa_ax25.sax25_call = nr->source_addr; - *uaddr_len = sizeof(struct sockaddr_ax25); + uaddr_len = sizeof(struct sockaddr_ax25); } release_sock(sk); - return 0; + return uaddr_len; } int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) @@ -1449,9 +1450,9 @@ static int __init nr_proto_init(void) nr_loopback_init(); - proc_create("nr", S_IRUGO, init_net.proc_net, &nr_info_fops); - proc_create("nr_neigh", S_IRUGO, init_net.proc_net, &nr_neigh_fops); - proc_create("nr_nodes", S_IRUGO, init_net.proc_net, &nr_nodes_fops); + proc_create("nr", 0444, init_net.proc_net, &nr_info_fops); + proc_create("nr_neigh", 0444, init_net.proc_net, &nr_neigh_fops); + proc_create("nr_nodes", 0444, init_net.proc_net, &nr_nodes_fops); out: return rc; fail: diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 376040092142..ea0c0c6f1874 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -497,7 +497,7 @@ error: } static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, - int *len, int peer) + int peer) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); @@ -510,7 +510,6 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, llcp_sock->dsap, llcp_sock->ssap); memset(llcp_addr, 0, sizeof(*llcp_addr)); - *len = sizeof(struct sockaddr_nfc_llcp); lock_sock(sk); if (!llcp_sock->dev) { @@ -528,7 +527,7 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, llcp_addr->service_name_len); release_sock(sk); - return 0; + return sizeof(struct sockaddr_nfc_llcp); } static inline __poll_t llcp_accept_poll(struct sock *parent) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ef38e5aecd28..015e24e08909 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2363,10 +2363,10 @@ static void __net_exit ovs_exit_net(struct net *dnet) list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); - rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) list_vports_from_net(net, dnet, &head); - rtnl_unlock(); + up_read(&net_rwsem); /* Detach all vports from given namespace. */ list_for_each_entry_safe(vport, vport_next, &head, detach_list) { diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 04b94281a30b..b891a91577f8 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -242,14 +242,20 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) band->type = nla_get_u32(attr[OVS_BAND_ATTR_TYPE]); band->rate = nla_get_u32(attr[OVS_BAND_ATTR_RATE]); + if (band->rate == 0) { + err = -EINVAL; + goto exit_free_meter; + } + band->burst_size = nla_get_u32(attr[OVS_BAND_ATTR_BURST]); /* Figure out max delta_t that is enough to fill any bucket. * Keep max_delta_t size to the bucket units: * pkts => 1/1000 packets, kilobits => bits. + * + * Start with a full bucket. */ - band_max_delta_t = (band->burst_size + band->rate) * 1000; - /* Start with a full bucket. */ - band->bucket = band_max_delta_t; + band->bucket = (band->burst_size + band->rate) * 1000; + band_max_delta_t = band->bucket / band->rate; if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; band++; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index b6c8524032a0..f81c1d0ddff4 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -464,10 +464,10 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, return 0; } -static unsigned int packet_length(const struct sk_buff *skb, - struct net_device *dev) +static int packet_length(const struct sk_buff *skb, + struct net_device *dev) { - unsigned int length = skb->len - dev->hard_header_len; + int length = skb->len - dev->hard_header_len; if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol)) @@ -478,7 +478,7 @@ static unsigned int packet_length(const struct sk_buff *skb, * account for 802.1ad. e.g. is_skb_forwardable(). */ - return length; + return length > 0 ? length : 0; } void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e0f3f4aeeb4f..616cb9c18f88 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3409,7 +3409,7 @@ out: } static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct net_device *dev; struct sock *sk = sock->sk; @@ -3424,13 +3424,12 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, if (dev) strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); - *uaddr_len = sizeof(*uaddr); - return 0; + return sizeof(*uaddr); } static int packet_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct net_device *dev; struct sock *sk = sock->sk; @@ -3455,9 +3454,8 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, sll->sll_halen = 0; } rcu_read_unlock(); - *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; - return 0; + return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; } static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, diff --git a/net/phonet/socket.c b/net/phonet/socket.c index fffcd69f63ff..f9b40e6a18a5 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -326,7 +326,7 @@ static int pn_socket_accept(struct socket *sock, struct socket *newsock, } static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, - int *sockaddr_len, int peer) + int peer) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); @@ -337,8 +337,7 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, pn_sockaddr_set_object((struct sockaddr_pn *)addr, pn->sobject); - *sockaddr_len = sizeof(struct sockaddr_pn); - return 0; + return sizeof(struct sockaddr_pn); } static __poll_t pn_socket_poll(struct file *file, struct socket *sock, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 5fb3929e3d7d..b33e5aeb4c06 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -893,7 +893,7 @@ static int qrtr_connect(struct socket *sock, struct sockaddr *saddr, } static int qrtr_getname(struct socket *sock, struct sockaddr *saddr, - int *len, int peer) + int peer) { struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sockaddr_qrtr qaddr; @@ -912,12 +912,11 @@ static int qrtr_getname(struct socket *sock, struct sockaddr *saddr, } release_sock(sk); - *len = sizeof(qaddr); qaddr.sq_family = AF_QIPCRTR; memcpy(saddr, &qaddr, sizeof(qaddr)); - return 0; + return sizeof(qaddr); } static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) diff --git a/net/qrtr/smd.c b/net/qrtr/smd.c index 50615d5efac1..9cf089b9754e 100644 --- a/net/qrtr/smd.c +++ b/net/qrtr/smd.c @@ -114,5 +114,6 @@ static struct rpmsg_driver qcom_smd_qrtr_driver = { module_rpmsg_driver(qcom_smd_qrtr_driver); +MODULE_ALIAS("rpmsg:IPCRTR"); MODULE_DESCRIPTION("Qualcomm IPC-Router SMD interface driver"); MODULE_LICENSE("GPL v2"); diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 744c637c86b0..ab751a150f70 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -77,6 +77,7 @@ static int rds_release(struct socket *sock) rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); + rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue); spin_lock_bh(&rds_sock_lock); list_del_init(&rs->rs_item); @@ -110,7 +111,7 @@ void rds_wake_sk_sleep(struct rds_sock *rs) } static int rds_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; struct rds_sock *rs = rds_sk_to_rs(sock->sk); @@ -131,8 +132,7 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_family = AF_INET; - *uaddr_len = sizeof(*sin); - return 0; + return sizeof(*sin); } /* @@ -145,7 +145,7 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr, * - to signal that a previously congested destination may have become * uncongested * - A notification has been queued to the socket (this can be a congestion - * update, or a RDMA completion). + * update, or a RDMA completion, or a MSG_ZEROCOPY completion). * * EPOLLOUT is asserted if there is room on the send queue. This does not mean * however, that the next sendmsg() call will succeed. If the application tries @@ -179,10 +179,13 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, spin_unlock(&rs->rs_lock); } if (!list_empty(&rs->rs_recv_queue) || - !list_empty(&rs->rs_notify_queue)) + !list_empty(&rs->rs_notify_queue) || + !list_empty(&rs->rs_zcookie_queue.zcookie_head)) mask |= (EPOLLIN | EPOLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; read_unlock_irqrestore(&rs->rs_recv_lock, flags); /* clear state any time we wake a seen-congested socket */ @@ -512,6 +515,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) INIT_LIST_HEAD(&rs->rs_recv_queue); INIT_LIST_HEAD(&rs->rs_notify_queue); INIT_LIST_HEAD(&rs->rs_cong_list); + rds_message_zcopy_queue_init(&rs->rs_zcookie_queue); spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; rs->rs_rx_traces = 0; diff --git a/net/rds/connection.c b/net/rds/connection.c index 2da3176bf792..abef75da89a7 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -540,9 +540,9 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), + u64 *buffer, size_t item_len) { - uint64_t buffer[(item_len + 7) / 8]; struct hlist_head *head; struct rds_connection *conn; size_t i; @@ -578,9 +578,9 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_conn_path *, void *), + u64 *buffer, size_t item_len) { - u64 buffer[(item_len + 7) / 8]; struct hlist_head *head; struct rds_connection *conn; size_t i; @@ -649,8 +649,11 @@ static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { + u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8]; + rds_walk_conn_path_info(sock, len, iter, lens, rds_conn_info_visitor, + buffer, sizeof(struct rds_info_connection)); } diff --git a/net/rds/ib.c b/net/rds/ib.c index 50a88f3e7e39..02deee29e7f1 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -321,8 +321,11 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { + u64 buffer[(sizeof(struct rds_info_rdma_connection) + 7) / 8]; + rds_for_each_conn_info(sock, len, iter, lens, rds_ib_conn_info_visitor, + buffer, sizeof(struct rds_info_rdma_connection)); } diff --git a/net/rds/message.c b/net/rds/message.c index 4318cc9b78f7..a35f76971984 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -33,6 +33,9 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/skbuff.h> +#include <linux/list.h> +#include <linux/errqueue.h> #include "rds.h" @@ -45,7 +48,6 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_GEN_NUM] = sizeof(u32), }; - void rds_message_addref(struct rds_message *rm) { rdsdebug("addref rm %p ref %d\n", rm, refcount_read(&rm->m_refcount)); @@ -53,20 +55,107 @@ void rds_message_addref(struct rds_message *rm) } EXPORT_SYMBOL_GPL(rds_message_addref); +static inline bool rds_zcookie_add(struct rds_msg_zcopy_info *info, u32 cookie) +{ + struct rds_zcopy_cookies *ck = &info->zcookies; + int ncookies = ck->num; + + if (ncookies == RDS_MAX_ZCOOKIES) + return false; + ck->cookies[ncookies] = cookie; + ck->num = ++ncookies; + return true; +} + +static struct rds_msg_zcopy_info *rds_info_from_znotifier(struct rds_znotifier *znotif) +{ + return container_of(znotif, struct rds_msg_zcopy_info, znotif); +} + +void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *q) +{ + unsigned long flags; + LIST_HEAD(copy); + struct rds_msg_zcopy_info *info, *tmp; + + spin_lock_irqsave(&q->lock, flags); + list_splice(&q->zcookie_head, ©); + INIT_LIST_HEAD(&q->zcookie_head); + spin_unlock_irqrestore(&q->lock, flags); + + list_for_each_entry_safe(info, tmp, ©, rs_zcookie_next) { + list_del(&info->rs_zcookie_next); + kfree(info); + } +} + +static void rds_rm_zerocopy_callback(struct rds_sock *rs, + struct rds_znotifier *znotif) +{ + struct rds_msg_zcopy_info *info; + struct rds_msg_zcopy_queue *q; + u32 cookie = znotif->z_cookie; + struct rds_zcopy_cookies *ck; + struct list_head *head; + unsigned long flags; + + mm_unaccount_pinned_pages(&znotif->z_mmp); + q = &rs->rs_zcookie_queue; + spin_lock_irqsave(&q->lock, flags); + head = &q->zcookie_head; + if (!list_empty(head)) { + info = list_entry(head, struct rds_msg_zcopy_info, + rs_zcookie_next); + if (info && rds_zcookie_add(info, cookie)) { + spin_unlock_irqrestore(&q->lock, flags); + kfree(rds_info_from_znotifier(znotif)); + /* caller invokes rds_wake_sk_sleep() */ + return; + } + } + + info = rds_info_from_znotifier(znotif); + ck = &info->zcookies; + memset(ck, 0, sizeof(*ck)); + WARN_ON(!rds_zcookie_add(info, cookie)); + list_add_tail(&q->zcookie_head, &info->rs_zcookie_next); + + spin_unlock_irqrestore(&q->lock, flags); + /* caller invokes rds_wake_sk_sleep() */ +} + /* * This relies on dma_map_sg() not touching sg[].page during merging. */ static void rds_message_purge(struct rds_message *rm) { - unsigned long i; + unsigned long i, flags; + bool zcopy = false; if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; + spin_lock_irqsave(&rm->m_rs_lock, flags); + if (rm->m_rs) { + struct rds_sock *rs = rm->m_rs; + + if (rm->data.op_mmp_znotifier) { + zcopy = true; + rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); + rds_wake_sk_sleep(rs); + rm->data.op_mmp_znotifier = NULL; + } + sock_put(rds_rs_to_sk(rs)); + rm->m_rs = NULL; + } + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + for (i = 0; i < rm->data.op_nents; i++) { - rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); /* XXX will have to put_page for page refs */ - __free_page(sg_page(&rm->data.op_sg[i])); + if (!zcopy) + __free_page(sg_page(&rm->data.op_sg[i])); + else + put_page(sg_page(&rm->data.op_sg[i])); } rm->data.op_nents = 0; @@ -266,12 +355,13 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in return rm; } -int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from) +static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from) { - unsigned long to_copy, nbytes; - unsigned long sg_off; struct scatterlist *sg; int ret = 0; + int length = iov_iter_count(from); + int total_copied = 0; + struct rds_msg_zcopy_info *info; rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); @@ -279,8 +369,67 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from) * now allocate and copy in the data payload. */ sg = rm->data.op_sg; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + INIT_LIST_HEAD(&info->rs_zcookie_next); + rm->data.op_mmp_znotifier = &info->znotif; + if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, + length)) { + ret = -ENOMEM; + goto err; + } + while (iov_iter_count(from)) { + struct page *pages; + size_t start; + ssize_t copied; + + copied = iov_iter_get_pages(from, &pages, PAGE_SIZE, + 1, &start); + if (copied < 0) { + struct mmpin *mmp; + int i; + + for (i = 0; i < rm->data.op_nents; i++) + put_page(sg_page(&rm->data.op_sg[i])); + mmp = &rm->data.op_mmp_znotifier->z_mmp; + mm_unaccount_pinned_pages(mmp); + ret = -EFAULT; + goto err; + } + total_copied += copied; + iov_iter_advance(from, copied); + length -= copied; + sg_set_page(sg, pages, copied, start); + rm->data.op_nents++; + sg++; + } + WARN_ON_ONCE(length != 0); + return ret; +err: + kfree(info); + rm->data.op_mmp_znotifier = NULL; + return ret; +} + +int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, + bool zcopy) +{ + unsigned long to_copy, nbytes; + unsigned long sg_off; + struct scatterlist *sg; + int ret = 0; + + rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); + + /* now allocate and copy in the data payload. */ + sg = rm->data.op_sg; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ + if (zcopy) + return rds_message_zcopy_from_user(rm, from); + while (iov_iter_count(from)) { if (!sg_page(sg)) { ret = rds_page_remainder_alloc(sg, iov_iter_count(from), diff --git a/net/rds/rds.h b/net/rds/rds.h index 7301b9b01890..b04c333d9d1c 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -356,6 +356,30 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) #define RDS_MSG_PAGEVEC 7 #define RDS_MSG_FLUSH 8 +struct rds_znotifier { + struct mmpin z_mmp; + u32 z_cookie; +}; + +struct rds_msg_zcopy_info { + struct list_head rs_zcookie_next; + union { + struct rds_znotifier znotif; + struct rds_zcopy_cookies zcookies; + }; +}; + +struct rds_msg_zcopy_queue { + struct list_head zcookie_head; + spinlock_t lock; /* protects zcookie_head queue */ +}; + +static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q) +{ + spin_lock_init(&q->lock); + INIT_LIST_HEAD(&q->zcookie_head); +} + struct rds_message { refcount_t m_refcount; struct list_head m_sock_item; @@ -436,6 +460,7 @@ struct rds_message { unsigned int op_count; unsigned int op_dmasg; unsigned int op_dmaoff; + struct rds_znotifier *op_mmp_znotifier; struct scatterlist *op_sg; } data; }; @@ -589,6 +614,7 @@ struct rds_sock { /* Socket receive path trace points*/ u8 rs_rx_traces; u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; + struct rds_msg_zcopy_queue rs_zcookie_queue; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) @@ -709,6 +735,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), + u64 *buffer, size_t item_len); __printf(2, 3) @@ -771,7 +798,8 @@ rds_conn_connecting(struct rds_connection *conn) /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); -int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from); +int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, + bool zcopy); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq); @@ -786,6 +814,7 @@ void rds_message_addref(struct rds_message *rm); void rds_message_put(struct rds_message *rm); void rds_message_wait(struct rds_message *rm); void rds_message_unmapped(struct rds_message *rm); +void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *info); static inline void rds_message_make_checksum(struct rds_header *hdr) { diff --git a/net/rds/recv.c b/net/rds/recv.c index b25bcfe411ca..de50e2126e40 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -577,6 +577,41 @@ out: return ret; } +static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg) +{ + struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue; + struct rds_msg_zcopy_info *info = NULL; + struct rds_zcopy_cookies *done; + unsigned long flags; + + if (!msg->msg_control) + return false; + + if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) || + msg->msg_controllen < CMSG_SPACE(sizeof(*done))) + return false; + + spin_lock_irqsave(&q->lock, flags); + if (!list_empty(&q->zcookie_head)) { + info = list_entry(q->zcookie_head.next, + struct rds_msg_zcopy_info, rs_zcookie_next); + list_del(&info->rs_zcookie_next); + } + spin_unlock_irqrestore(&q->lock, flags); + if (!info) + return false; + done = &info->zcookies; + if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done), + done)) { + spin_lock_irqsave(&q->lock, flags); + list_add(&info->rs_zcookie_next, &q->zcookie_head); + spin_unlock_irqrestore(&q->lock, flags); + return false; + } + kfree(info); + return true; +} + int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags) { @@ -594,6 +629,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (msg_flags & MSG_OOB) goto out; + if (msg_flags & MSG_ERRQUEUE) + return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR); while (1) { /* If there are pending notifications, do those - and nothing else */ @@ -609,7 +646,9 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (!rds_next_incoming(rs, &inc)) { if (nonblock) { - ret = -EAGAIN; + bool reaped = rds_recvmsg_zcookie(rs, msg); + + ret = reaped ? 0 : -EAGAIN; break; } @@ -658,6 +697,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, ret = -EFAULT; goto out; } + rds_recvmsg_zcookie(rs, msg); rds_stats_inc(s_recv_delivered); diff --git a/net/rds/send.c b/net/rds/send.c index b1b0022b8370..acad04243b41 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -649,7 +649,6 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status) rm->rdma.op_notifier = NULL; } was_on_sock = 1; - rm->m_rs = NULL; } spin_unlock(&rs->rs_lock); @@ -756,9 +755,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) */ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { spin_unlock_irqrestore(&cp->cp_lock, flags); - spin_lock_irqsave(&rm->m_rs_lock, flags); - rm->m_rs = NULL; - spin_unlock_irqrestore(&rm->m_rs_lock, flags); continue; } list_del_init(&rm->m_conn_item); @@ -774,7 +770,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); - rm->m_rs = NULL; spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); @@ -798,7 +793,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); - rm->m_rs = NULL; spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); @@ -849,6 +843,7 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); rds_message_addref(rm); + sock_hold(rds_rs_to_sk(rs)); rm->m_rs = rs; /* The code ordering is a little weird, but we're @@ -880,12 +875,13 @@ out: * rds_message is getting to be quite complicated, and we'd like to allocate * it all in one go. This figures out how big it needs to be up front. */ -static int rds_rm_size(struct msghdr *msg, int data_len) +static int rds_rm_size(struct msghdr *msg, int num_sgs) { struct cmsghdr *cmsg; int size = 0; int cmsg_groups = 0; int retval; + bool zcopy_cookie = false; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) @@ -904,6 +900,10 @@ static int rds_rm_size(struct msghdr *msg, int data_len) break; + case RDS_CMSG_ZCOPY_COOKIE: + zcopy_cookie = true; + /* fall through */ + case RDS_CMSG_RDMA_DEST: case RDS_CMSG_RDMA_MAP: cmsg_groups |= 2; @@ -924,7 +924,10 @@ static int rds_rm_size(struct msghdr *msg, int data_len) } - size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie) + return -EINVAL; + + size += num_sgs * sizeof(struct scatterlist); /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ if (cmsg_groups == 3) @@ -933,6 +936,19 @@ static int rds_rm_size(struct msghdr *msg, int data_len) return size; } +static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + u32 *cookie; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)) || + !rm->data.op_mmp_znotifier) + return -EINVAL; + cookie = CMSG_DATA(cmsg); + rm->data.op_mmp_znotifier->z_cookie = *cookie; + return 0; +} + static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr) { @@ -975,6 +991,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, ret = rds_cmsg_atomic(rs, rm, cmsg); break; + case RDS_CMSG_ZCOPY_COOKIE: + ret = rds_cmsg_zcopy(rs, rm, cmsg); + break; + default: return -EINVAL; } @@ -1045,10 +1065,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; size_t total_payload_len = payload_len, rdma_payload_len = 0; + bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && + sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); + int num_sgs = ceil(payload_len, PAGE_SIZE); /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ - if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { + if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) { ret = -EOPNOTSUPP; goto out; } @@ -1092,8 +1115,15 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } + if (zcopy) { + if (rs->rs_transport->t_type != RDS_TRANS_TCP) { + ret = -EOPNOTSUPP; + goto out; + } + num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX); + } /* size of rm including all sgs */ - ret = rds_rm_size(msg, payload_len); + ret = rds_rm_size(msg, num_sgs); if (ret < 0) goto out; @@ -1105,12 +1135,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* Attach data to the rm */ if (payload_len) { - rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); if (!rm->data.op_sg) { ret = -ENOMEM; goto out; } - ret = rds_message_copy_from_user(rm, &msg->msg_iter); + ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy); if (ret) goto out; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 44c4652721af..351a28474667 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -227,7 +227,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, struct rds_tcp_connection *tc; unsigned long flags; struct sockaddr_in sin; - int sinlen; struct socket *sock; spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); @@ -239,12 +238,10 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, sock = tc->t_sock; if (sock) { - sock->ops->getname(sock, (struct sockaddr *)&sin, - &sinlen, 0); + sock->ops->getname(sock, (struct sockaddr *)&sin, 0); tsinfo.local_addr = sin.sin_addr.s_addr; tsinfo.local_port = sin.sin_port; - sock->ops->getname(sock, (struct sockaddr *)&sin, - &sinlen, 1); + sock->ops->getname(sock, (struct sockaddr *)&sin, 1); tsinfo.peer_addr = sin.sin_addr.s_addr; tsinfo.peer_port = sin.sin_port; } @@ -275,13 +272,14 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr) static void rds_tcp_conn_free(void *arg) { struct rds_tcp_connection *tc = arg; + unsigned long flags; rdsdebug("freeing tc %p\n", tc); - spin_lock_bh(&rds_tcp_conn_lock); + spin_lock_irqsave(&rds_tcp_conn_lock, flags); if (!tc->t_tcp_node_detached) list_del(&tc->t_tcp_node); - spin_unlock_bh(&rds_tcp_conn_lock); + spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); kmem_cache_free(rds_tcp_conn_slab, tc); } @@ -311,13 +309,13 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) rdsdebug("rds_conn_path [%d] tc %p\n", i, conn->c_path[i].cp_transport_data); } - spin_lock_bh(&rds_tcp_conn_lock); + spin_lock_irq(&rds_tcp_conn_lock); for (i = 0; i < RDS_MPATH_WORKERS; i++) { tc = conn->c_path[i].cp_transport_data; tc->t_tcp_node_detached = false; list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); } - spin_unlock_bh(&rds_tcp_conn_lock); + spin_unlock_irq(&rds_tcp_conn_lock); fail: if (ret) { for (j = 0; j < i; j++) @@ -487,39 +485,6 @@ fail: return err; } -static void __net_exit rds_tcp_exit_net(struct net *net) -{ - struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); - - if (rtn->rds_tcp_sysctl) - unregister_net_sysctl_table(rtn->rds_tcp_sysctl); - - if (net != &init_net && rtn->ctl_table) - kfree(rtn->ctl_table); - - /* If rds_tcp_exit_net() is called as a result of netns deletion, - * the rds_tcp_kill_sock() device notifier would already have cleaned - * up the listen socket, thus there is no work to do in this function. - * - * If rds_tcp_exit_net() is called as a result of module unload, - * i.e., due to rds_tcp_exit() -> unregister_pernet_subsys(), then - * we do need to clean up the listen socket here. - */ - if (rtn->rds_tcp_listen_sock) { - struct socket *lsock = rtn->rds_tcp_listen_sock; - - rtn->rds_tcp_listen_sock = NULL; - rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); - } -} - -static struct pernet_operations rds_tcp_net_ops = { - .init = rds_tcp_init_net, - .exit = rds_tcp_exit_net, - .id = &rds_tcp_netid, - .size = sizeof(struct rds_tcp_net), -}; - static void rds_tcp_kill_sock(struct net *net) { struct rds_tcp_connection *tc, *_tc; @@ -529,7 +494,7 @@ static void rds_tcp_kill_sock(struct net *net) rtn->rds_tcp_listen_sock = NULL; rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); - spin_lock_bh(&rds_tcp_conn_lock); + spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); @@ -542,45 +507,42 @@ static void rds_tcp_kill_sock(struct net *net) tc->t_tcp_node_detached = true; } } - spin_unlock_bh(&rds_tcp_conn_lock); + spin_unlock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) rds_conn_destroy(tc->t_cpath->cp_conn); } -void *rds_tcp_listen_sock_def_readable(struct net *net) +static void __net_exit rds_tcp_exit_net(struct net *net) { struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); - struct socket *lsock = rtn->rds_tcp_listen_sock; - if (!lsock) - return NULL; + rds_tcp_kill_sock(net); - return lsock->sk->sk_user_data; + if (rtn->rds_tcp_sysctl) + unregister_net_sysctl_table(rtn->rds_tcp_sysctl); + + if (net != &init_net && rtn->ctl_table) + kfree(rtn->ctl_table); } -static int rds_tcp_dev_event(struct notifier_block *this, - unsigned long event, void *ptr) +static struct pernet_operations rds_tcp_net_ops = { + .init = rds_tcp_init_net, + .exit = rds_tcp_exit_net, + .id = &rds_tcp_netid, + .size = sizeof(struct rds_tcp_net), +}; + +void *rds_tcp_listen_sock_def_readable(struct net *net) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + struct socket *lsock = rtn->rds_tcp_listen_sock; - /* rds-tcp registers as a pernet subys, so the ->exit will only - * get invoked after network acitivity has quiesced. We need to - * clean up all sockets to quiesce network activity, and use - * the unregistration of the per-net loopback device as a trigger - * to start that cleanup. - */ - if (event == NETDEV_UNREGISTER_FINAL && - dev->ifindex == LOOPBACK_IFINDEX) - rds_tcp_kill_sock(dev_net(dev)); + if (!lsock) + return NULL; - return NOTIFY_DONE; + return lsock->sk->sk_user_data; } -static struct notifier_block rds_tcp_dev_notifier = { - .notifier_call = rds_tcp_dev_event, - .priority = -10, /* must be called after other network notifiers */ -}; - /* when sysctl is used to modify some kernel socket parameters,this * function resets the RDS connections in that netns so that we can * restart with new parameters. The assumption is that such reset @@ -590,7 +552,7 @@ static void rds_tcp_sysctl_reset(struct net *net) { struct rds_tcp_connection *tc, *_tc; - spin_lock_bh(&rds_tcp_conn_lock); + spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); @@ -600,7 +562,7 @@ static void rds_tcp_sysctl_reset(struct net *net) /* reconnect with new parameters */ rds_conn_path_drop(tc->t_cpath, false); } - spin_unlock_bh(&rds_tcp_conn_lock); + spin_unlock_irq(&rds_tcp_conn_lock); } static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, @@ -626,9 +588,7 @@ static void rds_tcp_exit(void) rds_tcp_set_unloading(); synchronize_rcu(); rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); - unregister_pernet_subsys(&rds_tcp_net_ops); - if (unregister_netdevice_notifier(&rds_tcp_dev_notifier)) - pr_warn("could not unregister rds_tcp_dev_notifier\n"); + unregister_pernet_device(&rds_tcp_net_ops); rds_tcp_destroy_conns(); rds_trans_unregister(&rds_tcp_transport); rds_tcp_recv_exit(); @@ -652,24 +612,15 @@ static int rds_tcp_init(void) if (ret) goto out_slab; - ret = register_pernet_subsys(&rds_tcp_net_ops); + ret = register_pernet_device(&rds_tcp_net_ops); if (ret) goto out_recv; - ret = register_netdevice_notifier(&rds_tcp_dev_notifier); - if (ret) { - pr_warn("could not register rds_tcp_dev_notifier\n"); - goto out_pernet; - } - rds_trans_register(&rds_tcp_transport); rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); goto out; - -out_pernet: - unregister_pernet_subsys(&rds_tcp_net_ops); out_recv: rds_tcp_recv_exit(); out_slab: diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index c061d6eb465d..22571189f21e 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -142,12 +142,20 @@ int rds_tcp_accept_one(struct socket *sock) if (ret) goto out; - new_sock->type = sock->type; - new_sock->ops = sock->ops; ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true); if (ret < 0) goto out; + /* sock_create_lite() does not get a hold on the owner module so we + * need to do it here. Note that sock_release() uses sock->ops to + * determine if it needs to decrement the reference count. So set + * sock->ops after calling accept() in case that fails. And there's + * no need to do try_module_get() as the listener should have a hold + * already. + */ + new_sock->ops = sock->ops; + __module_get(new_sock->ops->owner); + ret = rds_tcp_keepalive(new_sock); if (ret < 0) goto out; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 083bd251406f..9ff5e0a76593 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -938,7 +938,7 @@ out_release: } static int rose_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct full_sockaddr_rose *srose = (struct full_sockaddr_rose *)uaddr; struct sock *sk = sock->sk; @@ -964,8 +964,7 @@ static int rose_getname(struct socket *sock, struct sockaddr *uaddr, srose->srose_digis[n] = rose->source_digis[n]; } - *uaddr_len = sizeof(struct full_sockaddr_rose); - return 0; + return sizeof(struct full_sockaddr_rose); } int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct rose_neigh *neigh, unsigned int lci) @@ -1568,12 +1567,12 @@ static int __init rose_proto_init(void) rose_add_loopback_neigh(); - proc_create("rose", S_IRUGO, init_net.proc_net, &rose_info_fops); - proc_create("rose_neigh", S_IRUGO, init_net.proc_net, + proc_create("rose", 0444, init_net.proc_net, &rose_info_fops); + proc_create("rose_neigh", 0444, init_net.proc_net, &rose_neigh_fops); - proc_create("rose_nodes", S_IRUGO, init_net.proc_net, + proc_create("rose_nodes", 0444, init_net.proc_net, &rose_nodes_fops); - proc_create("rose_routes", S_IRUGO, init_net.proc_net, + proc_create("rose_routes", 0444, init_net.proc_net, &rose_routes_fops); out: return rc; diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 0c9c18aa7c77..9a2c8e7c000e 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -32,7 +32,7 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_RXRPC); unsigned int rxrpc_debug; // = RXRPC_DEBUG_KPROTO; -module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO); +module_param_named(debug, rxrpc_debug, uint, 0644); MODULE_PARM_DESC(debug, "RxRPC debugging mask"); static struct proto rxrpc_proto; @@ -40,6 +40,7 @@ static const struct proto_ops rxrpc_rpc_ops; /* current debugging ID */ atomic_t rxrpc_debug_id; +EXPORT_SYMBOL(rxrpc_debug_id); /* count of skbs currently in use */ atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; @@ -267,6 +268,7 @@ static int rxrpc_listen(struct socket *sock, int backlog) * @gfp: The allocation constraints * @notify_rx: Where to send notifications instead of socket queue * @upgrade: Request service upgrade for call + * @debug_id: The debug ID for tracing to be assigned to the call * * Allow a kernel service to begin a call on the nominated socket. This just * sets up all the internal tracking structures and allocates connection and @@ -282,7 +284,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, s64 tx_total_len, gfp_t gfp, rxrpc_notify_rx_t notify_rx, - bool upgrade) + bool upgrade, + unsigned int debug_id) { struct rxrpc_conn_parameters cp; struct rxrpc_call_params p; @@ -314,13 +317,14 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, cp.exclusive = false; cp.upgrade = upgrade; cp.service_id = srx->srx_service; - call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp); + call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp, debug_id); /* The socket has been unlocked. */ if (!IS_ERR(call)) { call->notify_rx = notify_rx; mutex_unlock(&call->user_mutex); } + rxrpc_put_peer(cp.peer); _leave(" = %p", call); return call; } @@ -444,6 +448,7 @@ int rxrpc_kernel_retry_call(struct socket *sock, struct rxrpc_call *call, ret = rxrpc_retry_client_call(rx, call, &cp, srx, GFP_KERNEL); mutex_unlock(&call->user_mutex); + rxrpc_put_peer(cp.peer); _leave(" = %d", ret); return ret; } @@ -759,6 +764,7 @@ static __poll_t rxrpc_poll(struct file *file, struct socket *sock, static int rxrpc_create(struct net *net, struct socket *sock, int protocol, int kern) { + struct rxrpc_net *rxnet; struct rxrpc_sock *rx; struct sock *sk; @@ -798,6 +804,9 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, rwlock_init(&rx->call_lock); memset(&rx->srx, 0, sizeof(rx->srx)); + rxnet = rxrpc_net(sock_net(&rx->sk)); + timer_reduce(&rxnet->peer_keepalive_timer, jiffies + 1); + _leave(" = 0 [%p]", rx); return 0; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 416688381eb7..90d7079e0aa9 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -75,7 +75,9 @@ struct rxrpc_net { u32 epoch; /* Local epoch for detecting local-end reset */ struct list_head calls; /* List of calls active in this namespace */ rwlock_t call_lock; /* Lock for ->calls */ + atomic_t nr_calls; /* Count of allocated calls */ + atomic_t nr_conns; struct list_head conn_proc_list; /* List of conns in this namespace for proc */ struct list_head service_conns; /* Service conns in this namespace */ rwlock_t conn_lock; /* Lock for ->conn_proc_list, ->service_conns */ @@ -97,8 +99,16 @@ struct rxrpc_net { struct list_head local_endpoints; struct mutex local_mutex; /* Lock for ->local_endpoints */ - spinlock_t peer_hash_lock; /* Lock for ->peer_hash */ DECLARE_HASHTABLE (peer_hash, 10); + spinlock_t peer_hash_lock; /* Lock for ->peer_hash */ + +#define RXRPC_KEEPALIVE_TIME 20 /* NAT keepalive time in seconds */ + u8 peer_keepalive_cursor; + ktime_t peer_keepalive_base; + struct hlist_head peer_keepalive[RXRPC_KEEPALIVE_TIME + 1]; + struct hlist_head peer_keepalive_new; + struct timer_list peer_keepalive_timer; + struct work_struct peer_keepalive_work; }; /* @@ -285,6 +295,8 @@ struct rxrpc_peer { struct hlist_head error_targets; /* targets for net error distribution */ struct work_struct error_distributor; struct rb_root service_conns; /* Service connections */ + struct hlist_node keepalive_link; /* Link in net->peer_keepalive[] */ + time64_t last_tx_at; /* Last time packet sent here */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ unsigned int if_mtu; /* interface MTU for this peer */ @@ -518,6 +530,7 @@ struct rxrpc_call { struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ + struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ struct mutex user_mutex; /* User access mutex */ unsigned long ack_at; /* When deferred ACK needs to happen */ unsigned long ack_lost_at; /* When ACK is figured as lost */ @@ -691,7 +704,6 @@ struct rxrpc_send_params { * af_rxrpc.c */ extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; -extern atomic_t rxrpc_debug_id; extern struct workqueue_struct *rxrpc_workqueue; /* @@ -732,11 +744,12 @@ extern unsigned int rxrpc_max_call_lifetime; extern struct kmem_cache *rxrpc_call_jar; struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); -struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t); +struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, - struct rxrpc_call_params *, gfp_t); + struct rxrpc_call_params *, gfp_t, + unsigned int); int rxrpc_retry_client_call(struct rxrpc_sock *, struct rxrpc_call *, struct rxrpc_conn_parameters *, @@ -778,6 +791,7 @@ static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call, call->error = error; call->completion = compl, call->state = RXRPC_CALL_COMPLETE; + trace_rxrpc_call_complete(call); wake_up(&call->waitq); return true; } @@ -822,7 +836,7 @@ static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call, rxrpc_seq_t seq, u32 abort_code, int error) { - trace_rxrpc_abort(why, call->cid, call->call_id, seq, + trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq, abort_code, error); return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, abort_code, error); @@ -968,31 +982,12 @@ extern void rxrpc_process_local_events(struct rxrpc_local *); * local_object.c */ struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc *); -void __rxrpc_put_local(struct rxrpc_local *); +struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *); +struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *); +void rxrpc_put_local(struct rxrpc_local *); +void rxrpc_queue_local(struct rxrpc_local *); void rxrpc_destroy_all_locals(struct rxrpc_net *); -static inline void rxrpc_get_local(struct rxrpc_local *local) -{ - atomic_inc(&local->usage); -} - -static inline -struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) -{ - return atomic_inc_not_zero(&local->usage) ? local : NULL; -} - -static inline void rxrpc_put_local(struct rxrpc_local *local) -{ - if (local && atomic_dec_and_test(&local->usage)) - __rxrpc_put_local(local); -} - -static inline void rxrpc_queue_local(struct rxrpc_local *local) -{ - rxrpc_queue_work(&local->processor); -} - /* * misc.c */ @@ -1025,6 +1020,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *, bool, rxrpc_serial_t *); int rxrpc_send_abort_packet(struct rxrpc_call *); int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool); void rxrpc_reject_packets(struct rxrpc_local *); +void rxrpc_send_keepalive(struct rxrpc_peer *); /* * peer_event.c @@ -1033,6 +1029,7 @@ void rxrpc_error_report(struct sock *); void rxrpc_peer_error_distributor(struct work_struct *); void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); +void rxrpc_peer_keepalive_worker(struct work_struct *); /* * peer_object.c @@ -1044,25 +1041,11 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *, struct rxrpc_peer *); - -static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) -{ - atomic_inc(&peer->usage); - return peer; -} - -static inline -struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) -{ - return atomic_inc_not_zero(&peer->usage) ? peer : NULL; -} - -extern void __rxrpc_put_peer(struct rxrpc_peer *peer); -static inline void rxrpc_put_peer(struct rxrpc_peer *peer) -{ - if (peer && atomic_dec_and_test(&peer->usage)) - __rxrpc_put_peer(peer); -} +void rxrpc_destroy_all_peers(struct rxrpc_net *); +struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *); +struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *); +void rxrpc_put_peer(struct rxrpc_peer *); +void __rxrpc_queue_peer_error(struct rxrpc_peer *); /* * proc.c diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 3028298ca561..a9a9be5519b9 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -34,7 +34,8 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, struct rxrpc_backlog *b, rxrpc_notify_rx_t notify_rx, rxrpc_user_attach_call_t user_attach_call, - unsigned long user_call_ID, gfp_t gfp) + unsigned long user_call_ID, gfp_t gfp, + unsigned int debug_id) { const void *here = __builtin_return_address(0); struct rxrpc_call *call; @@ -94,7 +95,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, /* Now it gets complicated, because calls get registered with the * socket here, particularly if a user ID is preassigned by the user. */ - call = rxrpc_alloc_call(rx, gfp); + call = rxrpc_alloc_call(rx, gfp, debug_id); if (!call) return -ENOMEM; call->flags |= (1 << RXRPC_CALL_IS_SERVICE); @@ -137,6 +138,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); + rxnet = call->rxnet; write_lock(&rxnet->call_lock); list_add_tail(&call->link, &rxnet->calls); write_unlock(&rxnet->call_lock); @@ -174,7 +176,8 @@ int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp) if (rx->discard_new_call) return 0; - while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp) == 0) + while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp, + atomic_inc_return(&rxrpc_debug_id)) == 0) ; return 0; @@ -216,6 +219,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) list_del(&conn->proc_link); write_unlock(&rxnet->conn_lock); kfree(conn); + if (atomic_dec_and_test(&rxnet->nr_conns)) + wake_up_var(&rxnet->nr_conns); tail = (tail + 1) & (size - 1); } @@ -223,7 +228,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) tail = b->call_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; - call->socket = rx; + rcu_assign_pointer(call->socket, rx); if (rx->discard_new_call) { _debug("discard %lx", call->user_call_ID); rx->discard_new_call(call, call->user_call_ID); @@ -293,8 +298,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, b->conn_backlog[conn_tail] = NULL; smp_store_release(&b->conn_backlog_tail, (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); - rxrpc_get_local(local); - conn->params.local = local; + conn->params.local = rxrpc_get_local(local); conn->params.peer = peer; rxrpc_see_connection(conn); rxrpc_new_incoming_connection(rx, conn, skb); @@ -347,7 +351,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, service_id == rx->second_service)) goto found_service; - trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, EOPNOTSUPP); skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; skb->priority = RX_INVALID_OPERATION; @@ -358,7 +362,7 @@ found_service: spin_lock(&rx->incoming_lock); if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || rx->sk.sk_state == RXRPC_CLOSE) { - trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber, + trace_rxrpc_abort(0, "CLS", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; skb->priority = RX_INVALID_OPERATION; @@ -454,6 +458,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, unsigned long user_call_ID, rxrpc_notify_rx_t notify_rx) __releases(&rx->sk.sk_lock.slock) + __acquires(call->user_mutex) { struct rxrpc_call *call; struct rb_node *parent, **pp; @@ -635,6 +640,7 @@ out_discard: * @user_attach_call: Func to attach call to user_call_ID * @user_call_ID: The tag to attach to the preallocated call * @gfp: The allocation conditions. + * @debug_id: The tracing debug ID. * * Charge up the socket with preallocated calls, each with a user ID. A * function should be provided to effect the attachment from the user's side. @@ -645,7 +651,8 @@ out_discard: int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, rxrpc_user_attach_call_t user_attach_call, - unsigned long user_call_ID, gfp_t gfp) + unsigned long user_call_ID, gfp_t gfp, + unsigned int debug_id) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct rxrpc_backlog *b = rx->backlog; @@ -655,6 +662,6 @@ int rxrpc_kernel_charge_accept(struct socket *sock, return rxrpc_service_prealloc_one(rx, b, notify_rx, user_attach_call, user_call_ID, - gfp); + gfp, debug_id); } EXPORT_SYMBOL(rxrpc_kernel_charge_accept); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index ad2ab1103189..6e0d788b4dc4 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -195,6 +195,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) * the packets in the Tx buffer we're going to resend and what the new * resend timeout will be. */ + trace_rxrpc_resend(call, (cursor + 1) & RXRPC_RXTX_BUFF_MASK); oldest = now; for (seq = cursor + 1; before_eq(seq, top); seq++) { ix = seq & RXRPC_RXTX_BUFF_MASK; @@ -225,7 +226,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) ktime_to_ns(ktime_sub(skb->tstamp, max_age))); } - resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(oldest, now))); + resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); resend_at += jiffies + rxrpc_resend_timeout; WRITE_ONCE(call->resend_at, resend_at); @@ -237,7 +238,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) * retransmitting data. */ if (!retrans) { - rxrpc_reduce_call_timer(call, resend_at, now, + rxrpc_reduce_call_timer(call, resend_at, now_j, rxrpc_timer_set_for_resend); spin_unlock_bh(&call->lock); ack_ts = ktime_sub(now, call->acks_latest_ts); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 0b2db38dd32d..f6734d8cb01a 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -99,9 +99,11 @@ found_extant_call: /* * allocate a new call */ -struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp) +struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, + unsigned int debug_id) { struct rxrpc_call *call; + struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); call = kmem_cache_zalloc(rxrpc_call_jar, gfp); if (!call) @@ -138,7 +140,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp) spin_lock_init(&call->notify_lock); rwlock_init(&call->state_lock); atomic_set(&call->usage, 1); - call->debug_id = atomic_inc_return(&rxrpc_debug_id); + call->debug_id = debug_id; call->tx_total_len = -1; call->next_rx_timo = 20 * HZ; call->next_req_timo = 1 * HZ; @@ -152,6 +154,9 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp) call->cong_cwnd = 2; call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1; + + call->rxnet = rxnet; + atomic_inc(&rxnet->nr_calls); return call; nomem_2: @@ -166,14 +171,15 @@ nomem: */ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, struct sockaddr_rxrpc *srx, - gfp_t gfp) + gfp_t gfp, + unsigned int debug_id) { struct rxrpc_call *call; ktime_t now; _enter(""); - call = rxrpc_alloc_call(rx, gfp); + call = rxrpc_alloc_call(rx, gfp, debug_id); if (!call) return ERR_PTR(-ENOMEM); call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; @@ -214,18 +220,20 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct sockaddr_rxrpc *srx, struct rxrpc_call_params *p, - gfp_t gfp) + gfp_t gfp, + unsigned int debug_id) __releases(&rx->sk.sk_lock.slock) + __acquires(&call->user_mutex) { struct rxrpc_call *call, *xcall; - struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); + struct rxrpc_net *rxnet; struct rb_node *parent, **pp; const void *here = __builtin_return_address(0); int ret; _enter("%p,%lx", rx, p->user_call_ID); - call = rxrpc_alloc_client_call(rx, srx, gfp); + call = rxrpc_alloc_client_call(rx, srx, gfp, debug_id); if (IS_ERR(call)) { release_sock(&rx->sk); _leave(" = %ld", PTR_ERR(call)); @@ -268,6 +276,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); + rxnet = call->rxnet; write_lock(&rxnet->call_lock); list_add_tail(&call->link, &rxnet->calls); write_unlock(&rxnet->call_lock); @@ -613,7 +622,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) */ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { - struct rxrpc_net *rxnet; + struct rxrpc_net *rxnet = call->rxnet; const void *here = __builtin_return_address(0); int n; @@ -627,7 +636,6 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); if (!list_empty(&call->link)) { - rxnet = rxrpc_net(sock_net(&call->socket->sk)); write_lock(&rxnet->call_lock); list_del_init(&call->link); write_unlock(&rxnet->call_lock); @@ -643,11 +651,14 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) { struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); + struct rxrpc_net *rxnet = call->rxnet; rxrpc_put_peer(call->peer); kfree(call->rxtx_buffer); kfree(call->rxtx_annotations); kmem_cache_free(rxrpc_call_jar, call); + if (atomic_dec_and_test(&rxnet->nr_calls)) + wake_up_var(&rxnet->nr_calls); } /* @@ -712,4 +723,7 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) } write_unlock(&rxnet->call_lock); + + atomic_dec(&rxnet->nr_calls); + wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls)); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 064175068059..5736f643c516 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -207,6 +207,7 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) if (ret < 0) goto error_2; + atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); @@ -776,7 +777,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) unsigned int channel = call->cid & RXRPC_CHANNELMASK; struct rxrpc_connection *conn = call->conn; struct rxrpc_channel *chan = &conn->channels[channel]; - struct rxrpc_net *rxnet = rxrpc_net(sock_net(&call->socket->sk)); + struct rxrpc_net *rxnet = conn->params.local->rxnet; trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); call->conn = NULL; diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index b1dfae107431..c717152070df 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -136,6 +136,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, } kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len); + conn->params.peer->last_tx_at = ktime_get_real(); _leave(""); return; } @@ -160,7 +161,8 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, lockdep_is_held(&conn->channel_lock)); if (call) { if (compl == RXRPC_CALL_LOCALLY_ABORTED) - trace_rxrpc_abort("CON", call->cid, + trace_rxrpc_abort(call->debug_id, + "CON", call->cid, call->call_id, 0, abort_code, error); if (rxrpc_set_call_completion(call, compl, @@ -238,6 +240,8 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, return -EAGAIN; } + conn->params.peer->last_tx_at = ktime_get_real(); + _leave(" = 0"); return 0; } diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index ccbac190add1..4c77a78a252a 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -365,6 +365,9 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) key_put(conn->params.key); key_put(conn->server_key); rxrpc_put_peer(conn->params.peer); + + if (atomic_dec_and_test(&conn->params.local->rxnet->nr_conns)) + wake_up_var(&conn->params.local->rxnet->nr_conns); rxrpc_put_local(conn->params.local); kfree(conn); @@ -418,7 +421,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) */ if (atomic_cmpxchg(&conn->usage, 1, 0) != 1) continue; - trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, 0); + trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, NULL); if (rxrpc_conn_is_client(conn)) BUG(); @@ -458,6 +461,7 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) _enter(""); + atomic_dec(&rxnet->nr_conns); rxrpc_destroy_all_client_connections(rxnet); del_timer_sync(&rxnet->service_conn_reap_timer); @@ -475,5 +479,9 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) ASSERT(list_empty(&rxnet->conn_proc_list)); + /* We need to wait for the connections to be destroyed by RCU as they + * pin things that we still need to get rid of. + */ + wait_var_event(&rxnet->nr_conns, !atomic_read(&rxnet->nr_conns)); _leave(""); } diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index f6fcdb3130a1..80773a50c755 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -132,6 +132,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn conn->state = RXRPC_CONN_SERVICE_PREALLOC; atomic_set(&conn->usage, 2); + atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); list_add_tail(&conn->link, &rxnet->service_conns); list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 6fc61400337f..0410d2277ca2 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1183,6 +1183,8 @@ void rxrpc_data_ready(struct sock *udp_sk) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: + if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) + goto discard; rxrpc_post_packet_to_local(local, skb); goto out; @@ -1198,6 +1200,12 @@ void rxrpc_data_ready(struct sock *udp_sk) !rxrpc_validate_jumbo(skb)) goto bad_message; break; + + /* Packet types 9-11 should just be ignored. */ + case RXRPC_PACKET_TYPE_PARAMS: + case RXRPC_PACKET_TYPE_10: + case RXRPC_PACKET_TYPE_11: + goto discard; } rcu_read_lock(); @@ -1240,16 +1248,19 @@ void rxrpc_data_ready(struct sock *udp_sk) goto discard_unlock; if (sp->hdr.callNumber == chan->last_call) { - /* For the previous service call, if completed successfully, we - * discard all further packets. + if (chan->call || + sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) + goto discard_unlock; + + /* For the previous service call, if completed + * successfully, we discard all further packets. */ if (rxrpc_conn_is_service(conn) && - (chan->last_type == RXRPC_PACKET_TYPE_ACK || - sp->hdr.type == RXRPC_PACKET_TYPE_ABORT)) + chan->last_type == RXRPC_PACKET_TYPE_ACK) goto discard_unlock; - /* But otherwise we need to retransmit the final packet from - * data cached in the connection record. + /* But otherwise we need to retransmit the final packet + * from data cached in the connection record. */ rxrpc_post_packet_to_conn(conn, skb); goto out_unlock; @@ -1307,21 +1318,21 @@ out_unlock: wrong_security: rcu_read_unlock(); - trace_rxrpc_abort("SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RXKADINCONSISTENCY, EBADMSG); skb->priority = RXKADINCONSISTENCY; goto post_abort; reupgrade: rcu_read_unlock(); - trace_rxrpc_abort("UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_PROTOCOL_ERROR, EBADMSG); goto protocol_error; bad_message_unlock: rcu_read_unlock(); bad_message: - trace_rxrpc_abort("BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_PROTOCOL_ERROR, EBADMSG); protocol_error: skb->priority = RX_PROTOCOL_ERROR; diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 38b99db30e54..8b54e9531d52 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -95,6 +95,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); local->srx.srx_service = 0; + trace_rxrpc_local(local, rxrpc_local_new, 1, NULL); } _leave(" = %p", local); @@ -257,15 +258,74 @@ addr_in_use: } /* + * Get a ref on a local endpoint. + */ +struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local) +{ + const void *here = __builtin_return_address(0); + int n; + + n = atomic_inc_return(&local->usage); + trace_rxrpc_local(local, rxrpc_local_got, n, here); + return local; +} + +/* + * Get a ref on a local endpoint unless its usage has already reached 0. + */ +struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) +{ + const void *here = __builtin_return_address(0); + + if (local) { + int n = __atomic_add_unless(&local->usage, 1, 0); + if (n > 0) + trace_rxrpc_local(local, rxrpc_local_got, n + 1, here); + else + local = NULL; + } + return local; +} + +/* + * Queue a local endpoint. + */ +void rxrpc_queue_local(struct rxrpc_local *local) +{ + const void *here = __builtin_return_address(0); + + if (rxrpc_queue_work(&local->processor)) + trace_rxrpc_local(local, rxrpc_local_queued, + atomic_read(&local->usage), here); +} + +/* * A local endpoint reached its end of life. */ -void __rxrpc_put_local(struct rxrpc_local *local) +static void __rxrpc_put_local(struct rxrpc_local *local) { _enter("%d", local->debug_id); rxrpc_queue_work(&local->processor); } /* + * Drop a ref on a local endpoint. + */ +void rxrpc_put_local(struct rxrpc_local *local) +{ + const void *here = __builtin_return_address(0); + int n; + + if (local) { + n = atomic_dec_return(&local->usage); + trace_rxrpc_local(local, rxrpc_local_put, n, here); + + if (n == 0) + __rxrpc_put_local(local); + } +} + +/* * Destroy a local endpoint's socket and then hand the record to RCU to dispose * of. * @@ -322,7 +382,8 @@ static void rxrpc_local_processor(struct work_struct *work) container_of(work, struct rxrpc_local, processor); bool again; - _enter("%d", local->debug_id); + trace_rxrpc_local(local, rxrpc_local_processing, + atomic_read(&local->usage), NULL); do { again = false; diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index f18c9248e0d4..c7a023fb22d0 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -32,13 +32,22 @@ static void rxrpc_service_conn_reap_timeout(struct timer_list *timer) rxrpc_queue_work(&rxnet->service_conn_reaper); } +static void rxrpc_peer_keepalive_timeout(struct timer_list *timer) +{ + struct rxrpc_net *rxnet = + container_of(timer, struct rxrpc_net, peer_keepalive_timer); + + if (rxnet->live) + rxrpc_queue_work(&rxnet->peer_keepalive_work); +} + /* * Initialise a per-network namespace record. */ static __net_init int rxrpc_init_net(struct net *net) { struct rxrpc_net *rxnet = rxrpc_net(net); - int ret; + int ret, i; rxnet->live = true; get_random_bytes(&rxnet->epoch, sizeof(rxnet->epoch)); @@ -46,7 +55,9 @@ static __net_init int rxrpc_init_net(struct net *net) INIT_LIST_HEAD(&rxnet->calls); rwlock_init(&rxnet->call_lock); + atomic_set(&rxnet->nr_calls, 1); + atomic_set(&rxnet->nr_conns, 1); INIT_LIST_HEAD(&rxnet->conn_proc_list); INIT_LIST_HEAD(&rxnet->service_conns); rwlock_init(&rxnet->conn_lock); @@ -70,8 +81,16 @@ static __net_init int rxrpc_init_net(struct net *net) INIT_LIST_HEAD(&rxnet->local_endpoints); mutex_init(&rxnet->local_mutex); + hash_init(rxnet->peer_hash); spin_lock_init(&rxnet->peer_hash_lock); + for (i = 0; i < ARRAY_SIZE(rxnet->peer_keepalive); i++) + INIT_HLIST_HEAD(&rxnet->peer_keepalive[i]); + INIT_HLIST_HEAD(&rxnet->peer_keepalive_new); + timer_setup(&rxnet->peer_keepalive_timer, + rxrpc_peer_keepalive_timeout, 0); + INIT_WORK(&rxnet->peer_keepalive_work, rxrpc_peer_keepalive_worker); + rxnet->peer_keepalive_base = ktime_add(ktime_get_real(), NSEC_PER_SEC); ret = -ENOMEM; rxnet->proc_net = proc_net_mkdir(net, "rxrpc", net->proc_net); @@ -95,8 +114,11 @@ static __net_exit void rxrpc_exit_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); rxnet->live = false; + del_timer_sync(&rxnet->peer_keepalive_timer); + cancel_work_sync(&rxnet->peer_keepalive_work); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); + rxrpc_destroy_all_peers(rxnet); rxrpc_destroy_all_locals(rxnet); proc_remove(rxnet->proc_net); } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 42410e910aff..7f1fc04775b3 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -32,6 +32,8 @@ struct rxrpc_abort_buffer { __be32 abort_code; }; +static const char rxrpc_keepalive_string[] = ""; + /* * Arrange for a keepalive ping a certain time after we last transmitted. This * lets the far side know we're still interested in this call and helps keep @@ -122,6 +124,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, struct kvec iov[2]; rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; + ktime_t now; size_t len, n; int ret; u8 reason; @@ -203,8 +206,10 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, } ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + now = ktime_get_real(); if (ping) - call->ping_time = ktime_get_real(); + call->ping_time = now; + conn->params.peer->last_tx_at = ktime_get_real(); if (call->state < RXRPC_CALL_COMPLETE) { if (ret < 0) { @@ -288,6 +293,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, sizeof(pkt)); + conn->params.peer->last_tx_at = ktime_get_real(); rxrpc_put_connection(conn); return ret; @@ -378,6 +384,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, * message and update the peer record */ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + conn->params.peer->last_tx_at = ktime_get_real(); up_read(&conn->params.local->defrag_sem); if (ret == -EMSGSIZE) @@ -429,6 +436,7 @@ send_fragmentable: if (ret == 0) { ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + conn->params.peer->last_tx_at = ktime_get_real(); opt = IP_PMTUDISC_DO; kernel_setsockopt(conn->params.local->socket, SOL_IP, @@ -445,7 +453,8 @@ send_fragmentable: (char *)&opt, sizeof(opt)); if (ret == 0) { ret = kernel_sendmsg(conn->params.local->socket, &msg, - iov, 1, iov[0].iov_len); + iov, 2, len); + conn->params.peer->last_tx_at = ktime_get_real(); opt = IPV6_PMTUDISC_DO; kernel_setsockopt(conn->params.local->socket, @@ -515,3 +524,51 @@ void rxrpc_reject_packets(struct rxrpc_local *local) _leave(""); } + +/* + * Send a VERSION reply to a peer as a keepalive. + */ +void rxrpc_send_keepalive(struct rxrpc_peer *peer) +{ + struct rxrpc_wire_header whdr; + struct msghdr msg; + struct kvec iov[2]; + size_t len; + int ret; + + _enter(""); + + msg.msg_name = &peer->srx.transport; + msg.msg_namelen = peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + whdr.epoch = htonl(peer->local->rxnet->epoch); + whdr.cid = 0; + whdr.callNumber = 0; + whdr.seq = 0; + whdr.serial = 0; + whdr.type = RXRPC_PACKET_TYPE_VERSION; /* Not client-initiated */ + whdr.flags = RXRPC_LAST_PACKET; + whdr.userStatus = 0; + whdr.securityIndex = 0; + whdr._rsvd = 0; + whdr.serviceId = 0; + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = (char *)rxrpc_keepalive_string; + iov[1].iov_len = sizeof(rxrpc_keepalive_string); + + len = iov[0].iov_len + iov[1].iov_len; + + _proto("Tx VERSION (keepalive)"); + + ret = kernel_sendmsg(peer->local->socket, &msg, iov, 2, len); + if (ret < 0) + _debug("sendmsg failed: %d", ret); + + peer->last_tx_at = ktime_get_real(); + _leave(""); +} diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 7f749505e699..78c2f95d1f22 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -192,7 +192,7 @@ void rxrpc_error_report(struct sock *sk) rxrpc_free_skb(skb, rxrpc_skb_rx_freed); /* The ref we obtained is passed off to the work item */ - rxrpc_queue_work(&peer->error_distributor); + __rxrpc_queue_peer_error(peer); _leave(""); } @@ -348,3 +348,99 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt, usage, avg); } + +/* + * Perform keep-alive pings with VERSION packets to keep any NAT alive. + */ +void rxrpc_peer_keepalive_worker(struct work_struct *work) +{ + struct rxrpc_net *rxnet = + container_of(work, struct rxrpc_net, peer_keepalive_work); + struct rxrpc_peer *peer; + unsigned long delay; + ktime_t base, now = ktime_get_real(); + s64 diff; + u8 cursor, slot; + + base = rxnet->peer_keepalive_base; + cursor = rxnet->peer_keepalive_cursor; + + _enter("%u,%lld", cursor, ktime_sub(now, base)); + +next_bucket: + diff = ktime_to_ns(ktime_sub(now, base)); + if (diff < 0) + goto resched; + + _debug("at %u", cursor); + spin_lock_bh(&rxnet->peer_hash_lock); +next_peer: + if (!rxnet->live) { + spin_unlock_bh(&rxnet->peer_hash_lock); + goto out; + } + + /* Everything in the bucket at the cursor is processed this second; the + * bucket at cursor + 1 goes now + 1s and so on... + */ + if (hlist_empty(&rxnet->peer_keepalive[cursor])) { + if (hlist_empty(&rxnet->peer_keepalive_new)) { + spin_unlock_bh(&rxnet->peer_hash_lock); + goto emptied_bucket; + } + + hlist_move_list(&rxnet->peer_keepalive_new, + &rxnet->peer_keepalive[cursor]); + } + + peer = hlist_entry(rxnet->peer_keepalive[cursor].first, + struct rxrpc_peer, keepalive_link); + hlist_del_init(&peer->keepalive_link); + if (!rxrpc_get_peer_maybe(peer)) + goto next_peer; + + spin_unlock_bh(&rxnet->peer_hash_lock); + + _debug("peer %u {%pISp}", peer->debug_id, &peer->srx.transport); + +recalc: + diff = ktime_divns(ktime_sub(peer->last_tx_at, base), NSEC_PER_SEC); + if (diff < -30 || diff > 30) + goto send; /* LSW of 64-bit time probably wrapped on 32-bit */ + diff += RXRPC_KEEPALIVE_TIME - 1; + if (diff < 0) + goto send; + + slot = (diff > RXRPC_KEEPALIVE_TIME - 1) ? RXRPC_KEEPALIVE_TIME - 1 : diff; + if (slot == 0) + goto send; + + /* A transmission to this peer occurred since last we examined it so + * put it into the appropriate future bucket. + */ + slot = (slot + cursor) % ARRAY_SIZE(rxnet->peer_keepalive); + spin_lock_bh(&rxnet->peer_hash_lock); + hlist_add_head(&peer->keepalive_link, &rxnet->peer_keepalive[slot]); + rxrpc_put_peer(peer); + goto next_peer; + +send: + rxrpc_send_keepalive(peer); + now = ktime_get_real(); + goto recalc; + +emptied_bucket: + cursor++; + if (cursor >= ARRAY_SIZE(rxnet->peer_keepalive)) + cursor = 0; + base = ktime_add_ns(base, NSEC_PER_SEC); + goto next_bucket; + +resched: + rxnet->peer_keepalive_base = base; + rxnet->peer_keepalive_cursor = cursor; + delay = nsecs_to_jiffies(-diff) + 1; + timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay); +out: + _leave(""); +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index d02a99f37f5f..1b7e8107b3ae 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -322,6 +322,7 @@ struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local, if (!peer) { peer = prealloc; hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); + hlist_add_head(&peer->keepalive_link, &rxnet->peer_keepalive_new); } spin_unlock(&rxnet->peer_hash_lock); @@ -363,9 +364,12 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; - if (!peer) + if (!peer) { hash_add_rcu(rxnet->peer_hash, &candidate->hash_link, hash_key); + hlist_add_head(&candidate->keepalive_link, + &rxnet->peer_keepalive_new); + } spin_unlock_bh(&rxnet->peer_hash_lock); @@ -382,9 +386,54 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, } /* - * Discard a ref on a remote peer record. + * Get a ref on a peer record. + */ +struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) +{ + const void *here = __builtin_return_address(0); + int n; + + n = atomic_inc_return(&peer->usage); + trace_rxrpc_peer(peer, rxrpc_peer_got, n, here); + return peer; +} + +/* + * Get a ref on a peer record unless its usage has already reached 0. + */ +struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) +{ + const void *here = __builtin_return_address(0); + + if (peer) { + int n = __atomic_add_unless(&peer->usage, 1, 0); + if (n > 0) + trace_rxrpc_peer(peer, rxrpc_peer_got, n + 1, here); + else + peer = NULL; + } + return peer; +} + +/* + * Queue a peer record. This passes the caller's ref to the workqueue. + */ +void __rxrpc_queue_peer_error(struct rxrpc_peer *peer) +{ + const void *here = __builtin_return_address(0); + int n; + + n = atomic_read(&peer->usage); + if (rxrpc_queue_work(&peer->error_distributor)) + trace_rxrpc_peer(peer, rxrpc_peer_queued_error, n, here); + else + rxrpc_put_peer(peer); +} + +/* + * Discard a peer record. */ -void __rxrpc_put_peer(struct rxrpc_peer *peer) +static void __rxrpc_put_peer(struct rxrpc_peer *peer) { struct rxrpc_net *rxnet = peer->local->rxnet; @@ -392,11 +441,49 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer) spin_lock_bh(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); + hlist_del_init(&peer->keepalive_link); spin_unlock_bh(&rxnet->peer_hash_lock); kfree_rcu(peer, rcu); } +/* + * Drop a ref on a peer record. + */ +void rxrpc_put_peer(struct rxrpc_peer *peer) +{ + const void *here = __builtin_return_address(0); + int n; + + if (peer) { + n = atomic_dec_return(&peer->usage); + trace_rxrpc_peer(peer, rxrpc_peer_put, n, here); + if (n == 0) + __rxrpc_put_peer(peer); + } +} + +/* + * Make sure all peer records have been discarded. + */ +void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) +{ + struct rxrpc_peer *peer; + int i; + + for (i = 0; i < HASH_SIZE(rxnet->peer_hash); i++) { + if (hlist_empty(&rxnet->peer_hash[i])) + continue; + + hlist_for_each_entry(peer, &rxnet->peer_hash[i], hash_link) { + pr_err("Leaked peer %u {%u} %pISp\n", + peer->debug_id, + atomic_read(&peer->usage), + &peer->srx.transport); + } + } +} + /** * rxrpc_kernel_get_peer - Get the peer address of a call * @sock: The socket on which the call is in progress. diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index f79f260c6ddc..7e45db058823 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -29,6 +29,8 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { * generate a list of extant and dead calls in /proc/net/rxrpc_calls */ static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos) + __acquires(rcu) + __acquires(rxnet->call_lock) { struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); @@ -45,6 +47,8 @@ static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) + __releases(rxnet->call_lock) + __releases(rcu) { struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); @@ -135,6 +139,7 @@ const struct file_operations rxrpc_call_seq_fops = { * generate a list of extant virtual connections in /proc/net/rxrpc_conns */ static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos) + __acquires(rxnet->conn_lock) { struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); @@ -151,6 +156,7 @@ static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v, } static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) + __releases(rxnet->conn_lock) { struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 4bddcf3face3..93da73bf7098 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -46,6 +46,9 @@ struct rxrpc_wire_header { #define RXRPC_PACKET_TYPE_CHALLENGE 6 /* connection security challenge (SRVR->CLNT) */ #define RXRPC_PACKET_TYPE_RESPONSE 7 /* connection secutity response (CLNT->SRVR) */ #define RXRPC_PACKET_TYPE_DEBUG 8 /* debug info request */ +#define RXRPC_PACKET_TYPE_PARAMS 9 /* Parameter negotiation (unspec'd, ignore) */ +#define RXRPC_PACKET_TYPE_10 10 /* Ignored */ +#define RXRPC_PACKET_TYPE_11 11 /* Ignored */ #define RXRPC_PACKET_TYPE_VERSION 13 /* version string request */ #define RXRPC_N_PACKET_TYPES 14 /* number of packet types (incl type 0) */ @@ -78,6 +81,9 @@ struct rxrpc_wire_header { (1 << RXRPC_PACKET_TYPE_CHALLENGE) | \ (1 << RXRPC_PACKET_TYPE_RESPONSE) | \ /*(1 << RXRPC_PACKET_TYPE_DEBUG) | */ \ + (1 << RXRPC_PACKET_TYPE_PARAMS) | \ + (1 << RXRPC_PACKET_TYPE_10) | \ + (1 << RXRPC_PACKET_TYPE_11) | \ (1 << RXRPC_PACKET_TYPE_VERSION)) /*****************************************************************************/ diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 9d45d8b56744..7bff716e911e 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -272,7 +272,7 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, unsigned int *_offset, unsigned int *_len) { unsigned int offset = sizeof(struct rxrpc_wire_header); - unsigned int len = *_len; + unsigned int len; int ret; u8 annotation = *_annotation; diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 77cb23c7bd0a..588fea0dd362 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -668,6 +668,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) return -EAGAIN; } + conn->params.peer->last_tx_at = ktime_get_real(); _leave(" = 0"); return 0; } @@ -722,6 +723,7 @@ static int rxkad_send_response(struct rxrpc_connection *conn, return -EAGAIN; } + conn->params.peer->last_tx_at = ktime_get_real(); _leave(" = 0"); return 0; } diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index e9f428351293..c4479afe8ae7 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -19,9 +19,6 @@ #include <keys/rxrpc-type.h> #include "ar-internal.h" -static LIST_HEAD(rxrpc_security_methods); -static DECLARE_RWSEM(rxrpc_security_sem); - static const struct rxrpc_security *rxrpc_security_types[] = { [RXRPC_SECURITY_NONE] = &rxrpc_no_security, #ifdef CONFIG_RXKAD diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 09f2a3e05221..206e802ccbdc 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -130,7 +130,9 @@ static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix) spin_lock_bh(&call->lock); if (call->state < RXRPC_CALL_COMPLETE) { - call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS; + call->rxtx_annotations[ix] = + (call->rxtx_annotations[ix] & RXRPC_TX_ANNO_LAST) | + RXRPC_TX_ANNO_RETRANS; if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) rxrpc_queue_call(call); } @@ -554,6 +556,7 @@ static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, struct rxrpc_send_params *p) __releases(&rx->sk.sk_lock.slock) + __acquires(&call->user_mutex) { struct rxrpc_conn_parameters cp; struct rxrpc_call *call; @@ -579,9 +582,11 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, cp.exclusive = rx->exclusive | p->exclusive; cp.upgrade = p->upgrade; cp.service_id = srx->srx_service; - call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL); + call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL, + atomic_inc_return(&rxrpc_debug_id)); /* The socket is now unlocked */ + rxrpc_put_peer(cp.peer); _leave(" = %p\n", call); return call; } @@ -593,6 +598,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) __releases(&rx->sk.sk_lock.slock) + __releases(&call->user_mutex) { enum rxrpc_call_state state; struct rxrpc_call *call; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index f24a6ae6819a..a01169fb5325 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -658,6 +658,18 @@ config NET_EMATCH_IPSET To compile this code as a module, choose M here: the module will be called em_ipset. +config NET_EMATCH_IPT + tristate "IPtables Matches" + depends on NET_EMATCH && NETFILTER && NETFILTER_XTABLES + ---help--- + Say Y here to be able to classify packets based on iptables + matches. + Current supported match is "policy" which allows packet classification + based on IPsec policy that was used during decapsulation + + To compile this code as a module, choose M here: the + module will be called em_ipt. + config NET_CLS_ACT bool "Actions" select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index 5b635447e3f8..8811d3804878 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -75,3 +75,4 @@ obj-$(CONFIG_NET_EMATCH_META) += em_meta.o obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o +obj-$(CONFIG_NET_EMATCH_IPT) += em_ipt.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index eba6682727dd..72251241665a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -109,6 +109,42 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) } EXPORT_SYMBOL(__tcf_idr_release); +static size_t tcf_action_shared_attrs_size(const struct tc_action *act) +{ + u32 cookie_len = 0; + + if (act->act_cookie) + cookie_len = nla_total_size(act->act_cookie->len); + + return nla_total_size(0) /* action number nested */ + + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ + + cookie_len /* TCA_ACT_COOKIE */ + + nla_total_size(0) /* TCA_ACT_STATS nested */ + /* TCA_STATS_BASIC */ + + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) + /* TCA_STATS_QUEUE */ + + nla_total_size_64bit(sizeof(struct gnet_stats_queue)) + + nla_total_size(0) /* TCA_OPTIONS nested */ + + nla_total_size(sizeof(struct tcf_t)); /* TCA_GACT_TM */ +} + +static size_t tcf_action_full_attrs_size(size_t sz) +{ + return NLMSG_HDRLEN /* struct nlmsghdr */ + + sizeof(struct tcamsg) + + nla_total_size(0) /* TCA_ACT_TAB nested */ + + sz; +} + +static size_t tcf_action_fill_size(const struct tc_action *act) +{ + size_t sz = tcf_action_shared_attrs_size(act); + + if (act->ops->get_fill_size) + return act->ops->get_fill_size(act) + sz; + return sz; +} + static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct netlink_callback *cb) { @@ -135,8 +171,10 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, continue; nest = nla_nest_start(skb, n_i); - if (!nest) + if (!nest) { + index--; goto nla_put_failure; + } err = tcf_action_dump_1(skb, p, 0, 0); if (err < 0) { index--; @@ -202,7 +240,8 @@ nla_put_failure: int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tcf_idrinfo *idrinfo = tn->idrinfo; @@ -211,7 +250,8 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, } else if (type == RTM_GETACTION) { return tcf_dump_walker(idrinfo, skb, cb); } else { - WARN(1, "tcf_generic_walker: unknown action %d\n", type); + WARN(1, "tcf_generic_walker: unknown command %d\n", type); + NL_SET_ERR_MSG(extack, "tcf_generic_walker: unknown command"); return -EINVAL; } } @@ -258,14 +298,6 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, } EXPORT_SYMBOL(tcf_idr_check); -void tcf_idr_cleanup(struct tc_action *a, struct nlattr *est) -{ - if (est) - gen_kill_estimator(&a->tcfa_rate_est); - free_tcf(a); -} -EXPORT_SYMBOL(tcf_idr_cleanup); - int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats) @@ -605,7 +637,8 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind) + char *name, int ovr, int bind, + struct netlink_ext_ack *extack) { struct tc_action *a; struct tc_action_ops *a_o; @@ -616,31 +649,40 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int err; if (name == NULL) { - err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, NULL); + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack); if (err < 0) goto err_out; err = -EINVAL; kind = tb[TCA_ACT_KIND]; - if (kind == NULL) + if (!kind) { + NL_SET_ERR_MSG(extack, "TC action kind must be specified"); goto err_out; - if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) + } + if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) { + NL_SET_ERR_MSG(extack, "TC action name too long"); goto err_out; + } if (tb[TCA_ACT_COOKIE]) { int cklen = nla_len(tb[TCA_ACT_COOKIE]); - if (cklen > TC_COOKIE_MAX_SIZE) + if (cklen > TC_COOKIE_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "TC cookie size above the maximum"); goto err_out; + } cookie = nla_memdup_cookie(tb); if (!cookie) { + NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); err = -ENOMEM; goto err_out; } } } else { - err = -EINVAL; - if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) + if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) { + NL_SET_ERR_MSG(extack, "TC action name too long"); + err = -EINVAL; goto err_out; + } } a_o = tc_lookup_action_n(act_name); @@ -663,15 +705,17 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_mod; } #endif + NL_SET_ERR_MSG(extack, "Failed to load TC action module"); err = -ENOENT; goto err_out; } /* backward compatibility for policer */ if (name == NULL) - err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, + extack); else - err = a_o->init(net, nla, est, &a, ovr, bind); + err = a_o->init(net, nla, est, &a, ovr, bind, extack); if (err < 0) goto err_mod; @@ -697,6 +741,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, list_add_tail(&a->list, &actions); tcf_action_destroy(&actions, bind); + NL_SET_ERR_MSG(extack, "Failed to init TC action chain"); return ERR_PTR(err); } } @@ -726,29 +771,35 @@ static void cleanup_a(struct list_head *actions, int ovr) int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions) + struct list_head *actions, size_t *attr_size, + struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; + size_t sz = 0; int err; int i; - err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, NULL); + err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (err < 0) return err; for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind); + act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, + extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; } act->order = i; + sz += tcf_action_fill_size(act); if (ovr) act->tcfa_refcnt++; list_add_tail(&act->list, actions); } + *attr_size = tcf_action_full_attrs_size(sz); + /* Remove the temp refcnt which was necessary to protect against * destroying an existing action which was being replaced */ @@ -822,7 +873,7 @@ static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); - if (nest == NULL) + if (!nest) goto out_nlmsg_trim; if (tcf_action_dump(skb, actions, bind, ref) < 0) @@ -840,7 +891,8 @@ out_nlmsg_trim: static int tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, - struct list_head *actions, int event) + struct list_head *actions, int event, + struct netlink_ext_ack *extack) { struct sk_buff *skb; @@ -849,6 +901,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; } @@ -857,7 +910,8 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, } static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, - struct nlmsghdr *n, u32 portid) + struct nlmsghdr *n, u32 portid, + struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX + 1]; const struct tc_action_ops *ops; @@ -865,22 +919,26 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, int index; int err; - err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, NULL); + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack); if (err < 0) goto err_out; err = -EINVAL; if (tb[TCA_ACT_INDEX] == NULL || - nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) + nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) { + NL_SET_ERR_MSG(extack, "Invalid TC action index value"); goto err_out; + } index = nla_get_u32(tb[TCA_ACT_INDEX]); err = -EINVAL; ops = tc_lookup_action(tb[TCA_ACT_KIND]); - if (!ops) /* could happen in batch of actions */ + if (!ops) { /* could happen in batch of actions */ + NL_SET_ERR_MSG(extack, "Specified TC action not found"); goto err_out; + } err = -ENOENT; - if (ops->lookup(net, &a, index) == 0) + if (ops->lookup(net, &a, index, extack) == 0) goto err_mod; module_put(ops->owner); @@ -893,7 +951,8 @@ err_out: } static int tca_action_flush(struct net *net, struct nlattr *nla, - struct nlmsghdr *n, u32 portid) + struct nlmsghdr *n, u32 portid, + struct netlink_ext_ack *extack) { struct sk_buff *skb; unsigned char *b; @@ -907,39 +966,45 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, int err = -ENOMEM; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) { - pr_debug("tca_action_flush: failed skb alloc\n"); + if (!skb) return err; - } b = skb_tail_pointer(skb); - err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, NULL); + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack); if (err < 0) goto err_out; err = -EINVAL; kind = tb[TCA_ACT_KIND]; ops = tc_lookup_action(kind); - if (!ops) /*some idjot trying to flush unknown action */ + if (!ops) { /*some idjot trying to flush unknown action */ + NL_SET_ERR_MSG(extack, "Cannot flush unknown TC action"); goto err_out; + } nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); - if (!nlh) + if (!nlh) { + NL_SET_ERR_MSG(extack, "Failed to create TC action flush notification"); goto out_module_put; + } t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); - if (nest == NULL) + if (!nest) { + NL_SET_ERR_MSG(extack, "Failed to add new netlink message"); goto out_module_put; + } - err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); - if (err <= 0) + err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops, extack); + if (err <= 0) { + nla_nest_cancel(skb, nest); goto out_module_put; + } nla_nest_end(skb, nest); @@ -950,6 +1015,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, n->nlmsg_flags & NLM_F_ECHO); if (err > 0) return 0; + if (err < 0) + NL_SET_ERR_MSG(extack, "Failed to send TC action flush notification"); return err; @@ -962,17 +1029,19 @@ err_out: static int tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, - u32 portid) + u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { int ret; struct sk_buff *skb; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size, + GFP_KERNEL); if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, 0, 1) <= 0) { + NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return -EINVAL; } @@ -980,6 +1049,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, /* now do the delete */ ret = tcf_action_destroy(actions, 0); if (ret < 0) { + NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); return ret; } @@ -993,38 +1063,43 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, static int tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, - u32 portid, int event) + u32 portid, int event, struct netlink_ext_ack *extack) { int i, ret; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; + size_t attr_size = 0; LIST_HEAD(actions); - ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, NULL); + ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (ret < 0) return ret; if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { - if (tb[1] != NULL) - return tca_action_flush(net, tb[1], n, portid); - else - return -EINVAL; + if (tb[1]) + return tca_action_flush(net, tb[1], n, portid, extack); + + NL_SET_ERR_MSG(extack, "Invalid netlink attributes while flushing TC action"); + return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_get_1(net, tb[i], n, portid); + act = tcf_action_get_1(net, tb[i], n, portid, extack); if (IS_ERR(act)) { ret = PTR_ERR(act); goto err; } act->order = i; + attr_size += tcf_action_fill_size(act); list_add_tail(&act->list, &actions); } + attr_size = tcf_action_full_attrs_size(attr_size); + if (event == RTM_GETACTION) - ret = tcf_get_notify(net, portid, n, &actions, event); + ret = tcf_get_notify(net, portid, n, &actions, event, extack); else { /* delete */ - ret = tcf_del_notify(net, n, &actions, portid); + ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack); if (ret) goto err; return ret; @@ -1037,17 +1112,19 @@ err: static int tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, - u32 portid) + u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; int err = 0; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size, + GFP_KERNEL); if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_NEWACTION, 0, 0) <= 0) { + NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; } @@ -1060,16 +1137,19 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, } static int tcf_action_add(struct net *net, struct nlattr *nla, - struct nlmsghdr *n, u32 portid, int ovr) + struct nlmsghdr *n, u32 portid, int ovr, + struct netlink_ext_ack *extack) { + size_t attr_size = 0; int ret = 0; LIST_HEAD(actions); - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions); + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, + &attr_size, extack); if (ret) return ret; - return tcf_add_notify(net, n, &actions, portid); + return tcf_add_notify(net, n, &actions, portid, attr_size, extack); } static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; @@ -1097,7 +1177,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, return ret; if (tca[TCA_ACT_TAB] == NULL) { - pr_notice("tc_ctl_action: received NO action attribs\n"); + NL_SET_ERR_MSG(extack, "Netlink action attributes missing"); return -EINVAL; } @@ -1113,17 +1193,18 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, if (n->nlmsg_flags & NLM_F_REPLACE) ovr = 1; replay: - ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr); + ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr, + extack); if (ret == -EAGAIN) goto replay; break; case RTM_DELACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, - portid, RTM_DELACTION); + portid, RTM_DELACTION, extack); break; case RTM_GETACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, - portid, RTM_GETACTION); + portid, RTM_GETACTION, extack); break; default: BUG(); @@ -1218,7 +1299,7 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto out_module_put; - ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o); + ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o, NULL); if (ret < 0) goto out_module_put; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index b3f2c15affa7..9092531d45d8 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -272,7 +272,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, - int replace, int bind) + int replace, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; @@ -352,7 +352,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, return res; out: if (res == ACT_P_CREATED) - tcf_idr_cleanup(*act, est); + tcf_idr_release(*act, bind); return ret; } @@ -367,14 +367,16 @@ static void tcf_bpf_cleanup(struct tc_action *act) static int tcf_bpf_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 2b15ba84e0c8..e4b880fa51fe 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -96,7 +96,8 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); struct nlattr *tb[TCA_CONNMARK_MAX + 1]; @@ -176,14 +177,16 @@ nla_put_failure: static int tcf_connmark_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index b7ba9b06b147..7e28b2ce1437 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -46,7 +46,7 @@ static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind) + int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); struct tcf_csum_params *params_old, *params_new; @@ -350,7 +350,7 @@ static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl, { struct sctphdr *sctph; - if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) + if (skb_is_gso(skb) && skb_is_gso_sctp(skb)) return 1; sctph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*sctph)); @@ -626,19 +626,22 @@ static void tcf_csum_cleanup(struct tc_action *a) struct tcf_csum_params *params; params = rcu_dereference_protected(p->params, 1); - kfree_rcu(params, rcu); + if (params) + kfree_rcu(params, rcu); } static int tcf_csum_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index b56986d41c87..4dc4f153cad8 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -56,7 +56,7 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; @@ -201,20 +201,35 @@ nla_put_failure: static int tcf_gact_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); return tcf_idr_search(tn, a, index); } +static size_t tcf_gact_get_fill_size(const struct tc_action *act) +{ + size_t sz = nla_total_size(sizeof(struct tc_gact)); /* TCA_GACT_PARMS */ + +#ifdef CONFIG_GACT_PROB + if (to_gact(act)->tcfg_ptype) + /* TCA_GACT_PROB */ + sz += nla_total_size(sizeof(struct tc_gact_p)); +#endif + + return sz; +} + static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, @@ -225,6 +240,7 @@ static struct tc_action_ops act_gact_ops = { .init = tcf_gact_init, .walk = tcf_gact_walker, .lookup = tcf_gact_search, + .get_fill_size = tcf_gact_get_fill_size, .size = sizeof(struct tcf_gact), }; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 5954e992685a..a5994cf0512b 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -447,7 +447,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; @@ -824,14 +824,16 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, static int tcf_ife_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 06e380ae0928..14c312d7908f 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -80,9 +80,12 @@ static void ipt_destroy_target(struct xt_entry_target *t) static void tcf_ipt_release(struct tc_action *a) { struct tcf_ipt *ipt = to_ipt(a); - ipt_destroy_target(ipt->tcfi_t); + + if (ipt->tcfi_t) { + ipt_destroy_target(ipt->tcfi_t); + kfree(ipt->tcfi_t); + } kfree(ipt->tcfi_tname); - kfree(ipt->tcfi_t); } static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { @@ -187,13 +190,13 @@ err2: kfree(tname); err1: if (ret == ACT_P_CREATED) - tcf_idr_cleanup(*a, est); + tcf_idr_release(*a, bind); return err; } static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind) + int bind, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, bind); @@ -201,7 +204,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind) + int bind, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, bind); @@ -303,14 +306,16 @@ nla_put_failure: static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ipt_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ipt_net_id); @@ -351,14 +356,16 @@ static struct pernet_operations ipt_net_ops = { static int tcf_xt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, xt_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, xt_net_id); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e6ff88f72900..fd34015331ab 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -69,7 +69,7 @@ static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind) + int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; @@ -80,13 +80,17 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, bool exists = false; int ret; - if (nla == NULL) + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); return -EINVAL; - ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, NULL); + } + ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, extack); if (ret < 0) return ret; - if (tb[TCA_MIRRED_PARMS] == NULL) + if (!tb[TCA_MIRRED_PARMS]) { + NL_SET_ERR_MSG_MOD(extack, "Missing required mirred parameters"); return -EINVAL; + } parm = nla_data(tb[TCA_MIRRED_PARMS]); exists = tcf_idr_check(tn, parm->index, a, bind); @@ -102,6 +106,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } if (parm->ifindex) { @@ -117,8 +122,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } if (!exists) { - if (dev == NULL) + if (!dev) { + NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; + } ret = tcf_idr_create(tn, parm->index, est, a, &act_mirred_ops, bind, true); if (ret) @@ -265,14 +272,16 @@ nla_put_failure: static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 98c6a4b2f523..4b5848b6c252 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -37,7 +37,8 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { }; static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, - struct tc_action **a, int ovr, int bind) + struct tc_action **a, int ovr, int bind, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; @@ -277,14 +278,16 @@ nla_put_failure: static int tcf_nat_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 349beaffb29e..8a925c72db5f 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -132,7 +132,7 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb, static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; @@ -176,7 +176,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) { - tcf_idr_cleanup(*a, est); + tcf_idr_release(*a, bind); kfree(keys_ex); return -ENOMEM; } @@ -419,14 +419,16 @@ nla_put_failure: static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 95d3c9097b25..4e72bc2a0dfb 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -58,11 +58,12 @@ static struct tc_action_ops act_police_ops; static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, police_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { @@ -74,7 +75,8 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_act_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, + struct netlink_ext_ack *extack) { int ret = 0, err; struct nlattr *tb[TCA_POLICE_MAX + 1]; @@ -194,7 +196,7 @@ failure: qdisc_put_rtab(P_tab); qdisc_put_rtab(R_tab); if (ret == ACT_P_CREATED) - tcf_idr_cleanup(*a, est); + tcf_idr_release(*a, bind); return err; } @@ -304,7 +306,8 @@ nla_put_failure: return -1; } -static int tcf_police_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_police_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, police_net_id); diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 1ba0df238756..5db358497c9e 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -37,7 +37,7 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind) + int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; @@ -103,7 +103,8 @@ static void tcf_sample_cleanup(struct tc_action *a) psample_group = rtnl_dereference(s->psample_group); RCU_INIT_POINTER(s->psample_group, NULL); - psample_group_put(psample_group); + if (psample_group) + psample_group_put(psample_group); } static bool tcf_sample_dev_ok_push(struct net_device *dev) @@ -202,14 +203,16 @@ nla_put_failure: static int tcf_sample_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 425eac11f6da..9618b4a83cee 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -79,7 +79,7 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; @@ -121,7 +121,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, d = to_defact(*a); ret = alloc_defdata(d, defdata); if (ret < 0) { - tcf_idr_cleanup(*a, est); + tcf_idr_release(*a, bind); return ret; } d->tcf_action = parm->action; @@ -170,14 +170,16 @@ nla_put_failure: static int tcf_simp_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 5a3f691bb545..ddf69fc01bdf 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -66,7 +66,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; @@ -208,14 +208,16 @@ nla_put_failure: static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index fa975262dbac..bbcbdce732cc 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -84,7 +84,7 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); struct nlattr *tb[TCA_SKBMOD_MAX + 1]; @@ -152,7 +152,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); if (unlikely(!p)) { - if (ovr) + if (ret == ACT_P_CREATED) tcf_idr_release(*a, bind); return -ENOMEM; } @@ -190,7 +190,8 @@ static void tcf_skbmod_cleanup(struct tc_action *a) struct tcf_skbmod_params *p; p = rcu_dereference_protected(d->skbmod_p, 1); - kfree_rcu(p, rcu); + if (p) + kfree_rcu(p, rcu); } static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, @@ -232,14 +233,16 @@ nla_put_failure: static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 0e23aac09ad6..626dac81a48a 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -70,7 +70,7 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; @@ -153,6 +153,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; break; default: + ret = -EINVAL; goto err_out; } @@ -207,11 +208,12 @@ static void tunnel_key_release(struct tc_action *a) struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, 1); + if (params) { + if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) + dst_release(¶ms->tcft_enc_metadata->dst); - if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) - dst_release(¶ms->tcft_enc_metadata->dst); - - kfree_rcu(params, rcu); + kfree_rcu(params, rcu); + } } static int tunnel_key_dump_addresses(struct sk_buff *skb, @@ -291,14 +293,16 @@ nla_put_failure: static int tunnel_key_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index) +static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index e1a1b3f3983a..853604685965 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -109,7 +109,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; @@ -117,7 +117,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct tc_vlan *parm; struct tcf_vlan *v; int action; - __be16 push_vid = 0; + u16 push_vid = 0; __be16 push_proto = 0; u8 push_prio = 0; bool exists = false; @@ -195,7 +195,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { - if (ovr) + if (ret == ACT_P_CREATED) tcf_idr_release(*a, bind); return -ENOMEM; } @@ -225,7 +225,8 @@ static void tcf_vlan_cleanup(struct tc_action *a) struct tcf_vlan_params *p; p = rcu_dereference_protected(v->vlan_p, 1); - kfree_rcu(p, rcu); + if (p) + kfree_rcu(p, rcu); } static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, @@ -267,14 +268,16 @@ nla_put_failure: static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a7dc7271042a..b66754f52a9f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1397,13 +1397,18 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) nla_get_u32(tca[TCA_CHAIN]) != chain->index) continue; if (!tcf_chain_dump(chain, q, parent, skb, cb, - index_start, &index)) + index_start, &index)) { + err = -EMSGSIZE; break; + } } cb->args[0] = index; out: + /* If we did no progress, the error (EMSGSIZE) is real */ + if (skb->len == 0 && err) + return err; return skb->len; } @@ -1428,11 +1433,12 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, #ifdef CONFIG_NET_CLS_ACT { struct tc_action *act; + size_t attr_size = 0; if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND); + TCA_ACT_BIND, extack); if (IS_ERR(act)) return PTR_ERR(act); @@ -1445,7 +1451,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions); + &actions, &attr_size, extack); if (err) return err; list_for_each_entry(act, &actions, list) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 7d0ce2c40f93..d964e60c730e 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -511,6 +511,9 @@ static int fl_set_key_flags(struct nlattr **tb, fl_set_key_flag(key, mask, flags_key, flags_mask, TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); + fl_set_key_flag(key, mask, flags_key, flags_mask, + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, + FLOW_DIS_FIRST_FRAG); return 0; } @@ -1130,6 +1133,9 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) fl_get_key_flag(flags_key, flags_mask, &key, &mask, TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); + fl_get_key_flag(flags_key, flags_mask, &key, &mask, + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, + FLOW_DIS_FIRST_FRAG); _key = cpu_to_be32(key); _mask = cpu_to_be32(mask); diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c new file mode 100644 index 000000000000..a5f34e930eff --- /dev/null +++ b/net/sched/em_ipt.c @@ -0,0 +1,257 @@ +/* + * net/sched/em_ipt.c IPtables matches Ematch + * + * (c) 2018 Eyal Birger <eyal.birger@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/gfp.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/tc_ematch/tc_em_ipt.h> +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/pkt_cls.h> + +struct em_ipt_match { + const struct xt_match *match; + u32 hook; + u8 match_data[0] __aligned(8); +}; + +struct em_ipt_xt_match { + char *match_name; + int (*validate_match_data)(struct nlattr **tb, u8 mrev); +}; + +static const struct nla_policy em_ipt_policy[TCA_EM_IPT_MAX + 1] = { + [TCA_EM_IPT_MATCH_NAME] = { .type = NLA_STRING, + .len = XT_EXTENSION_MAXNAMELEN }, + [TCA_EM_IPT_MATCH_REVISION] = { .type = NLA_U8 }, + [TCA_EM_IPT_HOOK] = { .type = NLA_U32 }, + [TCA_EM_IPT_NFPROTO] = { .type = NLA_U8 }, + [TCA_EM_IPT_MATCH_DATA] = { .type = NLA_UNSPEC }, +}; + +static int check_match(struct net *net, struct em_ipt_match *im, int mdata_len) +{ + struct xt_mtchk_param mtpar = {}; + union { + struct ipt_entry e4; + struct ip6t_entry e6; + } e = {}; + + mtpar.net = net; + mtpar.table = "filter"; + mtpar.hook_mask = 1 << im->hook; + mtpar.family = im->match->family; + mtpar.match = im->match; + mtpar.entryinfo = &e; + mtpar.matchinfo = (void *)im->match_data; + return xt_check_match(&mtpar, mdata_len, 0, 0); +} + +static int policy_validate_match_data(struct nlattr **tb, u8 mrev) +{ + if (mrev != 0) { + pr_err("only policy match revision 0 supported"); + return -EINVAL; + } + + if (nla_get_u32(tb[TCA_EM_IPT_HOOK]) != NF_INET_PRE_ROUTING) { + pr_err("policy can only be matched on NF_INET_PRE_ROUTING"); + return -EINVAL; + } + + return 0; +} + +static const struct em_ipt_xt_match em_ipt_xt_matches[] = { + { + .match_name = "policy", + .validate_match_data = policy_validate_match_data + }, + {} +}; + +static struct xt_match *get_xt_match(struct nlattr **tb) +{ + const struct em_ipt_xt_match *m; + struct nlattr *mname_attr; + u8 nfproto, mrev = 0; + int ret; + + mname_attr = tb[TCA_EM_IPT_MATCH_NAME]; + for (m = em_ipt_xt_matches; m->match_name; m++) { + if (!nla_strcmp(mname_attr, m->match_name)) + break; + } + + if (!m->match_name) { + pr_err("Unsupported xt match"); + return ERR_PTR(-EINVAL); + } + + if (tb[TCA_EM_IPT_MATCH_REVISION]) + mrev = nla_get_u8(tb[TCA_EM_IPT_MATCH_REVISION]); + + ret = m->validate_match_data(tb, mrev); + if (ret < 0) + return ERR_PTR(ret); + + nfproto = nla_get_u8(tb[TCA_EM_IPT_NFPROTO]); + return xt_request_find_match(nfproto, m->match_name, mrev); +} + +static int em_ipt_change(struct net *net, void *data, int data_len, + struct tcf_ematch *em) +{ + struct nlattr *tb[TCA_EM_IPT_MAX + 1]; + struct em_ipt_match *im = NULL; + struct xt_match *match; + int mdata_len, ret; + + ret = nla_parse(tb, TCA_EM_IPT_MAX, data, data_len, em_ipt_policy, + NULL); + if (ret < 0) + return ret; + + if (!tb[TCA_EM_IPT_HOOK] || !tb[TCA_EM_IPT_MATCH_NAME] || + !tb[TCA_EM_IPT_MATCH_DATA] || !tb[TCA_EM_IPT_NFPROTO]) + return -EINVAL; + + match = get_xt_match(tb); + if (IS_ERR(match)) { + pr_err("unable to load match\n"); + return PTR_ERR(match); + } + + mdata_len = XT_ALIGN(nla_len(tb[TCA_EM_IPT_MATCH_DATA])); + im = kzalloc(sizeof(*im) + mdata_len, GFP_KERNEL); + if (!im) { + ret = -ENOMEM; + goto err; + } + + im->match = match; + im->hook = nla_get_u32(tb[TCA_EM_IPT_HOOK]); + nla_memcpy(im->match_data, tb[TCA_EM_IPT_MATCH_DATA], mdata_len); + + ret = check_match(net, im, mdata_len); + if (ret) + goto err; + + em->datalen = sizeof(*im) + mdata_len; + em->data = (unsigned long)im; + return 0; + +err: + kfree(im); + module_put(match->me); + return ret; +} + +static void em_ipt_destroy(struct tcf_ematch *em) +{ + struct em_ipt_match *im = (void *)em->data; + + if (!im) + return; + + if (im->match->destroy) { + struct xt_mtdtor_param par = { + .net = em->net, + .match = im->match, + .matchinfo = im->match_data, + .family = im->match->family + }; + im->match->destroy(&par); + } + module_put(im->match->me); + kfree((void *)im); +} + +static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + const struct em_ipt_match *im = (const void *)em->data; + struct xt_action_param acpar = {}; + struct net_device *indev = NULL; + struct nf_hook_state state; + int ret; + + rcu_read_lock(); + + if (skb->skb_iif) + indev = dev_get_by_index_rcu(em->net, skb->skb_iif); + + nf_hook_state_init(&state, im->hook, im->match->family, + indev ?: skb->dev, skb->dev, NULL, em->net, NULL); + + acpar.match = im->match; + acpar.matchinfo = im->match_data; + acpar.state = &state; + + ret = im->match->match(skb, &acpar); + + rcu_read_unlock(); + return ret; +} + +static int em_ipt_dump(struct sk_buff *skb, struct tcf_ematch *em) +{ + struct em_ipt_match *im = (void *)em->data; + + if (nla_put_string(skb, TCA_EM_IPT_MATCH_NAME, im->match->name) < 0) + return -EMSGSIZE; + if (nla_put_u32(skb, TCA_EM_IPT_HOOK, im->hook) < 0) + return -EMSGSIZE; + if (nla_put_u8(skb, TCA_EM_IPT_MATCH_REVISION, im->match->revision) < 0) + return -EMSGSIZE; + if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->match->family) < 0) + return -EMSGSIZE; + if (nla_put(skb, TCA_EM_IPT_MATCH_DATA, + im->match->usersize ?: im->match->matchsize, + im->match_data) < 0) + return -EMSGSIZE; + + return 0; +} + +static struct tcf_ematch_ops em_ipt_ops = { + .kind = TCF_EM_IPT, + .change = em_ipt_change, + .destroy = em_ipt_destroy, + .match = em_ipt_match, + .dump = em_ipt_dump, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_ipt_ops.link) +}; + +static int __init init_em_ipt(void) +{ + return tcf_em_register(&em_ipt_ops); +} + +static void __exit exit_em_ipt(void) +{ + tcf_em_unregister(&em_ipt_ops); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eyal Birger <eyal.birger@gmail.com>"); +MODULE_DESCRIPTION("TC extended match for IPtables matches"); + +module_init(init_em_ipt); +module_exit(exit_em_ipt); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPT); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d512f49ee83c..106dae7e4818 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -739,6 +739,7 @@ static u32 qdisc_alloc_handle(struct net_device *dev) void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n, unsigned int len) { + bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; @@ -760,8 +761,12 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n, * If child was empty even before update then backlog * counter is screwed and we skip notification because * parent class is already passive. + * + * If the original child was offloaded then it is allowed + * to be seem as empty, so the parent is notified anyway. */ - notify = !sch->q.qlen && !WARN_ON_ONCE(!n); + notify = !sch->q.qlen && !WARN_ON_ONCE(!n && + !qdisc_is_offloaded); /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 190570f21b20..39c144b6ff98 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -106,6 +106,14 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, __skb_queue_tail(&q->skb_bad_txq, skb); + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_inc(q, skb); + qdisc_qstats_cpu_qlen_inc(q); + } else { + qdisc_qstats_backlog_inc(q, skb); + q->q.qlen++; + } + if (lock) spin_unlock(lock); } @@ -196,14 +204,6 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, break; if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { qdisc_enqueue_skb_bad_txq(q, nskb); - - if (qdisc_is_percpu_stats(q)) { - qdisc_qstats_cpu_backlog_inc(q, nskb); - qdisc_qstats_cpu_qlen_inc(q); - } else { - qdisc_qstats_backlog_inc(q, nskb); - q->q.qlen++; - } break; } skb->next = nskb; @@ -373,24 +373,33 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, */ static inline bool qdisc_restart(struct Qdisc *q, int *packets) { + bool more, validate, nolock = q->flags & TCQ_F_NOLOCK; spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; struct sk_buff *skb; - bool validate; /* Dequeue packet */ + if (nolock && test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) + return false; + skb = dequeue_skb(q, &validate, packets); - if (unlikely(!skb)) + if (unlikely(!skb)) { + if (nolock) + clear_bit(__QDISC_STATE_RUNNING, &q->state); return false; + } - if (!(q->flags & TCQ_F_NOLOCK)) + if (!nolock) root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); - return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); + more = sch_direct_xmit(skb, q, dev, txq, root_lock, validate); + if (nolock) + clear_bit(__QDISC_STATE_RUNNING, &q->state); + return more; } void __qdisc_run(struct Qdisc *q) @@ -628,6 +637,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, int band = prio2band[skb->priority & TC_PRIO_MAX]; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct skb_array *q = band2list(priv, band); + unsigned int pkt_len = qdisc_pkt_len(skb); int err; err = skb_array_produce(q, skb); @@ -636,7 +646,10 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, return qdisc_drop_cpu(skb, qdisc, to_free); qdisc_qstats_cpu_qlen_inc(qdisc); - qdisc_qstats_cpu_backlog_inc(qdisc, skb); + /* Note: skb can not be used after skb_array_produce(), + * so we better not use qdisc_qstats_cpu_backlog_inc() + */ + this_cpu_add(qdisc->cpu_qstats->backlog, pkt_len); return NET_XMIT_SUCCESS; } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 1ea9846cc6ce..2a4ab7caf553 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1337,6 +1337,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_opt *hopt; u64 rate64, ceil64; + int warn = 0; /* extract all subattrs from opt attr */ if (!opt) @@ -1499,13 +1500,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->quantum = min_t(u64, quantum, INT_MAX); if (!hopt->quantum && cl->quantum < 1000) { - pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n", - cl->common.classid); + warn = -1; cl->quantum = 1000; } if (!hopt->quantum && cl->quantum > 200000) { - pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n", - cl->common.classid); + warn = 1; cl->quantum = 200000; } if (hopt->quantum) @@ -1519,6 +1518,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, sch_tree_unlock(sch); + if (warn) + pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n", + cl->common.classid, (warn == -1 ? "small" : "big")); + qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 7c179addebcd..7d6801fc5340 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -509,7 +509,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, } if (unlikely(sch->q.qlen >= sch->limit)) - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_all(skb, sch, to_free); qdisc_qstats_backlog_inc(sch, skb); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index efbf51f35778..222e53d3d27a 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -142,9 +142,8 @@ prio_reset(struct Qdisc *sch) sch->q.qlen = 0; } -static int prio_offload(struct Qdisc *sch, bool enable) +static int prio_offload(struct Qdisc *sch, struct tc_prio_qopt *qopt) { - struct prio_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_prio_qopt_offload opt = { .handle = sch->handle, @@ -154,10 +153,10 @@ static int prio_offload(struct Qdisc *sch, bool enable) if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; - if (enable) { + if (qopt) { opt.command = TC_PRIO_REPLACE; - opt.replace_params.bands = q->bands; - memcpy(&opt.replace_params.priomap, q->prio2band, + opt.replace_params.bands = qopt->bands; + memcpy(&opt.replace_params.priomap, qopt->priomap, TC_PRIO_MAX + 1); opt.replace_params.qstats = &sch->qstats; } else { @@ -174,7 +173,7 @@ prio_destroy(struct Qdisc *sch) struct prio_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); - prio_offload(sch, false); + prio_offload(sch, NULL); for (prio = 0; prio < q->bands; prio++) qdisc_destroy(q->queues[prio]); } @@ -211,6 +210,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, } } + prio_offload(sch, qopt); sch_tree_lock(sch); q->bands = qopt->bands; memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); @@ -230,7 +230,6 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, } sch_tree_unlock(sch); - prio_offload(sch, true); return 0; } @@ -309,12 +308,44 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); + struct tc_prio_qopt_offload graft_offload; + struct net_device *dev = qdisc_dev(sch); unsigned long band = arg - 1; + bool any_qdisc_is_offloaded; + int err; if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->queues[band]); + + if (!tc_can_offload(dev)) + return 0; + + graft_offload.handle = sch->handle; + graft_offload.parent = sch->parent; + graft_offload.graft_params.band = band; + graft_offload.graft_params.child_handle = new->handle; + graft_offload.command = TC_PRIO_GRAFT; + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, + &graft_offload); + + /* Don't report error if the graft is part of destroy operation. */ + if (err && new != &noop_qdisc) { + /* Don't report error if the parent, the old child and the new + * one are not offloaded. + */ + any_qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; + any_qdisc_is_offloaded |= new->flags & TCQ_F_OFFLOADED; + if (*old) + any_qdisc_is_offloaded |= (*old)->flags & + TCQ_F_OFFLOADED; + + if (any_qdisc_is_offloaded) + NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); + } + return 0; } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 229172d509cc..03225a8df973 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -188,7 +188,8 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, int ret; if (qdisc_pkt_len(skb) > q->max_size) { - if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size) + if (skb_is_gso(skb) && + skb_gso_validate_mac_len(skb, q->max_size)) return tbf_segment(skb, sch, to_free); return qdisc_drop(skb, sch, to_free); } diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 6776582ec449..e845e4588535 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -15,6 +15,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ offload.o stream_sched.o stream_sched_prio.o \ stream_sched_rr.o stream_interleave.o +sctp_diag-y := diag.o + sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o sctp-$(CONFIG_PROC_FS) += proc.o sctp-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 00667c50efa7..e64630cd3331 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -101,13 +101,14 @@ struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp) return NULL; INIT_LIST_HEAD(&new->key_list); + refcount_set(&new->refcnt, 1); new->key_id = key_id; return new; } /* Free the shared key structure */ -static void sctp_auth_shkey_free(struct sctp_shared_key *sh_key) +static void sctp_auth_shkey_destroy(struct sctp_shared_key *sh_key) { BUG_ON(!list_empty(&sh_key->key_list)); sctp_auth_key_put(sh_key->key); @@ -115,6 +116,17 @@ static void sctp_auth_shkey_free(struct sctp_shared_key *sh_key) kfree(sh_key); } +void sctp_auth_shkey_release(struct sctp_shared_key *sh_key) +{ + if (refcount_dec_and_test(&sh_key->refcnt)) + sctp_auth_shkey_destroy(sh_key); +} + +void sctp_auth_shkey_hold(struct sctp_shared_key *sh_key) +{ + refcount_inc(&sh_key->refcnt); +} + /* Destroy the entire key list. This is done during the * associon and endpoint free process. */ @@ -128,7 +140,7 @@ void sctp_auth_destroy_keys(struct list_head *keys) key_for_each_safe(ep_key, tmp, keys) { list_del_init(&ep_key->key_list); - sctp_auth_shkey_free(ep_key); + sctp_auth_shkey_release(ep_key); } } @@ -409,13 +421,19 @@ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp) sctp_auth_key_put(asoc->asoc_shared_key); asoc->asoc_shared_key = secret; + asoc->shkey = ep_key; /* Update send queue in case any chunk already in there now * needs authenticating */ list_for_each_entry(chunk, &asoc->outqueue.out_chunk_list, list) { - if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc)) + if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc)) { chunk->auth = 1; + if (!chunk->shkey) { + chunk->shkey = asoc->shkey; + sctp_auth_shkey_hold(chunk->shkey); + } + } } return 0; @@ -431,8 +449,11 @@ struct sctp_shared_key *sctp_auth_get_shkey( /* First search associations set of endpoint pair shared keys */ key_for_each(key, &asoc->endpoint_shared_keys) { - if (key->key_id == key_id) - return key; + if (key->key_id == key_id) { + if (!key->deactivated) + return key; + break; + } } return NULL; @@ -703,16 +724,15 @@ int sctp_auth_recv_cid(enum sctp_cid chunk, const struct sctp_association *asoc) * after the AUTH chunk in the SCTP packet. */ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, - struct sk_buff *skb, - struct sctp_auth_chunk *auth, - gfp_t gfp) + struct sk_buff *skb, struct sctp_auth_chunk *auth, + struct sctp_shared_key *ep_key, gfp_t gfp) { - struct crypto_shash *tfm; struct sctp_auth_bytes *asoc_key; + struct crypto_shash *tfm; __u16 key_id, hmac_id; - __u8 *digest; unsigned char *end; int free_key = 0; + __u8 *digest; /* Extract the info we need: * - hmac id @@ -724,12 +744,7 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, if (key_id == asoc->active_key_id) asoc_key = asoc->asoc_shared_key; else { - struct sctp_shared_key *ep_key; - - ep_key = sctp_auth_get_shkey(asoc, key_id); - if (!ep_key) - return; - + /* ep_key can't be NULL here */ asoc_key = sctp_auth_asoc_create_secret(asoc, ep_key, gfp); if (!asoc_key) return; @@ -829,7 +844,7 @@ int sctp_auth_set_key(struct sctp_endpoint *ep, struct sctp_association *asoc, struct sctp_authkey *auth_key) { - struct sctp_shared_key *cur_key = NULL; + struct sctp_shared_key *cur_key, *shkey; struct sctp_auth_bytes *key; struct list_head *sh_keys; int replace = 0; @@ -842,46 +857,34 @@ int sctp_auth_set_key(struct sctp_endpoint *ep, else sh_keys = &ep->endpoint_shared_keys; - key_for_each(cur_key, sh_keys) { - if (cur_key->key_id == auth_key->sca_keynumber) { + key_for_each(shkey, sh_keys) { + if (shkey->key_id == auth_key->sca_keynumber) { replace = 1; break; } } - /* If we are not replacing a key id, we need to allocate - * a shared key. - */ - if (!replace) { - cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber, - GFP_KERNEL); - if (!cur_key) - return -ENOMEM; - } + cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber, GFP_KERNEL); + if (!cur_key) + return -ENOMEM; /* Create a new key data based on the info passed in */ key = sctp_auth_create_key(auth_key->sca_keylength, GFP_KERNEL); - if (!key) - goto nomem; + if (!key) { + kfree(cur_key); + return -ENOMEM; + } memcpy(key->data, &auth_key->sca_key[0], auth_key->sca_keylength); + cur_key->key = key; - /* If we are replacing, remove the old keys data from the - * key id. If we are adding new key id, add it to the - * list. - */ - if (replace) - sctp_auth_key_put(cur_key->key); - else - list_add(&cur_key->key_list, sh_keys); + if (replace) { + list_del_init(&shkey->key_list); + sctp_auth_shkey_release(shkey); + } + list_add(&cur_key->key_list, sh_keys); - cur_key->key = key; return 0; -nomem: - if (!replace) - sctp_auth_shkey_free(cur_key); - - return -ENOMEM; } int sctp_auth_set_active_key(struct sctp_endpoint *ep, @@ -905,7 +908,7 @@ int sctp_auth_set_active_key(struct sctp_endpoint *ep, } } - if (!found) + if (!found || key->deactivated) return -EINVAL; if (asoc) { @@ -952,7 +955,58 @@ int sctp_auth_del_key_id(struct sctp_endpoint *ep, /* Delete the shared key */ list_del_init(&key->key_list); - sctp_auth_shkey_free(key); + sctp_auth_shkey_release(key); + + return 0; +} + +int sctp_auth_deact_key_id(struct sctp_endpoint *ep, + struct sctp_association *asoc, __u16 key_id) +{ + struct sctp_shared_key *key; + struct list_head *sh_keys; + int found = 0; + + /* The key identifier MUST NOT be the current active key + * The key identifier MUST correst to an existing key + */ + if (asoc) { + if (asoc->active_key_id == key_id) + return -EINVAL; + + sh_keys = &asoc->endpoint_shared_keys; + } else { + if (ep->active_key_id == key_id) + return -EINVAL; + + sh_keys = &ep->endpoint_shared_keys; + } + + key_for_each(key, sh_keys) { + if (key->key_id == key_id) { + found = 1; + break; + } + } + + if (!found) + return -EINVAL; + + /* refcnt == 1 and !list_empty mean it's not being used anywhere + * and deactivated will be set, so it's time to notify userland + * that this shkey can be freed. + */ + if (asoc && !list_empty(&key->key_list) && + refcount_read(&key->refcnt) == 1) { + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, key->key_id, + SCTP_AUTH_FREE_KEY, GFP_KERNEL); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } + + key->deactivated = 1; return 0; } diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 991a530c6b31..f889a84f264d 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -168,6 +168,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, { size_t len, first_len, max_data, remaining; size_t msg_len = iov_iter_count(from); + struct sctp_shared_key *shkey = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_datamsg *msg; @@ -204,6 +205,17 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + hmac_desc->hmac_len); + + if (sinfo->sinfo_tsn && + sinfo->sinfo_ssn != asoc->active_key_id) { + shkey = sctp_auth_get_shkey(asoc, sinfo->sinfo_ssn); + if (!shkey) { + err = -EINVAL; + goto errout; + } + } else { + shkey = asoc->shkey; + } } /* Check what's our max considering the above */ @@ -275,6 +287,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, if (err < 0) goto errout_chunk_free; + chunk->shkey = shkey; + /* Put the chunk->skb back into the form expected by send. */ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - chunk->skb->data); diff --git a/net/sctp/sctp_diag.c b/net/sctp/diag.c index a72a7d925d46..078f01a8d582 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/diag.c @@ -1,3 +1,34 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions implement sctp diag support. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * <http://www.gnu.org/licenses/>. + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers <linux-sctp@vger.kernel.org> + * + * Written or modified by: + * Xin Long <lucien.xin@gmail.com> + */ + #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 8b3146816519..e2f5a3ee41a7 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -349,8 +349,8 @@ out: /* Look for any peeled off association from the endpoint that matches the * given peer address. */ -int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, - const union sctp_addr *paddr) +bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, + const union sctp_addr *paddr) { struct sctp_sockaddr_entry *addr; struct sctp_bind_addr *bp; @@ -362,10 +362,10 @@ int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, */ list_for_each_entry(addr, &bp->address_list, list) { if (sctp_has_association(net, &addr->a, paddr)) - return 1; + return true; } - return 0; + return false; } /* Do delayed input processing. This is scheduled by sctp_rcv(). diff --git a/net/sctp/input.c b/net/sctp/input.c index 0247cc432e02..ba8a6e6c36fa 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -106,6 +106,7 @@ int sctp_rcv(struct sk_buff *skb) int family; struct sctp_af *af; struct net *net = dev_net(skb->dev); + bool is_gso = skb_is_gso(skb) && skb_is_gso_sctp(skb); if (skb->pkt_type != PACKET_HOST) goto discard_it; @@ -123,8 +124,7 @@ int sctp_rcv(struct sk_buff *skb) * it's better to just linearize it otherwise crc computing * takes longer. */ - if ((!(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) && - skb_linearize(skb)) || + if ((!is_gso && skb_linearize(skb)) || !pskb_may_pull(skb, sizeof(struct sctphdr))) goto discard_it; @@ -135,7 +135,7 @@ int sctp_rcv(struct sk_buff *skb) if (skb_csum_unnecessary(skb)) __skb_decr_checksum_unnecessary(skb); else if (!sctp_checksum_disable && - !(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) && + !is_gso && sctp_rcv_checksum(net, skb) < 0) goto discard_it; skb->csum_valid = 1; @@ -1010,19 +1010,18 @@ struct sctp_association *sctp_lookup_association(struct net *net, } /* Is there an association matching the given local and peer addresses? */ -int sctp_has_association(struct net *net, - const union sctp_addr *laddr, - const union sctp_addr *paddr) +bool sctp_has_association(struct net *net, + const union sctp_addr *laddr, + const union sctp_addr *paddr) { - struct sctp_association *asoc; struct sctp_transport *transport; - if ((asoc = sctp_lookup_association(net, laddr, paddr, &transport))) { + if (sctp_lookup_association(net, laddr, paddr, &transport)) { sctp_transport_put(transport); - return 1; + return true; } - return 0; + return false; } /* @@ -1218,7 +1217,7 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, * issue as packets hitting this are mostly INIT or INIT-ACK and * those cannot be on GSO-style anyway. */ - if ((skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) + if (skb_is_gso(skb) && skb_is_gso_sctp(skb)) return NULL; ch = (struct sctp_chunkhdr *)skb->data; diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 48392552ee7c..23ebc5318edc 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -170,7 +170,7 @@ next_chunk: chunk = list_entry(entry, struct sctp_chunk, list); - if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) { + if (skb_is_gso(chunk->skb) && skb_is_gso_sctp(chunk->skb)) { /* GSO-marked skbs but without frags, handle * them normally */ diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e35d4f73d2df..0d873c58e516 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -952,16 +952,16 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, /* Handle SCTP_I_WANT_MAPPED_V4_ADDR for getpeername() and getsockname() */ static int sctp_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { int rc; - rc = inet6_getname(sock, uaddr, uaddr_len, peer); + rc = inet6_getname(sock, uaddr, peer); - if (rc != 0) + if (rc < 0) return rc; - *uaddr_len = sctp_v6_addr_to_user(sctp_sk(sock->sk), + rc = sctp_v6_addr_to_user(sctp_sk(sock->sk), (union sctp_addr *)uaddr); return rc; diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c index aeea6da81441..fd2684ad94c8 100644 --- a/net/sctp/objcnt.c +++ b/net/sctp/objcnt.c @@ -130,11 +130,3 @@ void sctp_dbg_objcnt_init(struct net *net) if (!ent) pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n"); } - -/* Cleanup the objcount entry in the proc filesystem. */ -void sctp_dbg_objcnt_exit(struct net *net) -{ - remove_proc_entry("sctp_dbg_objcnt", net->sctp.proc_net_sctp); -} - - diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 35bc7106d182..123e9f2dc226 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -45,7 +45,7 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EINVAL); struct sctphdr *sh; - if (!(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP)) + if (!skb_is_gso_sctp(skb)) goto out; sh = sctp_hdr(skb); diff --git a/net/sctp/output.c b/net/sctp/output.c index 01a26ee051e3..d6e1c90cc09a 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -241,10 +241,13 @@ static enum sctp_xmit sctp_packet_bundle_auth(struct sctp_packet *pkt, if (!chunk->auth) return retval; - auth = sctp_make_auth(asoc); + auth = sctp_make_auth(asoc, chunk->shkey->key_id); if (!auth) return retval; + auth->shkey = chunk->shkey; + sctp_auth_shkey_hold(auth->shkey); + retval = __sctp_packet_append_chunk(pkt, auth); if (retval != SCTP_XMIT_OK) @@ -490,7 +493,8 @@ merge: } if (auth) { - sctp_auth_calculate_hmac(tp->asoc, nskb, auth, gfp); + sctp_auth_calculate_hmac(tp->asoc, nskb, auth, + packet->auth->shkey, gfp); /* free auth if no more chunks, or add it back */ if (list_empty(&packet->chunk_list)) sctp_chunk_free(packet->auth); @@ -770,6 +774,16 @@ static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, enum sctp_xmit retval = SCTP_XMIT_OK; size_t psize, pmtu, maxsize; + /* Don't bundle in this packet if this chunk's auth key doesn't + * match other chunks already enqueued on this packet. Also, + * don't bundle the chunk with auth key if other chunks in this + * packet don't have auth key. + */ + if ((packet->auth && chunk->shkey != packet->auth->shkey) || + (!packet->auth && chunk->shkey && + chunk->chunk_hdr->type != SCTP_CID_AUTH)) + return SCTP_XMIT_PMTU_FULL; + psize = packet->size; if (packet->transport->asoc) pmtu = packet->transport->asoc->pathmtu; diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 537545ebcb0e..1d9ccc6dab2b 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -101,25 +101,6 @@ static const struct file_operations sctp_snmp_seq_fops = { .release = single_release_net, }; -/* Set up the proc fs entry for 'snmp' object. */ -int __net_init sctp_snmp_proc_init(struct net *net) -{ - struct proc_dir_entry *p; - - p = proc_create("snmp", S_IRUGO, net->sctp.proc_net_sctp, - &sctp_snmp_seq_fops); - if (!p) - return -ENOMEM; - - return 0; -} - -/* Cleanup the proc fs entry for 'snmp' object. */ -void sctp_snmp_proc_exit(struct net *net) -{ - remove_proc_entry("snmp", net->sctp.proc_net_sctp); -} - /* Dump local addresses of an association/endpoint. */ static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb) { @@ -259,25 +240,6 @@ static const struct file_operations sctp_eps_seq_fops = { .release = seq_release_net, }; -/* Set up the proc fs entry for 'eps' object. */ -int __net_init sctp_eps_proc_init(struct net *net) -{ - struct proc_dir_entry *p; - - p = proc_create("eps", S_IRUGO, net->sctp.proc_net_sctp, - &sctp_eps_seq_fops); - if (!p) - return -ENOMEM; - - return 0; -} - -/* Cleanup the proc fs entry for 'eps' object. */ -void sctp_eps_proc_exit(struct net *net) -{ - remove_proc_entry("eps", net->sctp.proc_net_sctp); -} - struct sctp_ht_iter { struct seq_net_private p; struct rhashtable_iter hti; @@ -390,25 +352,6 @@ static const struct file_operations sctp_assocs_seq_fops = { .release = seq_release_net, }; -/* Set up the proc fs entry for 'assocs' object. */ -int __net_init sctp_assocs_proc_init(struct net *net) -{ - struct proc_dir_entry *p; - - p = proc_create("assocs", S_IRUGO, net->sctp.proc_net_sctp, - &sctp_assocs_seq_fops); - if (!p) - return -ENOMEM; - - return 0; -} - -/* Cleanup the proc fs entry for 'assocs' object. */ -void sctp_assocs_proc_exit(struct net *net) -{ - remove_proc_entry("assocs", net->sctp.proc_net_sctp); -} - static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) { struct sctp_association *assoc; @@ -488,12 +431,6 @@ static const struct seq_operations sctp_remaddr_ops = { .show = sctp_remaddr_seq_show, }; -/* Cleanup the proc fs entry for 'remaddr' object. */ -void sctp_remaddr_proc_exit(struct net *net) -{ - remove_proc_entry("remaddr", net->sctp.proc_net_sctp); -} - static int sctp_remaddr_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &sctp_remaddr_ops, @@ -507,13 +444,28 @@ static const struct file_operations sctp_remaddr_seq_fops = { .release = seq_release_net, }; -int __net_init sctp_remaddr_proc_init(struct net *net) +/* Set up the proc fs entry for the SCTP protocol. */ +int __net_init sctp_proc_init(struct net *net) { - struct proc_dir_entry *p; - - p = proc_create("remaddr", S_IRUGO, net->sctp.proc_net_sctp, - &sctp_remaddr_seq_fops); - if (!p) + net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net); + if (!net->sctp.proc_net_sctp) return -ENOMEM; + if (!proc_create("snmp", 0444, net->sctp.proc_net_sctp, + &sctp_snmp_seq_fops)) + goto cleanup; + if (!proc_create("eps", 0444, net->sctp.proc_net_sctp, + &sctp_eps_seq_fops)) + goto cleanup; + if (!proc_create("assocs", 0444, net->sctp.proc_net_sctp, + &sctp_assocs_seq_fops)) + goto cleanup; + if (!proc_create("remaddr", 0444, net->sctp.proc_net_sctp, + &sctp_remaddr_seq_fops)) + goto cleanup; return 0; + +cleanup: + remove_proc_subtree("sctp", net->proc_net); + net->sctp.proc_net_sctp = NULL; + return -ENOMEM; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 91813e686c67..a24cde236330 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -80,56 +80,6 @@ long sysctl_sctp_mem[3]; int sysctl_sctp_rmem[3]; int sysctl_sctp_wmem[3]; -/* Set up the proc fs entry for the SCTP protocol. */ -static int __net_init sctp_proc_init(struct net *net) -{ -#ifdef CONFIG_PROC_FS - net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net); - if (!net->sctp.proc_net_sctp) - goto out_proc_net_sctp; - if (sctp_snmp_proc_init(net)) - goto out_snmp_proc_init; - if (sctp_eps_proc_init(net)) - goto out_eps_proc_init; - if (sctp_assocs_proc_init(net)) - goto out_assocs_proc_init; - if (sctp_remaddr_proc_init(net)) - goto out_remaddr_proc_init; - - return 0; - -out_remaddr_proc_init: - sctp_assocs_proc_exit(net); -out_assocs_proc_init: - sctp_eps_proc_exit(net); -out_eps_proc_init: - sctp_snmp_proc_exit(net); -out_snmp_proc_init: - remove_proc_entry("sctp", net->proc_net); - net->sctp.proc_net_sctp = NULL; -out_proc_net_sctp: - return -ENOMEM; -#endif /* CONFIG_PROC_FS */ - return 0; -} - -/* Clean up the proc fs entry for the SCTP protocol. - * Note: Do not make this __exit as it is used in the init error - * path. - */ -static void sctp_proc_exit(struct net *net) -{ -#ifdef CONFIG_PROC_FS - sctp_snmp_proc_exit(net); - sctp_eps_proc_exit(net); - sctp_assocs_proc_exit(net); - sctp_remaddr_proc_exit(net); - - remove_proc_entry("sctp", net->proc_net); - net->sctp.proc_net_sctp = NULL; -#endif -} - /* Private helper to extract ipv4 address and stash them in * the protocol structure. */ @@ -1285,10 +1235,12 @@ static int __net_init sctp_defaults_init(struct net *net) if (status) goto err_init_mibs; +#ifdef CONFIG_PROC_FS /* Initialize proc fs directory. */ status = sctp_proc_init(net); if (status) goto err_init_proc; +#endif sctp_dbg_objcnt_init(net); @@ -1306,8 +1258,10 @@ static int __net_init sctp_defaults_init(struct net *net) return 0; +#ifdef CONFIG_PROC_FS err_init_proc: cleanup_sctp_mibs(net); +#endif err_init_mibs: sctp_sysctl_net_unregister(net); err_sysctl_register: @@ -1320,9 +1274,10 @@ static void __net_exit sctp_defaults_exit(struct net *net) sctp_free_addr_wq(net); sctp_free_local_addr_list(net); - sctp_dbg_objcnt_exit(net); - - sctp_proc_exit(net); +#ifdef CONFIG_PROC_FS + remove_proc_subtree("sctp", net->proc_net); + net->sctp.proc_net_sctp = NULL; +#endif cleanup_sctp_mibs(net); sctp_sysctl_net_unregister(net); } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index d01475f5f710..cc20bc39ee7c 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -87,7 +87,28 @@ static void *sctp_addto_chunk_fixed(struct sctp_chunk *, int len, /* Control chunk destructor */ static void sctp_control_release_owner(struct sk_buff *skb) { - /*TODO: do memory release */ + struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg; + + if (chunk->shkey) { + struct sctp_shared_key *shkey = chunk->shkey; + struct sctp_association *asoc = chunk->asoc; + + /* refcnt == 2 and !list_empty mean after this release, it's + * not being used anywhere, and it's time to notify userland + * that this shkey can be freed if it's been deactivated. + */ + if (shkey->deactivated && !list_empty(&shkey->key_list) && + refcount_read(&shkey->refcnt) == 2) { + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, + SCTP_AUTH_FREE_KEY, + GFP_KERNEL); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } + sctp_auth_shkey_release(chunk->shkey); + } } static void sctp_control_set_owner_w(struct sctp_chunk *chunk) @@ -102,7 +123,12 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk) * * For now don't do anything for now. */ + if (chunk->auth) { + chunk->shkey = asoc->shkey; + sctp_auth_shkey_hold(chunk->shkey); + } skb->sk = asoc ? asoc->base.sk : NULL; + skb_shinfo(skb)->destructor_arg = chunk; skb->destructor = sctp_control_release_owner; } @@ -1271,7 +1297,8 @@ nodata: return retval; } -struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc) +struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, + __u16 key_id) { struct sctp_authhdr auth_hdr; struct sctp_hmac *hmac_desc; @@ -1289,7 +1316,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc) return NULL; auth_hdr.hmac_id = htons(hmac_desc->hmac_id); - auth_hdr.shkey_id = htons(asoc->active_key_id); + auth_hdr.shkey_id = htons(key_id); retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr), &auth_hdr); diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index b71e7fb0a20a..298112ca8c06 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1049,6 +1049,16 @@ static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands, asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } +static void sctp_cmd_peer_no_auth(struct sctp_cmd_seq *commands, + struct sctp_association *asoc) +{ + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH, GFP_ATOMIC); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); +} + /* Helper function to generate an adaptation indication event */ static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands, struct sctp_association *asoc) @@ -1755,6 +1765,9 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, case SCTP_CMD_ADAPTATION_IND: sctp_cmd_adaptation_ind(commands, asoc); break; + case SCTP_CMD_PEER_NO_AUTH: + sctp_cmd_peer_no_auth(commands, asoc); + break; case SCTP_CMD_ASSOC_SHKEY: error = sctp_auth_asoc_init_active_key(asoc, diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index eb7905ffe5f2..cc56a67dbb4d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -659,7 +659,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, void *arg, struct sctp_cmd_seq *commands) { - struct sctp_ulpevent *ev, *ai_ev = NULL; + struct sctp_ulpevent *ev, *ai_ev = NULL, *auth_ev = NULL; struct sctp_association *new_asoc; struct sctp_init_chunk *peer_init; struct sctp_chunk *chunk = arg; @@ -820,6 +820,14 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, goto nomem_aiev; } + if (!new_asoc->peer.auth_capable) { + auth_ev = sctp_ulpevent_make_authkey(new_asoc, 0, + SCTP_AUTH_NO_AUTH, + GFP_ATOMIC); + if (!auth_ev) + goto nomem_authev; + } + /* Add all the state machine commands now since we've created * everything. This way we don't introduce memory corruptions * during side-effect processing and correclty count established @@ -847,8 +855,14 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ai_ev)); + if (auth_ev) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(auth_ev)); + return SCTP_DISPOSITION_CONSUME; +nomem_authev: + sctp_ulpevent_free(ai_ev); nomem_aiev: sctp_ulpevent_free(ev); nomem_ev: @@ -953,6 +967,15 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net, SCTP_ULPEVENT(ev)); } + if (!asoc->peer.auth_capable) { + ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH, + GFP_ATOMIC); + if (!ev) + goto nomem; + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + } + return SCTP_DISPOSITION_CONSUME; nomem: return SCTP_DISPOSITION_NOMEM; @@ -1908,6 +1931,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_b( if (asoc->peer.adaptation_ind) sctp_add_cmd_sf(commands, SCTP_CMD_ADAPTATION_IND, SCTP_NULL()); + if (!asoc->peer.auth_capable) + sctp_add_cmd_sf(commands, SCTP_CMD_PEER_NO_AUTH, SCTP_NULL()); + return SCTP_DISPOSITION_CONSUME; nomem: @@ -1954,7 +1980,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_d( struct sctp_cmd_seq *commands, struct sctp_association *new_asoc) { - struct sctp_ulpevent *ev = NULL, *ai_ev = NULL; + struct sctp_ulpevent *ev = NULL, *ai_ev = NULL, *auth_ev = NULL; struct sctp_chunk *repl; /* Clarification from Implementor's Guide: @@ -2001,6 +2027,14 @@ static enum sctp_disposition sctp_sf_do_dupcook_d( goto nomem; } + + if (!asoc->peer.auth_capable) { + auth_ev = sctp_ulpevent_make_authkey(asoc, 0, + SCTP_AUTH_NO_AUTH, + GFP_ATOMIC); + if (!auth_ev) + goto nomem; + } } repl = sctp_make_cookie_ack(new_asoc, chunk); @@ -2015,10 +2049,15 @@ static enum sctp_disposition sctp_sf_do_dupcook_d( if (ai_ev) sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ai_ev)); + if (auth_ev) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(auth_ev)); return SCTP_DISPOSITION_CONSUME; nomem: + if (auth_ev) + sctp_ulpevent_free(auth_ev); if (ai_ev) sctp_ulpevent_free(ai_ev); if (ev) @@ -4114,6 +4153,7 @@ static enum sctp_ierror sctp_sf_authenticate( const union sctp_subtype type, struct sctp_chunk *chunk) { + struct sctp_shared_key *sh_key = NULL; struct sctp_authhdr *auth_hdr; __u8 *save_digest, *digest; struct sctp_hmac *hmac; @@ -4135,9 +4175,11 @@ static enum sctp_ierror sctp_sf_authenticate( * configured */ key_id = ntohs(auth_hdr->shkey_id); - if (key_id != asoc->active_key_id && !sctp_auth_get_shkey(asoc, key_id)) - return SCTP_IERROR_AUTH_BAD_KEYID; - + if (key_id != asoc->active_key_id) { + sh_key = sctp_auth_get_shkey(asoc, key_id); + if (!sh_key) + return SCTP_IERROR_AUTH_BAD_KEYID; + } /* Make sure that the length of the signature matches what * we expect. @@ -4166,7 +4208,7 @@ static enum sctp_ierror sctp_sf_authenticate( sctp_auth_calculate_hmac(asoc, chunk->skb, (struct sctp_auth_chunk *)chunk->chunk_hdr, - GFP_ATOMIC); + sh_key, GFP_ATOMIC); /* Discard the packet if the digests do not match */ if (memcmp(save_digest, digest, sig_len)) { @@ -4243,7 +4285,7 @@ enum sctp_disposition sctp_sf_eat_auth(struct net *net, struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, ntohs(auth_hdr->shkey_id), - SCTP_AUTH_NEWKEY, GFP_ATOMIC); + SCTP_AUTH_NEW_KEY, GFP_ATOMIC); if (!ev) return -ENOMEM; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index bf271f8c2dc9..7a10ae3c3d82 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -156,6 +156,9 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk) /* The sndbuf space is tracked per association. */ sctp_association_hold(asoc); + if (chunk->shkey) + sctp_auth_shkey_hold(chunk->shkey); + skb_set_owner_w(chunk->skb, sk); chunk->skb->destructor = sctp_wfree; @@ -1606,396 +1609,303 @@ static int sctp_error(struct sock *sk, int flags, int err) static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs); -static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) +static int sctp_sendmsg_parse(struct sock *sk, struct sctp_cmsgs *cmsgs, + struct sctp_sndrcvinfo *srinfo, + const struct msghdr *msg, size_t msg_len) { - struct net *net = sock_net(sk); - struct sctp_sock *sp; - struct sctp_endpoint *ep; - struct sctp_association *new_asoc = NULL, *asoc = NULL; - struct sctp_transport *transport, *chunk_tp; - struct sctp_chunk *chunk; - union sctp_addr to; - struct sockaddr *msg_name = NULL; - struct sctp_sndrcvinfo default_sinfo; - struct sctp_sndrcvinfo *sinfo; - struct sctp_initmsg *sinit; - sctp_assoc_t associd = 0; - struct sctp_cmsgs cmsgs = { NULL }; - enum sctp_scope scope; - bool fill_sinfo_ttl = false, wait_connect = false; - struct sctp_datamsg *datamsg; - int msg_flags = msg->msg_flags; - __u16 sinfo_flags = 0; - long timeo; + __u16 sflags; int err; - err = 0; - sp = sctp_sk(sk); - ep = sp->ep; - - pr_debug("%s: sk:%p, msg:%p, msg_len:%zu ep:%p\n", __func__, sk, - msg, msg_len, ep); + if (sctp_sstate(sk, LISTENING) && sctp_style(sk, TCP)) + return -EPIPE; - /* We cannot send a message over a TCP-style listening socket. */ - if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) { - err = -EPIPE; - goto out_nounlock; - } + if (msg_len > sk->sk_sndbuf) + return -EMSGSIZE; - /* Parse out the SCTP CMSGs. */ - err = sctp_msghdr_parse(msg, &cmsgs); + memset(cmsgs, 0, sizeof(*cmsgs)); + err = sctp_msghdr_parse(msg, cmsgs); if (err) { pr_debug("%s: msghdr parse err:%x\n", __func__, err); - goto out_nounlock; + return err; } - /* Fetch the destination address for this packet. This - * address only selects the association--it is not necessarily - * the address we will send to. - * For a peeled-off socket, msg_name is ignored. - */ - if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) { - int msg_namelen = msg->msg_namelen; + memset(srinfo, 0, sizeof(*srinfo)); + if (cmsgs->srinfo) { + srinfo->sinfo_stream = cmsgs->srinfo->sinfo_stream; + srinfo->sinfo_flags = cmsgs->srinfo->sinfo_flags; + srinfo->sinfo_ppid = cmsgs->srinfo->sinfo_ppid; + srinfo->sinfo_context = cmsgs->srinfo->sinfo_context; + srinfo->sinfo_assoc_id = cmsgs->srinfo->sinfo_assoc_id; + srinfo->sinfo_timetolive = cmsgs->srinfo->sinfo_timetolive; + } - err = sctp_verify_addr(sk, (union sctp_addr *)msg->msg_name, - msg_namelen); - if (err) - return err; + if (cmsgs->sinfo) { + srinfo->sinfo_stream = cmsgs->sinfo->snd_sid; + srinfo->sinfo_flags = cmsgs->sinfo->snd_flags; + srinfo->sinfo_ppid = cmsgs->sinfo->snd_ppid; + srinfo->sinfo_context = cmsgs->sinfo->snd_context; + srinfo->sinfo_assoc_id = cmsgs->sinfo->snd_assoc_id; + } - if (msg_namelen > sizeof(to)) - msg_namelen = sizeof(to); - memcpy(&to, msg->msg_name, msg_namelen); - msg_name = msg->msg_name; + if (cmsgs->prinfo) { + srinfo->sinfo_timetolive = cmsgs->prinfo->pr_value; + SCTP_PR_SET_POLICY(srinfo->sinfo_flags, + cmsgs->prinfo->pr_policy); } - sinit = cmsgs.init; - if (cmsgs.sinfo != NULL) { - memset(&default_sinfo, 0, sizeof(default_sinfo)); - default_sinfo.sinfo_stream = cmsgs.sinfo->snd_sid; - default_sinfo.sinfo_flags = cmsgs.sinfo->snd_flags; - default_sinfo.sinfo_ppid = cmsgs.sinfo->snd_ppid; - default_sinfo.sinfo_context = cmsgs.sinfo->snd_context; - default_sinfo.sinfo_assoc_id = cmsgs.sinfo->snd_assoc_id; + sflags = srinfo->sinfo_flags; + if (!sflags && msg_len) + return 0; - sinfo = &default_sinfo; - fill_sinfo_ttl = true; - } else { - sinfo = cmsgs.srinfo; - } - /* Did the user specify SNDINFO/SNDRCVINFO? */ - if (sinfo) { - sinfo_flags = sinfo->sinfo_flags; - associd = sinfo->sinfo_assoc_id; - } + if (sctp_style(sk, TCP) && (sflags & (SCTP_EOF | SCTP_ABORT))) + return -EINVAL; - pr_debug("%s: msg_len:%zu, sinfo_flags:0x%x\n", __func__, - msg_len, sinfo_flags); + if (((sflags & SCTP_EOF) && msg_len > 0) || + (!(sflags & (SCTP_EOF | SCTP_ABORT)) && msg_len == 0)) + return -EINVAL; - /* SCTP_EOF or SCTP_ABORT cannot be set on a TCP-style socket. */ - if (sctp_style(sk, TCP) && (sinfo_flags & (SCTP_EOF | SCTP_ABORT))) { - err = -EINVAL; - goto out_nounlock; - } + if ((sflags & SCTP_ADDR_OVER) && !msg->msg_name) + return -EINVAL; - /* If SCTP_EOF is set, no data can be sent. Disallow sending zero - * length messages when SCTP_EOF|SCTP_ABORT is not set. - * If SCTP_ABORT is set, the message length could be non zero with - * the msg_iov set to the user abort reason. - */ - if (((sinfo_flags & SCTP_EOF) && (msg_len > 0)) || - (!(sinfo_flags & (SCTP_EOF|SCTP_ABORT)) && (msg_len == 0))) { - err = -EINVAL; - goto out_nounlock; - } + return 0; +} - /* If SCTP_ADDR_OVER is set, there must be an address - * specified in msg_name. - */ - if ((sinfo_flags & SCTP_ADDR_OVER) && (!msg->msg_name)) { - err = -EINVAL; - goto out_nounlock; - } +static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, + struct sctp_cmsgs *cmsgs, + union sctp_addr *daddr, + struct sctp_transport **tp) +{ + struct sctp_endpoint *ep = sctp_sk(sk)->ep; + struct net *net = sock_net(sk); + struct sctp_association *asoc; + enum sctp_scope scope; + struct cmsghdr *cmsg; + int err; - transport = NULL; + *tp = NULL; - pr_debug("%s: about to look up association\n", __func__); + if (sflags & (SCTP_EOF | SCTP_ABORT)) + return -EINVAL; - lock_sock(sk); + if (sctp_style(sk, TCP) && (sctp_sstate(sk, ESTABLISHED) || + sctp_sstate(sk, CLOSING))) + return -EADDRNOTAVAIL; - /* If a msg_name has been specified, assume this is to be used. */ - if (msg_name) { - /* Look for a matching association on the endpoint. */ - asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport); + if (sctp_endpoint_is_peeled_off(ep, daddr)) + return -EADDRNOTAVAIL; - /* If we could not find a matching association on the - * endpoint, make sure that it is not a TCP-style - * socket that already has an association or there is - * no peeled-off association on another socket. - */ - if (!asoc && - ((sctp_style(sk, TCP) && - (sctp_sstate(sk, ESTABLISHED) || - sctp_sstate(sk, CLOSING))) || - sctp_endpoint_is_peeled_off(ep, &to))) { - err = -EADDRNOTAVAIL; - goto out_unlock; - } + if (!ep->base.bind_addr.port) { + if (sctp_autobind(sk)) + return -EAGAIN; } else { - asoc = sctp_id2assoc(sk, associd); - if (!asoc) { - err = -EPIPE; - goto out_unlock; - } + if (ep->base.bind_addr.port < inet_prot_sock(net) && + !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) + return -EACCES; } - if (asoc) { - pr_debug("%s: just looked up association:%p\n", __func__, asoc); + scope = sctp_scope(daddr); - /* We cannot send a message on a TCP-style SCTP_SS_ESTABLISHED - * socket that has an association in CLOSED state. This can - * happen when an accepted socket has an association that is - * already CLOSED. - */ - if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) { - err = -EPIPE; - goto out_unlock; - } + asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); + if (!asoc) + return -ENOMEM; + + if (sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL) < 0) { + err = -ENOMEM; + goto free; + } - if (sinfo_flags & SCTP_EOF) { - pr_debug("%s: shutting down association:%p\n", - __func__, asoc); + if (cmsgs->init) { + struct sctp_initmsg *init = cmsgs->init; - sctp_primitive_SHUTDOWN(net, asoc, NULL); - err = 0; - goto out_unlock; + if (init->sinit_num_ostreams) { + __u16 outcnt = init->sinit_num_ostreams; + + asoc->c.sinit_num_ostreams = outcnt; + /* outcnt has been changed, need to re-init stream */ + err = sctp_stream_init(&asoc->stream, outcnt, 0, + GFP_KERNEL); + if (err) + goto free; } - if (sinfo_flags & SCTP_ABORT) { - chunk = sctp_make_abort_user(asoc, msg, msg_len); - if (!chunk) { - err = -ENOMEM; - goto out_unlock; - } + if (init->sinit_max_instreams) + asoc->c.sinit_max_instreams = init->sinit_max_instreams; - pr_debug("%s: aborting association:%p\n", - __func__, asoc); + if (init->sinit_max_attempts) + asoc->max_init_attempts = init->sinit_max_attempts; - sctp_primitive_ABORT(net, asoc, chunk); - err = 0; - goto out_unlock; - } + if (init->sinit_max_init_timeo) + asoc->max_init_timeo = + msecs_to_jiffies(init->sinit_max_init_timeo); } - /* Do we need to create the association? */ - if (!asoc) { - pr_debug("%s: there is no association yet\n", __func__); + *tp = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN); + if (!*tp) { + err = -ENOMEM; + goto free; + } - if (sinfo_flags & (SCTP_EOF | SCTP_ABORT)) { - err = -EINVAL; - goto out_unlock; - } + if (!cmsgs->addrs_msg) + return 0; - /* Check for invalid stream against the stream counts, - * either the default or the user specified stream counts. - */ - if (sinfo) { - if (!sinit || !sinit->sinit_num_ostreams) { - /* Check against the defaults. */ - if (sinfo->sinfo_stream >= - sp->initmsg.sinit_num_ostreams) { - err = -EINVAL; - goto out_unlock; - } - } else { - /* Check against the requested. */ - if (sinfo->sinfo_stream >= - sinit->sinit_num_ostreams) { - err = -EINVAL; - goto out_unlock; - } - } - } + /* sendv addr list parse */ + for_each_cmsghdr(cmsg, cmsgs->addrs_msg) { + struct sctp_transport *transport; + struct sctp_association *old; + union sctp_addr _daddr; + int dlen; - /* - * API 3.1.2 bind() - UDP Style Syntax - * If a bind() or sctp_bindx() is not called prior to a - * sendmsg() call that initiates a new association, the - * system picks an ephemeral port and will choose an address - * set equivalent to binding with a wildcard address. - */ - if (!ep->base.bind_addr.port) { - if (sctp_autobind(sk)) { - err = -EAGAIN; - goto out_unlock; + if (cmsg->cmsg_level != IPPROTO_SCTP || + (cmsg->cmsg_type != SCTP_DSTADDRV4 && + cmsg->cmsg_type != SCTP_DSTADDRV6)) + continue; + + daddr = &_daddr; + memset(daddr, 0, sizeof(*daddr)); + dlen = cmsg->cmsg_len - sizeof(struct cmsghdr); + if (cmsg->cmsg_type == SCTP_DSTADDRV4) { + if (dlen < sizeof(struct in_addr)) { + err = -EINVAL; + goto free; } + + dlen = sizeof(struct in_addr); + daddr->v4.sin_family = AF_INET; + daddr->v4.sin_port = htons(asoc->peer.port); + memcpy(&daddr->v4.sin_addr, CMSG_DATA(cmsg), dlen); } else { - /* - * If an unprivileged user inherits a one-to-many - * style socket with open associations on a privileged - * port, it MAY be permitted to accept new associations, - * but it SHOULD NOT be permitted to open new - * associations. - */ - if (ep->base.bind_addr.port < inet_prot_sock(net) && - !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) { - err = -EACCES; - goto out_unlock; + if (dlen < sizeof(struct in6_addr)) { + err = -EINVAL; + goto free; } - } - scope = sctp_scope(&to); - new_asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); - if (!new_asoc) { - err = -ENOMEM; - goto out_unlock; + dlen = sizeof(struct in6_addr); + daddr->v6.sin6_family = AF_INET6; + daddr->v6.sin6_port = htons(asoc->peer.port); + memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen); } - asoc = new_asoc; - err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL); - if (err < 0) { - err = -ENOMEM; - goto out_free; + err = sctp_verify_addr(sk, daddr, sizeof(*daddr)); + if (err) + goto free; + + old = sctp_endpoint_lookup_assoc(ep, daddr, &transport); + if (old && old != asoc) { + if (old->state >= SCTP_STATE_ESTABLISHED) + err = -EISCONN; + else + err = -EALREADY; + goto free; } - /* If the SCTP_INIT ancillary data is specified, set all - * the association init values accordingly. - */ - if (sinit) { - if (sinit->sinit_num_ostreams) { - __u16 outcnt = sinit->sinit_num_ostreams; - - asoc->c.sinit_num_ostreams = outcnt; - /* outcnt has been changed, so re-init stream */ - err = sctp_stream_init(&asoc->stream, outcnt, 0, - GFP_KERNEL); - if (err) - goto out_free; - } - if (sinit->sinit_max_instreams) { - asoc->c.sinit_max_instreams = - sinit->sinit_max_instreams; - } - if (sinit->sinit_max_attempts) { - asoc->max_init_attempts - = sinit->sinit_max_attempts; - } - if (sinit->sinit_max_init_timeo) { - asoc->max_init_timeo = - msecs_to_jiffies(sinit->sinit_max_init_timeo); - } + if (sctp_endpoint_is_peeled_off(ep, daddr)) { + err = -EADDRNOTAVAIL; + goto free; } - /* Prime the peer's transport structures. */ - transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, SCTP_UNKNOWN); + transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, + SCTP_UNKNOWN); if (!transport) { err = -ENOMEM; - goto out_free; + goto free; } } - /* ASSERT: we have a valid association at this point. */ - pr_debug("%s: we have a valid association\n", __func__); + return 0; - if (!sinfo) { - /* If the user didn't specify SNDINFO/SNDRCVINFO, make up - * one with some defaults. - */ - memset(&default_sinfo, 0, sizeof(default_sinfo)); - default_sinfo.sinfo_stream = asoc->default_stream; - default_sinfo.sinfo_flags = asoc->default_flags; - default_sinfo.sinfo_ppid = asoc->default_ppid; - default_sinfo.sinfo_context = asoc->default_context; - default_sinfo.sinfo_timetolive = asoc->default_timetolive; - default_sinfo.sinfo_assoc_id = sctp_assoc2id(asoc); - - sinfo = &default_sinfo; - } else if (fill_sinfo_ttl) { - /* In case SNDINFO was specified, we still need to fill - * it with a default ttl from the assoc here. - */ - sinfo->sinfo_timetolive = asoc->default_timetolive; - } +free: + sctp_association_free(asoc); + return err; +} - /* API 7.1.7, the sndbuf size per association bounds the - * maximum size of data that can be sent in a single send call. - */ - if (msg_len > sk->sk_sndbuf) { - err = -EMSGSIZE; - goto out_free; +static int sctp_sendmsg_check_sflags(struct sctp_association *asoc, + __u16 sflags, struct msghdr *msg, + size_t msg_len) +{ + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); + + if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) + return -EPIPE; + + if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP) && + !sctp_state(asoc, ESTABLISHED)) + return 0; + + if (sflags & SCTP_EOF) { + pr_debug("%s: shutting down association:%p\n", __func__, asoc); + sctp_primitive_SHUTDOWN(net, asoc, NULL); + + return 0; } - if (asoc->pmtu_pending) - sctp_assoc_pending_pmtu(asoc); + if (sflags & SCTP_ABORT) { + struct sctp_chunk *chunk; - /* If fragmentation is disabled and the message length exceeds the - * association fragmentation point, return EMSGSIZE. The I-D - * does not specify what this error is, but this looks like - * a great fit. - */ - if (sctp_sk(sk)->disable_fragments && (msg_len > asoc->frag_point)) { - err = -EMSGSIZE; - goto out_free; + chunk = sctp_make_abort_user(asoc, msg, msg_len); + if (!chunk) + return -ENOMEM; + + pr_debug("%s: aborting association:%p\n", __func__, asoc); + sctp_primitive_ABORT(net, asoc, chunk); + + return 0; } - /* Check for invalid stream. */ + return 1; +} + +static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, + struct msghdr *msg, size_t msg_len, + struct sctp_transport *transport, + struct sctp_sndrcvinfo *sinfo) +{ + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); + struct sctp_datamsg *datamsg; + bool wait_connect = false; + struct sctp_chunk *chunk; + long timeo; + int err; + if (sinfo->sinfo_stream >= asoc->stream.outcnt) { err = -EINVAL; - goto out_free; + goto err; } - /* Allocate sctp_stream_out_ext if not already done */ if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) { err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream); if (err) - goto out_free; + goto err; } + if (sctp_sk(sk)->disable_fragments && msg_len > asoc->frag_point) { + err = -EMSGSIZE; + goto err; + } + + if (asoc->pmtu_pending) + sctp_assoc_pending_pmtu(asoc); + if (sctp_wspace(asoc) < msg_len) sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { - /* sk can be changed by peel off when waiting for buf. */ + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); - if (err) { - if (err == -ESRCH) { - /* asoc is already dead. */ - new_asoc = NULL; - err = -EPIPE; - } - goto out_free; - } + if (err) + goto err; } - /* If an address is passed with the sendto/sendmsg call, it is used - * to override the primary destination address in the TCP model, or - * when SCTP_ADDR_OVER flag is set in the UDP model. - */ - if ((sctp_style(sk, TCP) && msg_name) || - (sinfo_flags & SCTP_ADDR_OVER)) { - chunk_tp = sctp_assoc_lookup_paddr(asoc, &to); - if (!chunk_tp) { - err = -EINVAL; - goto out_free; - } - } else - chunk_tp = NULL; - - /* Auto-connect, if we aren't connected already. */ if (sctp_state(asoc, CLOSED)) { err = sctp_primitive_ASSOCIATE(net, asoc, NULL); - if (err < 0) - goto out_free; + if (err) + goto err; - /* If stream interleave is enabled, wait_connect has to be - * done earlier than data enqueue, as it needs to make data - * or idata according to asoc->intl_enable which is set - * after connection is done. - */ - if (sctp_sk(asoc->base.sk)->strm_interleave) { + if (sctp_sk(sk)->strm_interleave) { timeo = sock_sndtimeo(sk, 0); err = sctp_wait_for_connect(asoc, &timeo); if (err) - goto out_unlock; + goto err; } else { wait_connect = true; } @@ -2003,73 +1913,186 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) pr_debug("%s: we associated primitively\n", __func__); } - /* Break the message into multiple chunks of maximum size. */ datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter); if (IS_ERR(datamsg)) { err = PTR_ERR(datamsg); - goto out_free; + goto err; } + asoc->force_delay = !!(msg->msg_flags & MSG_MORE); - /* Now send the (possibly) fragmented message. */ list_for_each_entry(chunk, &datamsg->chunks, frag_list) { sctp_chunk_hold(chunk); - - /* Do accounting for the write space. */ sctp_set_owner_w(chunk); - - chunk->transport = chunk_tp; + chunk->transport = transport; } - /* Send it to the lower layers. Note: all chunks - * must either fail or succeed. The lower layer - * works that way today. Keep it that way or this - * breaks. - */ err = sctp_primitive_SEND(net, asoc, datamsg); - /* Did the lower layer accept the chunk? */ if (err) { sctp_datamsg_free(datamsg); - goto out_free; + goto err; } pr_debug("%s: we sent primitively\n", __func__); sctp_datamsg_put(datamsg); - err = msg_len; if (unlikely(wait_connect)) { - timeo = sock_sndtimeo(sk, msg_flags & MSG_DONTWAIT); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); sctp_wait_for_connect(asoc, &timeo); } - /* If we are already past ASSOCIATE, the lower - * layers are responsible for association cleanup. - */ - goto out_unlock; + err = msg_len; -out_free: - if (new_asoc) - sctp_association_free(asoc); -out_unlock: - release_sock(sk); +err: + return err; +} -out_nounlock: - return sctp_error(sk, msg_flags, err); +static union sctp_addr *sctp_sendmsg_get_daddr(struct sock *sk, + const struct msghdr *msg, + struct sctp_cmsgs *cmsgs) +{ + union sctp_addr *daddr = NULL; + int err; -#if 0 -do_sock_err: - if (msg_len) - err = msg_len; - else - err = sock_error(sk); - goto out; + if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) { + int len = msg->msg_namelen; -do_interrupted: - if (msg_len) - err = msg_len; - goto out; -#endif /* 0 */ + if (len > sizeof(*daddr)) + len = sizeof(*daddr); + + daddr = (union sctp_addr *)msg->msg_name; + + err = sctp_verify_addr(sk, daddr, len); + if (err) + return ERR_PTR(err); + } + + return daddr; +} + +static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, + struct sctp_cmsgs *cmsgs) +{ + if (!cmsgs->srinfo && !cmsgs->sinfo) { + sinfo->sinfo_stream = asoc->default_stream; + sinfo->sinfo_ppid = asoc->default_ppid; + sinfo->sinfo_context = asoc->default_context; + sinfo->sinfo_assoc_id = sctp_assoc2id(asoc); + + if (!cmsgs->prinfo) + sinfo->sinfo_flags = asoc->default_flags; + } + + if (!cmsgs->srinfo && !cmsgs->prinfo) + sinfo->sinfo_timetolive = asoc->default_timetolive; + + if (cmsgs->authinfo) { + /* Reuse sinfo_tsn to indicate that authinfo was set and + * sinfo_ssn to save the keyid on tx path. + */ + sinfo->sinfo_tsn = 1; + sinfo->sinfo_ssn = cmsgs->authinfo->auth_keynumber; + } +} + +static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) +{ + struct sctp_endpoint *ep = sctp_sk(sk)->ep; + struct sctp_transport *transport = NULL; + struct sctp_sndrcvinfo _sinfo, *sinfo; + struct sctp_association *asoc; + struct sctp_cmsgs cmsgs; + union sctp_addr *daddr; + bool new = false; + __u16 sflags; + int err; + + /* Parse and get snd_info */ + err = sctp_sendmsg_parse(sk, &cmsgs, &_sinfo, msg, msg_len); + if (err) + goto out; + + sinfo = &_sinfo; + sflags = sinfo->sinfo_flags; + + /* Get daddr from msg */ + daddr = sctp_sendmsg_get_daddr(sk, msg, &cmsgs); + if (IS_ERR(daddr)) { + err = PTR_ERR(daddr); + goto out; + } + + lock_sock(sk); + + /* SCTP_SENDALL process */ + if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP)) { + list_for_each_entry(asoc, &ep->asocs, asocs) { + err = sctp_sendmsg_check_sflags(asoc, sflags, msg, + msg_len); + if (err == 0) + continue; + if (err < 0) + goto out_unlock; + + sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs); + + err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, + NULL, sinfo); + if (err < 0) + goto out_unlock; + + iov_iter_revert(&msg->msg_iter, err); + } + + goto out_unlock; + } + + /* Get and check or create asoc */ + if (daddr) { + asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport); + if (asoc) { + err = sctp_sendmsg_check_sflags(asoc, sflags, msg, + msg_len); + if (err <= 0) + goto out_unlock; + } else { + err = sctp_sendmsg_new_asoc(sk, sflags, &cmsgs, daddr, + &transport); + if (err) + goto out_unlock; + + asoc = transport->asoc; + new = true; + } + + if (!sctp_style(sk, TCP) && !(sflags & SCTP_ADDR_OVER)) + transport = NULL; + } else { + asoc = sctp_id2assoc(sk, sinfo->sinfo_assoc_id); + if (!asoc) { + err = -EPIPE; + goto out_unlock; + } + + err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len); + if (err <= 0) + goto out_unlock; + } + + /* Update snd_info with the asoc */ + sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs); + + /* Send msg to the asoc */ + err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, transport, sinfo); + if (err < 0 && err != -ESRCH && new) + sctp_association_free(asoc); + +out_unlock: + release_sock(sk); +out: + return sctp_error(sk, msg->msg_flags, err); } /* This is an extended version of skb_pull() that removes the data from the @@ -3624,6 +3647,33 @@ static int sctp_setsockopt_del_key(struct sock *sk, } /* + * 8.3.4 Deactivate a Shared Key (SCTP_AUTH_DEACTIVATE_KEY) + * + * This set option will deactivate a shared secret key. + */ +static int sctp_setsockopt_deactivate_key(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct sctp_endpoint *ep = sctp_sk(sk)->ep; + struct sctp_authkeyid val; + struct sctp_association *asoc; + + if (!ep->auth_enable) + return -EACCES; + + if (optlen != sizeof(struct sctp_authkeyid)) + return -EINVAL; + if (copy_from_user(&val, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, val.scact_assoc_id); + if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + return sctp_auth_deact_key_id(ep, asoc, val.scact_keynumber); +} + +/* * 8.1.23 SCTP_AUTO_ASCONF * * This option will enable or disable the use of the automatic generation of @@ -4215,6 +4265,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_AUTH_DELETE_KEY: retval = sctp_setsockopt_del_key(sk, optval, optlen); break; + case SCTP_AUTH_DEACTIVATE_KEY: + retval = sctp_setsockopt_deactivate_key(sk, optval, optlen); + break; case SCTP_AUTO_ASCONF: retval = sctp_setsockopt_auto_asconf(sk, optval, optlen); break; @@ -7189,6 +7242,7 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_AUTH_KEY: case SCTP_AUTH_CHUNK: case SCTP_AUTH_DELETE_KEY: + case SCTP_AUTH_DEACTIVATE_KEY: retval = -EOPNOTSUPP; break; case SCTP_HMAC_IDENT: @@ -7811,8 +7865,8 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | - SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK | - SCTP_ABORT | SCTP_EOF)) + SCTP_SACK_IMMEDIATELY | SCTP_SENDALL | + SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; @@ -7835,10 +7889,60 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) if (cmsgs->sinfo->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | - SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK | - SCTP_ABORT | SCTP_EOF)) + SCTP_SACK_IMMEDIATELY | SCTP_SENDALL | + SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; + case SCTP_PRINFO: + /* SCTP Socket API Extension + * 5.3.7 SCTP PR-SCTP Information Structure (SCTP_PRINFO) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_PRINFO struct sctp_prinfo + */ + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_prinfo))) + return -EINVAL; + + cmsgs->prinfo = CMSG_DATA(cmsg); + if (cmsgs->prinfo->pr_policy & ~SCTP_PR_SCTP_MASK) + return -EINVAL; + + if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE) + cmsgs->prinfo->pr_value = 0; + break; + case SCTP_AUTHINFO: + /* SCTP Socket API Extension + * 5.3.8 SCTP AUTH Information Structure (SCTP_AUTHINFO) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_AUTHINFO struct sctp_authinfo + */ + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_authinfo))) + return -EINVAL; + + cmsgs->authinfo = CMSG_DATA(cmsg); + break; + case SCTP_DSTADDRV4: + case SCTP_DSTADDRV6: + /* SCTP Socket API Extension + * 5.3.9/10 SCTP Destination IPv4/6 Address Structure (SCTP_DSTADDRV4/6) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_DSTADDRV4 struct in_addr + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_DSTADDRV6 struct in6_addr + */ + cmsgs->addrs_msg = my_msg; + break; default: return -EINVAL; } @@ -8062,6 +8166,26 @@ static void sctp_wfree(struct sk_buff *skb) sk->sk_wmem_queued -= skb->truesize; sk_mem_uncharge(sk, skb->truesize); + if (chunk->shkey) { + struct sctp_shared_key *shkey = chunk->shkey; + + /* refcnt == 2 and !list_empty mean after this release, it's + * not being used anywhere, and it's time to notify userland + * that this shkey can be freed if it's been deactivated. + */ + if (shkey->deactivated && !list_empty(&shkey->key_list) && + refcount_read(&shkey->refcnt) == 2) { + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, + SCTP_AUTH_FREE_KEY, + GFP_KERNEL); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } + sctp_auth_shkey_release(chunk->shkey); + } + sock_wfree(skb); sctp_wake_up_waiters(sk, asoc); diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da1a5cdefd13..5f8046c62d90 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -7,13 +7,11 @@ * applicable with RoCE-cards only * * Initial restrictions: - * - non-blocking connect postponed - * - IPv6 support postponed * - support for alternate links postponed * - partial support for non-blocking sockets only * - support for urgent data postponed * - * Copyright IBM Corp. 2016 + * Copyright IBM Corp. 2016, 2018 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> * based on prototype from Frank Blaschka @@ -24,7 +22,6 @@ #include <linux/module.h> #include <linux/socket.h> -#include <linux/inetdevice.h> #include <linux/workqueue.h> #include <linux/in.h> #include <linux/sched/signal.h> @@ -66,6 +63,10 @@ static struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; +static struct smc_hashinfo smc_v6_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), +}; + int smc_hash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; @@ -105,6 +106,18 @@ struct proto smc_proto = { }; EXPORT_SYMBOL_GPL(smc_proto); +struct proto smc_proto6 = { + .name = "SMC6", + .owner = THIS_MODULE, + .keepalive = smc_set_keepalive, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v6_hashinfo, + .slab_flags = SLAB_TYPESAFE_BY_RCU, +}; +EXPORT_SYMBOL_GPL(smc_proto6); + static int smc_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -161,19 +174,22 @@ static void smc_destruct(struct sock *sk) sk_refcnt_debug_dec(sk); } -static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) +static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, + int protocol) { struct smc_sock *smc; + struct proto *prot; struct sock *sk; - sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0); + prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; + sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); if (!sk) return NULL; sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; - sk->sk_protocol = SMCPROTO_SMC; + sk->sk_protocol = protocol; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_LIST_HEAD(&smc->accept_q); @@ -200,10 +216,13 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, goto out; rc = -EAFNOSUPPORT; + if (addr->sin_family != AF_INET && + addr->sin_family != AF_INET6 && + addr->sin_family != AF_UNSPEC) + goto out; /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ - if ((addr->sin_family != AF_INET) && - ((addr->sin_family != AF_UNSPEC) || - (addr->sin_addr.s_addr != htonl(INADDR_ANY)))) + if (addr->sin_family == AF_UNSPEC && + addr->sin_addr.s_addr != htonl(INADDR_ANY)) goto out; lock_sock(sk); @@ -273,47 +292,7 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } -/* determine subnet and mask of internal TCP socket */ -int smc_netinfo_by_tcpsk(struct socket *clcsock, - __be32 *subnet, u8 *prefix_len) -{ - struct dst_entry *dst = sk_dst_get(clcsock->sk); - struct in_device *in_dev; - struct sockaddr_in addr; - int rc = -ENOENT; - int len; - - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { - rc = -ENODEV; - goto out_rel; - } - - /* get address to which the internal TCP socket is bound */ - kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); - /* analyze IPv4 specific data of net_device belonging to TCP socket */ - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dst->dev); - for_ifa(in_dev) { - if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) - continue; - *prefix_len = inet_mask_len(ifa->ifa_mask); - *subnet = ifa->ifa_address & ifa->ifa_mask; - rc = 0; - break; - } endfor_ifa(in_dev); - rcu_read_unlock(); - -out_rel: - dst_release(dst); -out: - return rc; -} - -static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) +static int smc_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link_group *lgr = smc->conn.lgr; struct smc_link *link; @@ -333,6 +312,9 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) return rc; } + if (link->llc_confirm_rc) + return SMC_CLC_DECL_RMBE_EC; + rc = smc_ib_modify_qp_rts(link); if (rc) return SMC_CLC_DECL_INTERR; @@ -347,11 +329,33 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, link->smcibdev->mac[link->ibport - 1], - gid, SMC_LLC_RESP); + &link->smcibdev->gid[link->ibport - 1], + SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TCL; - return rc; + /* receive ADD LINK request from server over RoCE fabric */ + rest = wait_for_completion_interruptible_timeout(&link->llc_add, + SMC_LLC_WAIT_TIME); + if (rest <= 0) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE); + return rc; + } + + /* send add link reject message, only one link supported for now */ + rc = smc_llc_send_add_link(link, + link->smcibdev->mac[link->ibport - 1], + &link->smcibdev->gid[link->ibport - 1], + SMC_LLC_RESP); + if (rc < 0) + return SMC_CLC_DECL_TCL; + + link->state = SMC_LNK_ACTIVE; + + return 0; } static void smc_conn_save_peer_info(struct smc_sock *smc, @@ -373,19 +377,9 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->qp_mtu; } -static void smc_lgr_forget(struct smc_link_group *lgr) -{ - spin_lock_bh(&smc_lgr_list.lock); - /* do not use this link group for new connections */ - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); - spin_unlock_bh(&smc_lgr_list.lock); -} - /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc) { - struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr; struct smc_clc_msg_accept_confirm aclc; int local_contact = SMC_FIRST_CONTACT; struct smc_ib_device *smcibdev; @@ -439,8 +433,8 @@ static int smc_connect_rdma(struct smc_sock *smc) srv_first_contact = aclc.hdr.flag; mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev, - ibport, &aclc.lcl, srv_first_contact); + local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl, + srv_first_contact); if (local_contact < 0) { rc = local_contact; if (rc == -ENOMEM) @@ -499,8 +493,7 @@ static int smc_connect_rdma(struct smc_sock *smc) if (local_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ - reason_code = smc_clnt_conf_first_link( - smc, &smcibdev->gid[ibport - 1]); + reason_code = smc_clnt_conf_first_link(smc); if (reason_code < 0) { rc = reason_code; goto out_err_unlock; @@ -557,9 +550,8 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, /* separate smc parameter checking to be safe */ if (alen < sizeof(addr->sa_family)) goto out_err; - if (addr->sa_family != AF_INET) + if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) goto out_err; - smc->addr = addr; /* needed for nonblocking connect */ lock_sock(sk); switch (sk->sk_state) { @@ -600,7 +592,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) int rc; release_sock(lsk); - new_sk = smc_sock_alloc(sock_net(lsk), NULL); + new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); if (!new_sk) { rc = -ENOMEM; lsk->sk_err = ENOMEM; @@ -749,9 +741,34 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE); + return rc; } - return rc; + if (link->llc_confirm_resp_rc) + return SMC_CLC_DECL_RMBE_EC; + + /* send ADD LINK request to client over the RoCE fabric */ + rc = smc_llc_send_add_link(link, + link->smcibdev->mac[link->ibport - 1], + &link->smcibdev->gid[link->ibport - 1], + SMC_LLC_REQ); + if (rc < 0) + return SMC_CLC_DECL_TCL; + + /* receive ADD LINK response from client over the RoCE fabric */ + rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, + SMC_LLC_WAIT_TIME); + if (rest <= 0) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE); + return rc; + } + + link->state = SMC_LNK_ACTIVE; + + return 0; } /* setup for RDMA connection of server */ @@ -767,13 +784,10 @@ static void smc_listen_work(struct work_struct *work) struct sock *newsmcsk = &new_smc->sk; struct smc_clc_msg_proposal *pclc; struct smc_ib_device *smcibdev; - struct sockaddr_in peeraddr; u8 buf[SMC_CLC_MAX_LEN]; struct smc_link *link; int reason_code = 0; - int rc = 0, len; - __be32 subnet; - u8 prefix_len; + int rc = 0; u8 ibport; /* check if peer is smc capable */ @@ -808,28 +822,19 @@ static void smc_listen_work(struct work_struct *work) goto decline_rdma; } - /* determine subnet and mask from internal TCP socket */ - rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len); - if (rc) { - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - goto decline_rdma; - } - pclc = (struct smc_clc_msg_proposal *)&buf; pclc_prfx = smc_clc_proposal_get_prefix(pclc); - if (pclc_prfx->outgoing_subnet != subnet || - pclc_prfx->prefix_len != prefix_len) { + + rc = smc_clc_prfx_match(newclcsock, pclc_prfx); + if (rc) { reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ goto decline_rdma; } - /* get address of the peer connected to the internal TCP socket */ - kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len); - /* allocate connection / link group */ mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, - smcibdev, ibport, &pclc->lcl, 0); + local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl, + 0); if (local_contact < 0) { rc = local_contact; if (rc == -ENOMEM) @@ -978,10 +983,6 @@ out: lsmc->clcsock = NULL; } release_sock(lsk); - /* no more listening, wake up smc_close_wait_listen_clcsock and - * accept - */ - lsk->sk_state_change(lsk); sock_put(&lsmc->sk); /* sock_hold in smc_listen */ } @@ -1075,7 +1076,7 @@ out: } static int smc_getname(struct socket *sock, struct sockaddr *addr, - int *len, int peer) + int peer) { struct smc_sock *smc; @@ -1085,7 +1086,7 @@ static int smc_getname(struct socket *sock, struct sockaddr *addr, smc = smc_sk(sock->sk); - return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer); + return smc->clcsock->ops->getname(smc->clcsock, addr, peer); } static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) @@ -1383,6 +1384,7 @@ static const struct proto_ops smc_sock_ops = { static int smc_create(struct net *net, struct socket *sock, int protocol, int kern) { + int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; struct smc_sock *smc; struct sock *sk; int rc; @@ -1392,22 +1394,24 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, goto out; rc = -EPROTONOSUPPORT; - if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP)) + if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) goto out; rc = -ENOBUFS; sock->ops = &smc_sock_ops; - sk = smc_sock_alloc(net, sock); + sk = smc_sock_alloc(net, sock, protocol); if (!sk) goto out; /* create internal TCP socket for CLC handshake and fallback */ smc = smc_sk(sk); smc->use_fallback = false; /* assume rdma capability first */ - rc = sock_create_kern(net, PF_INET, SOCK_STREAM, - IPPROTO_TCP, &smc->clcsock); - if (rc) + rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, + &smc->clcsock); + if (rc) { sk_common_release(sk); + goto out; + } smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); @@ -1443,16 +1447,23 @@ static int __init smc_init(void) rc = proto_register(&smc_proto, 1); if (rc) { - pr_err("%s: proto_register fails with %d\n", __func__, rc); + pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); goto out_pnet; } + rc = proto_register(&smc_proto6, 1); + if (rc) { + pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); + goto out_proto; + } + rc = sock_register(&smc_sock_family_ops); if (rc) { pr_err("%s: sock_register fails with %d\n", __func__, rc); - goto out_proto; + goto out_proto6; } INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { @@ -1465,6 +1476,8 @@ static int __init smc_init(void) out_sock: sock_unregister(PF_SMC); +out_proto6: + proto_unregister(&smc_proto6); out_proto: proto_unregister(&smc_proto); out_pnet: @@ -1483,11 +1496,13 @@ static void __exit smc_exit(void) spin_unlock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { list_del_init(&lgr->list); + cancel_delayed_work_sync(&lgr->free_work); smc_lgr_free(lgr); /* free link group */ } static_branch_disable(&tcp_have_smc); smc_ib_unregister_client(); sock_unregister(PF_SMC); + proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); } diff --git a/net/smc/smc.h b/net/smc/smc.h index 9518986c97b1..e4829a2f46ba 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -18,11 +18,13 @@ #include "smc_ib.h" -#define SMCPROTO_SMC 0 /* SMC protocol */ +#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ +#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ #define SMC_MAX_PORTS 2 /* Max # of ports */ extern struct proto smc_proto; +extern struct proto smc_proto6; #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 @@ -172,7 +174,6 @@ struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ struct smc_connection conn; /* smc connection */ - struct sockaddr *addr; /* inet connect address */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ @@ -263,10 +264,8 @@ static inline bool using_ipsec(struct smc_sock *smc) struct smc_clc_msg_local; -int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, - u8 *prefix_len); void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, +int smc_conn_create(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport, struct smc_clc_msg_local *lcl, int srv_first_contact); struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 3cd086e5bd28..b42395d24cba 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -269,7 +269,7 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved)) return; /* short message */ - if (cdc->len != sizeof(*cdc)) + if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ smc_cdc_msg_recv(cdc, link, wc->wr_id); } diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 8ac51583a063..3a988c22f627 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -5,15 +5,17 @@ * CLC (connection layer control) handshake over initial TCP socket to * prepare for RDMA traffic * - * Copyright IBM Corp. 2016 + * Copyright IBM Corp. 2016, 2018 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/in.h> +#include <linux/inetdevice.h> #include <linux/if_ether.h> #include <linux/sched/signal.h> +#include <net/addrconf.h> #include <net/sock.h> #include <net/tcp.h> @@ -22,6 +24,9 @@ #include "smc_clc.h" #include "smc_ib.h" +/* eye catcher "SMCR" EBCDIC for CLC messages */ +static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; + /* check if received message has a correct header length and contains valid * heading and trailing eyecatchers */ @@ -70,6 +75,172 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) return true; } +/* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */ +static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, + struct smc_clc_msg_proposal_prefix *prop) +{ + struct in_device *in_dev = __in_dev_get_rcu(dst->dev); + + if (!in_dev) + return -ENODEV; + for_ifa(in_dev) { + if (!inet_ifa_match(ipv4, ifa)) + continue; + prop->prefix_len = inet_mask_len(ifa->ifa_mask); + prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask; + /* prop->ipv6_prefixes_cnt = 0; already done by memset before */ + return 0; + } endfor_ifa(in_dev); + return -ENOENT; +} + +/* fill CLC proposal msg with ipv6 prefixes from device */ +static int smc_clc_prfx_set6_rcu(struct dst_entry *dst, + struct smc_clc_msg_proposal_prefix *prop, + struct smc_clc_ipv6_prefix *ipv6_prfx) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev = __in6_dev_get(dst->dev); + struct inet6_ifaddr *ifa; + int cnt = 0; + + if (!in6_dev) + return -ENODEV; + /* use a maximum of 8 IPv6 prefixes from device */ + list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { + if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) + continue; + ipv6_addr_prefix(&ipv6_prfx[cnt].prefix, + &ifa->addr, ifa->prefix_len); + ipv6_prfx[cnt].prefix_len = ifa->prefix_len; + cnt++; + if (cnt == SMC_CLC_MAX_V6_PREFIX) + break; + } + prop->ipv6_prefixes_cnt = cnt; + if (cnt) + return 0; +#endif + return -ENOENT; +} + +/* retrieve and set prefixes in CLC proposal msg */ +static int smc_clc_prfx_set(struct socket *clcsock, + struct smc_clc_msg_proposal_prefix *prop, + struct smc_clc_ipv6_prefix *ipv6_prfx) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct sockaddr_storage addrs; + struct sockaddr_in6 *addr6; + struct sockaddr_in *addr; + int rc = -ENOENT; + + memset(prop, 0, sizeof(*prop)); + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + /* get address to which the internal TCP socket is bound */ + kernel_getsockname(clcsock, (struct sockaddr *)&addrs); + /* analyze IP specific data of net_device belonging to TCP socket */ + addr6 = (struct sockaddr_in6 *)&addrs; + rcu_read_lock(); + if (addrs.ss_family == PF_INET) { + /* IPv4 */ + addr = (struct sockaddr_in *)&addrs; + rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop); + } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) { + /* mapped IPv4 address - peer is IPv4 only */ + rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3], + prop); + } else { + /* IPv6 */ + rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx); + } + rcu_read_unlock(); +out_rel: + dst_release(dst); +out: + return rc; +} + +/* match ipv4 addrs of dev against addr in CLC proposal */ +static int smc_clc_prfx_match4_rcu(struct net_device *dev, + struct smc_clc_msg_proposal_prefix *prop) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (!in_dev) + return -ENODEV; + for_ifa(in_dev) { + if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) && + inet_ifa_match(prop->outgoing_subnet, ifa)) + return 0; + } endfor_ifa(in_dev); + + return -ENOENT; +} + +/* match ipv6 addrs of dev against addrs in CLC proposal */ +static int smc_clc_prfx_match6_rcu(struct net_device *dev, + struct smc_clc_msg_proposal_prefix *prop) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev = __in6_dev_get(dev); + struct smc_clc_ipv6_prefix *ipv6_prfx; + struct inet6_ifaddr *ifa; + int i, max; + + if (!in6_dev) + return -ENODEV; + /* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */ + ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop)); + max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX); + list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { + if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) + continue; + for (i = 0; i < max; i++) { + if (ifa->prefix_len == ipv6_prfx[i].prefix_len && + ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix, + ifa->prefix_len)) + return 0; + } + } +#endif + return -ENOENT; +} + +/* check if proposed prefixes match one of our device prefixes */ +int smc_clc_prfx_match(struct socket *clcsock, + struct smc_clc_msg_proposal_prefix *prop) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + int rc; + + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + rcu_read_lock(); + if (!prop->ipv6_prefixes_cnt) + rc = smc_clc_prfx_match4_rcu(dst->dev, prop); + else + rc = smc_clc_prfx_match6_rcu(dst->dev, prop); + rcu_read_unlock(); +out_rel: + dst_release(dst); +out: + return rc; +} + /* Wait for data on the tcp-socket, analyze received data * Returns: * 0 if success and it was not a decline that we received. @@ -133,7 +304,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, /* receive the complete CLC message */ memset(&msg, 0, sizeof(struct msghdr)); - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, buflen); + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, datlen); krflags = MSG_WAITALL; smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; len = sock_recvmsg(smc->clcsock, &msg, krflags); @@ -189,16 +360,24 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport) { + struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_msg_proposal pclc; struct smc_clc_msg_trail trl; + int len, i, plen, rc; int reason_code = 0; - struct kvec vec[3]; + struct kvec vec[4]; struct msghdr msg; - int len, plen, rc; + + /* retrieve ip prefixes for CLC proposal msg */ + rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx, ipv6_prfx); + if (rc) + return SMC_CLC_DECL_CNFERR; /* configuration error */ /* send SMC Proposal CLC message */ - plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl); + plen = sizeof(pclc) + sizeof(pclc_prfx) + + (pclc_prfx.ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) + + sizeof(trl); memset(&pclc, 0, sizeof(pclc)); memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); pclc.hdr.type = SMC_CLC_PROPOSAL; @@ -209,23 +388,22 @@ int smc_clc_send_proposal(struct smc_sock *smc, memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); pclc.iparea_offset = htons(0); - memset(&pclc_prfx, 0, sizeof(pclc_prfx)); - /* determine subnet and mask from internal TCP socket */ - rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet, - &pclc_prfx.prefix_len); - if (rc) - return SMC_CLC_DECL_CNFERR; /* configuration error */ - pclc_prfx.ipv6_prefixes_cnt = 0; memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memset(&msg, 0, sizeof(msg)); - vec[0].iov_base = &pclc; - vec[0].iov_len = sizeof(pclc); - vec[1].iov_base = &pclc_prfx; - vec[1].iov_len = sizeof(pclc_prfx); - vec[2].iov_base = &trl; - vec[2].iov_len = sizeof(trl); + i = 0; + vec[i].iov_base = &pclc; + vec[i++].iov_len = sizeof(pclc); + vec[i].iov_base = &pclc_prfx; + vec[i++].iov_len = sizeof(pclc_prfx); + if (pclc_prfx.ipv6_prefixes_cnt > 0) { + vec[i].iov_base = &ipv6_prfx[0]; + vec[i++].iov_len = pclc_prfx.ipv6_prefixes_cnt * + sizeof(ipv6_prfx[0]); + } + vec[i].iov_base = &trl; + vec[i++].iov_len = sizeof(trl); /* due to the few bytes needed for clc-handshake this cannot block */ - len = kernel_sendmsg(smc->clcsock, &msg, vec, 3, plen); + len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen); if (len < sizeof(pclc)) { if (len >= 0) { reason_code = -ENETUNREACH; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index c145a0f36a68..63bf1dc2c1f9 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -22,9 +22,6 @@ #define SMC_CLC_CONFIRM 0x03 #define SMC_CLC_DECLINE 0x04 -/* eye catcher "SMCR" EBCDIC for CLC messages */ -static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; - #define SMC_CLC_V1 0x1 /* SMC version */ #define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ #define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ @@ -36,6 +33,7 @@ static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; #define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */ #define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */ #define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */ +#define SMC_CLC_DECL_RMBE_EC 0x08000000 /* peer has eyecatcher in RMBE */ struct smc_clc_msg_hdr { /* header1 of clc messages */ u8 eyecatcher[4]; /* eye catcher */ @@ -62,10 +60,15 @@ struct smc_clc_msg_local { /* header2 of clc messages */ u8 mac[6]; /* mac of ib_device port */ }; +#define SMC_CLC_MAX_V6_PREFIX 8 + +/* Struct would be 4 byte aligned, but it is used in an array that is sent + * to peers and must conform to RFC7609, hence we need to use packed here. + */ struct smc_clc_ipv6_prefix { - u8 prefix[4]; + struct in6_addr prefix; u8 prefix_len; -} __packed; +} __packed; /* format defined in RFC7609 */ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ __be32 outgoing_subnet; /* subnet mask */ @@ -81,9 +84,11 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ } __aligned(4); #define SMC_CLC_PROPOSAL_MAX_OFFSET 0x28 -#define SMC_CLC_PROPOSAL_MAX_PREFIX (8 * sizeof(struct smc_clc_ipv6_prefix)) +#define SMC_CLC_PROPOSAL_MAX_PREFIX (SMC_CLC_MAX_V6_PREFIX * \ + sizeof(struct smc_clc_ipv6_prefix)) #define SMC_CLC_MAX_LEN (sizeof(struct smc_clc_msg_proposal) + \ SMC_CLC_PROPOSAL_MAX_OFFSET + \ + sizeof(struct smc_clc_msg_proposal_prefix) + \ SMC_CLC_PROPOSAL_MAX_PREFIX + \ sizeof(struct smc_clc_msg_trail)) @@ -124,9 +129,8 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); } -struct smc_sock; -struct smc_ib_device; - +int smc_clc_prfx_match(struct socket *clcsock, + struct smc_clc_msg_proposal_prefix *prop); int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index e339c0186dcf..fa41d9881741 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -30,27 +30,6 @@ static void smc_close_cleanup_listen(struct sock *parent) smc_close_non_accepted(sk); } -static void smc_close_wait_listen_clcsock(struct smc_sock *smc) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct sock *sk = &smc->sk; - signed long timeout; - - timeout = SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME; - add_wait_queue(sk_sleep(sk), &wait); - do { - release_sock(sk); - if (smc->clcsock) - timeout = wait_woken(&wait, TASK_UNINTERRUPTIBLE, - timeout); - sched_annotate_sleep(); - lock_sock(sk); - if (!smc->clcsock) - break; - } while (timeout); - remove_wait_queue(sk_sleep(sk), &wait); -} - /* wait for sndbuf data being transmitted */ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) { @@ -204,9 +183,11 @@ again: rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); /* wake up kernel_accept of smc_tcp_listen_worker */ smc->clcsock->sk->sk_data_ready(smc->clcsock->sk); - smc_close_wait_listen_clcsock(smc); } smc_close_cleanup_listen(sk); + release_sock(sk); + flush_work(&smc->tcp_listen_work); + lock_sock(sk); break; case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2424c7100aaf..f44f6803f7ff 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -32,6 +32,17 @@ static u32 smc_lgr_num; /* unique link group number */ +static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) +{ + /* client link group creation always follows the server link group + * creation. For client use a somewhat higher removal delay time, + * otherwise there is a risk of out-of-sync link groups. + */ + mod_delayed_work(system_wq, &lgr->free_work, + lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); +} + /* Register connection's alert token in our lookup structure. * To use rbtrees we have to implement our own insert core. * Requires @conns_lock @@ -111,13 +122,7 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) write_unlock_bh(&lgr->conns_lock); if (!reduced || lgr->conns_num) return; - /* client link group creation always follows the server link group - * creation. For client use a somewhat higher removal delay time, - * otherwise there is a risk of out-of-sync link groups. - */ - mod_delayed_work(system_wq, &lgr->free_work, - lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : - SMC_LGR_FREE_DELAY_SERV); + smc_lgr_schedule_free_work(lgr); } static void smc_lgr_free_work(struct work_struct *work) @@ -140,11 +145,12 @@ static void smc_lgr_free_work(struct work_struct *work) list_del_init(&lgr->list); /* remove from smc_lgr_list */ free: spin_unlock_bh(&smc_lgr_list.lock); - smc_lgr_free(lgr); + if (!delayed_work_pending(&lgr->free_work)) + smc_lgr_free(lgr); } /* create a new SMC link group */ -static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, +static int smc_lgr_create(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport, char *peer_systemid, unsigned short vlan_id) { @@ -161,7 +167,6 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, } lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; lgr->sync_err = false; - lgr->daddr = peer_in_addr; memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); lgr->vlan_id = vlan_id; rwlock_init(&lgr->sndbufs_lock); @@ -177,6 +182,8 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, lnk = &lgr->lnk[SMC_SINGLE_LINK]; /* initialize link */ + lnk->state = SMC_LNK_ACTIVATING; + lnk->link_id = SMC_SINGLE_LINK; lnk->smcibdev = smcibdev; lnk->ibport = ibport; lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; @@ -198,6 +205,8 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, goto destroy_qp; init_completion(&lnk->llc_confirm); init_completion(&lnk->llc_confirm_resp); + init_completion(&lnk->llc_add); + init_completion(&lnk->llc_add_resp); smc->conn.lgr = lgr; rwlock_init(&lgr->conns_lock); @@ -306,6 +315,15 @@ void smc_lgr_free(struct smc_link_group *lgr) kfree(lgr); } +void smc_lgr_forget(struct smc_link_group *lgr) +{ + spin_lock_bh(&smc_lgr_list.lock); + /* do not use this link group for new connections */ + if (!list_empty(&lgr->list)) + list_del_init(&lgr->list); + spin_unlock_bh(&smc_lgr_list.lock); +} + /* terminate linkgroup abnormally */ void smc_lgr_terminate(struct smc_link_group *lgr) { @@ -313,15 +331,7 @@ void smc_lgr_terminate(struct smc_link_group *lgr) struct smc_sock *smc; struct rb_node *node; - spin_lock_bh(&smc_lgr_list.lock); - if (list_empty(&lgr->list)) { - /* termination already triggered */ - spin_unlock_bh(&smc_lgr_list.lock); - return; - } - /* do not use this link group for new connections */ - list_del_init(&lgr->list); - spin_unlock_bh(&smc_lgr_list.lock); + smc_lgr_forget(lgr); write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); @@ -339,6 +349,7 @@ void smc_lgr_terminate(struct smc_link_group *lgr) } write_unlock_bh(&lgr->conns_lock); wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); + smc_lgr_schedule_free_work(lgr); } /* Determine vlan of internal TCP socket. @@ -400,7 +411,7 @@ static int smc_link_determine_gid(struct smc_link_group *lgr) } /* create a new SMC connection (and a new link group if necessary) */ -int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, +int smc_conn_create(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport, struct smc_clc_msg_local *lcl, int srv_first_contact) { @@ -457,7 +468,7 @@ int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, create: if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport, + rc = smc_lgr_create(smc, smcibdev, ibport, lcl->id_for_peer, vlan_id); if (rc) goto out; @@ -465,7 +476,7 @@ create: rc = smc_link_determine_gid(conn->lgr); } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; - conn->local_tx_ctrl.len = sizeof(struct smc_cdc_msg); + conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); #endif @@ -698,27 +709,55 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) return -ENOSPC; } -/* save rkey and dma_addr received from peer during clc handshake */ -int smc_rmb_rtoken_handling(struct smc_connection *conn, - struct smc_clc_msg_accept_confirm *clc) +/* add a new rtoken from peer */ +int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) { - u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr); - struct smc_link_group *lgr = conn->lgr; - u32 rkey = ntohl(clc->rmb_rkey); + u64 dma_addr = be64_to_cpu(nw_vaddr); + u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && test_bit(i, lgr->rtokens_used_mask)) { - conn->rtoken_idx = i; + /* already in list */ + return i; + } + } + i = smc_rmb_reserve_rtoken_idx(lgr); + if (i < 0) + return i; + lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; + lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; + return i; +} + +/* delete an rtoken */ +int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) +{ + u32 rkey = ntohl(nw_rkey); + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && + test_bit(i, lgr->rtokens_used_mask)) { + lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; + lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; + + clear_bit(i, lgr->rtokens_used_mask); return 0; } } - conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr); + return -ENOENT; +} + +/* save rkey and dma_addr received from peer during clc handshake */ +int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_clc_msg_accept_confirm *clc) +{ + conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, + clc->rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; - lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey; - lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr; return 0; } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index fe691bf9af91..07e2a393e6d9 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -32,6 +32,12 @@ enum smc_lgr_role { /* possible roles of a link group */ SMC_SERV /* server */ }; +enum smc_link_state { /* possible states of a link */ + SMC_LNK_INACTIVE, /* link is inactive */ + SMC_LNK_ACTIVATING, /* link is being activated */ + SMC_LNK_ACTIVE /* link is active */ +}; + #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ struct smc_wr_buf { @@ -87,8 +93,14 @@ struct smc_link { u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/ u8 link_id; /* unique # within link group */ + + enum smc_link_state state; /* state of link */ struct completion llc_confirm; /* wait for rx of conf link */ struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ + int llc_confirm_rc; /* rc from confirm link msg */ + int llc_confirm_resp_rc; /* rc from conf_resp msg */ + struct completion llc_add; /* wait for rx of add link */ + struct completion llc_add_resp; /* wait for rx of add link rsp*/ }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -124,7 +136,6 @@ struct smc_rtoken { /* address/key of remote RMB */ struct smc_link_group { struct list_head list; enum smc_lgr_role role; /* client or server */ - __be32 daddr; /* destination ip address */ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ char peer_systemid[SMC_SYSTEMID_LEN]; /* unique system_id of peer */ @@ -186,10 +197,13 @@ struct smc_sock; struct smc_clc_msg_accept_confirm; void smc_lgr_free(struct smc_link_group *lgr); +void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); int smc_buf_create(struct smc_sock *smc); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc); +int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey); +int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 2a8957bd6d38..26df554f7588 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -23,6 +23,8 @@ #include "smc_wr.h" #include "smc.h" +#define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ + #define SMC_QP_MIN_RNR_TIMER 5 #define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ #define SMC_QP_RETRY_CNT 7 /* 7: infinite */ @@ -438,9 +440,15 @@ out: long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { struct ib_cq_init_attr cqattr = { - .cqe = SMC_WR_MAX_CQE, .comp_vector = 0 }; + .cqe = SMC_MAX_CQE, .comp_vector = 0 }; + int cqe_size_order, smc_order; long rc; + /* the calculated number of cq entries fits to mlx5 cq allocation */ + cqe_size_order = cache_line_size() == 128 ? 7 : 6; + smc_order = MAX_ORDER - cqe_size_order - 1; + if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) + cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, smc_wr_tx_cq_handler, NULL, smcibdev, &cqattr); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 92fe4cc8c82c..ea4b21981b4b 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -4,9 +4,6 @@ * * Link Layer Control (LLC) * - * For now, we only support the necessary "confirm link" functionality - * which happens for the first RoCE link after successful CLC handshake. - * * Copyright IBM Corp. 2016 * * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com> @@ -21,6 +18,122 @@ #include "smc_clc.h" #include "smc_llc.h" +#define SMC_LLC_DATA_LEN 40 + +struct smc_llc_hdr { + struct smc_wr_rx_hdr common; + u8 length; /* 44 */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:4, + add_link_rej_rsn:4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 add_link_rej_rsn:4, + reserved:4; +#endif + u8 flags; +}; + +#define SMC_LLC_FLAG_NO_RMBE_EYEC 0x03 + +struct smc_llc_msg_confirm_link { /* type 0x01 */ + struct smc_llc_hdr hd; + u8 sender_mac[ETH_ALEN]; + u8 sender_gid[SMC_GID_SIZE]; + u8 sender_qp_num[3]; + u8 link_num; + u8 link_uid[SMC_LGR_ID_SIZE]; + u8 max_links; + u8 reserved[9]; +}; + +#define SMC_LLC_FLAG_ADD_LNK_REJ 0x40 +#define SMC_LLC_REJ_RSN_NO_ALT_PATH 1 + +#define SMC_LLC_ADD_LNK_MAX_LINKS 2 + +struct smc_llc_msg_add_link { /* type 0x02 */ + struct smc_llc_hdr hd; + u8 sender_mac[ETH_ALEN]; + u8 reserved2[2]; + u8 sender_gid[SMC_GID_SIZE]; + u8 sender_qp_num[3]; + u8 link_num; + u8 flags2; /* QP mtu */ + u8 initial_psn[3]; + u8 reserved[8]; +}; + +#define SMC_LLC_FLAG_DEL_LINK_ALL 0x40 +#define SMC_LLC_FLAG_DEL_LINK_ORDERLY 0x20 + +struct smc_llc_msg_del_link { /* type 0x04 */ + struct smc_llc_hdr hd; + u8 link_num; + __be32 reason; + u8 reserved[35]; +} __packed; /* format defined in RFC7609 */ + +struct smc_llc_msg_test_link { /* type 0x07 */ + struct smc_llc_hdr hd; + u8 user_data[16]; + u8 reserved[24]; +}; + +struct smc_rmb_rtoken { + union { + u8 num_rkeys; /* first rtoken byte of CONFIRM LINK msg */ + /* is actually the num of rtokens, first */ + /* rtoken is always for the current link */ + u8 link_id; /* link id of the rtoken */ + }; + __be32 rmb_key; + __be64 rmb_vaddr; +} __packed; /* format defined in RFC7609 */ + +#define SMC_LLC_RKEYS_PER_MSG 3 + +struct smc_llc_msg_confirm_rkey { /* type 0x06 */ + struct smc_llc_hdr hd; + struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG]; + u8 reserved; +}; + +struct smc_llc_msg_confirm_rkey_cont { /* type 0x08 */ + struct smc_llc_hdr hd; + u8 num_rkeys; + struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG]; +}; + +#define SMC_LLC_DEL_RKEY_MAX 8 +#define SMC_LLC_FLAG_RKEY_NEG 0x20 + +struct smc_llc_msg_delete_rkey { /* type 0x09 */ + struct smc_llc_hdr hd; + u8 num_rkeys; + u8 err_mask; + u8 reserved[2]; + __be32 rkey[8]; + u8 reserved2[4]; +}; + +union smc_llc_msg { + struct smc_llc_msg_confirm_link confirm_link; + struct smc_llc_msg_add_link add_link; + struct smc_llc_msg_del_link delete_link; + + struct smc_llc_msg_confirm_rkey confirm_rkey; + struct smc_llc_msg_confirm_rkey_cont confirm_rkey_cont; + struct smc_llc_msg_delete_rkey delete_rkey; + + struct smc_llc_msg_test_link test_link; + struct { + struct smc_llc_hdr hdr; + u8 data[SMC_LLC_DATA_LEN]; + } raw; +}; + +#define SMC_LLC_FLAG_RESP 0x80 + /********************************** send *************************************/ struct smc_llc_tx_pend { @@ -87,14 +200,112 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], memset(confllc, 0, sizeof(*confllc)); confllc->hd.common.type = SMC_LLC_CONFIRM_LINK; confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link); + confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC; if (reqresp == SMC_LLC_RESP) confllc->hd.flags |= SMC_LLC_FLAG_RESP; memcpy(confllc->sender_mac, mac, ETH_ALEN); memcpy(confllc->sender_gid, gid, SMC_GID_SIZE); hton24(confllc->sender_qp_num, link->roce_qp->qp_num); - /* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */ + confllc->link_num = link->link_id; memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); - confllc->max_links = SMC_LINKS_PER_LGR_MAX; + confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* enforce peer resp. */ + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + +/* send ADD LINK request or response */ +int smc_llc_send_add_link(struct smc_link *link, u8 mac[], + union ib_gid *gid, + enum smc_llc_reqresp reqresp) +{ + struct smc_llc_msg_add_link *addllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + addllc = (struct smc_llc_msg_add_link *)wr_buf; + memset(addllc, 0, sizeof(*addllc)); + addllc->hd.common.type = SMC_LLC_ADD_LINK; + addllc->hd.length = sizeof(struct smc_llc_msg_add_link); + if (reqresp == SMC_LLC_RESP) { + addllc->hd.flags |= SMC_LLC_FLAG_RESP; + /* always reject more links for now */ + addllc->hd.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; + addllc->hd.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; + } + memcpy(addllc->sender_mac, mac, ETH_ALEN); + memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + +/* send DELETE LINK request or response */ +int smc_llc_send_delete_link(struct smc_link *link, + enum smc_llc_reqresp reqresp) +{ + struct smc_llc_msg_del_link *delllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + delllc = (struct smc_llc_msg_del_link *)wr_buf; + memset(delllc, 0, sizeof(*delllc)); + delllc->hd.common.type = SMC_LLC_DELETE_LINK; + delllc->hd.length = sizeof(struct smc_llc_msg_add_link); + if (reqresp == SMC_LLC_RESP) + delllc->hd.flags |= SMC_LLC_FLAG_RESP; + /* DEL_LINK_ALL because only 1 link supported */ + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + delllc->link_num = link->link_id; + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + +/* send LLC test link request or response */ +int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16], + enum smc_llc_reqresp reqresp) +{ + struct smc_llc_msg_test_link *testllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + testllc = (struct smc_llc_msg_test_link *)wr_buf; + memset(testllc, 0, sizeof(*testllc)); + testllc->hd.common.type = SMC_LLC_TEST_LINK; + testllc->hd.length = sizeof(struct smc_llc_msg_test_link); + if (reqresp == SMC_LLC_RESP) + testllc->hd.flags |= SMC_LLC_FLAG_RESP; + memcpy(testllc->user_data, user_data, sizeof(testllc->user_data)); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + +/* send a prepared message */ +static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) +{ + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + memcpy(wr_buf, llcbuf, llclen); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -106,19 +317,156 @@ static void smc_llc_rx_confirm_link(struct smc_link *link, struct smc_llc_msg_confirm_link *llc) { struct smc_link_group *lgr; + int conf_rc; lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + + /* RMBE eyecatchers are not supported */ + if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC) + conf_rc = 0; + else + conf_rc = ENOTSUPP; + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV) + if (lgr->role == SMC_SERV && + link->state == SMC_LNK_ACTIVATING) { + link->llc_confirm_resp_rc = conf_rc; complete(&link->llc_confirm_resp); + } } else { - if (lgr->role == SMC_CLNT) { + if (lgr->role == SMC_CLNT && + link->state == SMC_LNK_ACTIVATING) { + link->llc_confirm_rc = conf_rc; link->link_id = llc->link_num; complete(&link->llc_confirm); } } } +static void smc_llc_rx_add_link(struct smc_link *link, + struct smc_llc_msg_add_link *llc) +{ + struct smc_link_group *lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + if (link->state == SMC_LNK_ACTIVATING) + complete(&link->llc_add_resp); + } else { + if (link->state == SMC_LNK_ACTIVATING) { + complete(&link->llc_add); + return; + } + + if (lgr->role == SMC_SERV) { + smc_llc_send_add_link(link, + link->smcibdev->mac[link->ibport - 1], + &link->smcibdev->gid[link->ibport - 1], + SMC_LLC_REQ); + + } else { + smc_llc_send_add_link(link, + link->smcibdev->mac[link->ibport - 1], + &link->smcibdev->gid[link->ibport - 1], + SMC_LLC_RESP); + } + } +} + +static void smc_llc_rx_delete_link(struct smc_link *link, + struct smc_llc_msg_del_link *llc) +{ + struct smc_link_group *lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + if (lgr->role == SMC_SERV) + smc_lgr_terminate(lgr); + } else { + if (lgr->role == SMC_SERV) { + smc_lgr_forget(lgr); + smc_llc_send_delete_link(link, SMC_LLC_REQ); + } else { + smc_llc_send_delete_link(link, SMC_LLC_RESP); + smc_lgr_terminate(lgr); + } + } +} + +static void smc_llc_rx_test_link(struct smc_link *link, + struct smc_llc_msg_test_link *llc) +{ + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + /* unused as long as we don't send this type of msg */ + } else { + smc_llc_send_test_link(link, llc->user_data, SMC_LLC_RESP); + } +} + +static void smc_llc_rx_confirm_rkey(struct smc_link *link, + struct smc_llc_msg_confirm_rkey *llc) +{ + struct smc_link_group *lgr; + int rc; + + lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + /* unused as long as we don't send this type of msg */ + } else { + rc = smc_rtoken_add(lgr, + llc->rtoken[0].rmb_vaddr, + llc->rtoken[0].rmb_key); + + /* ignore rtokens for other links, we have only one link */ + + llc->hd.flags |= SMC_LLC_FLAG_RESP; + if (rc < 0) + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + smc_llc_send_message(link, (void *)llc, sizeof(*llc)); + } +} + +static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, + struct smc_llc_msg_confirm_rkey_cont *llc) +{ + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + /* unused as long as we don't send this type of msg */ + } else { + /* ignore rtokens for other links, we have only one link */ + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, (void *)llc, sizeof(*llc)); + } +} + +static void smc_llc_rx_delete_rkey(struct smc_link *link, + struct smc_llc_msg_delete_rkey *llc) +{ + struct smc_link_group *lgr; + u8 err_mask = 0; + int i, max; + + lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + /* unused as long as we don't send this type of msg */ + } else { + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); + for (i = 0; i < max; i++) { + if (smc_rtoken_delete(lgr, llc->rkey[i])) + err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); + } + + if (err_mask) { + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->err_mask = err_mask; + } + + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, (void *)llc, sizeof(*llc)); + } +} + static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) { struct smc_link *link = (struct smc_link *)wc->qp->qp_context; @@ -128,8 +476,30 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) return; /* short message */ if (llc->raw.hdr.length != sizeof(*llc)) return; /* invalid message */ - if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK) + + switch (llc->raw.hdr.common.type) { + case SMC_LLC_TEST_LINK: + smc_llc_rx_test_link(link, &llc->test_link); + break; + case SMC_LLC_CONFIRM_LINK: smc_llc_rx_confirm_link(link, &llc->confirm_link); + break; + case SMC_LLC_ADD_LINK: + smc_llc_rx_add_link(link, &llc->add_link); + break; + case SMC_LLC_DELETE_LINK: + smc_llc_rx_delete_link(link, &llc->delete_link); + break; + case SMC_LLC_CONFIRM_RKEY: + smc_llc_rx_confirm_rkey(link, &llc->confirm_rkey); + break; + case SMC_LLC_CONFIRM_RKEY_CONT: + smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont); + break; + case SMC_LLC_DELETE_RKEY: + smc_llc_rx_delete_rkey(link, &llc->delete_rkey); + break; + } } /***************************** init, exit, misc ******************************/ @@ -140,6 +510,30 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .type = SMC_LLC_CONFIRM_LINK }, { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_TEST_LINK + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ADD_LINK + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_LINK + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_RKEY + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_RKEY_CONT + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_RKEY + }, + { .handler = NULL, } }; diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 51b27ce90dbd..e4a7d5e234d5 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -18,6 +18,7 @@ #define SMC_LLC_FLAG_RESP 0x80 #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) +#define SMC_LLC_WAIT_TIME (2 * HZ) enum smc_llc_reqresp { SMC_LLC_REQ, @@ -26,39 +27,23 @@ enum smc_llc_reqresp { enum smc_llc_msg_type { SMC_LLC_CONFIRM_LINK = 0x01, -}; - -#define SMC_LLC_DATA_LEN 40 - -struct smc_llc_hdr { - struct smc_wr_rx_hdr common; - u8 length; /* 44 */ - u8 reserved; - u8 flags; -}; - -struct smc_llc_msg_confirm_link { /* type 0x01 */ - struct smc_llc_hdr hd; - u8 sender_mac[ETH_ALEN]; - u8 sender_gid[SMC_GID_SIZE]; - u8 sender_qp_num[3]; - u8 link_num; - u8 link_uid[SMC_LGR_ID_SIZE]; - u8 max_links; - u8 reserved[9]; -}; - -union smc_llc_msg { - struct smc_llc_msg_confirm_link confirm_link; - struct { - struct smc_llc_hdr hdr; - u8 data[SMC_LLC_DATA_LEN]; - } raw; + SMC_LLC_ADD_LINK = 0x02, + SMC_LLC_DELETE_LINK = 0x04, + SMC_LLC_CONFIRM_RKEY = 0x06, + SMC_LLC_TEST_LINK = 0x07, + SMC_LLC_CONFIRM_RKEY_CONT = 0x08, + SMC_LLC_DELETE_RKEY = 0x09, }; /* transmit */ int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid, enum smc_llc_reqresp reqresp); +int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid, + enum smc_llc_reqresp reqresp); +int smc_llc_send_delete_link(struct smc_link *link, + enum smc_llc_reqresp reqresp); +int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16], + enum smc_llc_reqresp reqresp); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index ef0c3494c9cb..210bec3c3ebe 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,7 +19,6 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_MAX_CQE 32768 /* max. # of completion queue elements */ #define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) diff --git a/net/socket.c b/net/socket.c index a93c99b518ca..f10f1d947c78 100644 --- a/net/socket.c +++ b/net/socket.c @@ -104,7 +104,6 @@ #include <linux/ipv6_route.h> #include <linux/route.h> #include <linux/sockios.h> -#include <linux/atalk.h> #include <net/busy_poll.h> #include <linux/errqueue.h> @@ -234,7 +233,7 @@ static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen, return __put_user(klen, ulen); } -static struct kmem_cache *sock_inode_cachep __read_mostly; +static struct kmem_cache *sock_inode_cachep __ro_after_init; static struct inode *sock_alloc_inode(struct super_block *sb) { @@ -991,10 +990,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, * what to do with it - that's up to the protocol still. */ -static struct ns_common *get_net_ns(struct ns_common *ns) +struct ns_common *get_net_ns(struct ns_common *ns) { return &get_net(container_of(ns, struct net, ns))->ns; } +EXPORT_SYMBOL_GPL(get_net_ns); static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) { @@ -1332,7 +1332,7 @@ int sock_create_kern(struct net *net, int family, int type, int protocol, struct } EXPORT_SYMBOL(sock_create_kern); -SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) +int __sys_socket(int family, int type, int protocol) { int retval; struct socket *sock; @@ -1359,12 +1359,16 @@ SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); } +SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) +{ + return __sys_socket(family, type, protocol); +} + /* * Create a pair of connected sockets. */ -SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, - int __user *, usockvec) +int __sys_socketpair(int family, int type, int protocol, int __user *usockvec) { struct socket *sock1, *sock2; int fd1, fd2, err; @@ -1449,6 +1453,12 @@ out: return err; } +SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, + int __user *, usockvec) +{ + return __sys_socketpair(family, type, protocol, usockvec); +} + /* * Bind a name to a socket. Nothing much to do here since it's * the protocol's responsibility to handle the local address. @@ -1457,7 +1467,7 @@ out: * the protocol layer (having also checked the address is ok). */ -SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) +int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) { struct socket *sock; struct sockaddr_storage address; @@ -1480,13 +1490,18 @@ SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) return err; } +SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) +{ + return __sys_bind(fd, umyaddr, addrlen); +} + /* * Perform a listen. Basically, we allow the protocol to do anything * necessary for a listen, and if that works, we mark the socket as * ready for listening. */ -SYSCALL_DEFINE2(listen, int, fd, int, backlog) +int __sys_listen(int fd, int backlog) { struct socket *sock; int err, fput_needed; @@ -1507,6 +1522,11 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) return err; } +SYSCALL_DEFINE2(listen, int, fd, int, backlog) +{ + return __sys_listen(fd, backlog); +} + /* * For accept, we attempt to create a new socket, set up the link * with the client, wake up the client, then return the new @@ -1516,11 +1536,11 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) * * 1003.1g adds the ability to recvmsg() to query connection pending * status to recvmsg. We need to add that support in a way thats - * clean when we restucture accept also. + * clean when we restructure accept also. */ -SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, - int __user *, upeer_addrlen, int, flags) +int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags) { struct socket *sock, *newsock; struct file *newfile; @@ -1573,8 +1593,9 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, goto out_fd; if (upeer_sockaddr) { - if (newsock->ops->getname(newsock, (struct sockaddr *)&address, - &len, 2) < 0) { + len = newsock->ops->getname(newsock, + (struct sockaddr *)&address, 2); + if (len < 0) { err = -ECONNABORTED; goto out_fd; } @@ -1599,10 +1620,16 @@ out_fd: goto out_put; } +SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, + int __user *, upeer_addrlen, int, flags) +{ + return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, flags); +} + SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen) { - return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0); + return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0); } /* @@ -1617,8 +1644,7 @@ SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr, * include the -EINPROGRESS status for such sockets. */ -SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, - int, addrlen) +int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) { struct socket *sock; struct sockaddr_storage address; @@ -1644,17 +1670,23 @@ out: return err; } +SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, + int, addrlen) +{ + return __sys_connect(fd, uservaddr, addrlen); +} + /* * Get the local address ('name') of a socket object. Move the obtained * name to user space. */ -SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, - int __user *, usockaddr_len) +int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, + int __user *usockaddr_len) { struct socket *sock; struct sockaddr_storage address; - int len, err, fput_needed; + int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) @@ -1664,10 +1696,11 @@ SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, if (err) goto out_put; - err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0); - if (err) + err = sock->ops->getname(sock, (struct sockaddr *)&address, 0); + if (err < 0) goto out_put; - err = move_addr_to_user(&address, len, usockaddr, usockaddr_len); + /* "err" is actually length in this case */ + err = move_addr_to_user(&address, err, usockaddr, usockaddr_len); out_put: fput_light(sock->file, fput_needed); @@ -1675,17 +1708,23 @@ out: return err; } +SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, + int __user *, usockaddr_len) +{ + return __sys_getsockname(fd, usockaddr, usockaddr_len); +} + /* * Get the remote address ('name') of a socket object. Move the obtained * name to user space. */ -SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, - int __user *, usockaddr_len) +int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, + int __user *usockaddr_len) { struct socket *sock; struct sockaddr_storage address; - int len, err, fput_needed; + int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { @@ -1695,26 +1734,29 @@ SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, return err; } - err = - sock->ops->getname(sock, (struct sockaddr *)&address, &len, - 1); - if (!err) - err = move_addr_to_user(&address, len, usockaddr, + err = sock->ops->getname(sock, (struct sockaddr *)&address, 1); + if (err >= 0) + /* "err" is actually length in this case */ + err = move_addr_to_user(&address, err, usockaddr, usockaddr_len); fput_light(sock->file, fput_needed); } return err; } +SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, + int __user *, usockaddr_len) +{ + return __sys_getpeername(fd, usockaddr, usockaddr_len); +} + /* * Send a datagram to a given address. We move the address into kernel * space and check the user space data area is readable before invoking * the protocol. */ - -SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, - unsigned int, flags, struct sockaddr __user *, addr, - int, addr_len) +int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, + struct sockaddr __user *addr, int addr_len) { struct socket *sock; struct sockaddr_storage address; @@ -1752,6 +1794,13 @@ out: return err; } +SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, + unsigned int, flags, struct sockaddr __user *, addr, + int, addr_len) +{ + return __sys_sendto(fd, buff, len, flags, addr, addr_len); +} + /* * Send a datagram down a socket. */ @@ -1759,7 +1808,7 @@ out: SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, unsigned int, flags) { - return sys_sendto(fd, buff, len, flags, NULL, 0); + return __sys_sendto(fd, buff, len, flags, NULL, 0); } /* @@ -1767,10 +1816,8 @@ SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, * sender. We verify the buffers are writable and if needed move the * sender address from kernel to user space. */ - -SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, - unsigned int, flags, struct sockaddr __user *, addr, - int __user *, addr_len) +int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, + struct sockaddr __user *addr, int __user *addr_len) { struct socket *sock; struct iovec iov; @@ -1810,6 +1857,13 @@ out: return err; } +SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, + unsigned int, flags, struct sockaddr __user *, addr, + int __user *, addr_len) +{ + return __sys_recvfrom(fd, ubuf, size, flags, addr, addr_len); +} + /* * Receive a datagram from a socket. */ @@ -1817,7 +1871,7 @@ out: SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, unsigned int, flags) { - return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); + return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); } /* @@ -1825,8 +1879,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, * to pass the user mode parameter for the protocols to sort out. */ -SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, - char __user *, optval, int, optlen) +static int __sys_setsockopt(int fd, int level, int optname, + char __user *optval, int optlen) { int err, fput_needed; struct socket *sock; @@ -1854,13 +1908,19 @@ out_put: return err; } +SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, + char __user *, optval, int, optlen) +{ + return __sys_setsockopt(fd, level, optname, optval, optlen); +} + /* * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. */ -SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, - char __user *, optval, int __user *, optlen) +static int __sys_getsockopt(int fd, int level, int optname, + char __user *optval, int __user *optlen) { int err, fput_needed; struct socket *sock; @@ -1885,11 +1945,17 @@ out_put: return err; } +SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, + char __user *, optval, int __user *, optlen) +{ + return __sys_getsockopt(fd, level, optname, optval, optlen); +} + /* * Shutdown a socket. */ -SYSCALL_DEFINE2(shutdown, int, fd, int, how) +int __sys_shutdown(int fd, int how) { int err, fput_needed; struct socket *sock; @@ -1904,6 +1970,11 @@ SYSCALL_DEFINE2(shutdown, int, fd, int, how) return err; } +SYSCALL_DEFINE2(shutdown, int, fd, int, how) +{ + return __sys_shutdown(fd, how); +} + /* A couple of helpful macros for getting the address of the 32/64 bit * fields which are the same type (int / unsigned) on our platforms. */ @@ -2067,12 +2138,16 @@ out_freeiov: * BSD sendmsg interface */ -long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags) +long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, + bool forbid_cmsg_compat) { int fput_needed, err; struct msghdr msg_sys; struct socket *sock; + if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) + return -EINVAL; + sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; @@ -2086,9 +2161,7 @@ out: SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags) { - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; - return __sys_sendmsg(fd, msg, flags); + return __sys_sendmsg(fd, msg, flags, true); } /* @@ -2096,7 +2169,7 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int */ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, - unsigned int flags) + unsigned int flags, bool forbid_cmsg_compat) { int fput_needed, err, datagrams; struct socket *sock; @@ -2106,6 +2179,9 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, struct used_address used_address; unsigned int oflags = flags; + if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) + return -EINVAL; + if (vlen > UIO_MAXIOV) vlen = UIO_MAXIOV; @@ -2162,9 +2238,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags) { - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; - return __sys_sendmmsg(fd, mmsg, vlen, flags); + return __sys_sendmmsg(fd, mmsg, vlen, flags, true); } static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, @@ -2237,12 +2311,16 @@ out_freeiov: * BSD recvmsg interface */ -long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned flags) +long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, + bool forbid_cmsg_compat) { int fput_needed, err; struct msghdr msg_sys; struct socket *sock; + if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) + return -EINVAL; + sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; @@ -2257,9 +2335,7 @@ out: SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags) { - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; - return __sys_recvmsg(fd, msg, flags); + return __sys_recvmsg(fd, msg, flags, true); } /* @@ -2288,10 +2364,12 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, if (!sock) return err; - err = sock_error(sock->sk); - if (err) { - datagrams = err; - goto out_put; + if (likely(!(flags & MSG_ERRQUEUE))) { + err = sock_error(sock->sk); + if (err) { + datagrams = err; + goto out_put; + } } entry = mmsg; @@ -2375,9 +2453,9 @@ out_put: return datagrams; } -SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, - unsigned int, vlen, unsigned int, flags, - struct timespec __user *, timeout) +static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, + unsigned int vlen, unsigned int flags, + struct timespec __user *timeout) { int datagrams; struct timespec timeout_sys; @@ -2400,6 +2478,13 @@ SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, return datagrams; } +SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags, + struct timespec __user *, timeout) +{ + return do_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); +} + #ifdef __ARCH_WANT_SYS_SOCKETCALL /* Argument list sizes for sys_socketcall */ #define AL(x) ((x) * sizeof(unsigned long)) @@ -2447,76 +2532,82 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) switch (call) { case SYS_SOCKET: - err = sys_socket(a0, a1, a[2]); + err = __sys_socket(a0, a1, a[2]); break; case SYS_BIND: - err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]); + err = __sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: - err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); + err = __sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: - err = sys_listen(a0, a1); + err = __sys_listen(a0, a1); break; case SYS_ACCEPT: - err = sys_accept4(a0, (struct sockaddr __user *)a1, - (int __user *)a[2], 0); + err = __sys_accept4(a0, (struct sockaddr __user *)a1, + (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: err = - sys_getsockname(a0, (struct sockaddr __user *)a1, - (int __user *)a[2]); + __sys_getsockname(a0, (struct sockaddr __user *)a1, + (int __user *)a[2]); break; case SYS_GETPEERNAME: err = - sys_getpeername(a0, (struct sockaddr __user *)a1, - (int __user *)a[2]); + __sys_getpeername(a0, (struct sockaddr __user *)a1, + (int __user *)a[2]); break; case SYS_SOCKETPAIR: - err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]); + err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]); break; case SYS_SEND: - err = sys_send(a0, (void __user *)a1, a[2], a[3]); + err = __sys_sendto(a0, (void __user *)a1, a[2], a[3], + NULL, 0); break; case SYS_SENDTO: - err = sys_sendto(a0, (void __user *)a1, a[2], a[3], - (struct sockaddr __user *)a[4], a[5]); + err = __sys_sendto(a0, (void __user *)a1, a[2], a[3], + (struct sockaddr __user *)a[4], a[5]); break; case SYS_RECV: - err = sys_recv(a0, (void __user *)a1, a[2], a[3]); + err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3], + NULL, NULL); break; case SYS_RECVFROM: - err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], - (struct sockaddr __user *)a[4], - (int __user *)a[5]); + err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3], + (struct sockaddr __user *)a[4], + (int __user *)a[5]); break; case SYS_SHUTDOWN: - err = sys_shutdown(a0, a1); + err = __sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: - err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); + err = __sys_setsockopt(a0, a1, a[2], (char __user *)a[3], + a[4]); break; case SYS_GETSOCKOPT: err = - sys_getsockopt(a0, a1, a[2], (char __user *)a[3], - (int __user *)a[4]); + __sys_getsockopt(a0, a1, a[2], (char __user *)a[3], + (int __user *)a[4]); break; case SYS_SENDMSG: - err = sys_sendmsg(a0, (struct user_msghdr __user *)a1, a[2]); + err = __sys_sendmsg(a0, (struct user_msghdr __user *)a1, + a[2], true); break; case SYS_SENDMMSG: - err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]); + err = __sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], + a[3], true); break; case SYS_RECVMSG: - err = sys_recvmsg(a0, (struct user_msghdr __user *)a1, a[2]); + err = __sys_recvmsg(a0, (struct user_msghdr __user *)a1, + a[2], true); break; case SYS_RECVMMSG: - err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], - (struct timespec __user *)a[4]); + err = do_sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], + a[3], (struct timespec __user *)a[4]); break; case SYS_ACCEPT4: - err = sys_accept4(a0, (struct sockaddr __user *)a1, - (int __user *)a[2], a[3]); + err = __sys_accept4(a0, (struct sockaddr __user *)a1, + (int __user *)a[2], a[3]); break; default: err = -EINVAL; @@ -2587,6 +2678,11 @@ void sock_unregister(int family) } EXPORT_SYMBOL(sock_unregister); +bool sock_is_registered(int family) +{ + return family < NPROTO && rcu_access_pointer(net_families[family]); +} + static int __init sock_init(void) { int err; @@ -3166,17 +3262,15 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, } EXPORT_SYMBOL(kernel_connect); -int kernel_getsockname(struct socket *sock, struct sockaddr *addr, - int *addrlen) +int kernel_getsockname(struct socket *sock, struct sockaddr *addr) { - return sock->ops->getname(sock, addr, addrlen, 0); + return sock->ops->getname(sock, addr, 0); } EXPORT_SYMBOL(kernel_getsockname); -int kernel_getpeername(struct socket *sock, struct sockaddr *addr, - int *addrlen) +int kernel_getpeername(struct socket *sock, struct sockaddr *addr) { - return sock->ops->getname(sock, addr, addrlen, 1); + return sock->ops->getname(sock, addr, 1); } EXPORT_SYMBOL(kernel_getpeername); diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 1fdab5c4eda8..b9283ce5cd85 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -60,7 +60,7 @@ static void strp_abort_strp(struct strparser *strp, int err) struct sock *sk = strp->sk; /* Report an error on the lower socket */ - sk->sk_err = err; + sk->sk_err = -err; sk->sk_error_report(sk); } } @@ -458,7 +458,7 @@ static void strp_msg_timeout(struct work_struct *w) /* Message assembly timed out */ STRP_STATS_INCR(strp->stats.msg_timeouts); strp->cb.lock(strp); - strp->cb.abort_parser(strp, ETIMEDOUT); + strp->cb.abort_parser(strp, -ETIMEDOUT); strp->cb.unlock(strp); } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 26531193fce4..5089dbb96d58 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1375,7 +1375,7 @@ static int create_use_gss_proxy_proc_entry(struct net *net) struct proc_dir_entry **p = &sn->use_gssp_proc; sn->use_gss_proxy = -1; - *p = proc_create_data("use-gss-proxy", S_IFREG|S_IRUSR|S_IWUSR, + *p = proc_create_data("use-gss-proxy", S_IFREG | 0600, sn->proc_net_rpc, &use_gss_proxy_ops, net); if (!*p) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 8a7e1c774f9c..c536cc24b3d1 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1621,20 +1621,20 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) if (cd->procfs == NULL) goto out_nomem; - p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR, + p = proc_create_data("flush", S_IFREG | 0600, cd->procfs, &cache_flush_operations_procfs, cd); if (p == NULL) goto out_nomem; if (cd->cache_request || cd->cache_parse) { - p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR, - cd->procfs, &cache_file_operations_procfs, cd); + p = proc_create_data("channel", S_IFREG | 0600, cd->procfs, + &cache_file_operations_procfs, cd); if (p == NULL) goto out_nomem; } if (cd->cache_show) { - p = proc_create_data("content", S_IFREG|S_IRUSR, - cd->procfs, &content_file_operations_procfs, cd); + p = proc_create_data("content", S_IFREG | 0400, cd->procfs, + &content_file_operations_procfs, cd); if (p == NULL) goto out_nomem; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 6e432ecd7f99..806395687bb6 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1231,7 +1231,7 @@ static const struct sockaddr_in6 rpc_in6addr_loopback = { * negative errno is returned. */ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, - struct sockaddr *buf, int buflen) + struct sockaddr *buf) { struct socket *sock; int err; @@ -1269,7 +1269,7 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, goto out_release; } - err = kernel_getsockname(sock, buf, &buflen); + err = kernel_getsockname(sock, buf); if (err < 0) { dprintk("RPC: getsockname failed (%d)\n", err); goto out_release; @@ -1353,7 +1353,7 @@ int rpc_localaddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t buflen) rcu_read_unlock(); rpc_set_port(sap, 0); - err = rpc_sockname(net, sap, salen, buf, buflen); + err = rpc_sockname(net, sap, salen, buf); put_net(net); if (err != 0) /* Couldn't discover local address, return ANYADDR */ diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index e980d2a493de..45a033329cd4 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -139,7 +139,7 @@ rpc_clnt_debugfs_register(struct rpc_clnt *clnt) return; /* make tasks file */ - if (!debugfs_create_file("tasks", S_IFREG | S_IRUSR, clnt->cl_debugfs, + if (!debugfs_create_file("tasks", S_IFREG | 0400, clnt->cl_debugfs, clnt, &tasks_fops)) goto out_err; @@ -241,7 +241,7 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt) return; /* make tasks file */ - if (!debugfs_create_file("info", S_IFREG | S_IRUSR, xprt->debugfs, + if (!debugfs_create_file("info", S_IFREG | 0400, xprt->debugfs, xprt, &xprt_info_fops)) { debugfs_remove_recursive(xprt->debugfs); xprt->debugfs = NULL; @@ -317,7 +317,7 @@ inject_fault_dir(struct dentry *topdir) if (!faultdir) return NULL; - if (!debugfs_create_file("disconnect", S_IFREG | S_IRUSR, faultdir, + if (!debugfs_create_file("disconnect", S_IFREG | 0400, faultdir, NULL, &fault_disconnect_fops)) return NULL; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index fc97fc3ed637..0f08934b2cea 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -820,13 +820,13 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, { struct dentry *dentry; struct inode *dir = d_inode(parent); - umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR; + umode_t umode = S_IFIFO | 0600; int err; if (pipe->ops->upcall == NULL) - umode &= ~S_IRUGO; + umode &= ~0444; if (pipe->ops->downcall == NULL) - umode &= ~S_IWUGO; + umode &= ~0222; inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); @@ -1035,7 +1035,7 @@ static const struct rpc_filelist authfiles[] = { [RPCAUTH_info] = { .name = "info", .i_fop = &rpc_info_operations, - .mode = S_IFREG | S_IRUSR, + .mode = S_IFREG | 0400, }, }; @@ -1068,8 +1068,8 @@ struct dentry *rpc_create_client_dir(struct dentry *dentry, { struct dentry *ret; - ret = rpc_mkdir_populate(dentry, name, S_IRUGO | S_IXUGO, NULL, - rpc_clntdir_populate, rpc_client); + ret = rpc_mkdir_populate(dentry, name, 0555, NULL, + rpc_clntdir_populate, rpc_client); if (!IS_ERR(ret)) { rpc_client->cl_pipedir_objects.pdh_dentry = ret; rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects); @@ -1096,17 +1096,17 @@ static const struct rpc_filelist cache_pipefs_files[3] = { [0] = { .name = "channel", .i_fop = &cache_file_operations_pipefs, - .mode = S_IFREG|S_IRUSR|S_IWUSR, + .mode = S_IFREG | 0600, }, [1] = { .name = "content", .i_fop = &content_file_operations_pipefs, - .mode = S_IFREG|S_IRUSR, + .mode = S_IFREG | 0400, }, [2] = { .name = "flush", .i_fop = &cache_flush_operations_pipefs, - .mode = S_IFREG|S_IRUSR|S_IWUSR, + .mode = S_IFREG | 0600, }, }; @@ -1164,39 +1164,39 @@ enum { static const struct rpc_filelist files[] = { [RPCAUTH_lockd] = { .name = "lockd", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_mount] = { .name = "mount", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_nfs] = { .name = "nfs", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_portmap] = { .name = "portmap", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_statd] = { .name = "statd", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_nfsd4_cb] = { .name = "nfsd4_cb", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_cache] = { .name = "cache", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_nfsd] = { .name = "nfsd", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_gssd] = { .name = "gssd", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, }; @@ -1261,7 +1261,7 @@ EXPORT_SYMBOL_GPL(rpc_put_sb_net); static const struct rpc_filelist gssd_dummy_clnt_dir[] = { [0] = { .name = "clntXX", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, }; @@ -1310,7 +1310,7 @@ static const struct rpc_filelist gssd_dummy_info_file[] = { [0] = { .name = "info", .i_fop = &rpc_dummy_info_operations, - .mode = S_IFREG | S_IRUSR, + .mode = S_IFREG | 0400, }, }; @@ -1397,7 +1397,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) sb->s_d_op = &simple_dentry_operations; sb->s_time_gran = 1; - inode = rpc_get_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO); + inode = rpc_get_inode(sb, S_IFDIR | 0555); sb->s_root = root = d_make_root(inode); if (!root) return -ENOMEM; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 943f2a745cd5..08cd951aaeea 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -832,12 +832,13 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) } set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); - err = kernel_getpeername(newsock, sin, &slen); + err = kernel_getpeername(newsock, sin); if (err < 0) { net_warn_ratelimited("%s: peername failed (err %d)!\n", serv->sv_name, -err); goto failed; /* aborted connection or whatever */ } + slen = err; /* Ideally, we would want to reject connections from unauthorized * hosts here, but when we get encryption, the IP of the host won't @@ -866,7 +867,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) if (IS_ERR(newsvsk)) goto failed; svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen); - err = kernel_getsockname(newsock, sin, &slen); + err = kernel_getsockname(newsock, sin); + slen = err; if (unlikely(err < 0)) { dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); slen = offsetof(struct sockaddr, sa_data); @@ -1465,7 +1467,8 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, err = PTR_ERR(svsk); goto out; } - if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) + salen = kernel_getsockname(svsk->sk_sock, sin); + if (salen >= 0) svc_xprt_set_local(&svsk->sk_xprt, sin, salen); svc_add_new_perm_xprt(serv, &svsk->sk_xprt); return svc_one_sock_name(svsk, name_return, len); @@ -1539,10 +1542,10 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, if (error < 0) goto bummer; - newlen = len; - error = kernel_getsockname(sock, newsin, &newlen); + error = kernel_getsockname(sock, newsin); if (error < 0) goto bummer; + newlen = error; if (protocol == IPPROTO_TCP) { if ((error = kernel_listen(sock, 64)) < 0) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a6b8c1f8f92a..956e29c1438d 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1794,10 +1794,9 @@ static void xs_sock_set_reuseport(struct socket *sock) static unsigned short xs_sock_getport(struct socket *sock) { struct sockaddr_storage buf; - int buflen; unsigned short port = 0; - if (kernel_getsockname(sock, (struct sockaddr *)&buf, &buflen) < 0) + if (kernel_getsockname(sock, (struct sockaddr *)&buf) < 0) goto out; switch (buf.ss_family) { case AF_INET6: diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index c25a3a149dc4..e450212121d2 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -34,3 +34,11 @@ config TIPC_MEDIA_UDP Saying Y here will enable support for running TIPC over IP/UDP bool default y + +config TIPC_DIAG + tristate "TIPC: socket monitoring interface" + depends on TIPC + default y + ---help--- + Support for TIPC socket monitoring interface used by ss tool. + If unsure, say Y. diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 37bb0bfbd936..aca168f2abb1 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -9,8 +9,13 @@ tipc-y += addr.o bcast.o bearer.o \ core.o link.o discover.o msg.o \ name_distr.o subscr.o monitor.o name_table.o net.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ - server.o socket.o group.o + topsrv.o socket.o group.o tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o tipc-$(CONFIG_SYSCTL) += sysctl.o + + +obj-$(CONFIG_TIPC_DIAG) += diag.o + +tipc_diag-y := diag.o diff --git a/net/tipc/addr.c b/net/tipc/addr.c index 48fd3b5a73fb..b88d48d00913 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -1,7 +1,7 @@ /* * net/tipc/addr.c: TIPC address utility routines * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -34,113 +34,90 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include <linux/kernel.h> #include "addr.h" #include "core.h" -/** - * in_own_cluster - test for cluster inclusion; <0.0.0> always matches - */ -int in_own_cluster(struct net *net, u32 addr) -{ - return in_own_cluster_exact(net, addr) || !addr; -} - -int in_own_cluster_exact(struct net *net, u32 addr) +bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - - return !((addr ^ tn->own_addr) >> 12); + if (!domain || (domain == addr)) + return true; + if (!legacy_format) + return false; + if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */ + return true; + if (domain == (addr & TIPC_ZONE_CLUSTER_MASK)) /* domain <Z.C.0> */ + return true; + if (domain == (addr & TIPC_ZONE_MASK)) /* domain <Z.0.0> */ + return true; + return false; } -/** - * in_own_node - test for node inclusion; <0.0.0> always matches - */ -int in_own_node(struct net *net, u32 addr) +void tipc_set_node_id(struct net *net, u8 *id) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); + u32 *tmp = (u32 *)id; - return (addr == tn->own_addr) || !addr; + memcpy(tn->node_id, id, NODE_ID_LEN); + tipc_nodeid2string(tn->node_id_string, id); + tn->trial_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]; + pr_info("Own node identity %s, cluster identity %u\n", + tipc_own_id_string(net), tn->net_id); } -/** - * addr_domain - convert 2-bit scope value to equivalent message lookup domain - * - * Needed when address of a named message must be looked up a second time - * after a network hop. - */ -u32 addr_domain(struct net *net, u32 sc) +void tipc_set_node_addr(struct net *net, u32 addr) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); + u8 node_id[NODE_ID_LEN] = {0,}; - if (likely(sc == TIPC_NODE_SCOPE)) - return tn->own_addr; - if (sc == TIPC_CLUSTER_SCOPE) - return tipc_cluster_mask(tn->own_addr); - return tipc_zone_mask(tn->own_addr); + tn->node_addr = addr; + if (!tipc_own_id(net)) { + sprintf(node_id, "%x", addr); + tipc_set_node_id(net, node_id); + } + tn->trial_addr = addr; + pr_info("32-bit node address hash set to %x\n", addr); } -/** - * tipc_addr_domain_valid - validates a network domain address - * - * Accepts <Z.C.N>, <Z.C.0>, <Z.0.0>, and <0.0.0>, - * where Z, C, and N are non-zero. - * - * Returns 1 if domain address is valid, otherwise 0 - */ -int tipc_addr_domain_valid(u32 addr) +char *tipc_nodeid2string(char *str, u8 *id) { - u32 n = tipc_node(addr); - u32 c = tipc_cluster(addr); - u32 z = tipc_zone(addr); - - if (n && (!z || !c)) - return 0; - if (c && !z) - return 0; - return 1; -} + int i; + u8 c; -/** - * tipc_addr_node_valid - validates a proposed network address for this node - * - * Accepts <Z.C.N>, where Z, C, and N are non-zero. - * - * Returns 1 if address can be used, otherwise 0 - */ -int tipc_addr_node_valid(u32 addr) -{ - return tipc_addr_domain_valid(addr) && tipc_node(addr); -} + /* Already a string ? */ + for (i = 0; i < NODE_ID_LEN; i++) { + c = id[i]; + if (c >= '0' && c <= '9') + continue; + if (c >= 'A' && c <= 'Z') + continue; + if (c >= 'a' && c <= 'z') + continue; + if (c == '.') + continue; + if (c == ':') + continue; + if (c == '_') + continue; + if (c == '-') + continue; + if (c == '@') + continue; + if (c != 0) + break; + } + if (i == NODE_ID_LEN) { + memcpy(str, id, NODE_ID_LEN); + str[NODE_ID_LEN] = 0; + return str; + } -int tipc_in_scope(u32 domain, u32 addr) -{ - if (!domain || (domain == addr)) - return 1; - if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */ - return 1; - if (domain == tipc_zone_mask(addr)) /* domain <Z.0.0> */ - return 1; - return 0; -} + /* Translate to hex string */ + for (i = 0; i < NODE_ID_LEN; i++) + sprintf(&str[2 * i], "%02x", id[i]); -/** - * tipc_addr_scope - convert message lookup domain to a 2-bit scope value - */ -int tipc_addr_scope(u32 domain) -{ - if (likely(!domain)) - return TIPC_ZONE_SCOPE; - if (tipc_node(domain)) - return TIPC_NODE_SCOPE; - if (tipc_cluster(domain)) - return TIPC_CLUSTER_SCOPE; - return TIPC_ZONE_SCOPE; -} + /* Strip off trailing zeroes */ + for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) + str[i] = 0; -char *tipc_addr_string_fill(char *string, u32 addr) -{ - snprintf(string, 16, "<%u.%u.%u>", - tipc_zone(addr), tipc_cluster(addr), tipc_node(addr)); - return string; + return str; } diff --git a/net/tipc/addr.h b/net/tipc/addr.h index bebb347803ce..31bee0ea7b3e 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -1,7 +1,7 @@ /* * net/tipc/addr.h: Include file for TIPC address utility routines * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, Wind River Systems * All rights reserved. * @@ -45,14 +45,21 @@ static inline u32 tipc_own_addr(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + return tipc_net(net)->node_addr; +} + +static inline u8 *tipc_own_id(struct net *net) +{ + struct tipc_net *tn = tipc_net(net); - return tn->own_addr; + if (!strlen(tn->node_id_string)) + return NULL; + return tn->node_id; } -static inline u32 tipc_zone_mask(u32 addr) +static inline char *tipc_own_id_string(struct net *net) { - return addr & TIPC_ZONE_MASK; + return tipc_net(net)->node_id_string; } static inline u32 tipc_cluster_mask(u32 addr) @@ -60,15 +67,25 @@ static inline u32 tipc_cluster_mask(u32 addr) return addr & TIPC_ZONE_CLUSTER_MASK; } -u32 tipc_own_addr(struct net *net); -int in_own_cluster(struct net *net, u32 addr); -int in_own_cluster_exact(struct net *net, u32 addr); -int in_own_node(struct net *net, u32 addr); -u32 addr_domain(struct net *net, u32 sc); -int tipc_addr_domain_valid(u32); -int tipc_addr_node_valid(u32 addr); -int tipc_in_scope(u32 domain, u32 addr); -int tipc_addr_scope(u32 domain); -char *tipc_addr_string_fill(char *string, u32 addr); +static inline int tipc_node2scope(u32 node) +{ + return node ? TIPC_NODE_SCOPE : TIPC_CLUSTER_SCOPE; +} + +static inline int tipc_scope2node(struct net *net, int sc) +{ + return sc != TIPC_NODE_SCOPE ? 0 : tipc_own_addr(net); +} + +static inline int in_own_node(struct net *net, u32 addr) +{ + return addr == tipc_own_addr(net) || !addr; +} + +bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr); +void tipc_set_node_id(struct net *net, u8 *id); +void tipc_set_node_addr(struct net *net, u32 addr); +char *tipc_nodeid2string(char *str, u8 *id); +u32 tipc_node_id2hash(u8 *id128); #endif diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 37892b3909af..f3711176be45 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -574,5 +574,5 @@ void tipc_nlist_purge(struct tipc_nlist *nl) { tipc_dest_list_purge(&nl->list); nl->remote = 0; - nl->local = 0; + nl->local = false; } diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 3e3dce3d4c63..f7d47c89d658 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -210,7 +210,7 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest) rcu_read_lock(); b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); if (b) - tipc_disc_add_dest(b->link_req); + tipc_disc_add_dest(b->disc); rcu_read_unlock(); } @@ -222,7 +222,7 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) rcu_read_lock(); b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); if (b) - tipc_disc_remove_dest(b->link_req); + tipc_disc_remove_dest(b->disc); rcu_read_unlock(); } @@ -230,88 +230,67 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) * tipc_enable_bearer - enable bearer with the given name */ static int tipc_enable_bearer(struct net *net, const char *name, - u32 disc_domain, u32 priority, + u32 disc_domain, u32 prio, struct nlattr *attr[]) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); + struct tipc_bearer_names b_names; + int with_this_prio = 1; struct tipc_bearer *b; struct tipc_media *m; - struct tipc_bearer_names b_names; struct sk_buff *skb; - char addr_string[16]; - u32 bearer_id; - u32 with_this_prio; - u32 i; + int bearer_id = 0; int res = -EINVAL; + char *errstr = ""; - if (!tn->own_addr) { - pr_warn("Bearer <%s> rejected, not supported in standalone mode\n", - name); - return -ENOPROTOOPT; - } if (!bearer_name_validate(name, &b_names)) { - pr_warn("Bearer <%s> rejected, illegal name\n", name); - return -EINVAL; - } - if (tipc_addr_domain_valid(disc_domain) && - (disc_domain != tn->own_addr)) { - if (tipc_in_scope(disc_domain, tn->own_addr)) { - disc_domain = tn->own_addr & TIPC_ZONE_CLUSTER_MASK; - res = 0; /* accept any node in own cluster */ - } else if (in_own_cluster_exact(net, disc_domain)) - res = 0; /* accept specified node in own cluster */ + errstr = "illegal name"; + goto rejected; } - if (res) { - pr_warn("Bearer <%s> rejected, illegal discovery domain\n", - name); - return -EINVAL; - } - if ((priority > TIPC_MAX_LINK_PRI) && - (priority != TIPC_MEDIA_LINK_PRI)) { - pr_warn("Bearer <%s> rejected, illegal priority\n", name); - return -EINVAL; + + if (prio > TIPC_MAX_LINK_PRI && prio != TIPC_MEDIA_LINK_PRI) { + errstr = "illegal priority"; + goto rejected; } m = tipc_media_find(b_names.media_name); if (!m) { - pr_warn("Bearer <%s> rejected, media <%s> not registered\n", - name, b_names.media_name); - return -EINVAL; + errstr = "media not registered"; + goto rejected; } - if (priority == TIPC_MEDIA_LINK_PRI) - priority = m->priority; + if (prio == TIPC_MEDIA_LINK_PRI) + prio = m->priority; -restart: - bearer_id = MAX_BEARERS; - with_this_prio = 1; - for (i = MAX_BEARERS; i-- != 0; ) { - b = rtnl_dereference(tn->bearer_list[i]); - if (!b) { - bearer_id = i; - continue; - } + /* Check new bearer vs existing ones and find free bearer id if any */ + while (bearer_id < MAX_BEARERS) { + b = rtnl_dereference(tn->bearer_list[bearer_id]); + if (!b) + break; if (!strcmp(name, b->name)) { - pr_warn("Bearer <%s> rejected, already enabled\n", - name); - return -EINVAL; + errstr = "already enabled"; + goto rejected; } - if ((b->priority == priority) && - (++with_this_prio > 2)) { - if (priority-- == 0) { - pr_warn("Bearer <%s> rejected, duplicate priority\n", - name); - return -EINVAL; - } - pr_warn("Bearer <%s> priority adjustment required %u->%u\n", - name, priority + 1, priority); - goto restart; + bearer_id++; + if (b->priority != prio) + continue; + if (++with_this_prio <= 2) + continue; + pr_warn("Bearer <%s>: already 2 bearers with priority %u\n", + name, prio); + if (prio == TIPC_MIN_LINK_PRI) { + errstr = "cannot adjust to lower"; + goto rejected; } + pr_warn("Bearer <%s>: trying with adjusted priority\n", name); + prio--; + bearer_id = 0; + with_this_prio = 1; } + if (bearer_id >= MAX_BEARERS) { - pr_warn("Bearer <%s> rejected, bearer limit reached (%u)\n", - name, MAX_BEARERS); - return -EINVAL; + errstr = "max 3 bearers permitted"; + goto rejected; } b = kzalloc(sizeof(*b), GFP_ATOMIC); @@ -322,10 +301,9 @@ restart: b->media = m; res = m->enable_media(net, b, attr); if (res) { - pr_warn("Bearer <%s> rejected, enable failure (%d)\n", - name, -res); kfree(b); - return -EINVAL; + errstr = "failed to enable media"; + goto rejected; } b->identity = bearer_id; @@ -333,15 +311,15 @@ restart: b->window = m->window; b->domain = disc_domain; b->net_plane = bearer_id + 'A'; - b->priority = priority; + b->priority = prio; test_and_set_bit_lock(0, &b->up); res = tipc_disc_create(net, b, &b->bcast_addr, &skb); if (res) { bearer_disable(net, b); - pr_warn("Bearer <%s> rejected, discovery object creation failed\n", - name); - return -EINVAL; + kfree(b); + errstr = "failed to create discoverer"; + goto rejected; } rcu_assign_pointer(tn->bearer_list[bearer_id], b); @@ -353,9 +331,11 @@ restart: return -ENOMEM; } - pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n", - name, - tipc_addr_string_fill(addr_string, disc_domain), priority); + pr_info("Enabled bearer <%s>, priority %u\n", name, prio); + + return res; +rejected: + pr_warn("Enabling of bearer <%s> rejected, %s\n", name, errstr); return res; } @@ -385,8 +365,8 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) tipc_node_delete_links(net, bearer_id); b->media->disable_media(b); RCU_INIT_POINTER(b->media_ptr, NULL); - if (b->link_req) - tipc_disc_delete(b->link_req); + if (b->disc) + tipc_disc_delete(b->disc); RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL); kfree_rcu(b, rcu); tipc_mon_delete(net, bearer_id); @@ -395,11 +375,13 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attr[]) { + char *dev_name = strchr((const char *)b->name, ':') + 1; + int hwaddr_len = b->media->hwaddr_len; + u8 node_id[NODE_ID_LEN] = {0,}; struct net_device *dev; - char *driver_name = strchr((const char *)b->name, ':') + 1; /* Find device with specified name */ - dev = dev_get_by_name(net, driver_name); + dev = dev_get_by_name(net, dev_name); if (!dev) return -ENODEV; if (tipc_mtu_bad(dev, 0)) { @@ -407,6 +389,16 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, return -EINVAL; } + /* Autoconfigure own node identity if needed */ + if (!tipc_own_id(net) && hwaddr_len <= NODE_ID_LEN) { + memcpy(node_id, dev->dev_addr, hwaddr_len); + tipc_net_init(net, node_id, 0); + } + if (!tipc_own_id(net)) { + pr_warn("Failed to obtain node identity\n"); + return -EINVAL; + } + /* Associate TIPC bearer with L2 bearer */ rcu_assign_pointer(b->media_ptr, dev); b->pt.dev = dev; @@ -414,7 +406,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, b->pt.func = tipc_l2_rcv_msg; dev_add_pack(&b->pt); memset(&b->bcast_addr, 0, sizeof(b->bcast_addr)); - memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len); + memcpy(b->bcast_addr.value, dev->broadcast, hwaddr_len); b->bcast_addr.media_id = b->media->type_id; b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT; b->mtu = dev->mtu; @@ -861,12 +853,10 @@ int __tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) char *bearer; struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; struct net *net = sock_net(skb->sk); - struct tipc_net *tn = net_generic(net, tipc_net_id); - u32 domain; + u32 domain = 0; u32 prio; prio = TIPC_MEDIA_LINK_PRI; - domain = tn->own_addr & TIPC_ZONE_CLUSTER_MASK; if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; @@ -956,11 +946,11 @@ int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) { - int err; - char *name; struct tipc_bearer *b; struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; struct net *net = sock_net(skb->sk); + char *name; + int err; if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; @@ -987,8 +977,10 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) if (err) return err; - if (props[TIPC_NLA_PROP_TOL]) + if (props[TIPC_NLA_PROP_TOL]) { b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]); + tipc_node_apply_tolerance(net, b); + } if (props[TIPC_NLA_PROP_PRIO]) b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index a53613d95bc9..6efcee63a381 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -159,7 +159,7 @@ struct tipc_bearer { u32 tolerance; u32 domain; u32 identity; - struct tipc_link_req *link_req; + struct tipc_discoverer *disc; char net_plane; unsigned long up; }; diff --git a/net/tipc/core.c b/net/tipc/core.c index 0b982d048fb9..5b38f5164281 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -56,7 +56,11 @@ static int __net_init tipc_init_net(struct net *net) int err; tn->net_id = 4711; - tn->own_addr = 0; + tn->node_addr = 0; + tn->trial_addr = 0; + tn->addr_trial_end = 0; + memset(tn->node_id, 0, sizeof(tn->node_id)); + memset(tn->node_id_string, 0, sizeof(tn->node_id_string)); tn->mon_threshold = TIPC_DEF_MON_THRESHOLD; get_random_bytes(&tn->random, sizeof(int)); INIT_LIST_HEAD(&tn->node_list); diff --git a/net/tipc/core.h b/net/tipc/core.h index 20b21af2ff14..8020a6c360ff 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -1,7 +1,7 @@ /* * net/tipc/core.h: Include file for TIPC global declarations * - * Copyright (c) 2005-2006, 2013 Ericsson AB + * Copyright (c) 2005-2006, 2013-2018 Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems * All rights reserved. * @@ -58,13 +58,14 @@ #include <linux/etherdevice.h> #include <net/netns/generic.h> #include <linux/rhashtable.h> +#include <net/genetlink.h> struct tipc_node; struct tipc_bearer; struct tipc_bc_base; struct tipc_link; struct tipc_name_table; -struct tipc_server; +struct tipc_topsrv; struct tipc_monitor; #define TIPC_MOD_VER "2.0.0" @@ -72,15 +73,22 @@ struct tipc_monitor; #define NODE_HTABLE_SIZE 512 #define MAX_BEARERS 3 #define TIPC_DEF_MON_THRESHOLD 32 +#define NODE_ID_LEN 16 +#define NODE_ID_STR_LEN (NODE_ID_LEN * 2 + 1) extern unsigned int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; extern int sysctl_tipc_named_timeout __read_mostly; struct tipc_net { - u32 own_addr; + u8 node_id[NODE_ID_LEN]; + u32 node_addr; + u32 trial_addr; + unsigned long addr_trial_end; + char node_id_string[NODE_ID_STR_LEN]; int net_id; int random; + bool legacy_addr_format; /* Node table and node list */ spinlock_t node_list_lock; @@ -112,7 +120,7 @@ struct tipc_net { struct list_head dist_queue; /* Topology subscription server */ - struct tipc_server *topsrv; + struct tipc_topsrv *topsrv; atomic_t subscription_count; }; @@ -131,7 +139,12 @@ static inline struct list_head *tipc_nodes(struct net *net) return &tipc_net(net)->node_list; } -static inline struct tipc_server *tipc_topsrv(struct net *net) +static inline struct name_table *tipc_name_table(struct net *net) +{ + return tipc_net(net)->nametbl; +} + +static inline struct tipc_topsrv *tipc_topsrv(struct net *net) { return tipc_net(net)->topsrv; } diff --git a/net/tipc/diag.c b/net/tipc/diag.c new file mode 100644 index 000000000000..46d9cd62f781 --- /dev/null +++ b/net/tipc/diag.c @@ -0,0 +1,114 @@ +/* + * net/tipc/diag.c: TIPC socket diag + * + * Copyright (c) 2018, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "socket.h" +#include <linux/sock_diag.h> +#include <linux/tipc_sockets_diag.h> + +static u64 __tipc_diag_gen_cookie(struct sock *sk) +{ + u32 res[2]; + + sock_diag_save_cookie(sk, res); + return *((u64 *)res); +} + +static int __tipc_add_sock_diag(struct sk_buff *skb, + struct netlink_callback *cb, + struct tipc_sock *tsk) +{ + struct tipc_sock_diag_req *req = nlmsg_data(cb->nlh); + struct nlmsghdr *nlh; + int err; + + nlh = nlmsg_put_answer(skb, cb, SOCK_DIAG_BY_FAMILY, 0, + NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + err = tipc_sk_fill_sock_diag(skb, tsk, req->tidiag_states, + __tipc_diag_gen_cookie); + if (err) + return err; + + nlmsg_end(skb, nlh); + return 0; +} + +static int tipc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + return tipc_nl_sk_walk(skb, cb, __tipc_add_sock_diag); +} + +static int tipc_sock_diag_handler_dump(struct sk_buff *skb, + struct nlmsghdr *h) +{ + int hdrlen = sizeof(struct tipc_sock_diag_req); + struct net *net = sock_net(skb->sk); + + if (nlmsg_len(h) < hdrlen) + return -EINVAL; + + if (h->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = tipc_diag_dump, + }; + netlink_dump_start(net->diag_nlsk, skb, h, &c); + return 0; + } + return -EOPNOTSUPP; +} + +static const struct sock_diag_handler tipc_sock_diag_handler = { + .family = AF_TIPC, + .dump = tipc_sock_diag_handler_dump, +}; + +static int __init tipc_diag_init(void) +{ + return sock_diag_register(&tipc_sock_diag_handler); +} + +static void __exit tipc_diag_exit(void) +{ + sock_diag_unregister(&tipc_sock_diag_handler); +} + +module_init(tipc_diag_init); +module_exit(tipc_diag_exit); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_TIPC); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 92e4828c6b09..9f666e0650e2 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -1,7 +1,7 @@ /* * net/tipc/discover.c * - * Copyright (c) 2003-2006, 2014-2015, Ericsson AB + * Copyright (c) 2003-2006, 2014-2018, Ericsson AB * Copyright (c) 2005-2006, 2010-2011, Wind River Systems * All rights reserved. * @@ -39,34 +39,34 @@ #include "discover.h" /* min delay during bearer start up */ -#define TIPC_LINK_REQ_INIT msecs_to_jiffies(125) +#define TIPC_DISC_INIT msecs_to_jiffies(125) /* max delay if bearer has no links */ -#define TIPC_LINK_REQ_FAST msecs_to_jiffies(1000) +#define TIPC_DISC_FAST msecs_to_jiffies(1000) /* max delay if bearer has links */ -#define TIPC_LINK_REQ_SLOW msecs_to_jiffies(60000) +#define TIPC_DISC_SLOW msecs_to_jiffies(60000) /* indicates no timer in use */ -#define TIPC_LINK_REQ_INACTIVE 0xffffffff +#define TIPC_DISC_INACTIVE 0xffffffff /** - * struct tipc_link_req - information about an ongoing link setup request + * struct tipc_discoverer - information about an ongoing link setup request * @bearer_id: identity of bearer issuing requests * @net: network namespace instance * @dest: destination address for request messages * @domain: network domain to which links can be established * @num_nodes: number of nodes currently discovered (i.e. with an active link) * @lock: spinlock for controlling access to requests - * @buf: request message to be (repeatedly) sent + * @skb: request message to be (repeatedly) sent * @timer: timer governing period between requests * @timer_intv: current interval between requests (in ms) */ -struct tipc_link_req { +struct tipc_discoverer { u32 bearer_id; struct tipc_media_addr dest; struct net *net; u32 domain; int num_nodes; spinlock_t lock; - struct sk_buff *buf; + struct sk_buff *skb; struct timer_list timer; unsigned long timer_intv; }; @@ -77,22 +77,42 @@ struct tipc_link_req { * @type: message type (request or response) * @b: ptr to bearer issuing message */ -static void tipc_disc_init_msg(struct net *net, struct sk_buff *buf, u32 type, - struct tipc_bearer *b) +static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, + u32 mtyp, struct tipc_bearer *b) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_msg *msg; + struct tipc_net *tn = tipc_net(net); u32 dest_domain = b->domain; + struct tipc_msg *hdr; - msg = buf_msg(buf); - tipc_msg_init(tn->own_addr, msg, LINK_CONFIG, type, + hdr = buf_msg(skb); + tipc_msg_init(tn->trial_addr, hdr, LINK_CONFIG, mtyp, MAX_H_SIZE, dest_domain); - msg_set_non_seq(msg, 1); - msg_set_node_sig(msg, tn->random); - msg_set_node_capabilities(msg, TIPC_NODE_CAPABILITIES); - msg_set_dest_domain(msg, dest_domain); - msg_set_bc_netid(msg, tn->net_id); - b->media->addr2msg(msg_media_addr(msg), &b->addr); + msg_set_size(hdr, MAX_H_SIZE + NODE_ID_LEN); + msg_set_non_seq(hdr, 1); + msg_set_node_sig(hdr, tn->random); + msg_set_node_capabilities(hdr, TIPC_NODE_CAPABILITIES); + msg_set_dest_domain(hdr, dest_domain); + msg_set_bc_netid(hdr, tn->net_id); + b->media->addr2msg(msg_media_addr(hdr), &b->addr); + msg_set_node_id(hdr, tipc_own_id(net)); +} + +static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, + u32 src, u32 sugg_addr, + struct tipc_media_addr *maddr, + struct tipc_bearer *b) +{ + struct tipc_msg *hdr; + struct sk_buff *skb; + + skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); + if (!skb) + return; + hdr = buf_msg(skb); + tipc_disc_init_msg(net, skb, mtyp, b); + msg_set_sugg_node_addr(hdr, sugg_addr); + msg_set_dest_domain(hdr, dst); + tipc_bearer_xmit_skb(net, b->identity, skb, maddr); } /** @@ -104,161 +124,207 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *buf, u32 type, static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, struct tipc_media_addr *media_addr) { - char node_addr_str[16]; char media_addr_str[64]; - tipc_addr_string_fill(node_addr_str, node_addr); tipc_media_addr_printf(media_addr_str, sizeof(media_addr_str), media_addr); - pr_warn("Duplicate %s using %s seen on <%s>\n", node_addr_str, + pr_warn("Duplicate %x using %s seen on <%s>\n", node_addr, media_addr_str, b->name); } +/* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer + */ +static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, + struct tipc_media_addr *maddr, + struct tipc_bearer *b, + u32 dst, u32 src, + u32 sugg_addr, + u8 *peer_id, + int mtyp) +{ + struct net *net = d->net; + struct tipc_net *tn = tipc_net(net); + bool trial = time_before(jiffies, tn->addr_trial_end); + u32 self = tipc_own_addr(net); + + if (mtyp == DSC_TRIAL_FAIL_MSG) { + if (!trial) + return true; + + /* Ignore if somebody else already gave new suggestion */ + if (dst != tn->trial_addr) + return true; + + /* Otherwise update trial address and restart trial period */ + tn->trial_addr = sugg_addr; + msg_set_prevnode(buf_msg(d->skb), sugg_addr); + tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); + return true; + } + + /* Apply trial address if we just left trial period */ + if (!trial && !self) { + tipc_net_finalize(net, tn->trial_addr); + msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); + } + + if (mtyp != DSC_TRIAL_MSG) + return false; + + sugg_addr = tipc_node_try_addr(net, peer_id, src); + if (sugg_addr) + tipc_disc_msg_xmit(net, DSC_TRIAL_FAIL_MSG, src, + self, sugg_addr, maddr, b); + return true; +} + /** * tipc_disc_rcv - handle incoming discovery message (request or response) - * @net: the applicable net namespace - * @buf: buffer containing message - * @bearer: bearer that message arrived on + * @net: applicable net namespace + * @skb: buffer containing message + * @b: bearer that message arrived on */ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, - struct tipc_bearer *bearer) + struct tipc_bearer *b) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_media_addr maddr; - struct sk_buff *rskb; + struct tipc_net *tn = tipc_net(net); struct tipc_msg *hdr = buf_msg(skb); - u32 ddom = msg_dest_domain(hdr); - u32 onode = msg_prevnode(hdr); + u16 caps = msg_node_capabilities(hdr); + bool legacy = tn->legacy_addr_format; + u32 sugg = msg_sugg_node_addr(hdr); + u32 signature = msg_node_sig(hdr); + u8 peer_id[NODE_ID_LEN] = {0,}; + u32 dst = msg_dest_domain(hdr); u32 net_id = msg_bc_netid(hdr); + struct tipc_media_addr maddr; + u32 src = msg_prevnode(hdr); u32 mtyp = msg_type(hdr); - u32 signature = msg_node_sig(hdr); - u16 caps = msg_node_capabilities(hdr); - bool respond = false; bool dupl_addr = false; + bool respond = false; + u32 self; int err; - err = bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr)); - kfree_skb(skb); - if (err) - return; + skb_linearize(skb); + hdr = buf_msg(skb); - /* Ensure message from node is valid and communication is permitted */ - if (net_id != tn->net_id) + if (caps & TIPC_NODE_ID128) + memcpy(peer_id, msg_node_id(hdr), NODE_ID_LEN); + else + sprintf(peer_id, "%x", src); + + err = b->media->msg2addr(b, &maddr, msg_media_addr(hdr)); + kfree_skb(skb); + if (err || maddr.broadcast) { + pr_warn_ratelimited("Rcv corrupt discovery message\n"); return; - if (maddr.broadcast) + } + /* Ignore discovery messages from own node */ + if (!memcmp(&maddr, &b->addr, sizeof(maddr))) return; - if (!tipc_addr_domain_valid(ddom)) + if (net_id != tn->net_id) return; - if (!tipc_addr_node_valid(onode)) + if (tipc_disc_addr_trial_msg(b->disc, &maddr, b, dst, + src, sugg, peer_id, mtyp)) return; + self = tipc_own_addr(net); - if (in_own_node(net, onode)) { - if (memcmp(&maddr, &bearer->addr, sizeof(maddr))) - disc_dupl_alert(bearer, tn->own_addr, &maddr); + /* Message from somebody using this node's address */ + if (in_own_node(net, src)) { + disc_dupl_alert(b, self, &maddr); return; } - if (!tipc_in_scope(ddom, tn->own_addr)) + if (!tipc_in_scope(legacy, dst, self)) return; - if (!tipc_in_scope(bearer->domain, onode)) + if (!tipc_in_scope(legacy, b->domain, src)) return; - - tipc_node_check_dest(net, onode, bearer, caps, signature, + tipc_node_check_dest(net, src, peer_id, b, caps, signature, &maddr, &respond, &dupl_addr); if (dupl_addr) - disc_dupl_alert(bearer, onode, &maddr); - - /* Send response, if necessary */ - if (respond && (mtyp == DSC_REQ_MSG)) { - rskb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC); - if (!rskb) - return; - tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer); - tipc_bearer_xmit_skb(net, bearer->identity, rskb, &maddr); - } + disc_dupl_alert(b, src, &maddr); + if (!respond) + return; + if (mtyp != DSC_REQ_MSG) + return; + tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, 0, &maddr, b); } -/** - * disc_update - update frequency of periodic link setup requests - * @req: ptr to link request structure - * - * Reinitiates discovery process if discovery object has no associated nodes - * and is either not currently searching or is searching at a slow rate +/* tipc_disc_add_dest - increment set of discovered nodes */ -static void disc_update(struct tipc_link_req *req) +void tipc_disc_add_dest(struct tipc_discoverer *d) { - if (!req->num_nodes) { - if ((req->timer_intv == TIPC_LINK_REQ_INACTIVE) || - (req->timer_intv > TIPC_LINK_REQ_FAST)) { - req->timer_intv = TIPC_LINK_REQ_INIT; - mod_timer(&req->timer, jiffies + req->timer_intv); - } - } + spin_lock_bh(&d->lock); + d->num_nodes++; + spin_unlock_bh(&d->lock); } -/** - * tipc_disc_add_dest - increment set of discovered nodes - * @req: ptr to link request structure +/* tipc_disc_remove_dest - decrement set of discovered nodes */ -void tipc_disc_add_dest(struct tipc_link_req *req) +void tipc_disc_remove_dest(struct tipc_discoverer *d) { - spin_lock_bh(&req->lock); - req->num_nodes++; - spin_unlock_bh(&req->lock); -} + int intv, num; -/** - * tipc_disc_remove_dest - decrement set of discovered nodes - * @req: ptr to link request structure - */ -void tipc_disc_remove_dest(struct tipc_link_req *req) -{ - spin_lock_bh(&req->lock); - req->num_nodes--; - disc_update(req); - spin_unlock_bh(&req->lock); + spin_lock_bh(&d->lock); + d->num_nodes--; + num = d->num_nodes; + intv = d->timer_intv; + if (!num && (intv == TIPC_DISC_INACTIVE || intv > TIPC_DISC_FAST)) { + d->timer_intv = TIPC_DISC_INIT; + mod_timer(&d->timer, jiffies + d->timer_intv); + } + spin_unlock_bh(&d->lock); } -/** - * disc_timeout - send a periodic link setup request - * @data: ptr to link request structure - * +/* tipc_disc_timeout - send a periodic link setup request * Called whenever a link setup request timer associated with a bearer expires. + * - Keep doubling time between sent request until limit is reached; + * - Hold at fast polling rate if we don't have any associated nodes + * - Otherwise hold at slow polling rate */ -static void disc_timeout(struct timer_list *t) +static void tipc_disc_timeout(struct timer_list *t) { - struct tipc_link_req *req = from_timer(req, t, timer); - struct sk_buff *skb; - int max_delay; + struct tipc_discoverer *d = from_timer(d, t, timer); + struct tipc_net *tn = tipc_net(d->net); + u32 self = tipc_own_addr(d->net); + struct tipc_media_addr maddr; + struct sk_buff *skb = NULL; + struct net *net = d->net; + u32 bearer_id; - spin_lock_bh(&req->lock); + spin_lock_bh(&d->lock); /* Stop searching if only desired node has been found */ - if (tipc_node(req->domain) && req->num_nodes) { - req->timer_intv = TIPC_LINK_REQ_INACTIVE; + if (tipc_node(d->domain) && d->num_nodes) { + d->timer_intv = TIPC_DISC_INACTIVE; goto exit; } - /* - * Send discovery message, then update discovery timer - * - * Keep doubling time between requests until limit is reached; - * hold at fast polling rate if don't have any associated nodes, - * otherwise hold at slow polling rate - */ - skb = skb_clone(req->buf, GFP_ATOMIC); - if (skb) - tipc_bearer_xmit_skb(req->net, req->bearer_id, skb, &req->dest); - req->timer_intv *= 2; - if (req->num_nodes) - max_delay = TIPC_LINK_REQ_SLOW; - else - max_delay = TIPC_LINK_REQ_FAST; - if (req->timer_intv > max_delay) - req->timer_intv = max_delay; + /* Did we just leave the address trial period ? */ + if (!self && !time_before(jiffies, tn->addr_trial_end)) { + self = tn->trial_addr; + tipc_net_finalize(net, self); + msg_set_prevnode(buf_msg(d->skb), self); + msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); + } + + /* Adjust timeout interval according to discovery phase */ + if (time_before(jiffies, tn->addr_trial_end)) { + d->timer_intv = TIPC_DISC_INIT; + } else { + d->timer_intv *= 2; + if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW) + d->timer_intv = TIPC_DISC_SLOW; + else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST) + d->timer_intv = TIPC_DISC_FAST; + } - mod_timer(&req->timer, jiffies + req->timer_intv); + mod_timer(&d->timer, jiffies + d->timer_intv); + memcpy(&maddr, &d->dest, sizeof(maddr)); + skb = skb_clone(d->skb, GFP_ATOMIC); + bearer_id = d->bearer_id; exit: - spin_unlock_bh(&req->lock); + spin_unlock_bh(&d->lock); + if (skb) + tipc_bearer_xmit_skb(net, bearer_id, skb, &maddr); } /** @@ -273,41 +339,47 @@ exit: int tipc_disc_create(struct net *net, struct tipc_bearer *b, struct tipc_media_addr *dest, struct sk_buff **skb) { - struct tipc_link_req *req; + struct tipc_net *tn = tipc_net(net); + struct tipc_discoverer *d; - req = kmalloc(sizeof(*req), GFP_ATOMIC); - if (!req) + d = kmalloc(sizeof(*d), GFP_ATOMIC); + if (!d) return -ENOMEM; - req->buf = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC); - if (!req->buf) { - kfree(req); + d->skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); + if (!d->skb) { + kfree(d); return -ENOMEM; } + tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); - tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b); - memcpy(&req->dest, dest, sizeof(*dest)); - req->net = net; - req->bearer_id = b->identity; - req->domain = b->domain; - req->num_nodes = 0; - req->timer_intv = TIPC_LINK_REQ_INIT; - spin_lock_init(&req->lock); - timer_setup(&req->timer, disc_timeout, 0); - mod_timer(&req->timer, jiffies + req->timer_intv); - b->link_req = req; - *skb = skb_clone(req->buf, GFP_ATOMIC); + /* Do we need an address trial period first ? */ + if (!tipc_own_addr(net)) { + tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); + msg_set_type(buf_msg(d->skb), DSC_TRIAL_MSG); + } + memcpy(&d->dest, dest, sizeof(*dest)); + d->net = net; + d->bearer_id = b->identity; + d->domain = b->domain; + d->num_nodes = 0; + d->timer_intv = TIPC_DISC_INIT; + spin_lock_init(&d->lock); + timer_setup(&d->timer, tipc_disc_timeout, 0); + mod_timer(&d->timer, jiffies + d->timer_intv); + b->disc = d; + *skb = skb_clone(d->skb, GFP_ATOMIC); return 0; } /** * tipc_disc_delete - destroy object sending periodic link setup requests - * @req: ptr to link request structure + * @d: ptr to link duest structure */ -void tipc_disc_delete(struct tipc_link_req *req) +void tipc_disc_delete(struct tipc_discoverer *d) { - del_timer_sync(&req->timer); - kfree_skb(req->buf); - kfree(req); + del_timer_sync(&d->timer); + kfree_skb(d->skb); + kfree(d); } /** @@ -318,19 +390,21 @@ void tipc_disc_delete(struct tipc_link_req *req) */ void tipc_disc_reset(struct net *net, struct tipc_bearer *b) { - struct tipc_link_req *req = b->link_req; + struct tipc_discoverer *d = b->disc; + struct tipc_media_addr maddr; struct sk_buff *skb; - spin_lock_bh(&req->lock); - tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b); - req->net = net; - req->bearer_id = b->identity; - req->domain = b->domain; - req->num_nodes = 0; - req->timer_intv = TIPC_LINK_REQ_INIT; - mod_timer(&req->timer, jiffies + req->timer_intv); - skb = skb_clone(req->buf, GFP_ATOMIC); + spin_lock_bh(&d->lock); + tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); + d->net = net; + d->bearer_id = b->identity; + d->domain = b->domain; + d->num_nodes = 0; + d->timer_intv = TIPC_DISC_INIT; + memcpy(&maddr, &d->dest, sizeof(maddr)); + mod_timer(&d->timer, jiffies + d->timer_intv); + skb = skb_clone(d->skb, GFP_ATOMIC); + spin_unlock_bh(&d->lock); if (skb) - tipc_bearer_xmit_skb(net, req->bearer_id, skb, &req->dest); - spin_unlock_bh(&req->lock); + tipc_bearer_xmit_skb(net, b->identity, skb, &maddr); } diff --git a/net/tipc/discover.h b/net/tipc/discover.h index b80a335389c0..521d96c41dfd 100644 --- a/net/tipc/discover.h +++ b/net/tipc/discover.h @@ -37,14 +37,14 @@ #ifndef _TIPC_DISCOVER_H #define _TIPC_DISCOVER_H -struct tipc_link_req; +struct tipc_discoverer; int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr, struct tipc_media_addr *dest, struct sk_buff **skb); -void tipc_disc_delete(struct tipc_link_req *req); +void tipc_disc_delete(struct tipc_discoverer *req); void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr); -void tipc_disc_add_dest(struct tipc_link_req *req); -void tipc_disc_remove_dest(struct tipc_link_req *req); +void tipc_disc_add_dest(struct tipc_discoverer *req); +void tipc_disc_remove_dest(struct tipc_discoverer *req); void tipc_disc_rcv(struct net *net, struct sk_buff *buf, struct tipc_bearer *b_ptr); diff --git a/net/tipc/group.c b/net/tipc/group.c index 122162a31816..d7a7befeddd4 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -37,7 +37,7 @@ #include "addr.h" #include "group.h" #include "bcast.h" -#include "server.h" +#include "topsrv.h" #include "msg.h" #include "socket.h" #include "node.h" @@ -189,6 +189,7 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid, grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; grp->open = group_is_open; + *grp->open = false; filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE; if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, filter, &grp->subid)) diff --git a/net/tipc/link.c b/net/tipc/link.c index 2d6b2aed30e0..695acb783969 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -434,14 +434,16 @@ char *tipc_link_name(struct tipc_link *l) */ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 ownnode, u32 peer, - u16 peer_caps, + int window, u32 session, u32 self, + u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link **link) { + char peer_str[NODE_ID_STR_LEN] = {0,}; + char self_str[NODE_ID_STR_LEN] = {0,}; struct tipc_link *l; l = kzalloc(sizeof(*l), GFP_ATOMIC); @@ -450,10 +452,19 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, *link = l; l->session = session; - /* Note: peer i/f name is completed by reset/activate message */ - sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown", - tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode), - if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer)); + /* Set link name for unicast links only */ + if (peer_id) { + tipc_nodeid2string(self_str, tipc_own_id(net)); + if (strlen(self_str) > 16) + sprintf(self_str, "%x", self); + tipc_nodeid2string(peer_str, peer_id); + if (strlen(peer_str) > 16) + sprintf(peer_str, "%x", peer); + } + /* Peer i/f name will be completed by reset/activate message */ + snprintf(l->name, sizeof(l->name), "%s:%s-%s:unknown", + self_str, if_name, peer_str); + strcpy(l->if_name, if_name); l->addr = peer; l->peer_caps = peer_caps; @@ -501,7 +512,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, struct tipc_link *l; if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window, - 0, ownnode, peer, peer_caps, bc_sndlink, + 0, ownnode, peer, NULL, peer_caps, bc_sndlink, NULL, inputq, namedq, link)) return false; @@ -1800,7 +1811,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) { - int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE); + int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE); l->window = win; l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win); @@ -1938,11 +1949,11 @@ msg_full: int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_link *link, int nlflags) { - int err; - void *hdr; + u32 self = tipc_own_addr(net); struct nlattr *attrs; struct nlattr *prop; - struct tipc_net *tn = net_generic(net, tipc_net_id); + void *hdr; + int err; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, nlflags, TIPC_NL_LINK_GET); @@ -1955,8 +1966,7 @@ int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, link->name)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_DEST, - tipc_cluster_mask(tn->own_addr))) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_DEST, tipc_cluster_mask(self))) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu)) goto attr_msg_full; @@ -2126,7 +2136,8 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, struct sk_buff_head *xmitq) { l->tolerance = tol; - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq); + if (link_is_up(l)) + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq); } void tipc_link_set_prio(struct tipc_link *l, u32 prio, diff --git a/net/tipc/link.h b/net/tipc/link.h index d1bd1787a768..ec59348a81e8 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -73,8 +73,8 @@ enum { bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 ownnode, u32 peer, - u16 peer_caps, + int window, u32 session, u32 ownnode, + u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, struct sk_buff_head *inputq, diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 4e1c6f6450bb..b6c45dccba3d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -580,7 +580,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) msg = buf_msg(skb); if (msg_reroute_cnt(msg)) return false; - dnode = addr_domain(net, msg_lookup_scope(msg)); + dnode = tipc_scope2node(net, msg_lookup_scope(msg)); dport = tipc_nametbl_translate(net, msg_nametype(msg), msg_nameinst(msg), &dnode); if (!dport) diff --git a/net/tipc/msg.h b/net/tipc/msg.h index b4ba1b4f9ae7..a4e944d59394 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -550,6 +550,8 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) */ #define DSC_REQ_MSG 0 #define DSC_RESP_MSG 1 +#define DSC_TRIAL_MSG 2 +#define DSC_TRIAL_FAIL_MSG 3 /* * Group protocol message types @@ -627,7 +629,6 @@ static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n) msg_set_bits(m, 2, 0, 0xffff, n); } - /* * Word 4 */ @@ -925,6 +926,26 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } +static inline u32 msg_sugg_node_addr(struct tipc_msg *m) +{ + return msg_word(m, 14); +} + +static inline void msg_set_sugg_node_addr(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 14, n); +} + +static inline void msg_set_node_id(struct tipc_msg *hdr, u8 *id) +{ + memcpy(msg_data(hdr), id, 16); +} + +static inline u8 *msg_node_id(struct tipc_msg *hdr) +{ + return (u8 *)msg_data(hdr); +} + struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff **_skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 23f8899e0f8c..51b4b96f89db 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -56,7 +56,7 @@ static void publ_to_item(struct distr_item *i, struct publication *p) i->type = htonl(p->type); i->lower = htonl(p->lower); i->upper = htonl(p->upper); - i->ref = htonl(p->ref); + i->port = htonl(p->port); i->key = htonl(p->key); } @@ -68,14 +68,14 @@ static void publ_to_item(struct distr_item *i, struct publication *p) static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, u32 dest) { - struct tipc_net *tn = net_generic(net, tipc_net_id); struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); + u32 self = tipc_own_addr(net); struct tipc_msg *msg; if (buf != NULL) { msg = buf_msg(buf); - tipc_msg_init(tn->own_addr, msg, NAME_DISTRIBUTOR, type, - INT_H_SIZE, dest); + tipc_msg_init(self, msg, NAME_DISTRIBUTOR, + type, INT_H_SIZE, dest); msg_set_size(msg, INT_H_SIZE + size); } return buf; @@ -86,25 +86,25 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, */ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct sk_buff *buf; + struct name_table *nt = tipc_name_table(net); struct distr_item *item; + struct sk_buff *skb; - list_add_tail_rcu(&publ->local_list, - &tn->nametbl->publ_list[publ->scope]); - - if (publ->scope == TIPC_NODE_SCOPE) + if (publ->scope == TIPC_NODE_SCOPE) { + list_add_tail_rcu(&publ->binding_node, &nt->node_scope); return NULL; + } + list_add_tail_rcu(&publ->binding_node, &nt->cluster_scope); - buf = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0); - if (!buf) { + skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0); + if (!skb) { pr_warn("Publication distribution failure\n"); return NULL; } - item = (struct distr_item *)msg_data(buf_msg(buf)); + item = (struct distr_item *)msg_data(buf_msg(skb)); publ_to_item(item, publ); - return buf; + return skb; } /** @@ -115,7 +115,7 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) struct sk_buff *buf; struct distr_item *item; - list_del(&publ->local_list); + list_del(&publ->binding_node); if (publ->scope == TIPC_NODE_SCOPE) return NULL; @@ -147,7 +147,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; - list_for_each_entry(publ, pls, local_list) { + list_for_each_entry(publ, pls, binding_node) { /* Prepare next buffer: */ if (!skb) { skb = named_prepare_buf(net, PUBLICATION, msg_rem, @@ -184,16 +184,13 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, */ void tipc_named_node_up(struct net *net, u32 dnode) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct name_table *nt = tipc_name_table(net); struct sk_buff_head head; __skb_queue_head_init(&head); rcu_read_lock(); - named_distribute(net, &head, dnode, - &tn->nametbl->publ_list[TIPC_CLUSTER_SCOPE]); - named_distribute(net, &head, dnode, - &tn->nametbl->publ_list[TIPC_ZONE_SCOPE]); + named_distribute(net, &head, dnode, &nt->cluster_scope); rcu_read_unlock(); tipc_node_xmit(net, &head, dnode, 0); @@ -207,20 +204,20 @@ void tipc_named_node_up(struct net *net, u32 dnode) */ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); struct publication *p; spin_lock_bh(&tn->nametbl_lock); - p = tipc_nametbl_remove_publ(net, publ->type, publ->lower, - publ->node, publ->ref, publ->key); + p = tipc_nametbl_remove_publ(net, publ->type, publ->lower, publ->upper, + publ->node, publ->key); if (p) - tipc_node_unsubscribe(net, &p->nodesub_list, addr); + tipc_node_unsubscribe(net, &p->binding_node, addr); spin_unlock_bh(&tn->nametbl_lock); if (p != publ) { pr_err("Unable to remove publication from failed node\n" - " (type=%u, lower=%u, node=0x%x, ref=%u, key=%u)\n", - publ->type, publ->lower, publ->node, publ->ref, + " (type=%u, lower=%u, node=0x%x, port=%u, key=%u)\n", + publ->type, publ->lower, publ->node, publ->port, publ->key); } @@ -249,7 +246,7 @@ void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr) { struct publication *publ, *tmp; - list_for_each_entry_safe(publ, tmp, nsub_list, nodesub_list) + list_for_each_entry_safe(publ, tmp, nsub_list, binding_node) tipc_publ_purge(net, publ, addr); tipc_dist_queue_purge(net, addr); } @@ -264,28 +261,31 @@ void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr) static bool tipc_update_nametbl(struct net *net, struct distr_item *i, u32 node, u32 dtype) { - struct publication *publ = NULL; + struct publication *p = NULL; + u32 lower = ntohl(i->lower); + u32 upper = ntohl(i->upper); + u32 type = ntohl(i->type); + u32 port = ntohl(i->port); + u32 key = ntohl(i->key); if (dtype == PUBLICATION) { - publ = tipc_nametbl_insert_publ(net, ntohl(i->type), - ntohl(i->lower), - ntohl(i->upper), - TIPC_CLUSTER_SCOPE, node, - ntohl(i->ref), ntohl(i->key)); - if (publ) { - tipc_node_subscribe(net, &publ->nodesub_list, node); + p = tipc_nametbl_insert_publ(net, type, lower, upper, + TIPC_CLUSTER_SCOPE, node, + port, key); + if (p) { + tipc_node_subscribe(net, &p->binding_node, node); return true; } } else if (dtype == WITHDRAWAL) { - publ = tipc_nametbl_remove_publ(net, ntohl(i->type), - ntohl(i->lower), - node, ntohl(i->ref), - ntohl(i->key)); - if (publ) { - tipc_node_unsubscribe(net, &publ->nodesub_list, node); - kfree_rcu(publ, rcu); + p = tipc_nametbl_remove_publ(net, type, lower, + upper, node, key); + if (p) { + tipc_node_unsubscribe(net, &p->binding_node, node); + kfree_rcu(p, rcu); return true; } + pr_warn_ratelimited("Failed to remove binding %u,%u from %x\n", + type, lower, node); } else { pr_warn("Unrecognized name table message received\n"); } @@ -293,55 +293,6 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, } /** - * tipc_named_add_backlog - add a failed name table update to the backlog - * - */ -static void tipc_named_add_backlog(struct net *net, struct distr_item *i, - u32 type, u32 node) -{ - struct distr_queue_item *e; - struct tipc_net *tn = net_generic(net, tipc_net_id); - unsigned long now = get_jiffies_64(); - - e = kzalloc(sizeof(*e), GFP_ATOMIC); - if (!e) - return; - e->dtype = type; - e->node = node; - e->expires = now + msecs_to_jiffies(sysctl_tipc_named_timeout); - memcpy(e, i, sizeof(*i)); - list_add_tail(&e->next, &tn->dist_queue); -} - -/** - * tipc_named_process_backlog - try to process any pending name table updates - * from the network. - */ -void tipc_named_process_backlog(struct net *net) -{ - struct distr_queue_item *e, *tmp; - struct tipc_net *tn = net_generic(net, tipc_net_id); - char addr[16]; - unsigned long now = get_jiffies_64(); - - list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) { - if (time_after(e->expires, now)) { - if (!tipc_update_nametbl(net, &e->i, e->node, e->dtype)) - continue; - } else { - tipc_addr_string_fill(addr, e->node); - pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %s key=%u\n", - e->dtype, ntohl(e->i.type), - ntohl(e->i.lower), - ntohl(e->i.upper), - addr, ntohl(e->i.key)); - } - list_del(&e->next); - kfree(e); - } -} - -/** * tipc_named_rcv - process name table update messages sent by another node */ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq) @@ -363,12 +314,10 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq) count = msg_data_sz(msg) / ITEM_SIZE; node = msg_orignode(msg); while (count--) { - if (!tipc_update_nametbl(net, item, node, mtype)) - tipc_named_add_backlog(net, item, mtype, node); + tipc_update_nametbl(net, item, node, mtype); item++; } kfree_skb(skb); - tipc_named_process_backlog(net); } spin_unlock_bh(&tn->nametbl_lock); } @@ -382,16 +331,17 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq) */ void tipc_named_reinit(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct name_table *nt = tipc_name_table(net); + struct tipc_net *tn = tipc_net(net); struct publication *publ; - int scope; + u32 self = tipc_own_addr(net); spin_lock_bh(&tn->nametbl_lock); - for (scope = TIPC_ZONE_SCOPE; scope <= TIPC_NODE_SCOPE; scope++) - list_for_each_entry_rcu(publ, &tn->nametbl->publ_list[scope], - local_list) - publ->node = tn->own_addr; + list_for_each_entry_rcu(publ, &nt->node_scope, binding_node) + publ->node = self; + list_for_each_entry_rcu(publ, &nt->cluster_scope, binding_node) + publ->node = self; spin_unlock_bh(&tn->nametbl_lock); } diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index 1264ba0af937..63fc73e0fa6c 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -63,7 +63,7 @@ struct distr_item { __be32 type; __be32 lower; __be32 upper; - __be32 ref; + __be32 port; __be32 key; }; @@ -72,7 +72,6 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ); void tipc_named_node_up(struct net *net, u32 dnode); void tipc_named_rcv(struct net *net, struct sk_buff_head *msg_queue); void tipc_named_reinit(struct net *net); -void tipc_named_process_backlog(struct net *net); void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr); #endif diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index ed0457cc99d6..b1fe20972aa9 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -1,7 +1,7 @@ /* * net/tipc/name_table.c: TIPC name table code * - * Copyright (c) 2000-2006, 2014-2015, Ericsson AB + * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2008, 2010-2014, Wind River Systems * All rights reserved. * @@ -44,64 +44,40 @@ #include "addr.h" #include "node.h" #include "group.h" -#include <net/genetlink.h> - -#define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ /** - * struct name_info - name sequence publication info - * @node_list: circular list of publications made by own node - * @cluster_list: circular list of publications made by own cluster - * @zone_list: circular list of publications made by own zone - * @node_list_size: number of entries in "node_list" - * @cluster_list_size: number of entries in "cluster_list" - * @zone_list_size: number of entries in "zone_list" - * - * Note: The zone list always contains at least one entry, since all - * publications of the associated name sequence belong to it. - * (The cluster and node lists may be empty.) + * struct service_range - container for all bindings of a service range + * @lower: service range lower bound + * @upper: service range upper bound + * @tree_node: member of service range RB tree + * @local_publ: list of identical publications made from this node + * Used by closest_first lookup and multicast lookup algorithm + * @all_publ: all publications identical to this one, whatever node and scope + * Used by round-robin lookup algorithm */ -struct name_info { - struct list_head node_list; - struct list_head cluster_list; - struct list_head zone_list; - u32 node_list_size; - u32 cluster_list_size; - u32 zone_list_size; -}; - -/** - * struct sub_seq - container for all published instances of a name sequence - * @lower: name sequence lower bound - * @upper: name sequence upper bound - * @info: pointer to name sequence publication info - */ -struct sub_seq { +struct service_range { u32 lower; u32 upper; - struct name_info *info; + struct rb_node tree_node; + struct list_head local_publ; + struct list_head all_publ; }; /** - * struct name_seq - container for all published instances of a name type - * @type: 32 bit 'type' value for name sequence - * @sseq: pointer to dynamically-sized array of sub-sequences of this 'type'; - * sub-sequences are sorted in ascending order - * @alloc: number of sub-sequences currently in array - * @first_free: array index of first unused sub-sequence entry - * @ns_list: links to adjacent name sequences in hash chain - * @subscriptions: list of subscriptions for this 'type' - * @lock: spinlock controlling access to publication lists of all sub-sequences + * struct tipc_service - container for all published instances of a service type + * @type: 32 bit 'type' value for service + * @ranges: rb tree containing all service ranges for this service + * @service_list: links to adjacent name ranges in hash chain + * @subscriptions: list of subscriptions for this service type + * @lock: spinlock controlling access to pertaining service ranges/publications * @rcu: RCU callback head used for deferred freeing */ -struct name_seq { +struct tipc_service { u32 type; - struct sub_seq *sseqs; - u32 alloc; - u32 first_free; - struct hlist_node ns_list; + struct rb_root ranges; + struct hlist_node service_list; struct list_head subscriptions; - spinlock_t lock; + spinlock_t lock; /* Covers service range list */ struct rcu_head rcu; }; @@ -111,494 +87,380 @@ static int hash(int x) } /** - * publ_create - create a publication structure + * tipc_publ_create - create a publication structure */ -static struct publication *publ_create(u32 type, u32 lower, u32 upper, - u32 scope, u32 node, u32 port_ref, - u32 key) +static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, + u32 scope, u32 node, u32 port, + u32 key) { struct publication *publ = kzalloc(sizeof(*publ), GFP_ATOMIC); - if (publ == NULL) { - pr_warn("Publication creation failure, no memory\n"); + + if (!publ) return NULL; - } publ->type = type; publ->lower = lower; publ->upper = upper; publ->scope = scope; publ->node = node; - publ->ref = port_ref; + publ->port = port; publ->key = key; - INIT_LIST_HEAD(&publ->pport_list); + INIT_LIST_HEAD(&publ->binding_sock); + INIT_LIST_HEAD(&publ->binding_node); + INIT_LIST_HEAD(&publ->local_publ); + INIT_LIST_HEAD(&publ->all_publ); return publ; } /** - * tipc_subseq_alloc - allocate a specified number of sub-sequence structures - */ -static struct sub_seq *tipc_subseq_alloc(u32 cnt) -{ - return kcalloc(cnt, sizeof(struct sub_seq), GFP_ATOMIC); -} - -/** - * tipc_nameseq_create - create a name sequence structure for the specified 'type' + * tipc_service_create - create a service structure for the specified 'type' * - * Allocates a single sub-sequence structure and sets it to all 0's. + * Allocates a single range structure and sets it to all 0's. */ -static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_head) +static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd) { - struct name_seq *nseq = kzalloc(sizeof(*nseq), GFP_ATOMIC); - struct sub_seq *sseq = tipc_subseq_alloc(1); + struct tipc_service *service = kzalloc(sizeof(*service), GFP_ATOMIC); - if (!nseq || !sseq) { - pr_warn("Name sequence creation failed, no memory\n"); - kfree(nseq); - kfree(sseq); + if (!service) { + pr_warn("Service creation failed, no memory\n"); return NULL; } - spin_lock_init(&nseq->lock); - nseq->type = type; - nseq->sseqs = sseq; - nseq->alloc = 1; - INIT_HLIST_NODE(&nseq->ns_list); - INIT_LIST_HEAD(&nseq->subscriptions); - hlist_add_head_rcu(&nseq->ns_list, seq_head); - return nseq; + spin_lock_init(&service->lock); + service->type = type; + service->ranges = RB_ROOT; + INIT_HLIST_NODE(&service->service_list); + INIT_LIST_HEAD(&service->subscriptions); + hlist_add_head_rcu(&service->service_list, hd); + return service; } /** - * nameseq_find_subseq - find sub-sequence (if any) matching a name instance + * tipc_service_find_range - find service range matching a service instance * - * Very time-critical, so binary searches through sub-sequence array. + * Very time-critical, so binary search through range rb tree */ -static struct sub_seq *nameseq_find_subseq(struct name_seq *nseq, - u32 instance) +static struct service_range *tipc_service_find_range(struct tipc_service *sc, + u32 instance) { - struct sub_seq *sseqs = nseq->sseqs; - int low = 0; - int high = nseq->first_free - 1; - int mid; - - while (low <= high) { - mid = (low + high) / 2; - if (instance < sseqs[mid].lower) - high = mid - 1; - else if (instance > sseqs[mid].upper) - low = mid + 1; + struct rb_node *n = sc->ranges.rb_node; + struct service_range *sr; + + while (n) { + sr = container_of(n, struct service_range, tree_node); + if (sr->lower > instance) + n = n->rb_left; + else if (sr->upper < instance) + n = n->rb_right; else - return &sseqs[mid]; + return sr; } return NULL; } -/** - * nameseq_locate_subseq - determine position of name instance in sub-sequence - * - * Returns index in sub-sequence array of the entry that contains the specified - * instance value; if no entry contains that value, returns the position - * where a new entry for it would be inserted in the array. - * - * Note: Similar to binary search code for locating a sub-sequence. - */ -static u32 nameseq_locate_subseq(struct name_seq *nseq, u32 instance) +static struct service_range *tipc_service_create_range(struct tipc_service *sc, + u32 lower, u32 upper) { - struct sub_seq *sseqs = nseq->sseqs; - int low = 0; - int high = nseq->first_free - 1; - int mid; - - while (low <= high) { - mid = (low + high) / 2; - if (instance < sseqs[mid].lower) - high = mid - 1; - else if (instance > sseqs[mid].upper) - low = mid + 1; + struct rb_node **n, *parent = NULL; + struct service_range *sr, *tmp; + + n = &sc->ranges.rb_node; + while (*n) { + tmp = container_of(*n, struct service_range, tree_node); + parent = *n; + tmp = container_of(parent, struct service_range, tree_node); + if (lower < tmp->lower) + n = &(*n)->rb_left; + else if (lower > tmp->lower) + n = &(*n)->rb_right; + else if (upper < tmp->upper) + n = &(*n)->rb_left; + else if (upper > tmp->upper) + n = &(*n)->rb_right; else - return mid; + return tmp; } - return low; + sr = kzalloc(sizeof(*sr), GFP_ATOMIC); + if (!sr) + return NULL; + sr->lower = lower; + sr->upper = upper; + INIT_LIST_HEAD(&sr->local_publ); + INIT_LIST_HEAD(&sr->all_publ); + rb_link_node(&sr->tree_node, parent, n); + rb_insert_color(&sr->tree_node, &sc->ranges); + return sr; } -/** - * tipc_nameseq_insert_publ - */ -static struct publication *tipc_nameseq_insert_publ(struct net *net, - struct name_seq *nseq, +static struct publication *tipc_service_insert_publ(struct net *net, + struct tipc_service *sc, u32 type, u32 lower, u32 upper, u32 scope, - u32 node, u32 port, u32 key) + u32 node, u32 port, + u32 key) { - struct tipc_subscription *s; - struct tipc_subscription *st; - struct publication *publ; - struct sub_seq *sseq; - struct name_info *info; - int created_subseq = 0; - - sseq = nameseq_find_subseq(nseq, lower); - if (sseq) { - - /* Lower end overlaps existing entry => need an exact match */ - if ((sseq->lower != lower) || (sseq->upper != upper)) { - return NULL; - } - - info = sseq->info; - - /* Check if an identical publication already exists */ - list_for_each_entry(publ, &info->zone_list, zone_list) { - if ((publ->ref == port) && (publ->key == key) && - (!publ->node || (publ->node == node))) - return NULL; - } - } else { - u32 inspos; - struct sub_seq *freesseq; - - /* Find where lower end should be inserted */ - inspos = nameseq_locate_subseq(nseq, lower); - - /* Fail if upper end overlaps into an existing entry */ - if ((inspos < nseq->first_free) && - (upper >= nseq->sseqs[inspos].lower)) { - return NULL; - } + struct tipc_subscription *sub, *tmp; + struct service_range *sr; + struct publication *p; + bool first = false; - /* Ensure there is space for new sub-sequence */ - if (nseq->first_free == nseq->alloc) { - struct sub_seq *sseqs = tipc_subseq_alloc(nseq->alloc * 2); + sr = tipc_service_create_range(sc, lower, upper); + if (!sr) + goto err; - if (!sseqs) { - pr_warn("Cannot publish {%u,%u,%u}, no memory\n", - type, lower, upper); - return NULL; - } - memcpy(sseqs, nseq->sseqs, - nseq->alloc * sizeof(struct sub_seq)); - kfree(nseq->sseqs); - nseq->sseqs = sseqs; - nseq->alloc *= 2; - } + first = list_empty(&sr->all_publ); - info = kzalloc(sizeof(*info), GFP_ATOMIC); - if (!info) { - pr_warn("Cannot publish {%u,%u,%u}, no memory\n", - type, lower, upper); + /* Return if the publication already exists */ + list_for_each_entry(p, &sr->all_publ, all_publ) { + if (p->key == key && (!p->node || p->node == node)) return NULL; - } - - INIT_LIST_HEAD(&info->node_list); - INIT_LIST_HEAD(&info->cluster_list); - INIT_LIST_HEAD(&info->zone_list); - - /* Insert new sub-sequence */ - sseq = &nseq->sseqs[inspos]; - freesseq = &nseq->sseqs[nseq->first_free]; - memmove(sseq + 1, sseq, (freesseq - sseq) * sizeof(*sseq)); - memset(sseq, 0, sizeof(*sseq)); - nseq->first_free++; - sseq->lower = lower; - sseq->upper = upper; - sseq->info = info; - created_subseq = 1; } - /* Insert a publication */ - publ = publ_create(type, lower, upper, scope, node, port, key); - if (!publ) - return NULL; - - list_add(&publ->zone_list, &info->zone_list); - info->zone_list_size++; - - if (in_own_cluster(net, node)) { - list_add(&publ->cluster_list, &info->cluster_list); - info->cluster_list_size++; - } - - if (in_own_node(net, node)) { - list_add(&publ->node_list, &info->node_list); - info->node_list_size++; - } + /* Create and insert publication */ + p = tipc_publ_create(type, lower, upper, scope, node, port, key); + if (!p) + goto err; + if (in_own_node(net, node)) + list_add(&p->local_publ, &sr->local_publ); + list_add(&p->all_publ, &sr->all_publ); /* Any subscriptions waiting for notification? */ - list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { - tipc_subscrp_report_overlap(s, publ->lower, publ->upper, - TIPC_PUBLISHED, publ->ref, - publ->node, publ->scope, - created_subseq); + list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { + tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_PUBLISHED, + p->port, p->node, p->scope, first); } - return publ; + return p; +err: + pr_warn("Failed to bind to %u,%u,%u, no memory\n", type, lower, upper); + return NULL; } /** - * tipc_nameseq_remove_publ - * - * NOTE: There may be cases where TIPC is asked to remove a publication - * that is not in the name table. For example, if another node issues a - * publication for a name sequence that overlaps an existing name sequence - * the publication will not be recorded, which means the publication won't - * be found when the name sequence is later withdrawn by that node. - * A failed withdraw request simply returns a failure indication and lets the - * caller issue any error or warning messages associated with such a problem. + * tipc_service_remove_publ - remove a publication from a service */ -static struct publication *tipc_nameseq_remove_publ(struct net *net, - struct name_seq *nseq, - u32 inst, u32 node, - u32 ref, u32 key) +static struct publication *tipc_service_remove_publ(struct net *net, + struct tipc_service *sc, + u32 lower, u32 upper, + u32 node, u32 key) { - struct publication *publ; - struct sub_seq *sseq = nameseq_find_subseq(nseq, inst); - struct name_info *info; - struct sub_seq *free; - struct tipc_subscription *s, *st; - int removed_subseq = 0; - - if (!sseq) - return NULL; + struct tipc_subscription *sub, *tmp; + struct service_range *sr; + struct publication *p; + bool found = false; + bool last = false; + struct rb_node *n; - info = sseq->info; + sr = tipc_service_find_range(sc, lower); + if (!sr) + return NULL; - /* Locate publication, if it exists */ - list_for_each_entry(publ, &info->zone_list, zone_list) { - if ((publ->key == key) && (publ->ref == ref) && - (!publ->node || (publ->node == node))) - goto found; + /* Find exact matching service range */ + for (n = &sr->tree_node; n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + if (sr->upper == upper) + break; } - return NULL; - -found: - /* Remove publication from zone scope list */ - list_del(&publ->zone_list); - info->zone_list_size--; + if (!n || sr->lower != lower || sr->upper != upper) + return NULL; - /* Remove publication from cluster scope list, if present */ - if (in_own_cluster(net, node)) { - list_del(&publ->cluster_list); - info->cluster_list_size--; + /* Find publication, if it exists */ + list_for_each_entry(p, &sr->all_publ, all_publ) { + if (p->key != key || (node && node != p->node)) + continue; + found = true; + break; } + if (!found) + return NULL; - /* Remove publication from node scope list, if present */ - if (in_own_node(net, node)) { - list_del(&publ->node_list); - info->node_list_size--; - } + list_del(&p->all_publ); + list_del(&p->local_publ); - /* Contract subseq list if no more publications for that subseq */ - if (list_empty(&info->zone_list)) { - kfree(info); - free = &nseq->sseqs[nseq->first_free--]; - memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof(*sseq)); - removed_subseq = 1; + /* Remove service range item if this was its last publication */ + if (list_empty(&sr->all_publ)) { + last = true; + rb_erase(&sr->tree_node, &sc->ranges); + kfree(sr); } /* Notify any waiting subscriptions */ - list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { - tipc_subscrp_report_overlap(s, publ->lower, publ->upper, - TIPC_WITHDRAWN, publ->ref, - publ->node, publ->scope, - removed_subseq); + list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { + tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_WITHDRAWN, + p->port, p->node, p->scope, last); } - - return publ; + return p; } /** - * tipc_nameseq_subscribe - attach a subscription, and optionally - * issue the prescribed number of events if there is any sub- - * sequence overlapping with the requested sequence + * tipc_service_subscribe - attach a subscription, and optionally + * issue the prescribed number of events if there is any service + * range overlapping with the requested range */ -static void tipc_nameseq_subscribe(struct name_seq *nseq, - struct tipc_subscription *s, - bool status) +static void tipc_service_subscribe(struct tipc_service *service, + struct tipc_subscription *sub) { - struct sub_seq *sseq = nseq->sseqs; + struct tipc_subscr *sb = &sub->evt.s; + struct service_range *sr; struct tipc_name_seq ns; + struct publication *p; + struct rb_node *n; + bool first; - tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); + ns.type = tipc_sub_read(sb, seq.type); + ns.lower = tipc_sub_read(sb, seq.lower); + ns.upper = tipc_sub_read(sb, seq.upper); - tipc_subscrp_get(s); - list_add(&s->nameseq_list, &nseq->subscriptions); + tipc_sub_get(sub); + list_add(&sub->service_list, &service->subscriptions); - if (!status || !sseq) + if (tipc_sub_read(sb, filter) & TIPC_SUB_NO_STATUS) return; - while (sseq != &nseq->sseqs[nseq->first_free]) { - if (tipc_subscrp_check_overlap(&ns, sseq->lower, sseq->upper)) { - struct publication *crs; - struct name_info *info = sseq->info; - int must_report = 1; - - list_for_each_entry(crs, &info->zone_list, zone_list) { - tipc_subscrp_report_overlap(s, sseq->lower, - sseq->upper, - TIPC_PUBLISHED, - crs->ref, crs->node, - crs->scope, - must_report); - must_report = 0; - } + for (n = rb_first(&service->ranges); n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + if (sr->lower > ns.upper) + break; + if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper)) + continue; + first = true; + + list_for_each_entry(p, &sr->all_publ, all_publ) { + tipc_sub_report_overlap(sub, sr->lower, sr->upper, + TIPC_PUBLISHED, p->port, + p->node, p->scope, first); + first = false; } - sseq++; } } -static struct name_seq *nametbl_find_seq(struct net *net, u32 type) +static struct tipc_service *tipc_service_find(struct net *net, u32 type) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct hlist_head *seq_head; - struct name_seq *ns; - - seq_head = &tn->nametbl->seq_hlist[hash(type)]; - hlist_for_each_entry_rcu(ns, seq_head, ns_list) { - if (ns->type == type) - return ns; + struct name_table *nt = tipc_name_table(net); + struct hlist_head *service_head; + struct tipc_service *service; + + service_head = &nt->services[hash(type)]; + hlist_for_each_entry_rcu(service, service_head, service_list) { + if (service->type == type) + return service; } - return NULL; }; struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, - u32 lower, u32 upper, u32 scope, - u32 node, u32 port, u32 key) + u32 lower, u32 upper, + u32 scope, u32 node, + u32 port, u32 key) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct publication *publ; - struct name_seq *seq = nametbl_find_seq(net, type); - int index = hash(type); - - if ((scope < TIPC_ZONE_SCOPE) || (scope > TIPC_NODE_SCOPE) || - (lower > upper)) { - pr_debug("Failed to publish illegal {%u,%u,%u} with scope %u\n", + struct name_table *nt = tipc_name_table(net); + struct tipc_service *sc; + struct publication *p; + + if (scope > TIPC_NODE_SCOPE || lower > upper) { + pr_debug("Failed to bind illegal {%u,%u,%u} with scope %u\n", type, lower, upper, scope); return NULL; } - - if (!seq) - seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]); - if (!seq) + sc = tipc_service_find(net, type); + if (!sc) + sc = tipc_service_create(type, &nt->services[hash(type)]); + if (!sc) return NULL; - spin_lock_bh(&seq->lock); - publ = tipc_nameseq_insert_publ(net, seq, type, lower, upper, - scope, node, port, key); - spin_unlock_bh(&seq->lock); - return publ; + spin_lock_bh(&sc->lock); + p = tipc_service_insert_publ(net, sc, type, lower, upper, + scope, node, port, key); + spin_unlock_bh(&sc->lock); + return p; } struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, - u32 lower, u32 node, u32 ref, - u32 key) + u32 lower, u32 upper, + u32 node, u32 key) { - struct publication *publ; - struct name_seq *seq = nametbl_find_seq(net, type); + struct tipc_service *sc = tipc_service_find(net, type); + struct publication *p = NULL; - if (!seq) + if (!sc) return NULL; - spin_lock_bh(&seq->lock); - publ = tipc_nameseq_remove_publ(net, seq, lower, node, ref, key); - if (!seq->first_free && list_empty(&seq->subscriptions)) { - hlist_del_init_rcu(&seq->ns_list); - kfree(seq->sseqs); - spin_unlock_bh(&seq->lock); - kfree_rcu(seq, rcu); - return publ; + spin_lock_bh(&sc->lock); + p = tipc_service_remove_publ(net, sc, lower, upper, node, key); + + /* Delete service item if this no more publications and subscriptions */ + if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { + hlist_del_init_rcu(&sc->service_list); + kfree_rcu(sc, rcu); } - spin_unlock_bh(&seq->lock); - return publ; + spin_unlock_bh(&sc->lock); + return p; } /** - * tipc_nametbl_translate - perform name translation + * tipc_nametbl_translate - perform service instance to socket translation * - * On entry, 'destnode' is the search domain used during translation. + * On entry, 'dnode' is the search domain used during translation. * * On exit: - * - if name translation is deferred to another node/cluster/zone, - * leaves 'destnode' unchanged (will be non-zero) and returns 0 - * - if name translation is attempted and succeeds, sets 'destnode' - * to publishing node and returns port reference (will be non-zero) - * - if name translation is attempted and fails, sets 'destnode' to 0 - * and returns 0 + * - if translation is deferred to another node, leave 'dnode' unchanged and + * return 0 + * - if translation is attempted and succeeds, set 'dnode' to the publishing + * node and return the published (non-zero) port number + * - if translation is attempted and fails, set 'dnode' to 0 and return 0 + * + * Note that for legacy users (node configured with Z.C.N address format) the + * 'closest-first' lookup algorithm must be maintained, i.e., if dnode is 0 + * we must look in the local binding list first */ -u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, - u32 *destnode) +u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct sub_seq *sseq; - struct name_info *info; - struct publication *publ; - struct name_seq *seq; - u32 ref = 0; + struct tipc_net *tn = tipc_net(net); + bool legacy = tn->legacy_addr_format; + u32 self = tipc_own_addr(net); + struct service_range *sr; + struct tipc_service *sc; + struct list_head *list; + struct publication *p; + u32 port = 0; u32 node = 0; - if (!tipc_in_scope(*destnode, tn->own_addr)) + if (!tipc_in_scope(legacy, *dnode, self)) return 0; rcu_read_lock(); - seq = nametbl_find_seq(net, type); - if (unlikely(!seq)) + sc = tipc_service_find(net, type); + if (unlikely(!sc)) goto not_found; - spin_lock_bh(&seq->lock); - sseq = nameseq_find_subseq(seq, instance); - if (unlikely(!sseq)) + + spin_lock_bh(&sc->lock); + sr = tipc_service_find_range(sc, instance); + if (unlikely(!sr)) goto no_match; - info = sseq->info; - - /* Closest-First Algorithm */ - if (likely(!*destnode)) { - if (!list_empty(&info->node_list)) { - publ = list_first_entry(&info->node_list, - struct publication, - node_list); - list_move_tail(&publ->node_list, - &info->node_list); - } else if (!list_empty(&info->cluster_list)) { - publ = list_first_entry(&info->cluster_list, - struct publication, - cluster_list); - list_move_tail(&publ->cluster_list, - &info->cluster_list); - } else { - publ = list_first_entry(&info->zone_list, - struct publication, - zone_list); - list_move_tail(&publ->zone_list, - &info->zone_list); - } - } - /* Round-Robin Algorithm */ - else if (*destnode == tn->own_addr) { - if (list_empty(&info->node_list)) - goto no_match; - publ = list_first_entry(&info->node_list, struct publication, - node_list); - list_move_tail(&publ->node_list, &info->node_list); - } else if (in_own_cluster_exact(net, *destnode)) { - if (list_empty(&info->cluster_list)) + /* Select lookup algorithm: local, closest-first or round-robin */ + if (*dnode == self) { + list = &sr->local_publ; + if (list_empty(list)) goto no_match; - publ = list_first_entry(&info->cluster_list, struct publication, - cluster_list); - list_move_tail(&publ->cluster_list, &info->cluster_list); + p = list_first_entry(list, struct publication, local_publ); + list_move_tail(&p->local_publ, &sr->local_publ); + } else if (legacy && !*dnode && !list_empty(&sr->local_publ)) { + list = &sr->local_publ; + p = list_first_entry(list, struct publication, local_publ); + list_move_tail(&p->local_publ, &sr->local_publ); } else { - publ = list_first_entry(&info->zone_list, struct publication, - zone_list); - list_move_tail(&publ->zone_list, &info->zone_list); + list = &sr->all_publ; + p = list_first_entry(list, struct publication, all_publ); + list_move_tail(&p->all_publ, &sr->all_publ); } - - ref = publ->ref; - node = publ->node; + port = p->port; + node = p->node; no_match: - spin_unlock_bh(&seq->lock); + spin_unlock_bh(&sc->lock); not_found: rcu_read_unlock(); - *destnode = node; - return ref; + *dnode = node; + return port; } bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, @@ -606,102 +468,102 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, bool all) { u32 self = tipc_own_addr(net); - struct publication *publ; - struct name_info *info; - struct name_seq *seq; - struct sub_seq *sseq; + struct service_range *sr; + struct tipc_service *sc; + struct publication *p; *dstcnt = 0; rcu_read_lock(); - seq = nametbl_find_seq(net, type); - if (unlikely(!seq)) + sc = tipc_service_find(net, type); + if (unlikely(!sc)) goto exit; - spin_lock_bh(&seq->lock); - sseq = nameseq_find_subseq(seq, instance); - if (likely(sseq)) { - info = sseq->info; - list_for_each_entry(publ, &info->zone_list, zone_list) { - if (publ->scope != scope) - continue; - if (publ->ref == exclude && publ->node == self) - continue; - tipc_dest_push(dsts, publ->node, publ->ref); - (*dstcnt)++; - if (all) - continue; - list_move_tail(&publ->zone_list, &info->zone_list); - break; - } + + spin_lock_bh(&sc->lock); + + sr = tipc_service_find_range(sc, instance); + if (!sr) + goto no_match; + + list_for_each_entry(p, &sr->all_publ, all_publ) { + if (p->scope != scope) + continue; + if (p->port == exclude && p->node == self) + continue; + tipc_dest_push(dsts, p->node, p->port); + (*dstcnt)++; + if (all) + continue; + list_move_tail(&p->all_publ, &sr->all_publ); + break; } - spin_unlock_bh(&seq->lock); +no_match: + spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return !list_empty(dsts); } -int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, - u32 scope, bool exact, struct list_head *dports) +void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, + u32 scope, bool exact, struct list_head *dports) { - struct sub_seq *sseq_stop; - struct name_info *info; + struct service_range *sr; + struct tipc_service *sc; struct publication *p; - struct name_seq *seq; - struct sub_seq *sseq; - int res = 0; + struct rb_node *n; rcu_read_lock(); - seq = nametbl_find_seq(net, type); - if (!seq) + sc = tipc_service_find(net, type); + if (!sc) goto exit; - spin_lock_bh(&seq->lock); - sseq = seq->sseqs + nameseq_locate_subseq(seq, lower); - sseq_stop = seq->sseqs + seq->first_free; - for (; sseq != sseq_stop; sseq++) { - if (sseq->lower > upper) + spin_lock_bh(&sc->lock); + + for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + if (sr->upper < lower) + continue; + if (sr->lower > upper) break; - info = sseq->info; - list_for_each_entry(p, &info->node_list, node_list) { + list_for_each_entry(p, &sr->local_publ, local_publ) { if (p->scope == scope || (!exact && p->scope < scope)) - tipc_dest_push(dports, 0, p->ref); + tipc_dest_push(dports, 0, p->port); } - - if (info->cluster_list_size != info->node_list_size) - res = 1; } - spin_unlock_bh(&seq->lock); + spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); - return res; } /* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes * - Creates list of nodes that overlap the given multicast address - * - Determines if any node local ports overlap + * - Determines if any node local destinations overlap */ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, u32 upper, struct tipc_nlist *nodes) { - struct sub_seq *sseq, *stop; - struct publication *publ; - struct name_info *info; - struct name_seq *seq; + struct service_range *sr; + struct tipc_service *sc; + struct publication *p; + struct rb_node *n; rcu_read_lock(); - seq = nametbl_find_seq(net, type); - if (!seq) + sc = tipc_service_find(net, type); + if (!sc) goto exit; - spin_lock_bh(&seq->lock); - sseq = seq->sseqs + nameseq_locate_subseq(seq, lower); - stop = seq->sseqs + seq->first_free; - for (; sseq != stop && sseq->lower <= upper; sseq++) { - info = sseq->info; - list_for_each_entry(publ, &info->zone_list, zone_list) { - tipc_nlist_add(nodes, publ->node); + spin_lock_bh(&sc->lock); + + for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + if (sr->upper < lower) + continue; + if (sr->lower > upper) + break; + list_for_each_entry(p, &sr->all_publ, all_publ) { + tipc_nlist_add(nodes, p->node); } } - spin_unlock_bh(&seq->lock); + spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } @@ -711,90 +573,85 @@ exit: void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, u32 type, u32 scope) { - struct sub_seq *sseq, *stop; - struct name_info *info; + struct service_range *sr; + struct tipc_service *sc; struct publication *p; - struct name_seq *seq; + struct rb_node *n; rcu_read_lock(); - seq = nametbl_find_seq(net, type); - if (!seq) + sc = tipc_service_find(net, type); + if (!sc) goto exit; - spin_lock_bh(&seq->lock); - sseq = seq->sseqs; - stop = seq->sseqs + seq->first_free; - for (; sseq != stop; sseq++) { - info = sseq->info; - list_for_each_entry(p, &info->zone_list, zone_list) { + spin_lock_bh(&sc->lock); + for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != scope) continue; - tipc_group_add_member(grp, p->node, p->ref, p->lower); + tipc_group_add_member(grp, p->node, p->port, p->lower); } } - spin_unlock_bh(&seq->lock); + spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } -/* - * tipc_nametbl_publish - add name publication to network name tables +/* tipc_nametbl_publish - add service binding to name table */ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, - u32 upper, u32 scope, u32 port_ref, + u32 upper, u32 scope, u32 port, u32 key) { - struct publication *publ; - struct sk_buff *buf = NULL; - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct name_table *nt = tipc_name_table(net); + struct tipc_net *tn = tipc_net(net); + struct publication *p = NULL; + struct sk_buff *skb = NULL; spin_lock_bh(&tn->nametbl_lock); - if (tn->nametbl->local_publ_count >= TIPC_MAX_PUBLICATIONS) { - pr_warn("Publication failed, local publication limit reached (%u)\n", - TIPC_MAX_PUBLICATIONS); - spin_unlock_bh(&tn->nametbl_lock); - return NULL; + + if (nt->local_publ_count >= TIPC_MAX_PUBL) { + pr_warn("Bind failed, max limit %u reached\n", TIPC_MAX_PUBL); + goto exit; } - publ = tipc_nametbl_insert_publ(net, type, lower, upper, scope, - tn->own_addr, port_ref, key); - if (likely(publ)) { - tn->nametbl->local_publ_count++; - buf = tipc_named_publish(net, publ); - /* Any pending external events? */ - tipc_named_process_backlog(net); + p = tipc_nametbl_insert_publ(net, type, lower, upper, scope, + tipc_own_addr(net), port, key); + if (p) { + nt->local_publ_count++; + skb = tipc_named_publish(net, p); } +exit: spin_unlock_bh(&tn->nametbl_lock); - if (buf) - tipc_node_broadcast(net, buf); - return publ; + if (skb) + tipc_node_broadcast(net, skb); + return p; } /** - * tipc_nametbl_withdraw - withdraw name publication from network name tables + * tipc_nametbl_withdraw - withdraw a service binding */ -int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, - u32 key) +int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, + u32 upper, u32 key) { - struct publication *publ; + struct name_table *nt = tipc_name_table(net); + struct tipc_net *tn = tipc_net(net); + u32 self = tipc_own_addr(net); struct sk_buff *skb = NULL; - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct publication *p; spin_lock_bh(&tn->nametbl_lock); - publ = tipc_nametbl_remove_publ(net, type, lower, tn->own_addr, - ref, key); - if (likely(publ)) { - tn->nametbl->local_publ_count--; - skb = tipc_named_withdraw(net, publ); - /* Any pending external events? */ - tipc_named_process_backlog(net); - list_del_init(&publ->pport_list); - kfree_rcu(publ, rcu); + + p = tipc_nametbl_remove_publ(net, type, lower, upper, self, key); + if (p) { + nt->local_publ_count--; + skb = tipc_named_withdraw(net, p); + list_del_init(&p->binding_sock); + kfree_rcu(p, rcu); } else { - pr_err("Unable to remove local publication\n" - "(type=%u, lower=%u, ref=%u, key=%u)\n", - type, lower, ref, key); + pr_err("Failed to remove local publication {%u,%u,%u}/%u\n", + type, lower, upper, key); } spin_unlock_bh(&tn->nametbl_lock); @@ -808,26 +665,26 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, /** * tipc_nametbl_subscribe - add a subscription object to the name table */ -void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status) +void tipc_nametbl_subscribe(struct tipc_subscription *sub) { - struct tipc_net *tn = net_generic(s->net, tipc_net_id); - u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); - int index = hash(type); - struct name_seq *seq; - struct tipc_name_seq ns; + struct name_table *nt = tipc_name_table(sub->net); + struct tipc_net *tn = tipc_net(sub->net); + struct tipc_subscr *s = &sub->evt.s; + u32 type = tipc_sub_read(s, seq.type); + struct tipc_service *sc; spin_lock_bh(&tn->nametbl_lock); - seq = nametbl_find_seq(s->net, type); - if (!seq) - seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]); - if (seq) { - spin_lock_bh(&seq->lock); - tipc_nameseq_subscribe(seq, s, status); - spin_unlock_bh(&seq->lock); + sc = tipc_service_find(sub->net, type); + if (!sc) + sc = tipc_service_create(type, &nt->services[hash(type)]); + if (sc) { + spin_lock_bh(&sc->lock); + tipc_service_subscribe(sc, sub); + spin_unlock_bh(&sc->lock); } else { - tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); - pr_warn("Failed to create subscription for {%u,%u,%u}\n", - ns.type, ns.lower, ns.upper); + pr_warn("Failed to subscribe for {%u,%u,%u}\n", type, + tipc_sub_read(s, seq.lower), + tipc_sub_read(s, seq.upper)); } spin_unlock_bh(&tn->nametbl_lock); } @@ -835,126 +692,124 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status) /** * tipc_nametbl_unsubscribe - remove a subscription object from name table */ -void tipc_nametbl_unsubscribe(struct tipc_subscription *s) +void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { - struct tipc_net *tn = net_generic(s->net, tipc_net_id); - struct name_seq *seq; - u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); + struct tipc_net *tn = tipc_net(sub->net); + struct tipc_subscr *s = &sub->evt.s; + u32 type = tipc_sub_read(s, seq.type); + struct tipc_service *sc; spin_lock_bh(&tn->nametbl_lock); - seq = nametbl_find_seq(s->net, type); - if (seq != NULL) { - spin_lock_bh(&seq->lock); - list_del_init(&s->nameseq_list); - tipc_subscrp_put(s); - if (!seq->first_free && list_empty(&seq->subscriptions)) { - hlist_del_init_rcu(&seq->ns_list); - kfree(seq->sseqs); - spin_unlock_bh(&seq->lock); - kfree_rcu(seq, rcu); - } else { - spin_unlock_bh(&seq->lock); - } + sc = tipc_service_find(sub->net, type); + if (!sc) + goto exit; + + spin_lock_bh(&sc->lock); + list_del_init(&sub->service_list); + tipc_sub_put(sub); + + /* Delete service item if no more publications and subscriptions */ + if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { + hlist_del_init_rcu(&sc->service_list); + kfree_rcu(sc, rcu); } + spin_unlock_bh(&sc->lock); +exit: spin_unlock_bh(&tn->nametbl_lock); } int tipc_nametbl_init(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct name_table *tipc_nametbl; + struct tipc_net *tn = tipc_net(net); + struct name_table *nt; int i; - tipc_nametbl = kzalloc(sizeof(*tipc_nametbl), GFP_ATOMIC); - if (!tipc_nametbl) + nt = kzalloc(sizeof(*nt), GFP_ATOMIC); + if (!nt) return -ENOMEM; for (i = 0; i < TIPC_NAMETBL_SIZE; i++) - INIT_HLIST_HEAD(&tipc_nametbl->seq_hlist[i]); + INIT_HLIST_HEAD(&nt->services[i]); - INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_ZONE_SCOPE]); - INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_CLUSTER_SCOPE]); - INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_NODE_SCOPE]); - tn->nametbl = tipc_nametbl; + INIT_LIST_HEAD(&nt->node_scope); + INIT_LIST_HEAD(&nt->cluster_scope); + tn->nametbl = nt; spin_lock_init(&tn->nametbl_lock); return 0; } /** - * tipc_purge_publications - remove all publications for a given type - * - * tipc_nametbl_lock must be held when calling this function + * tipc_service_delete - purge all publications for a service and delete it */ -static void tipc_purge_publications(struct net *net, struct name_seq *seq) +static void tipc_service_delete(struct net *net, struct tipc_service *sc) { - struct publication *publ, *safe; - struct sub_seq *sseq; - struct name_info *info; - - spin_lock_bh(&seq->lock); - sseq = seq->sseqs; - info = sseq->info; - list_for_each_entry_safe(publ, safe, &info->zone_list, zone_list) { - tipc_nameseq_remove_publ(net, seq, publ->lower, publ->node, - publ->ref, publ->key); - kfree_rcu(publ, rcu); + struct service_range *sr, *tmpr; + struct publication *p, *tmpb; + + spin_lock_bh(&sc->lock); + rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) { + list_for_each_entry_safe(p, tmpb, + &sr->all_publ, all_publ) { + tipc_service_remove_publ(net, sc, p->lower, p->upper, + p->node, p->key); + kfree_rcu(p, rcu); + } } - hlist_del_init_rcu(&seq->ns_list); - kfree(seq->sseqs); - spin_unlock_bh(&seq->lock); - - kfree_rcu(seq, rcu); + hlist_del_init_rcu(&sc->service_list); + spin_unlock_bh(&sc->lock); + kfree_rcu(sc, rcu); } void tipc_nametbl_stop(struct net *net) { + struct name_table *nt = tipc_name_table(net); + struct tipc_net *tn = tipc_net(net); + struct hlist_head *service_head; + struct tipc_service *service; u32 i; - struct name_seq *seq; - struct hlist_head *seq_head; - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct name_table *tipc_nametbl = tn->nametbl; /* Verify name table is empty and purge any lingering * publications, then release the name table */ spin_lock_bh(&tn->nametbl_lock); for (i = 0; i < TIPC_NAMETBL_SIZE; i++) { - if (hlist_empty(&tipc_nametbl->seq_hlist[i])) + if (hlist_empty(&nt->services[i])) continue; - seq_head = &tipc_nametbl->seq_hlist[i]; - hlist_for_each_entry_rcu(seq, seq_head, ns_list) { - tipc_purge_publications(net, seq); + service_head = &nt->services[i]; + hlist_for_each_entry_rcu(service, service_head, service_list) { + tipc_service_delete(net, service); } } spin_unlock_bh(&tn->nametbl_lock); synchronize_net(); - kfree(tipc_nametbl); - + kfree(nt); } static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, - struct name_seq *seq, - struct sub_seq *sseq, u32 *last_publ) + struct tipc_service *service, + struct service_range *sr, + u32 *last_key) { - void *hdr; - struct nlattr *attrs; - struct nlattr *publ; struct publication *p; + struct nlattr *attrs; + struct nlattr *b; + void *hdr; - if (*last_publ) { - list_for_each_entry(p, &sseq->info->zone_list, zone_list) - if (p->key == *last_publ) + if (*last_key) { + list_for_each_entry(p, &sr->all_publ, all_publ) + if (p->key == *last_key) break; - if (p->key != *last_publ) + if (p->key != *last_key) return -EPIPE; } else { - p = list_first_entry(&sseq->info->zone_list, struct publication, - zone_list); + p = list_first_entry(&sr->all_publ, + struct publication, + all_publ); } - list_for_each_entry_from(p, &sseq->info->zone_list, zone_list) { - *last_publ = p->key; + list_for_each_entry_from(p, &sr->all_publ, all_publ) { + *last_key = p->key; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, @@ -966,35 +821,35 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, if (!attrs) goto msg_full; - publ = nla_nest_start(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); - if (!publ) + b = nla_nest_start(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); + if (!b) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, seq->type)) + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, service->type)) goto publ_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sseq->lower)) + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sr->lower)) goto publ_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sseq->upper)) + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sr->upper)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->node)) goto publ_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->ref)) + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->port)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key)) goto publ_msg_full; - nla_nest_end(msg->skb, publ); + nla_nest_end(msg->skb, b); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); } - *last_publ = 0; + *last_key = 0; return 0; publ_msg_full: - nla_nest_cancel(msg->skb, publ); + nla_nest_cancel(msg->skb, b); attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: @@ -1003,39 +858,34 @@ msg_full: return -EMSGSIZE; } -static int __tipc_nl_subseq_list(struct tipc_nl_msg *msg, struct name_seq *seq, - u32 *last_lower, u32 *last_publ) +static int __tipc_nl_service_range_list(struct tipc_nl_msg *msg, + struct tipc_service *sc, + u32 *last_lower, u32 *last_key) { - struct sub_seq *sseq; - struct sub_seq *sseq_start; + struct service_range *sr; + struct rb_node *n; int err; - if (*last_lower) { - sseq_start = nameseq_find_subseq(seq, *last_lower); - if (!sseq_start) - return -EPIPE; - } else { - sseq_start = seq->sseqs; - } - - for (sseq = sseq_start; sseq != &seq->sseqs[seq->first_free]; sseq++) { - err = __tipc_nl_add_nametable_publ(msg, seq, sseq, last_publ); + for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + if (sr->lower < *last_lower) + continue; + err = __tipc_nl_add_nametable_publ(msg, sc, sr, last_key); if (err) { - *last_lower = sseq->lower; + *last_lower = sr->lower; return err; } } *last_lower = 0; - return 0; } -static int tipc_nl_seq_list(struct net *net, struct tipc_nl_msg *msg, - u32 *last_type, u32 *last_lower, u32 *last_publ) +static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg, + u32 *last_type, u32 *last_lower, u32 *last_key) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct hlist_head *seq_head; - struct name_seq *seq = NULL; + struct tipc_net *tn = tipc_net(net); + struct tipc_service *service = NULL; + struct hlist_head *head; int err; int i; @@ -1045,30 +895,31 @@ static int tipc_nl_seq_list(struct net *net, struct tipc_nl_msg *msg, i = 0; for (; i < TIPC_NAMETBL_SIZE; i++) { - seq_head = &tn->nametbl->seq_hlist[i]; + head = &tn->nametbl->services[i]; if (*last_type) { - seq = nametbl_find_seq(net, *last_type); - if (!seq) + service = tipc_service_find(net, *last_type); + if (!service) return -EPIPE; } else { - hlist_for_each_entry_rcu(seq, seq_head, ns_list) + hlist_for_each_entry_rcu(service, head, service_list) break; - if (!seq) + if (!service) continue; } - hlist_for_each_entry_from_rcu(seq, ns_list) { - spin_lock_bh(&seq->lock); - err = __tipc_nl_subseq_list(msg, seq, last_lower, - last_publ); + hlist_for_each_entry_from_rcu(service, service_list) { + spin_lock_bh(&service->lock); + err = __tipc_nl_service_range_list(msg, service, + last_lower, + last_key); if (err) { - *last_type = seq->type; - spin_unlock_bh(&seq->lock); + *last_type = service->type; + spin_unlock_bh(&service->lock); return err; } - spin_unlock_bh(&seq->lock); + spin_unlock_bh(&service->lock); } *last_type = 0; } @@ -1077,13 +928,13 @@ static int tipc_nl_seq_list(struct net *net, struct tipc_nl_msg *msg, int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) { - int err; - int done = cb->args[3]; + struct net *net = sock_net(skb->sk); u32 last_type = cb->args[0]; u32 last_lower = cb->args[1]; - u32 last_publ = cb->args[2]; - struct net *net = sock_net(skb->sk); + u32 last_key = cb->args[2]; + int done = cb->args[3]; struct tipc_nl_msg msg; + int err; if (done) return 0; @@ -1093,7 +944,8 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); - err = tipc_nl_seq_list(net, &msg, &last_type, &last_lower, &last_publ); + err = tipc_nl_service_list(net, &msg, &last_type, + &last_lower, &last_key); if (!err) { done = 1; } else if (err != -EMSGSIZE) { @@ -1109,7 +961,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[0] = last_type; cb->args[1] = last_lower; - cb->args[2] = last_publ; + cb->args[2] = last_key; cb->args[3] = done; return skb->len; diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index f56e7cb3d436..4b14fc28d9e2 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -1,7 +1,7 @@ /* * net/tipc/name_table.h: Include file for TIPC name table code * - * Copyright (c) 2000-2006, 2014-2015, Ericsson AB + * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -54,19 +54,22 @@ struct tipc_group; * @type: name sequence type * @lower: name sequence lower bound * @upper: name sequence upper bound - * @scope: scope of publication - * @node: network address of publishing port's node - * @ref: publishing port - * @key: publication key - * @nodesub_list: subscription to "node down" event (off-node publication only) - * @local_list: adjacent entries in list of publications made by this node - * @pport_list: adjacent entries in list of publications made by this port - * @node_list: adjacent matching name seq publications with >= node scope - * @cluster_list: adjacent matching name seq publications with >= cluster scope - * @zone_list: adjacent matching name seq publications with >= zone scope + * @scope: scope of publication, TIPC_NODE_SCOPE or TIPC_CLUSTER_SCOPE + * @node: network address of publishing socket's node + * @port: publishing port + * @key: publication key, unique across the cluster + * @binding_node: all publications from the same node which bound this one + * - Remote publications: in node->publ_list + * Used by node/name distr to withdraw publications when node is lost + * - Local/node scope publications: in name_table->node_scope list + * - Local/cluster scope publications: in name_table->cluster_scope list + * @binding_sock: all publications from the same socket which bound this one + * Used by socket to withdraw publications when socket is unbound/released + * @local_publ: list of identical publications made from this node + * Used by closest_first and multicast receive lookup algorithms + * @all_publ: all publications identical to this one, whatever node and scope + * Used by round-robin lookup algorithm * @rcu: RCU callback head used for deferred freeing - * - * Note that the node list, cluster list, and zone list are circular lists. */ struct publication { u32 type; @@ -74,34 +77,37 @@ struct publication { u32 upper; u32 scope; u32 node; - u32 ref; + u32 port; u32 key; - struct list_head nodesub_list; - struct list_head local_list; - struct list_head pport_list; - struct list_head node_list; - struct list_head cluster_list; - struct list_head zone_list; + struct list_head binding_node; + struct list_head binding_sock; + struct list_head local_publ; + struct list_head all_publ; struct rcu_head rcu; }; /** * struct name_table - table containing all existing port name publications * @seq_hlist: name sequence hash lists - * @publ_list: pulication lists + * @node_scope: all local publications with node scope + * - used by name_distr during re-init of name table + * @cluster_scope: all local publications with cluster scope + * - used by name_distr to send bulk updates to new nodes + * - used by name_distr during re-init of name table * @local_publ_count: number of publications issued by this node */ struct name_table { - struct hlist_head seq_hlist[TIPC_NAMETBL_SIZE]; - struct list_head publ_list[TIPC_PUBL_SCOPE_NUM]; + struct hlist_head services[TIPC_NAMETBL_SIZE]; + struct list_head node_scope; + struct list_head cluster_scope; u32 local_publ_count; }; int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); -int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, - u32 scope, bool exact, struct list_head *dports); +void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, + u32 scope, bool exact, struct list_head *dports); void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, u32 type, u32 domain); void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, @@ -110,17 +116,17 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, struct list_head *dsts, int *dstcnt, u32 exclude, bool all); struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, - u32 upper, u32 scope, u32 port_ref, + u32 upper, u32 scope, u32 port, u32 key); -int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, +int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 upper, u32 key); struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, u32 lower, u32 upper, u32 scope, u32 node, u32 ref, u32 key); struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, - u32 lower, u32 node, u32 ref, - u32 key); -void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status); + u32 lower, u32 upper, + u32 node, u32 key); +void tipc_nametbl_subscribe(struct tipc_subscription *s); void tipc_nametbl_unsubscribe(struct tipc_subscription *s); int tipc_nametbl_init(struct net *net); void tipc_nametbl_stop(struct net *net); diff --git a/net/tipc/net.c b/net/tipc/net.c index 1a2fde0d6f61..856f9e97ea29 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -104,38 +104,39 @@ * - A local spin_lock protecting the queue of subscriber events. */ -int tipc_net_start(struct net *net, u32 addr) +int tipc_net_init(struct net *net, u8 *node_id, u32 addr) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - char addr_string[16]; + if (tipc_own_id(net)) { + pr_info("Cannot configure node identity twice\n"); + return -1; + } + pr_info("Started in network mode\n"); - tn->own_addr = addr; + if (node_id) + tipc_set_node_id(net, node_id); + if (addr) + tipc_net_finalize(net, addr); + return 0; +} - /* Ensure that the new address is visible before we reinit. */ +void tipc_net_finalize(struct net *net, u32 addr) +{ + tipc_set_node_addr(net, addr); smp_mb(); - tipc_named_reinit(net); tipc_sk_reinit(net); - - tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr, - TIPC_ZONE_SCOPE, 0, tn->own_addr); - - pr_info("Started in network mode\n"); - pr_info("Own node address %s, network identity %u\n", - tipc_addr_string_fill(addr_string, tn->own_addr), - tn->net_id); - return 0; + tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, + TIPC_CLUSTER_SCOPE, 0, addr); } void tipc_net_stop(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + u32 self = tipc_own_addr(net); - if (!tn->own_addr) + if (!self) return; - tipc_nametbl_withdraw(net, TIPC_CFG_SRV, tn->own_addr, 0, - tn->own_addr); + tipc_nametbl_withdraw(net, TIPC_CFG_SRV, self, self, self); rtnl_lock(); tipc_bearer_stop(net); tipc_node_stop(net); @@ -147,8 +148,10 @@ void tipc_net_stop(struct net *net) static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = net_generic(net, tipc_net_id); - void *hdr; + u64 *w0 = (u64 *)&tn->node_id[0]; + u64 *w1 = (u64 *)&tn->node_id[8]; struct nlattr *attrs; + void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NET_GET); @@ -161,7 +164,10 @@ static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg) if (nla_put_u32(msg->skb, TIPC_NLA_NET_ID, tn->net_id)) goto attr_msg_full; - + if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID, *w0, 0)) + goto attr_msg_full; + if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID_W1, *w1, 0)) + goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); @@ -202,9 +208,9 @@ out: int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { - struct net *net = sock_net(skb->sk); - struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); int err; if (!info->attrs[TIPC_NLA_NET]) @@ -213,16 +219,17 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); + if (err) return err; + /* Can't change net id once TIPC has joined a network */ + if (tipc_own_addr(net)) + return -EPERM; + if (attrs[TIPC_NLA_NET_ID]) { u32 val; - /* Can't change net id once TIPC has joined a network */ - if (tn->own_addr) - return -EPERM; - val = nla_get_u32(attrs[TIPC_NLA_NET_ID]); if (val < 1 || val > 9999) return -EINVAL; @@ -233,17 +240,22 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) if (attrs[TIPC_NLA_NET_ADDR]) { u32 addr; - /* Can't change net addr once TIPC has joined a network */ - if (tn->own_addr) - return -EPERM; - addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); - if (!tipc_addr_node_valid(addr)) + if (!addr) return -EINVAL; - - tipc_net_start(net, addr); + tn->legacy_addr_format = true; + tipc_net_init(net, NULL, addr); } + if (attrs[TIPC_NLA_NET_NODEID]) { + u8 node_id[NODE_ID_LEN]; + u64 *w0 = (u64 *)&node_id[0]; + u64 *w1 = (u64 *)&node_id[8]; + + *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); + *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); + tipc_net_init(net, node_id, 0); + } return 0; } diff --git a/net/tipc/net.h b/net/tipc/net.h index c0306aa2374b..09ad02b50bb1 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -41,10 +41,9 @@ extern const struct nla_policy tipc_nl_net_policy[]; -int tipc_net_start(struct net *net, u32 addr); - +int tipc_net_init(struct net *net, u8 *node_id, u32 addr); +void tipc_net_finalize(struct net *net, u32 addr); void tipc_net_stop(struct net *net); - int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/node.c b/net/tipc/node.c index 9036d8756e73..c77dd2f3c589 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -115,6 +115,7 @@ struct tipc_node { u16 capabilities; u32 signature; u32 link_id; + u8 peer_id[16]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; @@ -156,6 +157,7 @@ static void tipc_node_delete(struct tipc_node *node); static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); +static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); static void tipc_node_put(struct tipc_node *node); static bool node_is_up(struct tipc_node *n); @@ -233,9 +235,6 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) struct tipc_node *node; unsigned int thash = tipc_hashfn(addr); - if (unlikely(!in_own_cluster_exact(net, addr))) - return NULL; - rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { if (node->addr != addr) @@ -248,6 +247,30 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) return node; } +/* tipc_node_find_by_id - locate specified node object by its 128-bit id + * Note: this function is called only when a discovery request failed + * to find the node by its 32-bit id, and is not time critical + */ +static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_node *n; + bool found = false; + + rcu_read_lock(); + list_for_each_entry_rcu(n, &tn->node_list, list) { + read_lock_bh(&n->lock); + if (!memcmp(id, n->peer_id, 16) && + kref_get_unless_zero(&n->kref)) + found = true; + read_unlock_bh(&n->lock); + if (found) + break; + } + rcu_read_unlock(); + return found ? n : NULL; +} + static void tipc_node_read_lock(struct tipc_node *n) { read_lock_bh(&n->lock); @@ -301,16 +324,17 @@ static void tipc_node_write_unlock(struct tipc_node *n) if (flags & TIPC_NOTIFY_LINK_UP) { tipc_mon_peer_up(net, addr, bearer_id); tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr, - TIPC_NODE_SCOPE, link_id, addr); + TIPC_NODE_SCOPE, link_id, link_id); } if (flags & TIPC_NOTIFY_LINK_DOWN) { tipc_mon_peer_down(net, addr, bearer_id); tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr, - link_id, addr); + addr, link_id); } } -struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) +static struct tipc_node *tipc_node_create(struct net *net, u32 addr, + u8 *peer_id, u16 capabilities) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n, *temp_node; @@ -329,6 +353,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) goto exit; } n->addr = addr; + memcpy(&n->peer_id, peer_id, 16); n->net = net; n->capabilities = capabilities; kref_init(&n->kref); @@ -347,8 +372,8 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; - if (!tipc_link_bc_create(net, tipc_own_addr(net), n->addr, - U16_MAX, + if (!tipc_link_bc_create(net, tipc_own_addr(net), + addr, U16_MAX, tipc_link_window(tipc_bc_sndlink(net)), n->capabilities, &n->bc_entry.inputq1, @@ -738,8 +763,51 @@ bool tipc_node_is_up(struct net *net, u32 addr) return retval; } -void tipc_node_check_dest(struct net *net, u32 onode, - struct tipc_bearer *b, +static u32 tipc_node_suggest_addr(struct net *net, u32 addr) +{ + struct tipc_node *n; + + addr ^= tipc_net(net)->random; + while ((n = tipc_node_find(net, addr))) { + tipc_node_put(n); + addr++; + } + return addr; +} + +/* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not + */ +u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_node *n; + + /* Suggest new address if some other peer is using this one */ + n = tipc_node_find(net, addr); + if (n) { + if (!memcmp(n->peer_id, id, NODE_ID_LEN)) + addr = 0; + tipc_node_put(n); + if (!addr) + return 0; + return tipc_node_suggest_addr(net, addr); + } + + /* Suggest previously used address if peer is known */ + n = tipc_node_find_by_id(net, id); + if (n) { + addr = n->addr; + tipc_node_put(n); + } + /* Even this node may be in trial phase */ + if (tn->trial_addr == addr) + return tipc_node_suggest_addr(net, addr); + + return addr; +} + +void tipc_node_check_dest(struct net *net, u32 addr, + u8 *peer_id, struct tipc_bearer *b, u16 capabilities, u32 signature, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) @@ -758,7 +826,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, *dupl_addr = false; *respond = false; - n = tipc_node_create(net, onode, capabilities); + n = tipc_node_create(net, addr, peer_id, capabilities); if (!n) return; @@ -836,15 +904,14 @@ void tipc_node_check_dest(struct net *net, u32 onode, /* Now create new link if not already existing */ if (!l) { - if (n->link_cnt == 2) { - pr_warn("Cannot establish 3rd link to %x\n", n->addr); + if (n->link_cnt == 2) goto exit; - } + if_name = strchr(b->name, ':') + 1; if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, b->window, mod(tipc_net(net)->random), - tipc_own_addr(net), onode, + tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, &le->inputq, @@ -887,11 +954,9 @@ void tipc_node_delete_links(struct net *net, int bearer_id) static void tipc_node_reset_links(struct tipc_node *n) { - char addr_string[16]; int i; - pr_warn("Resetting all links to %s\n", - tipc_addr_string_fill(addr_string, n->addr)); + pr_warn("Resetting all links to %x\n", n->addr); for (i = 0; i < MAX_BEARERS; i++) { tipc_node_link_down(n, i, false); @@ -1078,15 +1143,13 @@ illegal_evt: static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq) { - char addr_string[16]; struct tipc_sock_conn *conn, *safe; struct tipc_link *l; struct list_head *conns = &n->conn_sks; struct sk_buff *skb; uint i; - pr_debug("Lost contact with %s\n", - tipc_addr_string_fill(addr_string, n->addr)); + pr_debug("Lost contact with %x\n", n->addr); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); @@ -1618,6 +1681,30 @@ discard: kfree_skb(skb); } +void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b) +{ + struct tipc_net *tn = tipc_net(net); + int bearer_id = b->identity; + struct sk_buff_head xmitq; + struct tipc_link_entry *e; + struct tipc_node *n; + + __skb_queue_head_init(&xmitq); + + rcu_read_lock(); + + list_for_each_entry_rcu(n, &tn->node_list, list) { + tipc_node_write_lock(n); + e = &n->links[bearer_id]; + if (e->link) + tipc_link_set_tolerance(e->link, b->tolerance, &xmitq); + tipc_node_write_unlock(n); + tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr); + } + + rcu_read_unlock(); +} + int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); diff --git a/net/tipc/node.h b/net/tipc/node.h index acd58d23a70e..f24b83500df1 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -49,22 +49,25 @@ enum { TIPC_BCAST_STATE_NACK = (1 << 2), TIPC_BLOCK_FLOWCTL = (1 << 3), TIPC_BCAST_RCAST = (1 << 4), - TIPC_MCAST_GROUPS = (1 << 5) + TIPC_NODE_ID128 = (1 << 5) }; #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ TIPC_BCAST_STATE_NACK | \ TIPC_BCAST_RCAST | \ - TIPC_BLOCK_FLOWCTL) + TIPC_BLOCK_FLOWCTL | \ + TIPC_NODE_ID128) #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); -void tipc_node_check_dest(struct net *net, u32 onode, +u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr); +void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, struct tipc_bearer *bearer, u16 capabilities, u32 signature, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr); void tipc_node_delete_links(struct net *net, int bearer_id); +void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b); int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, char *linkname, size_t len); int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, diff --git a/net/tipc/server.c b/net/tipc/server.c deleted file mode 100644 index df0c563c90cd..000000000000 --- a/net/tipc/server.c +++ /dev/null @@ -1,710 +0,0 @@ -/* - * net/tipc/server.c: TIPC server infrastructure - * - * Copyright (c) 2012-2013, Wind River Systems - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "server.h" -#include "core.h" -#include "socket.h" -#include "addr.h" -#include "msg.h" -#include <net/sock.h> -#include <linux/module.h> - -/* Number of messages to send before rescheduling */ -#define MAX_SEND_MSG_COUNT 25 -#define MAX_RECV_MSG_COUNT 25 -#define CF_CONNECTED 1 -#define CF_SERVER 2 - -#define sock2con(x) ((struct tipc_conn *)(x)->sk_user_data) - -/** - * struct tipc_conn - TIPC connection structure - * @kref: reference counter to connection object - * @conid: connection identifier - * @sock: socket handler associated with connection - * @flags: indicates connection state - * @server: pointer to connected server - * @rwork: receive work item - * @usr_data: user-specified field - * @rx_action: what to do when connection socket is active - * @outqueue: pointer to first outbound message in queue - * @outqueue_lock: control access to the outqueue - * @outqueue: list of connection objects for its server - * @swork: send work item - */ -struct tipc_conn { - struct kref kref; - int conid; - struct socket *sock; - unsigned long flags; - struct tipc_server *server; - struct work_struct rwork; - int (*rx_action) (struct tipc_conn *con); - void *usr_data; - struct list_head outqueue; - spinlock_t outqueue_lock; - struct work_struct swork; -}; - -/* An entry waiting to be sent */ -struct outqueue_entry { - struct list_head list; - struct kvec iov; - struct sockaddr_tipc dest; -}; - -static void tipc_recv_work(struct work_struct *work); -static void tipc_send_work(struct work_struct *work); -static void tipc_clean_outqueues(struct tipc_conn *con); - -static void tipc_conn_kref_release(struct kref *kref) -{ - struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); - struct tipc_server *s = con->server; - struct sockaddr_tipc *saddr = s->saddr; - struct socket *sock = con->sock; - struct sock *sk; - - if (sock) { - sk = sock->sk; - if (test_bit(CF_SERVER, &con->flags)) { - __module_get(sock->ops->owner); - __module_get(sk->sk_prot_creator->owner); - } - saddr->scope = -TIPC_NODE_SCOPE; - kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); - sock_release(sock); - con->sock = NULL; - } - spin_lock_bh(&s->idr_lock); - idr_remove(&s->conn_idr, con->conid); - s->idr_in_use--; - spin_unlock_bh(&s->idr_lock); - tipc_clean_outqueues(con); - kfree(con); -} - -static void conn_put(struct tipc_conn *con) -{ - kref_put(&con->kref, tipc_conn_kref_release); -} - -static void conn_get(struct tipc_conn *con) -{ - kref_get(&con->kref); -} - -static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid) -{ - struct tipc_conn *con; - - spin_lock_bh(&s->idr_lock); - con = idr_find(&s->conn_idr, conid); - if (con) { - if (!test_bit(CF_CONNECTED, &con->flags) || - !kref_get_unless_zero(&con->kref)) - con = NULL; - } - spin_unlock_bh(&s->idr_lock); - return con; -} - -static void sock_data_ready(struct sock *sk) -{ - struct tipc_conn *con; - - read_lock_bh(&sk->sk_callback_lock); - con = sock2con(sk); - if (con && test_bit(CF_CONNECTED, &con->flags)) { - conn_get(con); - if (!queue_work(con->server->rcv_wq, &con->rwork)) - conn_put(con); - } - read_unlock_bh(&sk->sk_callback_lock); -} - -static void sock_write_space(struct sock *sk) -{ - struct tipc_conn *con; - - read_lock_bh(&sk->sk_callback_lock); - con = sock2con(sk); - if (con && test_bit(CF_CONNECTED, &con->flags)) { - conn_get(con); - if (!queue_work(con->server->send_wq, &con->swork)) - conn_put(con); - } - read_unlock_bh(&sk->sk_callback_lock); -} - -static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con) -{ - struct sock *sk = sock->sk; - - write_lock_bh(&sk->sk_callback_lock); - - sk->sk_data_ready = sock_data_ready; - sk->sk_write_space = sock_write_space; - sk->sk_user_data = con; - - con->sock = sock; - - write_unlock_bh(&sk->sk_callback_lock); -} - -static void tipc_close_conn(struct tipc_conn *con) -{ - struct tipc_server *s = con->server; - struct sock *sk = con->sock->sk; - bool disconnect = false; - - write_lock_bh(&sk->sk_callback_lock); - disconnect = test_and_clear_bit(CF_CONNECTED, &con->flags); - if (disconnect) { - sk->sk_user_data = NULL; - if (con->conid) - s->tipc_conn_release(con->conid, con->usr_data); - } - write_unlock_bh(&sk->sk_callback_lock); - - /* Handle concurrent calls from sending and receiving threads */ - if (!disconnect) - return; - - /* Don't flush pending works, -just let them expire */ - kernel_sock_shutdown(con->sock, SHUT_RDWR); - conn_put(con); -} - -static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s) -{ - struct tipc_conn *con; - int ret; - - con = kzalloc(sizeof(struct tipc_conn), GFP_ATOMIC); - if (!con) - return ERR_PTR(-ENOMEM); - - kref_init(&con->kref); - INIT_LIST_HEAD(&con->outqueue); - spin_lock_init(&con->outqueue_lock); - INIT_WORK(&con->swork, tipc_send_work); - INIT_WORK(&con->rwork, tipc_recv_work); - - spin_lock_bh(&s->idr_lock); - ret = idr_alloc(&s->conn_idr, con, 0, 0, GFP_ATOMIC); - if (ret < 0) { - kfree(con); - spin_unlock_bh(&s->idr_lock); - return ERR_PTR(-ENOMEM); - } - con->conid = ret; - s->idr_in_use++; - spin_unlock_bh(&s->idr_lock); - - set_bit(CF_CONNECTED, &con->flags); - con->server = s; - - return con; -} - -static int tipc_receive_from_sock(struct tipc_conn *con) -{ - struct tipc_server *s = con->server; - struct sock *sk = con->sock->sk; - struct sockaddr_tipc addr; - struct msghdr msg = {}; - struct kvec iov; - void *buf; - int ret; - - buf = kmem_cache_alloc(s->rcvbuf_cache, GFP_ATOMIC); - if (!buf) { - ret = -ENOMEM; - goto out_close; - } - - iov.iov_base = buf; - iov.iov_len = s->max_rcvbuf_size; - msg.msg_name = &addr; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len); - ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); - if (ret <= 0) { - kmem_cache_free(s->rcvbuf_cache, buf); - goto out_close; - } - - read_lock_bh(&sk->sk_callback_lock); - if (test_bit(CF_CONNECTED, &con->flags)) - ret = s->tipc_conn_recvmsg(sock_net(con->sock->sk), con->conid, - &addr, con->usr_data, buf, ret); - read_unlock_bh(&sk->sk_callback_lock); - kmem_cache_free(s->rcvbuf_cache, buf); - if (ret < 0) - tipc_conn_terminate(s, con->conid); - return ret; - -out_close: - if (ret != -EWOULDBLOCK) - tipc_close_conn(con); - else if (ret == 0) - /* Don't return success if we really got EOF */ - ret = -EAGAIN; - - return ret; -} - -static int tipc_accept_from_sock(struct tipc_conn *con) -{ - struct tipc_server *s = con->server; - struct socket *sock = con->sock; - struct socket *newsock; - struct tipc_conn *newcon; - int ret; - - ret = kernel_accept(sock, &newsock, O_NONBLOCK); - if (ret < 0) - return ret; - - newcon = tipc_alloc_conn(con->server); - if (IS_ERR(newcon)) { - ret = PTR_ERR(newcon); - sock_release(newsock); - return ret; - } - - newcon->rx_action = tipc_receive_from_sock; - tipc_register_callbacks(newsock, newcon); - - /* Notify that new connection is incoming */ - newcon->usr_data = s->tipc_conn_new(newcon->conid); - if (!newcon->usr_data) { - sock_release(newsock); - conn_put(newcon); - return -ENOMEM; - } - - /* Wake up receive process in case of 'SYN+' message */ - newsock->sk->sk_data_ready(newsock->sk); - return ret; -} - -static struct socket *tipc_create_listen_sock(struct tipc_conn *con) -{ - struct tipc_server *s = con->server; - struct socket *sock = NULL; - int ret; - - ret = sock_create_kern(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock); - if (ret < 0) - return NULL; - ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE, - (char *)&s->imp, sizeof(s->imp)); - if (ret < 0) - goto create_err; - ret = kernel_bind(sock, (struct sockaddr *)s->saddr, sizeof(*s->saddr)); - if (ret < 0) - goto create_err; - - switch (s->type) { - case SOCK_STREAM: - case SOCK_SEQPACKET: - con->rx_action = tipc_accept_from_sock; - - ret = kernel_listen(sock, 0); - if (ret < 0) - goto create_err; - break; - case SOCK_DGRAM: - case SOCK_RDM: - con->rx_action = tipc_receive_from_sock; - break; - default: - pr_err("Unknown socket type %d\n", s->type); - goto create_err; - } - - /* As server's listening socket owner and creator is the same module, - * we have to decrease TIPC module reference count to guarantee that - * it remains zero after the server socket is created, otherwise, - * executing "rmmod" command is unable to make TIPC module deleted - * after TIPC module is inserted successfully. - * - * However, the reference count is ever increased twice in - * sock_create_kern(): one is to increase the reference count of owner - * of TIPC socket's proto_ops struct; another is to increment the - * reference count of owner of TIPC proto struct. Therefore, we must - * decrement the module reference count twice to ensure that it keeps - * zero after server's listening socket is created. Of course, we - * must bump the module reference count twice as well before the socket - * is closed. - */ - module_put(sock->ops->owner); - module_put(sock->sk->sk_prot_creator->owner); - set_bit(CF_SERVER, &con->flags); - - return sock; - -create_err: - kernel_sock_shutdown(sock, SHUT_RDWR); - sock_release(sock); - return NULL; -} - -static int tipc_open_listening_sock(struct tipc_server *s) -{ - struct socket *sock; - struct tipc_conn *con; - - con = tipc_alloc_conn(s); - if (IS_ERR(con)) - return PTR_ERR(con); - - sock = tipc_create_listen_sock(con); - if (!sock) { - idr_remove(&s->conn_idr, con->conid); - s->idr_in_use--; - kfree(con); - return -EINVAL; - } - - tipc_register_callbacks(sock, con); - return 0; -} - -static struct outqueue_entry *tipc_alloc_entry(void *data, int len) -{ - struct outqueue_entry *entry; - void *buf; - - entry = kmalloc(sizeof(struct outqueue_entry), GFP_ATOMIC); - if (!entry) - return NULL; - - buf = kmemdup(data, len, GFP_ATOMIC); - if (!buf) { - kfree(entry); - return NULL; - } - - entry->iov.iov_base = buf; - entry->iov.iov_len = len; - - return entry; -} - -static void tipc_free_entry(struct outqueue_entry *e) -{ - kfree(e->iov.iov_base); - kfree(e); -} - -static void tipc_clean_outqueues(struct tipc_conn *con) -{ - struct outqueue_entry *e, *safe; - - spin_lock_bh(&con->outqueue_lock); - list_for_each_entry_safe(e, safe, &con->outqueue, list) { - list_del(&e->list); - tipc_free_entry(e); - } - spin_unlock_bh(&con->outqueue_lock); -} - -int tipc_conn_sendmsg(struct tipc_server *s, int conid, - struct sockaddr_tipc *addr, void *data, size_t len) -{ - struct outqueue_entry *e; - struct tipc_conn *con; - - con = tipc_conn_lookup(s, conid); - if (!con) - return -EINVAL; - - if (!test_bit(CF_CONNECTED, &con->flags)) { - conn_put(con); - return 0; - } - - e = tipc_alloc_entry(data, len); - if (!e) { - conn_put(con); - return -ENOMEM; - } - - if (addr) - memcpy(&e->dest, addr, sizeof(struct sockaddr_tipc)); - - spin_lock_bh(&con->outqueue_lock); - list_add_tail(&e->list, &con->outqueue); - spin_unlock_bh(&con->outqueue_lock); - - if (!queue_work(s->send_wq, &con->swork)) - conn_put(con); - return 0; -} - -void tipc_conn_terminate(struct tipc_server *s, int conid) -{ - struct tipc_conn *con; - - con = tipc_conn_lookup(s, conid); - if (con) { - tipc_close_conn(con); - conn_put(con); - } -} - -bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, - u32 upper, u32 filter, int *conid) -{ - struct tipc_subscriber *scbr; - struct tipc_subscr sub; - struct tipc_server *s; - struct tipc_conn *con; - - sub.seq.type = type; - sub.seq.lower = lower; - sub.seq.upper = upper; - sub.timeout = TIPC_WAIT_FOREVER; - sub.filter = filter; - *(u32 *)&sub.usr_handle = port; - - con = tipc_alloc_conn(tipc_topsrv(net)); - if (IS_ERR(con)) - return false; - - *conid = con->conid; - s = con->server; - scbr = s->tipc_conn_new(*conid); - if (!scbr) { - conn_put(con); - return false; - } - - con->usr_data = scbr; - con->sock = NULL; - s->tipc_conn_recvmsg(net, *conid, NULL, scbr, &sub, sizeof(sub)); - return true; -} - -void tipc_topsrv_kern_unsubscr(struct net *net, int conid) -{ - struct tipc_conn *con; - struct tipc_server *srv; - - con = tipc_conn_lookup(tipc_topsrv(net), conid); - if (!con) - return; - - test_and_clear_bit(CF_CONNECTED, &con->flags); - srv = con->server; - if (con->conid) - srv->tipc_conn_release(con->conid, con->usr_data); - conn_put(con); - conn_put(con); -} - -static void tipc_send_kern_top_evt(struct net *net, struct tipc_event *evt) -{ - u32 port = *(u32 *)&evt->s.usr_handle; - u32 self = tipc_own_addr(net); - struct sk_buff_head evtq; - struct sk_buff *skb; - - skb = tipc_msg_create(TOP_SRV, 0, INT_H_SIZE, sizeof(*evt), - self, self, port, port, 0); - if (!skb) - return; - msg_set_dest_droppable(buf_msg(skb), true); - memcpy(msg_data(buf_msg(skb)), evt, sizeof(*evt)); - skb_queue_head_init(&evtq); - __skb_queue_tail(&evtq, skb); - tipc_sk_rcv(net, &evtq); -} - -static void tipc_send_to_sock(struct tipc_conn *con) -{ - struct tipc_server *s = con->server; - struct outqueue_entry *e; - struct tipc_event *evt; - struct msghdr msg; - int count = 0; - int ret; - - spin_lock_bh(&con->outqueue_lock); - while (test_bit(CF_CONNECTED, &con->flags)) { - e = list_entry(con->outqueue.next, struct outqueue_entry, list); - if ((struct list_head *) e == &con->outqueue) - break; - - spin_unlock_bh(&con->outqueue_lock); - - if (con->sock) { - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; - if (s->type == SOCK_DGRAM || s->type == SOCK_RDM) { - msg.msg_name = &e->dest; - msg.msg_namelen = sizeof(struct sockaddr_tipc); - } - ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1, - e->iov.iov_len); - if (ret == -EWOULDBLOCK || ret == 0) { - cond_resched(); - goto out; - } else if (ret < 0) { - goto send_err; - } - } else { - evt = e->iov.iov_base; - tipc_send_kern_top_evt(s->net, evt); - } - /* Don't starve users filling buffers */ - if (++count >= MAX_SEND_MSG_COUNT) { - cond_resched(); - count = 0; - } - - spin_lock_bh(&con->outqueue_lock); - list_del(&e->list); - tipc_free_entry(e); - } - spin_unlock_bh(&con->outqueue_lock); -out: - return; - -send_err: - tipc_close_conn(con); -} - -static void tipc_recv_work(struct work_struct *work) -{ - struct tipc_conn *con = container_of(work, struct tipc_conn, rwork); - int count = 0; - - while (test_bit(CF_CONNECTED, &con->flags)) { - if (con->rx_action(con)) - break; - - /* Don't flood Rx machine */ - if (++count >= MAX_RECV_MSG_COUNT) { - cond_resched(); - count = 0; - } - } - conn_put(con); -} - -static void tipc_send_work(struct work_struct *work) -{ - struct tipc_conn *con = container_of(work, struct tipc_conn, swork); - - if (test_bit(CF_CONNECTED, &con->flags)) - tipc_send_to_sock(con); - - conn_put(con); -} - -static void tipc_work_stop(struct tipc_server *s) -{ - destroy_workqueue(s->rcv_wq); - destroy_workqueue(s->send_wq); -} - -static int tipc_work_start(struct tipc_server *s) -{ - s->rcv_wq = alloc_ordered_workqueue("tipc_rcv", 0); - if (!s->rcv_wq) { - pr_err("can't start tipc receive workqueue\n"); - return -ENOMEM; - } - - s->send_wq = alloc_ordered_workqueue("tipc_send", 0); - if (!s->send_wq) { - pr_err("can't start tipc send workqueue\n"); - destroy_workqueue(s->rcv_wq); - return -ENOMEM; - } - - return 0; -} - -int tipc_server_start(struct tipc_server *s) -{ - int ret; - - spin_lock_init(&s->idr_lock); - idr_init(&s->conn_idr); - s->idr_in_use = 0; - - s->rcvbuf_cache = kmem_cache_create(s->name, s->max_rcvbuf_size, - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!s->rcvbuf_cache) - return -ENOMEM; - - ret = tipc_work_start(s); - if (ret < 0) { - kmem_cache_destroy(s->rcvbuf_cache); - return ret; - } - ret = tipc_open_listening_sock(s); - if (ret < 0) { - tipc_work_stop(s); - kmem_cache_destroy(s->rcvbuf_cache); - return ret; - } - return ret; -} - -void tipc_server_stop(struct tipc_server *s) -{ - struct tipc_conn *con; - int id; - - spin_lock_bh(&s->idr_lock); - for (id = 0; s->idr_in_use; id++) { - con = idr_find(&s->conn_idr, id); - if (con) { - spin_unlock_bh(&s->idr_lock); - tipc_close_conn(con); - spin_lock_bh(&s->idr_lock); - } - } - spin_unlock_bh(&s->idr_lock); - - tipc_work_stop(s); - kmem_cache_destroy(s->rcvbuf_cache); - idr_destroy(&s->conn_idr); -} diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b0323ec7971e..cee6674a3bf4 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -289,10 +289,9 @@ static bool tipc_sk_type_connectionless(struct sock *sk) static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg) { struct sock *sk = &tsk->sk; - struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id); + u32 self = tipc_own_addr(sock_net(sk)); u32 peer_port = tsk_peer_port(tsk); - u32 orig_node; - u32 peer_node; + u32 orig_node, peer_node; if (unlikely(!tipc_sk_connected(sk))) return false; @@ -306,10 +305,10 @@ static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg) if (likely(orig_node == peer_node)) return true; - if (!orig_node && (peer_node == tn->own_addr)) + if (!orig_node && peer_node == self) return true; - if (!peer_node && (orig_node == tn->own_addr)) + if (!peer_node && orig_node == self) return true; return false; @@ -461,8 +460,8 @@ static int tipc_sk_create(struct net *net, struct socket *sock, /* Ensure tsk is visible before we read own_addr. */ smp_mb(); - tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, - NAMED_H_SIZE, 0); + tipc_msg_init(tipc_own_addr(net), msg, TIPC_LOW_IMPORTANCE, + TIPC_NAMED_MSG, NAMED_H_SIZE, 0); msg_set_origport(msg, tsk->portid); timer_setup(&sk->sk_timer, tipc_sk_timeout, 0); @@ -473,6 +472,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sk->sk_write_space = tipc_write_space; sk->sk_destruct = tipc_sock_destruct; tsk->conn_timeout = CONN_TIMEOUT_DEFAULT; + tsk->group_is_open = true; atomic_set(&tsk->dupl_rcvcnt, 0); /* Start out with safe limits until we receive an advertised window */ @@ -643,7 +643,7 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, goto exit; } - res = (addr->scope > 0) ? + res = (addr->scope >= 0) ? tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq) : tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq); exit: @@ -665,12 +665,11 @@ exit: * a completely predictable manner). */ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_net *tn = net_generic(sock_net(sock->sk), tipc_net_id); memset(addr, 0, sizeof(*addr)); if (peer) { @@ -681,16 +680,15 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, addr->addr.id.node = tsk_peer_node(tsk); } else { addr->addr.id.ref = tsk->portid; - addr->addr.id.node = tn->own_addr; + addr->addr.id.node = tipc_own_addr(sock_net(sk)); } - *uaddr_len = sizeof(*addr); addr->addrtype = TIPC_ADDR_ID; addr->family = AF_TIPC; addr->scope = 0; addr->addr.name.domain = 0; - return 0; + return sizeof(*addr); } /** @@ -1280,8 +1278,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; struct sk_buff_head pkts; - u32 type, inst, domain; u32 dnode, dport; + u32 type, inst; int mtu, rc; if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) @@ -1332,13 +1330,12 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (dest->addrtype == TIPC_ADDR_NAME) { type = dest->addr.name.name.type; inst = dest->addr.name.name.instance; - domain = dest->addr.name.domain; - dnode = domain; + dnode = dest->addr.name.domain; msg_set_type(hdr, TIPC_NAMED_MSG); msg_set_hdr_sz(hdr, NAMED_H_SIZE); msg_set_nametype(hdr, type); msg_set_nameinst(hdr, inst); - msg_set_lookup_scope(hdr, tipc_addr_scope(domain)); + msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); dport = tipc_nametbl_translate(net, type, inst, &dnode); msg_set_destnode(hdr, dnode); msg_set_destport(hdr, dport); @@ -2123,8 +2120,10 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, (!sk_conn && msg_connected(hdr)) || (!grp && msg_in_group(hdr))) err = TIPC_ERR_NO_PORT; - else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) + else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) { + atomic_inc(&sk->sk_drops); err = TIPC_ERR_OVERLOAD; + } if (unlikely(err)) { tipc_skb_reject(net, err, skb, xmitq); @@ -2203,6 +2202,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /* Overload => reject message back to sender */ onode = tipc_own_addr(sock_net(sk)); + atomic_inc(&sk->sk_drops); if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) __skb_queue_tail(xmitq, skb); break; @@ -2592,6 +2592,9 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct publication *publ; u32 key; + if (scope != TIPC_NODE_SCOPE) + scope = TIPC_CLUSTER_SCOPE; + if (tipc_sk_connected(sk)) return -EINVAL; key = tsk->portid + tsk->pub_count + 1; @@ -2603,7 +2606,7 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, if (unlikely(!publ)) return -EINVAL; - list_add(&publ->pport_list, &tsk->publications); + list_add(&publ->binding_sock, &tsk->publications); tsk->pub_count++; tsk->published = 1; return 0; @@ -2617,7 +2620,10 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, struct publication *safe; int rc = -EINVAL; - list_for_each_entry_safe(publ, safe, &tsk->publications, pport_list) { + if (scope != TIPC_NODE_SCOPE) + scope = TIPC_CLUSTER_SCOPE; + + list_for_each_entry_safe(publ, safe, &tsk->publications, binding_sock) { if (seq) { if (publ->scope != scope) continue; @@ -2628,12 +2634,12 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, if (publ->upper != seq->upper) break; tipc_nametbl_withdraw(net, publ->type, publ->lower, - publ->ref, publ->key); + publ->upper, publ->key); rc = 0; break; } tipc_nametbl_withdraw(net, publ->type, publ->lower, - publ->ref, publ->key); + publ->upper, publ->key); rc = 0; } if (list_empty(&tsk->publications)) @@ -2659,8 +2665,8 @@ void tipc_sk_reinit(struct net *net) while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { spin_lock_bh(&tsk->sk.sk_lock.slock); msg = &tsk->phdr; - msg_set_prevnode(msg, tn->own_addr); - msg_set_orignode(msg, tn->own_addr); + msg_set_prevnode(msg, tipc_own_addr(net)); + msg_set_orignode(msg, tipc_own_addr(net)); spin_unlock_bh(&tsk->sk.sk_lock.slock); } @@ -3155,16 +3161,32 @@ msg_full: return -EMSGSIZE; } +static int __tipc_nl_add_sk_info(struct sk_buff *skb, struct tipc_sock + *tsk) +{ + struct net *net = sock_net(skb->sk); + struct sock *sk = &tsk->sk; + + if (nla_put_u32(skb, TIPC_NLA_SOCK_REF, tsk->portid) || + nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tipc_own_addr(net))) + return -EMSGSIZE; + + if (tipc_sk_connected(sk)) { + if (__tipc_nl_add_sk_con(skb, tsk)) + return -EMSGSIZE; + } else if (!list_empty(&tsk->publications)) { + if (nla_put_flag(skb, TIPC_NLA_SOCK_HAS_PUBL)) + return -EMSGSIZE; + } + return 0; +} + /* Caller should hold socket lock for the passed tipc socket. */ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk) { - int err; - void *hdr; struct nlattr *attrs; - struct net *net = sock_net(skb->sk); - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct sock *sk = &tsk->sk; + void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_SOCK_GET); @@ -3174,19 +3196,10 @@ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb, attrs = nla_nest_start(skb, TIPC_NLA_SOCK); if (!attrs) goto genlmsg_cancel; - if (nla_put_u32(skb, TIPC_NLA_SOCK_REF, tsk->portid)) - goto attr_msg_cancel; - if (nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tn->own_addr)) + + if (__tipc_nl_add_sk_info(skb, tsk)) goto attr_msg_cancel; - if (tipc_sk_connected(sk)) { - err = __tipc_nl_add_sk_con(skb, tsk); - if (err) - goto attr_msg_cancel; - } else if (!list_empty(&tsk->publications)) { - if (nla_put_flag(skb, TIPC_NLA_SOCK_HAS_PUBL)) - goto attr_msg_cancel; - } nla_nest_end(skb, attrs); genlmsg_end(skb, hdr); @@ -3200,16 +3213,19 @@ msg_cancel: return -EMSGSIZE; } -int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb) +int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, + int (*skb_handler)(struct sk_buff *skb, + struct netlink_callback *cb, + struct tipc_sock *tsk)) { - int err; - struct tipc_sock *tsk; - const struct bucket_table *tbl; - struct rhash_head *pos; struct net *net = sock_net(skb->sk); - struct tipc_net *tn = net_generic(net, tipc_net_id); - u32 tbl_id = cb->args[0]; + struct tipc_net *tn = tipc_net(net); + const struct bucket_table *tbl; u32 prev_portid = cb->args[1]; + u32 tbl_id = cb->args[0]; + struct rhash_head *pos; + struct tipc_sock *tsk; + int err; rcu_read_lock(); tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht); @@ -3221,12 +3237,13 @@ int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; } - err = __tipc_nl_add_sk(skb, cb, tsk); + err = skb_handler(skb, cb, tsk); if (err) { prev_portid = tsk->portid; spin_unlock_bh(&tsk->sk.sk_lock.slock); goto out; } + prev_portid = 0; spin_unlock_bh(&tsk->sk.sk_lock.slock); } @@ -3238,6 +3255,76 @@ out: return skb->len; } +EXPORT_SYMBOL(tipc_nl_sk_walk); + +int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk, + u32 sk_filter_state, + u64 (*tipc_diag_gen_cookie)(struct sock *sk)) +{ + struct sock *sk = &tsk->sk; + struct nlattr *attrs; + struct nlattr *stat; + + /*filter response w.r.t sk_state*/ + if (!(sk_filter_state & (1 << sk->sk_state))) + return 0; + + attrs = nla_nest_start(skb, TIPC_NLA_SOCK); + if (!attrs) + goto msg_cancel; + + if (__tipc_nl_add_sk_info(skb, tsk)) + goto attr_msg_cancel; + + if (nla_put_u32(skb, TIPC_NLA_SOCK_TYPE, (u32)sk->sk_type) || + nla_put_u32(skb, TIPC_NLA_SOCK_TIPC_STATE, (u32)sk->sk_state) || + nla_put_u32(skb, TIPC_NLA_SOCK_INO, sock_i_ino(sk)) || + nla_put_u32(skb, TIPC_NLA_SOCK_UID, + from_kuid_munged(sk_user_ns(NETLINK_CB(skb).sk), + sock_i_uid(sk))) || + nla_put_u64_64bit(skb, TIPC_NLA_SOCK_COOKIE, + tipc_diag_gen_cookie(sk), + TIPC_NLA_SOCK_PAD)) + goto attr_msg_cancel; + + stat = nla_nest_start(skb, TIPC_NLA_SOCK_STAT); + if (!stat) + goto attr_msg_cancel; + + if (nla_put_u32(skb, TIPC_NLA_SOCK_STAT_RCVQ, + skb_queue_len(&sk->sk_receive_queue)) || + nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ, + skb_queue_len(&sk->sk_write_queue)) || + nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP, + atomic_read(&sk->sk_drops))) + goto stat_msg_cancel; + + if (tsk->cong_link_cnt && + nla_put_flag(skb, TIPC_NLA_SOCK_STAT_LINK_CONG)) + goto stat_msg_cancel; + + if (tsk_conn_cong(tsk) && + nla_put_flag(skb, TIPC_NLA_SOCK_STAT_CONN_CONG)) + goto stat_msg_cancel; + + nla_nest_end(skb, stat); + nla_nest_end(skb, attrs); + + return 0; + +stat_msg_cancel: + nla_nest_cancel(skb, stat); +attr_msg_cancel: + nla_nest_cancel(skb, attrs); +msg_cancel: + return -EMSGSIZE; +} +EXPORT_SYMBOL(tipc_sk_fill_sock_diag); + +int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + return tipc_nl_sk_walk(skb, cb, __tipc_nl_add_sk); +} /* Caller should hold socket lock for the passed tipc socket. */ static int __tipc_nl_add_sk_publ(struct sk_buff *skb, @@ -3287,7 +3374,7 @@ static int __tipc_nl_list_sk_publ(struct sk_buff *skb, struct publication *p; if (*last_publ) { - list_for_each_entry(p, &tsk->publications, pport_list) { + list_for_each_entry(p, &tsk->publications, binding_sock) { if (p->key == *last_publ) break; } @@ -3304,10 +3391,10 @@ static int __tipc_nl_list_sk_publ(struct sk_buff *skb, } } else { p = list_first_entry(&tsk->publications, struct publication, - pport_list); + binding_sock); } - list_for_each_entry_from(p, &tsk->publications, pport_list) { + list_for_each_entry_from(p, &tsk->publications, binding_sock) { err = __tipc_nl_add_sk_publ(skb, cb, p); if (err) { *last_publ = p->key; diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 06fb5944cf76..aae3fd4cd06c 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -49,6 +49,8 @@ #define RCVBUF_DEF (FLOWCTL_BLK_SZ * 1024 * 2) #define RCVBUF_MAX (FLOWCTL_BLK_SZ * 1024 * 16) +struct tipc_sock; + int tipc_socket_init(void); void tipc_socket_stop(void); void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq); @@ -59,5 +61,11 @@ int tipc_sk_rht_init(struct net *net); void tipc_sk_rht_destroy(struct net *net); int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb); - +int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk, + u32 sk_filter_state, + u64 (*tipc_diag_gen_cookie)(struct sock *sk)); +int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, + int (*skb_handler)(struct sk_buff *skb, + struct netlink_callback *cb, + struct tipc_sock *tsk)); #endif diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 68e26470c516..b7d80bc5f4ab 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -1,7 +1,7 @@ /* * net/tipc/subscr.c: TIPC network topology service * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2017, Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems * All rights reserved. * @@ -38,61 +38,30 @@ #include "name_table.h" #include "subscr.h" -/** - * struct tipc_subscriber - TIPC network topology subscriber - * @kref: reference counter to tipc_subscription object - * @conid: connection identifier to server connecting to subscriber - * @lock: control access to subscriber - * @subscrp_list: list of subscription objects for this subscriber - */ -struct tipc_subscriber { - struct kref kref; - int conid; - spinlock_t lock; - struct list_head subscrp_list; -}; - -static void tipc_subscrb_put(struct tipc_subscriber *subscriber); - -/** - * htohl - convert value to endianness used by destination - * @in: value to convert - * @swap: non-zero if endianness must be reversed - * - * Returns converted value - */ -static u32 htohl(u32 in, int swap) -{ - return swap ? swab32(in) : in; -} - -static void tipc_subscrp_send_event(struct tipc_subscription *sub, - u32 found_lower, u32 found_upper, - u32 event, u32 port_ref, u32 node) +static void tipc_sub_send_event(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, + u32 event, u32 port, u32 node) { - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); - struct tipc_subscriber *subscriber = sub->subscriber; - struct kvec msg_sect; + struct tipc_event *evt = &sub->evt; - msg_sect.iov_base = (void *)&sub->evt; - msg_sect.iov_len = sizeof(struct tipc_event); - sub->evt.event = htohl(event, sub->swap); - sub->evt.found_lower = htohl(found_lower, sub->swap); - sub->evt.found_upper = htohl(found_upper, sub->swap); - sub->evt.port.ref = htohl(port_ref, sub->swap); - sub->evt.port.node = htohl(node, sub->swap); - tipc_conn_sendmsg(tn->topsrv, subscriber->conid, NULL, - msg_sect.iov_base, msg_sect.iov_len); + if (sub->inactive) + return; + tipc_evt_write(evt, event, event); + tipc_evt_write(evt, found_lower, found_lower); + tipc_evt_write(evt, found_upper, found_upper); + tipc_evt_write(evt, port.ref, port); + tipc_evt_write(evt, port.node, node); + tipc_topsrv_queue_evt(sub->net, sub->conid, event, evt); } /** - * tipc_subscrp_check_overlap - test for subscription overlap with the + * tipc_sub_check_overlap - test for subscription overlap with the * given values * * Returns 1 if there is overlap, otherwise 0. */ -int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, - u32 found_upper) +int tipc_sub_check_overlap(struct tipc_name_seq *seq, u32 found_lower, + u32 found_upper) { if (found_lower < seq->lower) found_lower = seq->lower; @@ -103,298 +72,100 @@ int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, return 1; } -u32 tipc_subscrp_convert_seq_type(u32 type, int swap) -{ - return htohl(type, swap); -} - -void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, - struct tipc_name_seq *out) -{ - out->type = htohl(in->type, swap); - out->lower = htohl(in->lower, swap); - out->upper = htohl(in->upper, swap); -} - -void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper, u32 event, u32 port_ref, - u32 node, u32 scope, int must) +void tipc_sub_report_overlap(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, + u32 event, u32 port, u32 node, + u32 scope, int must) { - u32 filter = htohl(sub->evt.s.filter, sub->swap); + struct tipc_subscr *s = &sub->evt.s; + u32 filter = tipc_sub_read(s, filter); struct tipc_name_seq seq; - tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); - if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) + seq.type = tipc_sub_read(s, seq.type); + seq.lower = tipc_sub_read(s, seq.lower); + seq.upper = tipc_sub_read(s, seq.upper); + + if (!tipc_sub_check_overlap(&seq, found_lower, found_upper)) return; + if (!must && !(filter & TIPC_SUB_PORTS)) return; if (filter & TIPC_SUB_CLUSTER_SCOPE && scope == TIPC_NODE_SCOPE) return; if (filter & TIPC_SUB_NODE_SCOPE && scope != TIPC_NODE_SCOPE) return; - - tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, - node); + spin_lock(&sub->lock); + tipc_sub_send_event(sub, found_lower, found_upper, + event, port, node); + spin_unlock(&sub->lock); } -static void tipc_subscrp_timeout(struct timer_list *t) +static void tipc_sub_timeout(struct timer_list *t) { struct tipc_subscription *sub = from_timer(sub, t, timer); - struct tipc_subscriber *subscriber = sub->subscriber; - - spin_lock_bh(&subscriber->lock); - tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscrp_list); - spin_unlock_bh(&subscriber->lock); - - /* Notify subscriber of timeout */ - tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, - TIPC_SUBSCR_TIMEOUT, 0, 0); - - tipc_subscrp_put(sub); -} - -static void tipc_subscrb_kref_release(struct kref *kref) -{ - kfree(container_of(kref,struct tipc_subscriber, kref)); -} - -static void tipc_subscrb_put(struct tipc_subscriber *subscriber) -{ - kref_put(&subscriber->kref, tipc_subscrb_kref_release); -} + struct tipc_subscr *s = &sub->evt.s; -static void tipc_subscrb_get(struct tipc_subscriber *subscriber) -{ - kref_get(&subscriber->kref); + spin_lock(&sub->lock); + tipc_sub_send_event(sub, s->seq.lower, s->seq.upper, + TIPC_SUBSCR_TIMEOUT, 0, 0); + sub->inactive = true; + spin_unlock(&sub->lock); } -static void tipc_subscrp_kref_release(struct kref *kref) +static void tipc_sub_kref_release(struct kref *kref) { - struct tipc_subscription *sub = container_of(kref, - struct tipc_subscription, - kref); - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); - struct tipc_subscriber *subscriber = sub->subscriber; - - atomic_dec(&tn->subscription_count); - kfree(sub); - tipc_subscrb_put(subscriber); + kfree(container_of(kref, struct tipc_subscription, kref)); } -void tipc_subscrp_put(struct tipc_subscription *subscription) +void tipc_sub_put(struct tipc_subscription *subscription) { - kref_put(&subscription->kref, tipc_subscrp_kref_release); + kref_put(&subscription->kref, tipc_sub_kref_release); } -void tipc_subscrp_get(struct tipc_subscription *subscription) +void tipc_sub_get(struct tipc_subscription *subscription) { kref_get(&subscription->kref); } -/* tipc_subscrb_subscrp_delete - delete a specific subscription or all - * subscriptions for a given subscriber. - */ -static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, - struct tipc_subscr *s) -{ - struct list_head *subscription_list = &subscriber->subscrp_list; - struct tipc_subscription *sub, *temp; - u32 timeout; - - spin_lock_bh(&subscriber->lock); - list_for_each_entry_safe(sub, temp, subscription_list, subscrp_list) { - if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) - continue; - - timeout = htohl(sub->evt.s.timeout, sub->swap); - if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) { - tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscrp_list); - tipc_subscrp_put(sub); - } - - if (s) - break; - } - spin_unlock_bh(&subscriber->lock); -} - -static struct tipc_subscriber *tipc_subscrb_create(int conid) -{ - struct tipc_subscriber *subscriber; - - subscriber = kzalloc(sizeof(*subscriber), GFP_ATOMIC); - if (!subscriber) { - pr_warn("Subscriber rejected, no memory\n"); - return NULL; - } - INIT_LIST_HEAD(&subscriber->subscrp_list); - kref_init(&subscriber->kref); - subscriber->conid = conid; - spin_lock_init(&subscriber->lock); - - return subscriber; -} - -static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) -{ - tipc_subscrb_subscrp_delete(subscriber, NULL); - tipc_subscrb_put(subscriber); -} - -static void tipc_subscrp_cancel(struct tipc_subscr *s, - struct tipc_subscriber *subscriber) -{ - tipc_subscrb_get(subscriber); - tipc_subscrb_subscrp_delete(subscriber, s); - tipc_subscrb_put(subscriber); -} - -static struct tipc_subscription *tipc_subscrp_create(struct net *net, - struct tipc_subscr *s, - int swap) +struct tipc_subscription *tipc_sub_subscribe(struct net *net, + struct tipc_subscr *s, + int conid) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + u32 filter = tipc_sub_read(s, filter); struct tipc_subscription *sub; - u32 filter = htohl(s->filter, swap); + u32 timeout; - /* Refuse subscription if global limit exceeded */ - if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) { - pr_warn("Subscription rejected, limit reached (%u)\n", - TIPC_MAX_SUBSCRIPTIONS); + if ((filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE) || + (tipc_sub_read(s, seq.lower) > tipc_sub_read(s, seq.upper))) { + pr_warn("Subscription rejected, illegal request\n"); return NULL; } - - /* Allocate subscription object */ sub = kmalloc(sizeof(*sub), GFP_ATOMIC); if (!sub) { pr_warn("Subscription rejected, no memory\n"); return NULL; } - - /* Initialize subscription object */ + INIT_LIST_HEAD(&sub->service_list); + INIT_LIST_HEAD(&sub->sub_list); sub->net = net; - if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) || - (htohl(s->seq.lower, swap) > htohl(s->seq.upper, swap))) { - pr_warn("Subscription rejected, illegal request\n"); - kfree(sub); - return NULL; - } - - sub->swap = swap; + sub->conid = conid; + sub->inactive = false; memcpy(&sub->evt.s, s, sizeof(*s)); - atomic_inc(&tn->subscription_count); + spin_lock_init(&sub->lock); kref_init(&sub->kref); - return sub; -} - -static int tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, - struct tipc_subscriber *subscriber, int swap, - bool status) -{ - struct tipc_subscription *sub = NULL; - u32 timeout; - - sub = tipc_subscrp_create(net, s, swap); - if (!sub) - return -1; - - spin_lock_bh(&subscriber->lock); - list_add(&sub->subscrp_list, &subscriber->subscrp_list); - sub->subscriber = subscriber; - tipc_nametbl_subscribe(sub, status); - tipc_subscrb_get(subscriber); - spin_unlock_bh(&subscriber->lock); - - timer_setup(&sub->timer, tipc_subscrp_timeout, 0); - timeout = htohl(sub->evt.s.timeout, swap); - + tipc_nametbl_subscribe(sub); + timer_setup(&sub->timer, tipc_sub_timeout, 0); + timeout = tipc_sub_read(&sub->evt.s, timeout); if (timeout != TIPC_WAIT_FOREVER) mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); - return 0; -} - -/* Handle one termination request for the subscriber */ -static void tipc_subscrb_release_cb(int conid, void *usr_data) -{ - tipc_subscrb_delete((struct tipc_subscriber *)usr_data); -} - -/* Handle one request to create a new subscription for the subscriber */ -static int tipc_subscrb_rcv_cb(struct net *net, int conid, - struct sockaddr_tipc *addr, void *usr_data, - void *buf, size_t len) -{ - struct tipc_subscriber *subscriber = usr_data; - struct tipc_subscr *s = (struct tipc_subscr *)buf; - bool status; - int swap; - - /* Determine subscriber's endianness */ - swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE | - TIPC_SUB_CANCEL)); - - /* Detect & process a subscription cancellation request */ - if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { - s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); - tipc_subscrp_cancel(s, subscriber); - return 0; - } - status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap)); - return tipc_subscrp_subscribe(net, s, subscriber, swap, status); -} - -/* Handle one request to establish a new subscriber */ -static void *tipc_subscrb_connect_cb(int conid) -{ - return (void *)tipc_subscrb_create(conid); -} - -int tipc_topsrv_start(struct net *net) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - const char name[] = "topology_server"; - struct tipc_server *topsrv; - struct sockaddr_tipc *saddr; - - saddr = kzalloc(sizeof(*saddr), GFP_ATOMIC); - if (!saddr) - return -ENOMEM; - saddr->family = AF_TIPC; - saddr->addrtype = TIPC_ADDR_NAMESEQ; - saddr->addr.nameseq.type = TIPC_TOP_SRV; - saddr->addr.nameseq.lower = TIPC_TOP_SRV; - saddr->addr.nameseq.upper = TIPC_TOP_SRV; - saddr->scope = TIPC_NODE_SCOPE; - - topsrv = kzalloc(sizeof(*topsrv), GFP_ATOMIC); - if (!topsrv) { - kfree(saddr); - return -ENOMEM; - } - topsrv->net = net; - topsrv->saddr = saddr; - topsrv->imp = TIPC_CRITICAL_IMPORTANCE; - topsrv->type = SOCK_SEQPACKET; - topsrv->max_rcvbuf_size = sizeof(struct tipc_subscr); - topsrv->tipc_conn_recvmsg = tipc_subscrb_rcv_cb; - topsrv->tipc_conn_new = tipc_subscrb_connect_cb; - topsrv->tipc_conn_release = tipc_subscrb_release_cb; - - strncpy(topsrv->name, name, strlen(name) + 1); - tn->topsrv = topsrv; - atomic_set(&tn->subscription_count, 0); - - return tipc_server_start(topsrv); + return sub; } -void tipc_topsrv_stop(struct net *net) +void tipc_sub_unsubscribe(struct tipc_subscription *sub) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_server *topsrv = tn->topsrv; - - tipc_server_stop(topsrv); - kfree(topsrv->saddr); - kfree(topsrv); + tipc_nametbl_unsubscribe(sub); + if (sub->evt.s.timeout != TIPC_WAIT_FOREVER) + del_timer_sync(&sub->timer); + list_del(&sub->sub_list); + tipc_sub_put(sub); } diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index f3edca775d9f..d793b4343885 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -1,7 +1,7 @@ /* * net/tipc/subscr.h: Include file for TIPC network topology service * - * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2003-2017, Ericsson AB * Copyright (c) 2005-2007, 2012-2013, Wind River Systems * All rights reserved. * @@ -37,48 +37,72 @@ #ifndef _TIPC_SUBSCR_H #define _TIPC_SUBSCR_H -#include "server.h" +#include "topsrv.h" -#define TIPC_MAX_SUBSCRIPTIONS 65535 -#define TIPC_MAX_PUBLICATIONS 65535 +#define TIPC_MAX_SUBSCR 65535 +#define TIPC_MAX_PUBL 65535 struct tipc_subscription; -struct tipc_subscriber; +struct tipc_conn; /** * struct tipc_subscription - TIPC network topology subscription object * @subscriber: pointer to its subscriber * @seq: name sequence associated with subscription - * @net: point to network namespace * @timer: timer governing subscription duration (optional) * @nameseq_list: adjacent subscriptions in name sequence's subscription list - * @subscrp_list: adjacent subscriptions in subscriber's subscription list - * @swap: indicates if subscriber uses opposite endianness in its messages + * @sub_list: adjacent subscriptions in subscriber's subscription list * @evt: template for events generated by subscription */ struct tipc_subscription { struct kref kref; - struct tipc_subscriber *subscriber; struct net *net; struct timer_list timer; - struct list_head nameseq_list; - struct list_head subscrp_list; - int swap; + struct list_head service_list; + struct list_head sub_list; struct tipc_event evt; + int conid; + bool inactive; + spinlock_t lock; /* serialize up/down and timer events */ }; -int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, - u32 found_upper); -void tipc_subscrp_report_overlap(struct tipc_subscription *sub, - u32 found_lower, u32 found_upper, u32 event, - u32 port_ref, u32 node, u32 scope, int must); -void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, - struct tipc_name_seq *out); -u32 tipc_subscrp_convert_seq_type(u32 type, int swap); +struct tipc_subscription *tipc_sub_subscribe(struct net *net, + struct tipc_subscr *s, + int conid); +void tipc_sub_unsubscribe(struct tipc_subscription *sub); + +int tipc_sub_check_overlap(struct tipc_name_seq *seq, u32 found_lower, + u32 found_upper); +void tipc_sub_report_overlap(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, + u32 event, u32 port, u32 node, + u32 scope, int must); int tipc_topsrv_start(struct net *net); void tipc_topsrv_stop(struct net *net); -void tipc_subscrp_put(struct tipc_subscription *subscription); -void tipc_subscrp_get(struct tipc_subscription *subscription); +void tipc_sub_put(struct tipc_subscription *subscription); +void tipc_sub_get(struct tipc_subscription *subscription); + +#define TIPC_FILTER_MASK (TIPC_SUB_PORTS | TIPC_SUB_SERVICE | TIPC_SUB_CANCEL) + +/* tipc_sub_read - return field_ of struct sub_ in host endian format + */ +#define tipc_sub_read(sub_, field_) \ + ({ \ + struct tipc_subscr *sub__ = sub_; \ + u32 val__ = (sub__)->field_; \ + int swap_ = !((sub__)->filter & TIPC_FILTER_MASK); \ + (swap_ ? swab32(val__) : val__); \ + }) + +/* tipc_evt_write - write val_ to field_ of struct evt_ in user endian format + */ +#define tipc_evt_write(evt_, field_, val_) \ + ({ \ + struct tipc_event *evt__ = evt_; \ + u32 val__ = val_; \ + int swap_ = !((evt__)->s.filter & (TIPC_FILTER_MASK)); \ + (evt__)->field_ = swap_ ? swab32(val__) : val__; \ + }) #endif diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c new file mode 100644 index 000000000000..c8e34ef22c30 --- /dev/null +++ b/net/tipc/topsrv.c @@ -0,0 +1,703 @@ +/* + * net/tipc/server.c: TIPC server infrastructure + * + * Copyright (c) 2012-2013, Wind River Systems + * Copyright (c) 2017-2018, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "subscr.h" +#include "topsrv.h" +#include "core.h" +#include "socket.h" +#include "addr.h" +#include "msg.h" +#include <net/sock.h> +#include <linux/module.h> + +/* Number of messages to send before rescheduling */ +#define MAX_SEND_MSG_COUNT 25 +#define MAX_RECV_MSG_COUNT 25 +#define CF_CONNECTED 1 +#define CF_SERVER 2 + +#define TIPC_SERVER_NAME_LEN 32 + +/** + * struct tipc_topsrv - TIPC server structure + * @conn_idr: identifier set of connection + * @idr_lock: protect the connection identifier set + * @idr_in_use: amount of allocated identifier entry + * @net: network namspace instance + * @rcvbuf_cache: memory cache of server receive buffer + * @rcv_wq: receive workqueue + * @send_wq: send workqueue + * @max_rcvbuf_size: maximum permitted receive message length + * @tipc_conn_new: callback will be called when new connection is incoming + * @tipc_conn_release: callback will be called before releasing the connection + * @tipc_conn_recvmsg: callback will be called when message arrives + * @name: server name + * @imp: message importance + * @type: socket type + */ +struct tipc_topsrv { + struct idr conn_idr; + spinlock_t idr_lock; /* for idr list */ + int idr_in_use; + struct net *net; + struct work_struct awork; + struct workqueue_struct *rcv_wq; + struct workqueue_struct *send_wq; + int max_rcvbuf_size; + struct socket *listener; + char name[TIPC_SERVER_NAME_LEN]; +}; + +/** + * struct tipc_conn - TIPC connection structure + * @kref: reference counter to connection object + * @conid: connection identifier + * @sock: socket handler associated with connection + * @flags: indicates connection state + * @server: pointer to connected server + * @sub_list: lsit to all pertaing subscriptions + * @sub_lock: lock protecting the subscription list + * @outqueue_lock: control access to the outqueue + * @rwork: receive work item + * @rx_action: what to do when connection socket is active + * @outqueue: pointer to first outbound message in queue + * @outqueue_lock: control access to the outqueue + * @swork: send work item + */ +struct tipc_conn { + struct kref kref; + int conid; + struct socket *sock; + unsigned long flags; + struct tipc_topsrv *server; + struct list_head sub_list; + spinlock_t sub_lock; /* for subscription list */ + struct work_struct rwork; + struct list_head outqueue; + spinlock_t outqueue_lock; /* for outqueue */ + struct work_struct swork; +}; + +/* An entry waiting to be sent */ +struct outqueue_entry { + bool inactive; + struct tipc_event evt; + struct list_head list; +}; + +static void tipc_conn_recv_work(struct work_struct *work); +static void tipc_conn_send_work(struct work_struct *work); +static void tipc_topsrv_kern_evt(struct net *net, struct tipc_event *evt); +static void tipc_conn_delete_sub(struct tipc_conn *con, struct tipc_subscr *s); + +static bool connected(struct tipc_conn *con) +{ + return con && test_bit(CF_CONNECTED, &con->flags); +} + +static void tipc_conn_kref_release(struct kref *kref) +{ + struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); + struct tipc_topsrv *s = con->server; + struct outqueue_entry *e, *safe; + + spin_lock_bh(&s->idr_lock); + idr_remove(&s->conn_idr, con->conid); + s->idr_in_use--; + spin_unlock_bh(&s->idr_lock); + if (con->sock) + sock_release(con->sock); + + spin_lock_bh(&con->outqueue_lock); + list_for_each_entry_safe(e, safe, &con->outqueue, list) { + list_del(&e->list); + kfree(e); + } + spin_unlock_bh(&con->outqueue_lock); + kfree(con); +} + +static void conn_put(struct tipc_conn *con) +{ + kref_put(&con->kref, tipc_conn_kref_release); +} + +static void conn_get(struct tipc_conn *con) +{ + kref_get(&con->kref); +} + +static void tipc_conn_close(struct tipc_conn *con) +{ + struct sock *sk = con->sock->sk; + bool disconnect = false; + + write_lock_bh(&sk->sk_callback_lock); + disconnect = test_and_clear_bit(CF_CONNECTED, &con->flags); + + if (disconnect) { + sk->sk_user_data = NULL; + tipc_conn_delete_sub(con, NULL); + } + write_unlock_bh(&sk->sk_callback_lock); + + /* Handle concurrent calls from sending and receiving threads */ + if (!disconnect) + return; + + /* Don't flush pending works, -just let them expire */ + kernel_sock_shutdown(con->sock, SHUT_RDWR); + + conn_put(con); +} + +static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s) +{ + struct tipc_conn *con; + int ret; + + con = kzalloc(sizeof(*con), GFP_ATOMIC); + if (!con) + return ERR_PTR(-ENOMEM); + + kref_init(&con->kref); + INIT_LIST_HEAD(&con->outqueue); + INIT_LIST_HEAD(&con->sub_list); + spin_lock_init(&con->outqueue_lock); + spin_lock_init(&con->sub_lock); + INIT_WORK(&con->swork, tipc_conn_send_work); + INIT_WORK(&con->rwork, tipc_conn_recv_work); + + spin_lock_bh(&s->idr_lock); + ret = idr_alloc(&s->conn_idr, con, 0, 0, GFP_ATOMIC); + if (ret < 0) { + kfree(con); + spin_unlock_bh(&s->idr_lock); + return ERR_PTR(-ENOMEM); + } + con->conid = ret; + s->idr_in_use++; + spin_unlock_bh(&s->idr_lock); + + set_bit(CF_CONNECTED, &con->flags); + con->server = s; + + return con; +} + +static struct tipc_conn *tipc_conn_lookup(struct tipc_topsrv *s, int conid) +{ + struct tipc_conn *con; + + spin_lock_bh(&s->idr_lock); + con = idr_find(&s->conn_idr, conid); + if (!connected(con) || !kref_get_unless_zero(&con->kref)) + con = NULL; + spin_unlock_bh(&s->idr_lock); + return con; +} + +/* tipc_conn_delete_sub - delete a specific or all subscriptions + * for a given subscriber + */ +static void tipc_conn_delete_sub(struct tipc_conn *con, struct tipc_subscr *s) +{ + struct tipc_net *tn = tipc_net(con->server->net); + struct list_head *sub_list = &con->sub_list; + struct tipc_subscription *sub, *tmp; + + spin_lock_bh(&con->sub_lock); + list_for_each_entry_safe(sub, tmp, sub_list, sub_list) { + if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) { + tipc_sub_unsubscribe(sub); + atomic_dec(&tn->subscription_count); + } else if (s) { + break; + } + } + spin_unlock_bh(&con->sub_lock); +} + +static void tipc_conn_send_to_sock(struct tipc_conn *con) +{ + struct list_head *queue = &con->outqueue; + struct tipc_topsrv *srv = con->server; + struct outqueue_entry *e; + struct tipc_event *evt; + struct msghdr msg; + struct kvec iov; + int count = 0; + int ret; + + spin_lock_bh(&con->outqueue_lock); + + while (!list_empty(queue)) { + e = list_first_entry(queue, struct outqueue_entry, list); + evt = &e->evt; + spin_unlock_bh(&con->outqueue_lock); + + if (e->inactive) + tipc_conn_delete_sub(con, &evt->s); + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT; + iov.iov_base = evt; + iov.iov_len = sizeof(*evt); + msg.msg_name = NULL; + + if (con->sock) { + ret = kernel_sendmsg(con->sock, &msg, &iov, + 1, sizeof(*evt)); + if (ret == -EWOULDBLOCK || ret == 0) { + cond_resched(); + return; + } else if (ret < 0) { + return tipc_conn_close(con); + } + } else { + tipc_topsrv_kern_evt(srv->net, evt); + } + + /* Don't starve users filling buffers */ + if (++count >= MAX_SEND_MSG_COUNT) { + cond_resched(); + count = 0; + } + spin_lock_bh(&con->outqueue_lock); + list_del(&e->list); + kfree(e); + } + spin_unlock_bh(&con->outqueue_lock); +} + +static void tipc_conn_send_work(struct work_struct *work) +{ + struct tipc_conn *con = container_of(work, struct tipc_conn, swork); + + if (connected(con)) + tipc_conn_send_to_sock(con); + + conn_put(con); +} + +/* tipc_conn_queue_evt() - interrupt level call from a subscription instance + * The queued work is launched into tipc_send_work()->tipc_send_to_sock() + */ +void tipc_topsrv_queue_evt(struct net *net, int conid, + u32 event, struct tipc_event *evt) +{ + struct tipc_topsrv *srv = tipc_topsrv(net); + struct outqueue_entry *e; + struct tipc_conn *con; + + con = tipc_conn_lookup(srv, conid); + if (!con) + return; + + if (!connected(con)) + goto err; + + e = kmalloc(sizeof(*e), GFP_ATOMIC); + if (!e) + goto err; + e->inactive = (event == TIPC_SUBSCR_TIMEOUT); + memcpy(&e->evt, evt, sizeof(*evt)); + spin_lock_bh(&con->outqueue_lock); + list_add_tail(&e->list, &con->outqueue); + spin_unlock_bh(&con->outqueue_lock); + + if (queue_work(srv->send_wq, &con->swork)) + return; +err: + conn_put(con); +} + +/* tipc_conn_write_space - interrupt callback after a sendmsg EAGAIN + * Indicates that there now is more space in the send buffer + * The queued work is launched into tipc_send_work()->tipc_conn_send_to_sock() + */ +static void tipc_conn_write_space(struct sock *sk) +{ + struct tipc_conn *con; + + read_lock_bh(&sk->sk_callback_lock); + con = sk->sk_user_data; + if (connected(con)) { + conn_get(con); + if (!queue_work(con->server->send_wq, &con->swork)) + conn_put(con); + } + read_unlock_bh(&sk->sk_callback_lock); +} + +static int tipc_conn_rcv_sub(struct tipc_topsrv *srv, + struct tipc_conn *con, + struct tipc_subscr *s) +{ + struct tipc_net *tn = tipc_net(srv->net); + struct tipc_subscription *sub; + + if (tipc_sub_read(s, filter) & TIPC_SUB_CANCEL) { + tipc_conn_delete_sub(con, s); + return 0; + } + if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) { + pr_warn("Subscription rejected, max (%u)\n", TIPC_MAX_SUBSCR); + return -1; + } + sub = tipc_sub_subscribe(srv->net, s, con->conid); + if (!sub) + return -1; + atomic_inc(&tn->subscription_count); + spin_lock_bh(&con->sub_lock); + list_add(&sub->sub_list, &con->sub_list); + spin_unlock_bh(&con->sub_lock); + return 0; +} + +static int tipc_conn_rcv_from_sock(struct tipc_conn *con) +{ + struct tipc_topsrv *srv = con->server; + struct sock *sk = con->sock->sk; + struct msghdr msg = {}; + struct tipc_subscr s; + struct kvec iov; + int ret; + + iov.iov_base = &s; + iov.iov_len = sizeof(s); + msg.msg_name = NULL; + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len); + ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); + if (ret == -EWOULDBLOCK) + return -EWOULDBLOCK; + if (ret > 0) { + read_lock_bh(&sk->sk_callback_lock); + ret = tipc_conn_rcv_sub(srv, con, &s); + read_unlock_bh(&sk->sk_callback_lock); + } + if (ret < 0) + tipc_conn_close(con); + + return ret; +} + +static void tipc_conn_recv_work(struct work_struct *work) +{ + struct tipc_conn *con = container_of(work, struct tipc_conn, rwork); + int count = 0; + + while (connected(con)) { + if (tipc_conn_rcv_from_sock(con)) + break; + + /* Don't flood Rx machine */ + if (++count >= MAX_RECV_MSG_COUNT) { + cond_resched(); + count = 0; + } + } + conn_put(con); +} + +/* tipc_conn_data_ready - interrupt callback indicating the socket has data + * The queued work is launched into tipc_recv_work()->tipc_conn_rcv_from_sock() + */ +static void tipc_conn_data_ready(struct sock *sk) +{ + struct tipc_conn *con; + + read_lock_bh(&sk->sk_callback_lock); + con = sk->sk_user_data; + if (connected(con)) { + conn_get(con); + if (!queue_work(con->server->rcv_wq, &con->rwork)) + conn_put(con); + } + read_unlock_bh(&sk->sk_callback_lock); +} + +static void tipc_topsrv_accept(struct work_struct *work) +{ + struct tipc_topsrv *srv = container_of(work, struct tipc_topsrv, awork); + struct socket *lsock = srv->listener; + struct socket *newsock; + struct tipc_conn *con; + struct sock *newsk; + int ret; + + while (1) { + ret = kernel_accept(lsock, &newsock, O_NONBLOCK); + if (ret < 0) + return; + con = tipc_conn_alloc(srv); + if (IS_ERR(con)) { + ret = PTR_ERR(con); + sock_release(newsock); + return; + } + /* Register callbacks */ + newsk = newsock->sk; + write_lock_bh(&newsk->sk_callback_lock); + newsk->sk_data_ready = tipc_conn_data_ready; + newsk->sk_write_space = tipc_conn_write_space; + newsk->sk_user_data = con; + con->sock = newsock; + write_unlock_bh(&newsk->sk_callback_lock); + + /* Wake up receive process in case of 'SYN+' message */ + newsk->sk_data_ready(newsk); + } +} + +/* tipc_toprsv_listener_data_ready - interrupt callback with connection request + * The queued job is launched into tipc_topsrv_accept() + */ +static void tipc_topsrv_listener_data_ready(struct sock *sk) +{ + struct tipc_topsrv *srv; + + read_lock_bh(&sk->sk_callback_lock); + srv = sk->sk_user_data; + if (srv->listener) + queue_work(srv->rcv_wq, &srv->awork); + read_unlock_bh(&sk->sk_callback_lock); +} + +static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) +{ + int imp = TIPC_CRITICAL_IMPORTANCE; + struct socket *lsock = NULL; + struct sockaddr_tipc saddr; + struct sock *sk; + int rc; + + rc = sock_create_kern(srv->net, AF_TIPC, SOCK_SEQPACKET, 0, &lsock); + if (rc < 0) + return rc; + + srv->listener = lsock; + sk = lsock->sk; + write_lock_bh(&sk->sk_callback_lock); + sk->sk_data_ready = tipc_topsrv_listener_data_ready; + sk->sk_user_data = srv; + write_unlock_bh(&sk->sk_callback_lock); + + rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE, + (char *)&imp, sizeof(imp)); + if (rc < 0) + goto err; + + saddr.family = AF_TIPC; + saddr.addrtype = TIPC_ADDR_NAMESEQ; + saddr.addr.nameseq.type = TIPC_TOP_SRV; + saddr.addr.nameseq.lower = TIPC_TOP_SRV; + saddr.addr.nameseq.upper = TIPC_TOP_SRV; + saddr.scope = TIPC_NODE_SCOPE; + + rc = kernel_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr)); + if (rc < 0) + goto err; + rc = kernel_listen(lsock, 0); + if (rc < 0) + goto err; + + /* As server's listening socket owner and creator is the same module, + * we have to decrease TIPC module reference count to guarantee that + * it remains zero after the server socket is created, otherwise, + * executing "rmmod" command is unable to make TIPC module deleted + * after TIPC module is inserted successfully. + * + * However, the reference count is ever increased twice in + * sock_create_kern(): one is to increase the reference count of owner + * of TIPC socket's proto_ops struct; another is to increment the + * reference count of owner of TIPC proto struct. Therefore, we must + * decrement the module reference count twice to ensure that it keeps + * zero after server's listening socket is created. Of course, we + * must bump the module reference count twice as well before the socket + * is closed. + */ + module_put(lsock->ops->owner); + module_put(sk->sk_prot_creator->owner); + + return 0; +err: + sock_release(lsock); + return -EINVAL; +} + +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, + u32 upper, u32 filter, int *conid) +{ + struct tipc_subscr sub; + struct tipc_conn *con; + int rc; + + sub.seq.type = type; + sub.seq.lower = lower; + sub.seq.upper = upper; + sub.timeout = TIPC_WAIT_FOREVER; + sub.filter = filter; + *(u32 *)&sub.usr_handle = port; + + con = tipc_conn_alloc(tipc_topsrv(net)); + if (IS_ERR(con)) + return false; + + *conid = con->conid; + con->sock = NULL; + rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub); + if (rc >= 0) + return true; + conn_put(con); + return false; +} + +void tipc_topsrv_kern_unsubscr(struct net *net, int conid) +{ + struct tipc_conn *con; + + con = tipc_conn_lookup(tipc_topsrv(net), conid); + if (!con) + return; + + test_and_clear_bit(CF_CONNECTED, &con->flags); + tipc_conn_delete_sub(con, NULL); + conn_put(con); + conn_put(con); +} + +static void tipc_topsrv_kern_evt(struct net *net, struct tipc_event *evt) +{ + u32 port = *(u32 *)&evt->s.usr_handle; + u32 self = tipc_own_addr(net); + struct sk_buff_head evtq; + struct sk_buff *skb; + + skb = tipc_msg_create(TOP_SRV, 0, INT_H_SIZE, sizeof(*evt), + self, self, port, port, 0); + if (!skb) + return; + msg_set_dest_droppable(buf_msg(skb), true); + memcpy(msg_data(buf_msg(skb)), evt, sizeof(*evt)); + skb_queue_head_init(&evtq); + __skb_queue_tail(&evtq, skb); + tipc_sk_rcv(net, &evtq); +} + +static int tipc_topsrv_work_start(struct tipc_topsrv *s) +{ + s->rcv_wq = alloc_ordered_workqueue("tipc_rcv", 0); + if (!s->rcv_wq) { + pr_err("can't start tipc receive workqueue\n"); + return -ENOMEM; + } + + s->send_wq = alloc_ordered_workqueue("tipc_send", 0); + if (!s->send_wq) { + pr_err("can't start tipc send workqueue\n"); + destroy_workqueue(s->rcv_wq); + return -ENOMEM; + } + + return 0; +} + +static void tipc_topsrv_work_stop(struct tipc_topsrv *s) +{ + destroy_workqueue(s->rcv_wq); + destroy_workqueue(s->send_wq); +} + +int tipc_topsrv_start(struct net *net) +{ + struct tipc_net *tn = tipc_net(net); + const char name[] = "topology_server"; + struct tipc_topsrv *srv; + int ret; + + srv = kzalloc(sizeof(*srv), GFP_ATOMIC); + if (!srv) + return -ENOMEM; + + srv->net = net; + srv->max_rcvbuf_size = sizeof(struct tipc_subscr); + INIT_WORK(&srv->awork, tipc_topsrv_accept); + + strncpy(srv->name, name, strlen(name) + 1); + tn->topsrv = srv; + atomic_set(&tn->subscription_count, 0); + + spin_lock_init(&srv->idr_lock); + idr_init(&srv->conn_idr); + srv->idr_in_use = 0; + + ret = tipc_topsrv_work_start(srv); + if (ret < 0) + return ret; + + ret = tipc_topsrv_create_listener(srv); + if (ret < 0) + tipc_topsrv_work_stop(srv); + + return ret; +} + +void tipc_topsrv_stop(struct net *net) +{ + struct tipc_topsrv *srv = tipc_topsrv(net); + struct socket *lsock = srv->listener; + struct tipc_conn *con; + int id; + + spin_lock_bh(&srv->idr_lock); + for (id = 0; srv->idr_in_use; id++) { + con = idr_find(&srv->conn_idr, id); + if (con) { + spin_unlock_bh(&srv->idr_lock); + tipc_conn_close(con); + spin_lock_bh(&srv->idr_lock); + } + } + __module_get(lsock->ops->owner); + __module_get(lsock->sk->sk_prot_creator->owner); + srv->listener = NULL; + spin_unlock_bh(&srv->idr_lock); + sock_release(lsock); + tipc_topsrv_work_stop(srv); + idr_destroy(&srv->conn_idr); + kfree(srv); +} diff --git a/net/tipc/server.h b/net/tipc/topsrv.h index 64df7513cd70..c7ea71293748 100644 --- a/net/tipc/server.h +++ b/net/tipc/topsrv.h @@ -2,6 +2,7 @@ * net/tipc/server.h: Include file for TIPC server code * * Copyright (c) 2012-2013, Wind River Systems + * Copyright (c) 2017, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,68 +37,18 @@ #ifndef _TIPC_SERVER_H #define _TIPC_SERVER_H -#include <linux/idr.h> -#include <linux/tipc.h> -#include <net/net_namespace.h> +#include "core.h" #define TIPC_SERVER_NAME_LEN 32 #define TIPC_SUB_CLUSTER_SCOPE 0x20 #define TIPC_SUB_NODE_SCOPE 0x40 #define TIPC_SUB_NO_STATUS 0x80 -/** - * struct tipc_server - TIPC server structure - * @conn_idr: identifier set of connection - * @idr_lock: protect the connection identifier set - * @idr_in_use: amount of allocated identifier entry - * @net: network namspace instance - * @rcvbuf_cache: memory cache of server receive buffer - * @rcv_wq: receive workqueue - * @send_wq: send workqueue - * @max_rcvbuf_size: maximum permitted receive message length - * @tipc_conn_new: callback will be called when new connection is incoming - * @tipc_conn_release: callback will be called before releasing the connection - * @tipc_conn_recvmsg: callback will be called when message arrives - * @saddr: TIPC server address - * @name: server name - * @imp: message importance - * @type: socket type - */ -struct tipc_server { - struct idr conn_idr; - spinlock_t idr_lock; - int idr_in_use; - struct net *net; - struct kmem_cache *rcvbuf_cache; - struct workqueue_struct *rcv_wq; - struct workqueue_struct *send_wq; - int max_rcvbuf_size; - void *(*tipc_conn_new)(int conid); - void (*tipc_conn_release)(int conid, void *usr_data); - int (*tipc_conn_recvmsg)(struct net *net, int conid, - struct sockaddr_tipc *addr, void *usr_data, - void *buf, size_t len); - struct sockaddr_tipc *saddr; - char name[TIPC_SERVER_NAME_LEN]; - int imp; - int type; -}; - -int tipc_conn_sendmsg(struct tipc_server *s, int conid, - struct sockaddr_tipc *addr, void *data, size_t len); +void tipc_topsrv_queue_evt(struct net *net, int conid, + u32 event, struct tipc_event *evt); bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, u32 upper, u32 filter, int *conid); void tipc_topsrv_kern_unsubscr(struct net *net, int conid); -/** - * tipc_conn_terminate - terminate connection with server - * - * Note: Must call it in process context since it might sleep - */ -void tipc_conn_terminate(struct tipc_server *s, int conid); -int tipc_server_start(struct tipc_server *s); - -void tipc_server_stop(struct tipc_server *s); - #endif diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 3deabcab4882..e7d91f5d5cae 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -47,6 +47,8 @@ #include <net/addrconf.h> #include <linux/tipc_netlink.h> #include "core.h" +#include "addr.h" +#include "net.h" #include "bearer.h" #include "netlink.h" #include "msg.h" @@ -647,6 +649,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, struct udp_port_cfg udp_conf = {0}; struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; + u8 node_id[NODE_ID_LEN] = {0,}; ub = kzalloc(sizeof(*ub), GFP_ATOMIC); if (!ub) @@ -677,6 +680,17 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, if (err) goto err; + /* Autoconfigure own node identity if needed */ + if (!tipc_own_id(net)) { + memcpy(node_id, local.ipv6.in6_u.u6_addr8, 16); + tipc_net_init(net, node_id, 0); + } + if (!tipc_own_id(net)) { + pr_warn("Failed to set node id, please configure manually\n"); + err = -EINVAL; + goto err; + } + b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP; b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT; rcu_assign_pointer(b->media_ptr, ub); diff --git a/net/tls/Kconfig b/net/tls/Kconfig index eb583038c67e..89b8745a986f 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -7,6 +7,7 @@ config TLS select CRYPTO select CRYPTO_AES select CRYPTO_GCM + select STREAM_PARSER default n ---help--- Enable kernel support for TLS protocol. This allows symmetric diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index e9b4b53ab53e..0d379970960e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -38,6 +38,7 @@ #include <linux/highmem.h> #include <linux/netdevice.h> #include <linux/sched/signal.h> +#include <linux/inetdevice.h> #include <net/tls.h> @@ -46,16 +47,32 @@ MODULE_DESCRIPTION("Transport Layer Security Support"); MODULE_LICENSE("Dual BSD/GPL"); enum { - TLS_BASE_TX, + TLSV4, + TLSV6, + TLS_NUM_PROTS, +}; + +enum { + TLS_BASE, TLS_SW_TX, + TLS_SW_RX, + TLS_SW_RXTX, + TLS_HW_RECORD, TLS_NUM_CONFIG, }; -static struct proto tls_prots[TLS_NUM_CONFIG]; +static struct proto *saved_tcpv6_prot; +static DEFINE_MUTEX(tcpv6_prot_mutex); +static LIST_HEAD(device_list); +static DEFINE_MUTEX(device_mutex); +static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; +static struct proto_ops tls_sw_proto_ops; static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) { - sk->sk_prot = &tls_prots[ctx->tx_conf]; + int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; + + sk->sk_prot = &tls_prots[ip_ver][ctx->conf]; } int wait_on_pending_writer(struct sock *sk, long *timeo) @@ -228,8 +245,12 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) lock_sock(sk); sk_proto_close = ctx->sk_proto_close; - if (ctx->tx_conf == TLS_BASE_TX) { + if (ctx->conf == TLS_HW_RECORD) + goto skip_tx_cleanup; + + if (ctx->conf == TLS_BASE) { kfree(ctx); + ctx = NULL; goto skip_tx_cleanup; } @@ -249,15 +270,25 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) } } - kfree(ctx->rec_seq); - kfree(ctx->iv); + kfree(ctx->tx.rec_seq); + kfree(ctx->tx.iv); + kfree(ctx->rx.rec_seq); + kfree(ctx->rx.iv); - if (ctx->tx_conf == TLS_SW_TX) - tls_sw_free_tx_resources(sk); + if (ctx->conf == TLS_SW_TX || + ctx->conf == TLS_SW_RX || + ctx->conf == TLS_SW_RXTX) { + tls_sw_free_resources(sk); + } skip_tx_cleanup: release_sock(sk); sk_proto_close(sk, timeout); + /* free ctx for TLS_HW_RECORD, used by tcp_set_state + * for sk->sk_prot->unhash [tls_hw_unhash] + */ + if (ctx && ctx->conf == TLS_HW_RECORD) + kfree(ctx); } static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, @@ -309,9 +340,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, } lock_sock(sk); memcpy(crypto_info_aes_gcm_128->iv, - ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); - memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->rec_seq, + memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->tx.rec_seq, TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE); release_sock(sk); if (copy_to_user(optval, @@ -355,20 +386,24 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, return do_tls_getsockopt(sk, optname, optval, optlen); } -static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, - unsigned int optlen) +static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, + unsigned int optlen, int tx) { struct tls_crypto_info *crypto_info; struct tls_context *ctx = tls_get_ctx(sk); int rc = 0; - int tx_conf; + int conf; if (!optval || (optlen < sizeof(*crypto_info))) { rc = -EINVAL; goto out; } - crypto_info = &ctx->crypto_send; + if (tx) + crypto_info = &ctx->crypto_send; + else + crypto_info = &ctx->crypto_recv; + /* Currently we don't support set crypto info more than one time */ if (TLS_CRYPTO_INFO_READY(crypto_info)) { rc = -EBUSY; @@ -407,15 +442,31 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, } /* currently SW is default, we will have ethtool in future */ - rc = tls_set_sw_offload(sk, ctx); - tx_conf = TLS_SW_TX; + if (tx) { + rc = tls_set_sw_offload(sk, ctx, 1); + if (ctx->conf == TLS_SW_RX) + conf = TLS_SW_RXTX; + else + conf = TLS_SW_TX; + } else { + rc = tls_set_sw_offload(sk, ctx, 0); + if (ctx->conf == TLS_SW_TX) + conf = TLS_SW_RXTX; + else + conf = TLS_SW_RX; + } + if (rc) goto err_crypto_info; - ctx->tx_conf = tx_conf; + ctx->conf = conf; update_sk_prot(sk, ctx); - ctx->sk_write_space = sk->sk_write_space; - sk->sk_write_space = tls_write_space; + if (tx) { + ctx->sk_write_space = sk->sk_write_space; + sk->sk_write_space = tls_write_space; + } else { + sk->sk_socket->ops = &tls_sw_proto_ops; + } goto out; err_crypto_info: @@ -431,8 +482,10 @@ static int do_tls_setsockopt(struct sock *sk, int optname, switch (optname) { case TLS_TX: + case TLS_RX: lock_sock(sk); - rc = do_tls_setsockopt_tx(sk, optval, optlen); + rc = do_tls_setsockopt_conf(sk, optval, optlen, + optname == TLS_TX); release_sock(sk); break; default: @@ -453,12 +506,113 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, return do_tls_setsockopt(sk, optname, optval, optlen); } -static int tls_init(struct sock *sk) +static struct tls_context *create_ctx(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + icsk->icsk_ulp_data = ctx; + return ctx; +} + +static int tls_hw_prot(struct sock *sk) +{ + struct tls_context *ctx; + struct tls_device *dev; + int rc = 0; + + mutex_lock(&device_mutex); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->feature && dev->feature(dev)) { + ctx = create_ctx(sk); + if (!ctx) + goto out; + + ctx->hash = sk->sk_prot->hash; + ctx->unhash = sk->sk_prot->unhash; + ctx->sk_proto_close = sk->sk_prot->close; + ctx->conf = TLS_HW_RECORD; + update_sk_prot(sk, ctx); + rc = 1; + break; + } + } +out: + mutex_unlock(&device_mutex); + return rc; +} + +static void tls_hw_unhash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_device *dev; + + mutex_lock(&device_mutex); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->unhash) + dev->unhash(dev, sk); + } + mutex_unlock(&device_mutex); + ctx->unhash(sk); +} + +static int tls_hw_hash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_device *dev; + int err; + + err = ctx->hash(sk); + mutex_lock(&device_mutex); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->hash) + err |= dev->hash(dev, sk); + } + mutex_unlock(&device_mutex); + + if (err) + tls_hw_unhash(sk); + return err; +} + +static void build_protos(struct proto *prot, struct proto *base) +{ + prot[TLS_BASE] = *base; + prot[TLS_BASE].setsockopt = tls_setsockopt; + prot[TLS_BASE].getsockopt = tls_getsockopt; + prot[TLS_BASE].close = tls_sk_proto_close; + + prot[TLS_SW_TX] = prot[TLS_BASE]; + prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; + prot[TLS_SW_TX].sendpage = tls_sw_sendpage; + + prot[TLS_SW_RX] = prot[TLS_BASE]; + prot[TLS_SW_RX].recvmsg = tls_sw_recvmsg; + prot[TLS_SW_RX].close = tls_sk_proto_close; + + prot[TLS_SW_RXTX] = prot[TLS_SW_TX]; + prot[TLS_SW_RXTX].recvmsg = tls_sw_recvmsg; + prot[TLS_SW_RXTX].close = tls_sk_proto_close; + + prot[TLS_HW_RECORD] = *base; + prot[TLS_HW_RECORD].hash = tls_hw_hash; + prot[TLS_HW_RECORD].unhash = tls_hw_unhash; + prot[TLS_HW_RECORD].close = tls_sk_proto_close; +} + +static int tls_init(struct sock *sk) +{ + int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; + struct tls_context *ctx; int rc = 0; + if (tls_hw_prot(sk)) + goto out; + /* The TLS ulp is currently supported only for TCP sockets * in ESTABLISHED state. * Supporting sockets in LISTEN state will require us @@ -469,22 +623,48 @@ static int tls_init(struct sock *sk) return -ENOTSUPP; /* allocate tls context */ - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = create_ctx(sk); if (!ctx) { rc = -ENOMEM; goto out; } - icsk->icsk_ulp_data = ctx; ctx->setsockopt = sk->sk_prot->setsockopt; ctx->getsockopt = sk->sk_prot->getsockopt; ctx->sk_proto_close = sk->sk_prot->close; - ctx->tx_conf = TLS_BASE_TX; + /* Build IPv6 TLS whenever the address of tcpv6_prot changes */ + if (ip_ver == TLSV6 && + unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { + mutex_lock(&tcpv6_prot_mutex); + if (likely(sk->sk_prot != saved_tcpv6_prot)) { + build_protos(tls_prots[TLSV6], sk->sk_prot); + smp_store_release(&saved_tcpv6_prot, sk->sk_prot); + } + mutex_unlock(&tcpv6_prot_mutex); + } + + ctx->conf = TLS_BASE; update_sk_prot(sk, ctx); out: return rc; } +void tls_register_device(struct tls_device *device) +{ + mutex_lock(&device_mutex); + list_add_tail(&device->dev_list, &device_list); + mutex_unlock(&device_mutex); +} +EXPORT_SYMBOL(tls_register_device); + +void tls_unregister_device(struct tls_device *device) +{ + mutex_lock(&device_mutex); + list_del(&device->dev_list); + mutex_unlock(&device_mutex); +} +EXPORT_SYMBOL(tls_unregister_device); + static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", .uid = TCP_ULP_TLS, @@ -493,21 +673,13 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .init = tls_init, }; -static void build_protos(struct proto *prot, struct proto *base) -{ - prot[TLS_BASE_TX] = *base; - prot[TLS_BASE_TX].setsockopt = tls_setsockopt; - prot[TLS_BASE_TX].getsockopt = tls_getsockopt; - prot[TLS_BASE_TX].close = tls_sk_proto_close; - - prot[TLS_SW_TX] = prot[TLS_BASE_TX]; - prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; - prot[TLS_SW_TX].sendpage = tls_sw_sendpage; -} - static int __init tls_register(void) { - build_protos(tls_prots, &tcp_prot); + build_protos(tls_prots[TLSV4], &tcp_prot); + + tls_sw_proto_ops = inet_stream_ops; + tls_sw_proto_ops.poll = tls_sw_poll; + tls_sw_proto_ops.splice_read = tls_sw_splice_read; tcp_register_ulp(&tcp_tls_ulp_ops); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f26376e954ae..4dc766b03f00 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -34,11 +34,60 @@ * SOFTWARE. */ +#include <linux/sched/signal.h> #include <linux/module.h> #include <crypto/aead.h> +#include <net/strparser.h> #include <net/tls.h> +static int tls_do_decryption(struct sock *sk, + struct scatterlist *sgin, + struct scatterlist *sgout, + char *iv_recv, + size_t data_len, + struct sk_buff *skb, + gfp_t flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + struct aead_request *aead_req; + + int ret; + unsigned int req_size = sizeof(struct aead_request) + + crypto_aead_reqsize(ctx->aead_recv); + + aead_req = kzalloc(req_size, flags); + if (!aead_req) + return -ENOMEM; + + aead_request_set_tfm(aead_req, ctx->aead_recv); + aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); + aead_request_set_crypt(aead_req, sgin, sgout, + data_len + tls_ctx->rx.tag_size, + (u8 *)iv_recv); + aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &ctx->async_wait); + + ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); + + if (ret < 0) + goto out; + + rxm->offset += tls_ctx->rx.prepend_size; + rxm->full_len -= tls_ctx->rx.overhead_size; + tls_advance_record_sn(sk, &tls_ctx->rx); + + ctx->decrypted = true; + + ctx->saved_data_ready(sk); + +out: + kfree(aead_req); + return ret; +} + static void trim_sg(struct sock *sk, struct scatterlist *sg, int *sg_num_elem, unsigned int *sg_size, int target_size) { @@ -79,7 +128,7 @@ static void trim_both_sgl(struct sock *sk, int target_size) target_size); if (target_size > 0) - target_size += tls_ctx->overhead_size; + target_size += tls_ctx->tx.overhead_size; trim_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem, @@ -87,71 +136,16 @@ static void trim_both_sgl(struct sock *sk, int target_size) target_size); } -static int alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, - int first_coalesce) -{ - struct page_frag *pfrag; - unsigned int size = *sg_size; - int num_elem = *sg_num_elem, use = 0, rc = 0; - struct scatterlist *sge; - unsigned int orig_offset; - - len -= size; - pfrag = sk_page_frag(sk); - - while (len > 0) { - if (!sk_page_frag_refill(sk, pfrag)) { - rc = -ENOMEM; - goto out; - } - - use = min_t(int, len, pfrag->size - pfrag->offset); - - if (!sk_wmem_schedule(sk, use)) { - rc = -ENOMEM; - goto out; - } - - sk_mem_charge(sk, use); - size += use; - orig_offset = pfrag->offset; - pfrag->offset += use; - - sge = sg + num_elem - 1; - if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && - sg->offset + sg->length == orig_offset) { - sg->length += use; - } else { - sge++; - sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); - ++num_elem; - if (num_elem == MAX_SKB_FRAGS) { - rc = -ENOSPC; - break; - } - } - - len -= use; - } - goto out; - -out: - *sg_size = size; - *sg_num_elem = num_elem; - return rc; -} - static int alloc_encrypted_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int rc = 0; - rc = alloc_sg(sk, len, ctx->sg_encrypted_data, - &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0); + rc = sk_alloc_sg(sk, len, + ctx->sg_encrypted_data, 0, + &ctx->sg_encrypted_num_elem, + &ctx->sg_encrypted_size, 0); return rc; } @@ -162,9 +156,9 @@ static int alloc_plaintext_sg(struct sock *sk, int len) struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int rc = 0; - rc = alloc_sg(sk, len, ctx->sg_plaintext_data, - &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, - tls_ctx->pending_open_record_frags); + rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0, + &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, + tls_ctx->pending_open_record_frags); return rc; } @@ -207,21 +201,21 @@ static int tls_do_encryption(struct tls_context *tls_ctx, if (!aead_req) return -ENOMEM; - ctx->sg_encrypted_data[0].offset += tls_ctx->prepend_size; - ctx->sg_encrypted_data[0].length -= tls_ctx->prepend_size; + ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; + ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out, - data_len, tls_ctx->iv); + data_len, tls_ctx->tx.iv); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &ctx->async_wait); rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait); - ctx->sg_encrypted_data[0].offset -= tls_ctx->prepend_size; - ctx->sg_encrypted_data[0].length += tls_ctx->prepend_size; + ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; + ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; kfree(aead_req); return rc; @@ -238,7 +232,7 @@ static int tls_push_record(struct sock *sk, int flags, sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size, - tls_ctx->rec_seq, tls_ctx->rec_seq_size, + tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, record_type); tls_fill_prepend(tls_ctx, @@ -269,9 +263,9 @@ static int tls_push_record(struct sock *sk, int flags, /* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */ rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags); if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk); + tls_err_abort(sk, EBADMSG); - tls_advance_record_sn(sk, tls_ctx); + tls_advance_record_sn(sk, &tls_ctx->tx); return rc; } @@ -281,23 +275,24 @@ static int tls_sw_push_pending_record(struct sock *sk, int flags) } static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, - int length) + int length, int *pages_used, + unsigned int *size_used, + struct scatterlist *to, int to_max_pages, + bool charge) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); struct page *pages[MAX_SKB_FRAGS]; size_t offset; ssize_t copied, use; int i = 0; - unsigned int size = ctx->sg_plaintext_size; - int num_elem = ctx->sg_plaintext_num_elem; + unsigned int size = *size_used; + int num_elem = *pages_used; int rc = 0; int maxpages; while (length > 0) { i = 0; - maxpages = ARRAY_SIZE(ctx->sg_plaintext_data) - num_elem; + maxpages = to_max_pages - num_elem; if (maxpages == 0) { rc = -EFAULT; goto out; @@ -317,10 +312,11 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, while (copied) { use = min_t(int, copied, PAGE_SIZE - offset); - sg_set_page(&ctx->sg_plaintext_data[num_elem], + sg_set_page(&to[num_elem], pages[i], use, offset); - sg_unmark_end(&ctx->sg_plaintext_data[num_elem]); - sk_mem_charge(sk, use); + sg_unmark_end(&to[num_elem]); + if (charge) + sk_mem_charge(sk, use); offset = 0; copied -= use; @@ -331,8 +327,9 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, } out: - ctx->sg_plaintext_size = size; - ctx->sg_plaintext_num_elem = num_elem; + *size_used = size; + *pages_used = num_elem; + return rc; } @@ -409,7 +406,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } required_size = ctx->sg_plaintext_size + try_to_copy + - tls_ctx->overhead_size; + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -429,7 +426,11 @@ alloc_encrypted: if (full_record || eor) { ret = zerocopy_from_iter(sk, &msg->msg_iter, - try_to_copy); + try_to_copy, &ctx->sg_plaintext_num_elem, + &ctx->sg_plaintext_size, + ctx->sg_plaintext_data, + ARRAY_SIZE(ctx->sg_plaintext_data), + true); if (ret) goto fallback_to_reg_send; @@ -468,7 +469,7 @@ alloc_plaintext: &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, ctx->sg_plaintext_size + - tls_ctx->overhead_size); + tls_ctx->tx.overhead_size); } ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy); @@ -560,7 +561,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, full_record = true; } required_size = ctx->sg_plaintext_size + copy + - tls_ctx->overhead_size; + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -629,13 +630,404 @@ sendpage_end: return ret; } -void tls_sw_free_tx_resources(struct sock *sk) +static struct sk_buff *tls_wait_data(struct sock *sk, int flags, + long timeo, int *err) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct sk_buff *skb; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + while (!(skb = ctx->recv_pkt)) { + if (sk->sk_err) { + *err = sock_error(sk); + return NULL; + } + + if (sock_flag(sk, SOCK_DONE)) + return NULL; + + if ((flags & MSG_DONTWAIT) || !timeo) { + *err = -EAGAIN; + return NULL; + } + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + + /* Handle signals */ + if (signal_pending(current)) { + *err = sock_intr_errno(timeo); + return NULL; + } + } + + return skb; +} + +static int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + tls_ctx->rx.iv_size]; + struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2]; + struct scatterlist *sgin = &sgin_arr[0]; + struct strp_msg *rxm = strp_msg(skb); + int ret, nsg = ARRAY_SIZE(sgin_arr); + char aad_recv[TLS_AAD_SPACE_SIZE]; + struct sk_buff *unused; + + ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, + iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + tls_ctx->rx.iv_size); + if (ret < 0) + return ret; + + memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + if (!sgout) { + nsg = skb_cow_data(skb, 0, &unused) + 1; + sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation); + if (!sgout) + sgout = sgin; + } + + sg_init_table(sgin, nsg); + sg_set_buf(&sgin[0], aad_recv, sizeof(aad_recv)); + + nsg = skb_to_sgvec(skb, &sgin[1], + rxm->offset + tls_ctx->rx.prepend_size, + rxm->full_len - tls_ctx->rx.prepend_size); + + tls_make_aad(aad_recv, + rxm->full_len - tls_ctx->rx.overhead_size, + tls_ctx->rx.rec_seq, + tls_ctx->rx.rec_seq_size, + ctx->control); + + ret = tls_do_decryption(sk, sgin, sgout, iv, + rxm->full_len - tls_ctx->rx.overhead_size, + skb, sk->sk_allocation); + + if (sgin != &sgin_arr[0]) + kfree(sgin); + + return ret; +} + +static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, + unsigned int len) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + + if (len < rxm->full_len) { + rxm->offset += len; + rxm->full_len -= len; + + return false; + } + + /* Finished with message */ + ctx->recv_pkt = NULL; + kfree_skb(skb); + strp_unpause(&ctx->strp); + + return true; +} + +int tls_sw_recvmsg(struct sock *sk, + struct msghdr *msg, + size_t len, + int nonblock, + int flags, + int *addr_len) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + unsigned char control; + struct strp_msg *rxm; + struct sk_buff *skb; + ssize_t copied = 0; + bool cmsg = false; + int err = 0; + long timeo; + + flags |= nonblock; + + if (unlikely(flags & MSG_ERRQUEUE)) + return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + do { + bool zc = false; + int chunk = 0; + + skb = tls_wait_data(sk, flags, timeo, &err); + if (!skb) + goto recv_end; + + rxm = strp_msg(skb); + if (!cmsg) { + int cerr; + + cerr = put_cmsg(msg, SOL_TLS, TLS_GET_RECORD_TYPE, + sizeof(ctx->control), &ctx->control); + cmsg = true; + control = ctx->control; + if (ctx->control != TLS_RECORD_TYPE_DATA) { + if (cerr || msg->msg_flags & MSG_CTRUNC) { + err = -EIO; + goto recv_end; + } + } + } else if (control != ctx->control) { + goto recv_end; + } + + if (!ctx->decrypted) { + int page_count; + int to_copy; + + page_count = iov_iter_npages(&msg->msg_iter, + MAX_SKB_FRAGS); + to_copy = rxm->full_len - tls_ctx->rx.overhead_size; + if (to_copy <= len && page_count < MAX_SKB_FRAGS && + likely(!(flags & MSG_PEEK))) { + struct scatterlist sgin[MAX_SKB_FRAGS + 1]; + char unused[21]; + int pages = 0; + + zc = true; + sg_init_table(sgin, MAX_SKB_FRAGS + 1); + sg_set_buf(&sgin[0], unused, 13); + + err = zerocopy_from_iter(sk, &msg->msg_iter, + to_copy, &pages, + &chunk, &sgin[1], + MAX_SKB_FRAGS, false); + if (err < 0) + goto fallback_to_reg_recv; + + err = decrypt_skb(sk, skb, sgin); + for (; pages > 0; pages--) + put_page(sg_page(&sgin[pages])); + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto recv_end; + } + } else { +fallback_to_reg_recv: + err = decrypt_skb(sk, skb, NULL); + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto recv_end; + } + } + ctx->decrypted = true; + } + + if (!zc) { + chunk = min_t(unsigned int, rxm->full_len, len); + err = skb_copy_datagram_msg(skb, rxm->offset, msg, + chunk); + if (err < 0) + goto recv_end; + } + + copied += chunk; + len -= chunk; + if (likely(!(flags & MSG_PEEK))) { + u8 control = ctx->control; + + if (tls_sw_advance_skb(sk, skb, chunk)) { + /* Return full control message to + * userspace before trying to parse + * another message type + */ + msg->msg_flags |= MSG_EOR; + if (control != TLS_RECORD_TYPE_DATA) + goto recv_end; + } + } + } while (len); + +recv_end: + release_sock(sk); + return copied ? : err; +} + +ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sock->sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm = NULL; + struct sock *sk = sock->sk; + struct sk_buff *skb; + ssize_t copied = 0; + int err = 0; + long timeo; + int chunk; + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + skb = tls_wait_data(sk, flags, timeo, &err); + if (!skb) + goto splice_read_end; + + /* splice does not support reading control messages */ + if (ctx->control != TLS_RECORD_TYPE_DATA) { + err = -ENOTSUPP; + goto splice_read_end; + } + + if (!ctx->decrypted) { + err = decrypt_skb(sk, skb, NULL); + + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto splice_read_end; + } + ctx->decrypted = true; + } + rxm = strp_msg(skb); + + chunk = min_t(unsigned int, rxm->full_len, len); + copied = skb_splice_bits(skb, sk, rxm->offset, pipe, chunk, flags); + if (copied < 0) + goto splice_read_end; + + if (likely(!(flags & MSG_PEEK))) + tls_sw_advance_skb(sk, skb, copied); + +splice_read_end: + release_sock(sk); + return copied ? : err; +} + +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + unsigned int ret; + struct sock *sk = sock->sk; + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + + /* Grab POLLOUT and POLLHUP from the underlying socket */ + ret = ctx->sk_poll(file, sock, wait); + + /* Clear POLLIN bits, and set based on recv_pkt */ + ret &= ~(POLLIN | POLLRDNORM); + if (ctx->recv_pkt) + ret |= POLLIN | POLLRDNORM; + + return ret; +} + +static int tls_read_size(struct strparser *strp, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(strp->sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + char header[tls_ctx->rx.prepend_size]; + struct strp_msg *rxm = strp_msg(skb); + size_t cipher_overhead; + size_t data_len = 0; + int ret; + + /* Verify that we have a full TLS header, or wait for more data */ + if (rxm->offset + tls_ctx->rx.prepend_size > skb->len) + return 0; + + /* Linearize header to local buffer */ + ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size); + + if (ret < 0) + goto read_failure; + + ctx->control = header[0]; + + data_len = ((header[4] & 0xFF) | (header[3] << 8)); + + cipher_overhead = tls_ctx->rx.tag_size + tls_ctx->rx.iv_size; + + if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead) { + ret = -EMSGSIZE; + goto read_failure; + } + if (data_len < cipher_overhead) { + ret = -EBADMSG; + goto read_failure; + } + + if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.version) || + header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.version)) { + ret = -EINVAL; + goto read_failure; + } + + return data_len + TLS_HEADER_SIZE; + +read_failure: + tls_err_abort(strp->sk, ret); + + return ret; +} + +static void tls_queue(struct strparser *strp, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(strp->sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm; + + rxm = strp_msg(skb); + + ctx->decrypted = false; + + ctx->recv_pkt = skb; + strp_pause(strp); + + strp->sk->sk_state_change(strp->sk); +} + +static void tls_data_ready(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + + strp_data_ready(&ctx->strp); +} + +void tls_sw_free_resources(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); if (ctx->aead_send) crypto_free_aead(ctx->aead_send); + if (ctx->aead_recv) { + if (ctx->recv_pkt) { + kfree_skb(ctx->recv_pkt); + ctx->recv_pkt = NULL; + } + crypto_free_aead(ctx->aead_recv); + strp_stop(&ctx->strp); + write_lock_bh(&sk->sk_callback_lock); + sk->sk_data_ready = ctx->saved_data_ready; + write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); + strp_done(&ctx->strp); + lock_sock(sk); + } tls_free_both_sg(sk); @@ -643,12 +1035,15 @@ void tls_sw_free_tx_resources(struct sock *sk) kfree(tls_ctx); } -int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) { char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; struct tls_sw_context *sw_ctx; + struct cipher_context *cctx; + struct crypto_aead **aead; + struct strp_callbacks cb; u16 nonce_size, tag_size, iv_size, rec_seq_size; char *iv, *rec_seq; int rc = 0; @@ -658,22 +1053,29 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) goto out; } - if (ctx->priv_ctx) { - rc = -EEXIST; - goto out; - } - - sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL); - if (!sw_ctx) { - rc = -ENOMEM; - goto out; + if (!ctx->priv_ctx) { + sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL); + if (!sw_ctx) { + rc = -ENOMEM; + goto out; + } + crypto_init_wait(&sw_ctx->async_wait); + } else { + sw_ctx = ctx->priv_ctx; } - crypto_init_wait(&sw_ctx->async_wait); - ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; - crypto_info = &ctx->crypto_send; + if (tx) { + crypto_info = &ctx->crypto_send; + cctx = &ctx->tx; + aead = &sw_ctx->aead_send; + } else { + crypto_info = &ctx->crypto_recv; + cctx = &ctx->rx; + aead = &sw_ctx->aead_recv; + } + switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: { nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; @@ -692,46 +1094,49 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) goto free_priv; } - ctx->prepend_size = TLS_HEADER_SIZE + nonce_size; - ctx->tag_size = tag_size; - ctx->overhead_size = ctx->prepend_size + ctx->tag_size; - ctx->iv_size = iv_size; - ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, GFP_KERNEL); - if (!ctx->iv) { + cctx->prepend_size = TLS_HEADER_SIZE + nonce_size; + cctx->tag_size = tag_size; + cctx->overhead_size = cctx->prepend_size + cctx->tag_size; + cctx->iv_size = iv_size; + cctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + GFP_KERNEL); + if (!cctx->iv) { rc = -ENOMEM; goto free_priv; } - memcpy(ctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); - memcpy(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); - ctx->rec_seq_size = rec_seq_size; - ctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); - if (!ctx->rec_seq) { + memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); + cctx->rec_seq_size = rec_seq_size; + cctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); + if (!cctx->rec_seq) { rc = -ENOMEM; goto free_iv; } - memcpy(ctx->rec_seq, rec_seq, rec_seq_size); - - sg_init_table(sw_ctx->sg_encrypted_data, - ARRAY_SIZE(sw_ctx->sg_encrypted_data)); - sg_init_table(sw_ctx->sg_plaintext_data, - ARRAY_SIZE(sw_ctx->sg_plaintext_data)); - - sg_init_table(sw_ctx->sg_aead_in, 2); - sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space, - sizeof(sw_ctx->aad_space)); - sg_unmark_end(&sw_ctx->sg_aead_in[1]); - sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data); - sg_init_table(sw_ctx->sg_aead_out, 2); - sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space, - sizeof(sw_ctx->aad_space)); - sg_unmark_end(&sw_ctx->sg_aead_out[1]); - sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data); - - if (!sw_ctx->aead_send) { - sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0); - if (IS_ERR(sw_ctx->aead_send)) { - rc = PTR_ERR(sw_ctx->aead_send); - sw_ctx->aead_send = NULL; + memcpy(cctx->rec_seq, rec_seq, rec_seq_size); + + if (tx) { + sg_init_table(sw_ctx->sg_encrypted_data, + ARRAY_SIZE(sw_ctx->sg_encrypted_data)); + sg_init_table(sw_ctx->sg_plaintext_data, + ARRAY_SIZE(sw_ctx->sg_plaintext_data)); + + sg_init_table(sw_ctx->sg_aead_in, 2); + sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space, + sizeof(sw_ctx->aad_space)); + sg_unmark_end(&sw_ctx->sg_aead_in[1]); + sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data); + sg_init_table(sw_ctx->sg_aead_out, 2); + sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space, + sizeof(sw_ctx->aad_space)); + sg_unmark_end(&sw_ctx->sg_aead_out[1]); + sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data); + } + + if (!*aead) { + *aead = crypto_alloc_aead("gcm(aes)", 0, 0); + if (IS_ERR(*aead)) { + rc = PTR_ERR(*aead); + *aead = NULL; goto free_rec_seq; } } @@ -740,24 +1145,44 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); - rc = crypto_aead_setkey(sw_ctx->aead_send, keyval, + rc = crypto_aead_setkey(*aead, keyval, TLS_CIPHER_AES_GCM_128_KEY_SIZE); if (rc) goto free_aead; - rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tag_size); - if (!rc) - return 0; + rc = crypto_aead_setauthsize(*aead, cctx->tag_size); + if (rc) + goto free_aead; + + if (!tx) { + /* Set up strparser */ + memset(&cb, 0, sizeof(cb)); + cb.rcv_msg = tls_queue; + cb.parse_msg = tls_read_size; + + strp_init(&sw_ctx->strp, sk, &cb); + + write_lock_bh(&sk->sk_callback_lock); + sw_ctx->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = tls_data_ready; + write_unlock_bh(&sk->sk_callback_lock); + + sw_ctx->sk_poll = sk->sk_socket->ops->poll; + + strp_check_rcv(&sw_ctx->strp); + } + + goto out; free_aead: - crypto_free_aead(sw_ctx->aead_send); - sw_ctx->aead_send = NULL; + crypto_free_aead(*aead); + *aead = NULL; free_rec_seq: - kfree(ctx->rec_seq); - ctx->rec_seq = NULL; + kfree(cctx->rec_seq); + cctx->rec_seq = NULL; free_iv: - kfree(ctx->iv); - ctx->iv = NULL; + kfree(ctx->tx.iv); + ctx->tx.iv = NULL; free_priv: kfree(ctx->priv_ctx); ctx->priv_ctx = NULL; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2d465bdeccbc..68bb70a62afe 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -637,7 +637,7 @@ static int unix_stream_connect(struct socket *, struct sockaddr *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, int, bool); -static int unix_getname(struct socket *, struct sockaddr *, int *, int); +static int unix_getname(struct socket *, struct sockaddr *, int); static __poll_t unix_poll(struct file *, struct socket *, poll_table *); static __poll_t unix_dgram_poll(struct file *, struct socket *, poll_table *); @@ -745,14 +745,6 @@ static struct proto unix_proto = { .obj_size = sizeof(struct unix_sock), }; -/* - * AF_UNIX sockets do not interact with hardware, hence they - * dont trigger interrupts - so it's safe for them to have - * bh-unsafe locking for their sk_receive_queue.lock. Split off - * this special lock-class by reinitializing the spinlock key: - */ -static struct lock_class_key af_unix_sk_receive_queue_lock_key; - static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) { struct sock *sk = NULL; @@ -767,8 +759,6 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) goto out; sock_init_data(sock, sk); - lockdep_set_class(&sk->sk_receive_queue.lock, - &af_unix_sk_receive_queue_lock_key); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; @@ -1453,7 +1443,7 @@ out: } -static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) +static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct unix_sock *u; @@ -1476,12 +1466,12 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_ if (!u->addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; - *uaddr_len = sizeof(short); + err = sizeof(short); } else { struct unix_address *addr = u->addr; - *uaddr_len = addr->len; - memcpy(sunaddr, addr->name, *uaddr_len); + err = addr->len; + memcpy(sunaddr, addr->name, addr->len); } unix_state_unlock(sk); sock_put(sk); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index e0fc84daed94..aac9b8f6552e 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -759,7 +759,7 @@ vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } static int vsock_getname(struct socket *sock, - struct sockaddr *addr, int *addr_len, int peer) + struct sockaddr *addr, int peer) { int err; struct sock *sk; @@ -794,7 +794,7 @@ static int vsock_getname(struct socket *sock, */ BUILD_BUG_ON(sizeof(*vm_addr) > 128); memcpy(addr, vm_addr, sizeof(*vm_addr)); - *addr_len = sizeof(*vm_addr); + err = sizeof(*vm_addr); out: release_sock(sk); diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 1abcc4fc4df1..41722046b937 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -34,9 +34,10 @@ config CFG80211 When built as a module it will be called cfg80211. +if CFG80211 + config NL80211_TESTMODE bool "nl80211 testmode command" - depends on CFG80211 help The nl80211 testmode command helps implementing things like factory calibration or validation tools for wireless chips. @@ -51,7 +52,6 @@ config NL80211_TESTMODE config CFG80211_DEVELOPER_WARNINGS bool "enable developer warnings" - depends on CFG80211 default n help This option enables some additional warnings that help @@ -68,7 +68,7 @@ config CFG80211_DEVELOPER_WARNINGS config CFG80211_CERTIFICATION_ONUS bool "cfg80211 certification onus" - depends on CFG80211 && EXPERT + depends on EXPERT default n ---help--- You should disable this option unless you are both capable @@ -159,7 +159,6 @@ config CFG80211_REG_RELAX_NO_IR config CFG80211_DEFAULT_PS bool "enable powersave by default" - depends on CFG80211 default y help This option enables powersave mode by default. @@ -170,7 +169,6 @@ config CFG80211_DEFAULT_PS config CFG80211_DEBUGFS bool "cfg80211 DebugFS entries" - depends on CFG80211 depends on DEBUG_FS ---help--- You can enable this if you want debugfs entries for cfg80211. @@ -180,7 +178,6 @@ config CFG80211_DEBUGFS config CFG80211_CRDA_SUPPORT bool "support CRDA" if EXPERT default y - depends on CFG80211 help You should enable this option unless you know for sure you have no need for it, for example when using internal regdb (above) or the @@ -190,7 +187,6 @@ config CFG80211_CRDA_SUPPORT config CFG80211_WEXT bool "cfg80211 wireless extensions compatibility" if !CFG80211_WEXT_EXPORT - depends on CFG80211 select WEXT_CORE default y if CFG80211_WEXT_EXPORT help @@ -199,11 +195,12 @@ config CFG80211_WEXT config CFG80211_WEXT_EXPORT bool - depends on CFG80211 help Drivers should select this option if they require cfg80211's wext compatibility symbols to be exported. +endif # CFG80211 + config LIB80211 tristate default n diff --git a/net/wireless/ap.c b/net/wireless/ap.c index 63682176c96c..882d97bdc6bf 100644 --- a/net/wireless/ap.c +++ b/net/wireless/ap.c @@ -27,6 +27,7 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, err = rdev_stop_ap(rdev, dev); if (!err) { + wdev->conn_owner_nlportid = 0; wdev->beacon_interval = 0; memset(&wdev->chandef, 0, sizeof(wdev->chandef)); wdev->ssid_len = 0; diff --git a/net/wireless/chan.c b/net/wireless/chan.c index a48859982a32..2db713d18f71 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -579,6 +579,10 @@ static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy, { struct ieee80211_channel *c; u32 freq, start_freq, end_freq; + bool dfs_offload; + + dfs_offload = wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_DFS_OFFLOAD); start_freq = cfg80211_get_start_freq(center_freq, bandwidth); end_freq = cfg80211_get_end_freq(center_freq, bandwidth); @@ -596,8 +600,9 @@ static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy, if (c->flags & IEEE80211_CHAN_DISABLED) return false; - if ((c->flags & IEEE80211_CHAN_RADAR) && - (c->dfs_state != NL80211_DFS_AVAILABLE)) + if ((c->flags & IEEE80211_CHAN_RADAR) && + (c->dfs_state != NL80211_DFS_AVAILABLE) && + !(c->dfs_state == NL80211_DFS_USABLE && dfs_offload)) return false; } diff --git a/net/wireless/core.h b/net/wireless/core.h index eaff636169c2..63eb1b5fdd04 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -282,10 +282,10 @@ void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs); /* IBSS */ -int cfg80211_join_ibss(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_ibss_params *params, - struct cfg80211_cached_keys *connkeys); +int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ibss_params *params, + struct cfg80211_cached_keys *connkeys); void cfg80211_clear_ibss(struct net_device *dev, bool nowext); int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, bool nowext); @@ -303,10 +303,6 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev, struct mesh_setup *setup, const struct mesh_config *conf); -int cfg80211_join_mesh(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct mesh_setup *setup, - const struct mesh_config *conf); int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev); int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index a1d10993d08a..d1743e6abc34 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -84,14 +84,15 @@ void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, } EXPORT_SYMBOL(cfg80211_ibss_joined); -static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_ibss_params *params, - struct cfg80211_cached_keys *connkeys) +int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ibss_params *params, + struct cfg80211_cached_keys *connkeys) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; + ASSERT_RTNL(); ASSERT_WDEV_LOCK(wdev); if (wdev->ssid_len) @@ -146,23 +147,6 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, return 0; } -int cfg80211_join_ibss(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_ibss_params *params, - struct cfg80211_cached_keys *connkeys) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - int err; - - ASSERT_RTNL(); - - wdev_lock(wdev); - err = __cfg80211_join_ibss(rdev, dev, params, connkeys); - wdev_unlock(wdev); - - return err; -} - static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) { struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -224,6 +208,7 @@ int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, if (err) return err; + wdev->conn_owner_nlportid = 0; __cfg80211_clear_ibss(dev, nowext); return 0; diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index 51aa55618ef7..eac5aa1419fc 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -170,9 +170,28 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, enum nl80211_bss_scan_width scan_width; struct ieee80211_supported_band *sband = rdev->wiphy.bands[setup->chandef.chan->band]; - scan_width = cfg80211_chandef_to_scan_width(&setup->chandef); - setup->basic_rates = ieee80211_mandatory_rates(sband, - scan_width); + + if (setup->chandef.chan->band == NL80211_BAND_2GHZ) { + int i; + + /* + * Older versions selected the mandatory rates for + * 2.4 GHz as well, but were broken in that only + * 1 Mbps was regarded as a mandatory rate. Keep + * using just 1 Mbps as the default basic rate for + * mesh to be interoperable with older versions. + */ + for (i = 0; i < sband->n_bitrates; i++) { + if (sband->bitrates[i].bitrate == 10) { + setup->basic_rates = BIT(i); + break; + } + } + } else { + scan_width = cfg80211_chandef_to_scan_width(&setup->chandef); + setup->basic_rates = ieee80211_mandatory_rates(sband, + scan_width); + } } err = cfg80211_chandef_dfs_required(&rdev->wiphy, @@ -198,21 +217,6 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, return err; } -int cfg80211_join_mesh(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct mesh_setup *setup, - const struct mesh_config *conf) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - int err; - - wdev_lock(wdev); - err = __cfg80211_join_mesh(rdev, dev, setup, conf); - wdev_unlock(wdev); - - return err; -} - int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef) @@ -267,6 +271,7 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, err = rdev_leave_mesh(rdev, dev); if (!err) { + wdev->conn_owner_nlportid = 0; wdev->mesh_id_len = 0; wdev->beacon_interval = 0; memset(&wdev->chandef, 0, sizeof(wdev->chandef)); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index bbb9907bfa86..12b3edf70a7b 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -872,7 +872,7 @@ void cfg80211_cac_event(struct net_device *netdev, trace_cfg80211_cac_event(netdev, event); - if (WARN_ON(!wdev->cac_started)) + if (WARN_ON(!wdev->cac_started && event != NL80211_RADAR_CAC_STARTED)) return; if (WARN_ON(!wdev->chandef.chan)) @@ -888,14 +888,17 @@ void cfg80211_cac_event(struct net_device *netdev, sizeof(struct cfg80211_chan_def)); queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); cfg80211_sched_dfs_chan_update(rdev); - break; + /* fall through */ case NL80211_RADAR_CAC_ABORTED: + wdev->cac_started = false; + break; + case NL80211_RADAR_CAC_STARTED: + wdev->cac_started = true; break; default: WARN_ON(1); return; } - wdev->cac_started = false; nl80211_radar_notify(rdev, chandef, event, netdev, gfp); } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9c0dcc8324b0..ff28f8feeb09 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -287,6 +287,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG }, [NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 }, [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG }, + [NL80211_ATTR_CONTROL_PORT_OVER_NL80211] = { .type = NLA_FLAG }, [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG }, [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 }, [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, @@ -421,6 +422,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_FILS_CACHE_ID] = { .len = 2 }, [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN }, [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG }, + [NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -3923,9 +3925,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, return false; return true; case NL80211_CMD_CONNECT: - /* SAE not supported yet */ - if (auth_type == NL80211_AUTHTYPE_SAE) + if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && + auth_type == NL80211_AUTHTYPE_SAE) return false; + /* FILS with SK PFS or PK not supported yet */ if (auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || auth_type == NL80211_AUTHTYPE_FILS_PK) @@ -4132,6 +4135,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) wdev->chandef = params.chandef; wdev->ssid_len = params.ssid_len; memcpy(wdev->ssid, params.ssid, wdev->ssid_len); + + if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) + wdev->conn_owner_nlportid = info->snd_portid; } wdev_unlock(wdev); @@ -4487,6 +4493,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc); PUT_SINFO_U64(BEACON_RX, rx_beacon); PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); + PUT_SINFO(ACK_SIGNAL, ack_signal, u8); #undef PUT_SINFO #undef PUT_SINFO_U64 @@ -5848,7 +5855,6 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, return genlmsg_reply(msg, info); nla_put_failure: - genlmsg_cancel(msg, hdr); out: nlmsg_free(msg); return -ENOBUFS; @@ -6329,7 +6335,6 @@ static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info) nla_put_failure_rcu: rcu_read_unlock(); nla_put_failure: - genlmsg_cancel(msg, hdr); put_failure: nlmsg_free(msg); return -EMSGSIZE; @@ -6718,8 +6723,17 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, *flags = nla_get_u32(attrs[NL80211_ATTR_SCAN_FLAGS]); - if ((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && - !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) + if (((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && + !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) || + ((*flags & NL80211_SCAN_FLAG_LOW_SPAN) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_LOW_SPAN_SCAN)) || + ((*flags & NL80211_SCAN_FLAG_LOW_POWER) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_LOW_POWER_SCAN)) || + ((*flags & NL80211_SCAN_FLAG_HIGH_ACCURACY) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN))) return -EOPNOTSUPP; if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { @@ -7541,12 +7555,13 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; struct cfg80211_chan_def chandef; enum nl80211_dfs_regions dfs_region; unsigned int cac_time_ms; int err; - dfs_region = reg_get_dfs_region(wdev->wiphy); + dfs_region = reg_get_dfs_region(wiphy); if (dfs_region == NL80211_DFS_UNSET) return -EINVAL; @@ -7560,17 +7575,20 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, if (wdev->cac_started) return -EBUSY; - err = cfg80211_chandef_dfs_required(wdev->wiphy, &chandef, - wdev->iftype); + err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype); if (err < 0) return err; if (err == 0) return -EINVAL; - if (!cfg80211_chandef_dfs_usable(wdev->wiphy, &chandef)) + if (!cfg80211_chandef_dfs_usable(wiphy, &chandef)) return -EINVAL; + /* CAC start is offloaded to HW and can't be started manually */ + if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD)) + return -EOPNOTSUPP; + if (!rdev->ops->start_radar_detection) return -EOPNOTSUPP; @@ -8194,6 +8212,22 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) return err; } +static int validate_pae_over_nl80211(struct cfg80211_registered_device *rdev, + struct genl_info *info) +{ + if (!info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + GENL_SET_ERR_MSG(info, "SOCKET_OWNER not set"); + return -EINVAL; + } + + if (!rdev->ops->tx_control_port || + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211)) + return -EOPNOTSUPP; + + return 0; +} + static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, struct genl_info *info, struct cfg80211_crypto_settings *settings, @@ -8217,6 +8251,15 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, } else settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE); + if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) { + int r = validate_pae_over_nl80211(rdev, info); + + if (r < 0) + return r; + + settings->control_port_over_nl80211 = true; + } + if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) { void *data; int len, i; @@ -8662,12 +8705,26 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) ibss.control_port = nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT]); + if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) { + int r = validate_pae_over_nl80211(rdev, info); + + if (r < 0) + return r; + + ibss.control_port_over_nl80211 = true; + } + ibss.userspace_handles_dfs = nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]); - err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys); + wdev_lock(dev->ieee80211_ptr); + err = __cfg80211_join_ibss(rdev, dev, &ibss, connkeys); if (err) kzfree(connkeys); + else if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) + dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid; + wdev_unlock(dev->ieee80211_ptr); + return err; } @@ -9155,6 +9212,15 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } + if (nla_get_flag(info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])) { + if (!info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + GENL_SET_ERR_MSG(info, + "external auth requires connection ownership"); + return -EINVAL; + } + connect.flags |= CONNECT_REQ_EXTERNAL_AUTH_SUPPORT; + } + wdev_lock(dev->ieee80211_ptr); err = cfg80211_connect(rdev, dev, &connect, connkeys, @@ -10064,7 +10130,7 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (err) return err; } else { - /* cfg80211_join_mesh() will sort it out */ + /* __cfg80211_join_mesh() will sort it out */ setup.chandef.chan = NULL; } @@ -10102,7 +10168,22 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) setup.userspace_handles_dfs = nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]); - return cfg80211_join_mesh(rdev, dev, &setup, &cfg); + if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) { + int r = validate_pae_over_nl80211(rdev, info); + + if (r < 0) + return r; + + setup.control_port_over_nl80211 = true; + } + + wdev_lock(dev->ieee80211_ptr); + err = __cfg80211_join_mesh(rdev, dev, &setup, &cfg); + if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) + dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid; + wdev_unlock(dev->ieee80211_ptr); + + return err; } static int nl80211_leave_mesh(struct sk_buff *skb, struct genl_info *info) @@ -12463,6 +12544,103 @@ static int nl80211_del_pmk(struct sk_buff *skb, struct genl_info *info) return ret; } +static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_external_auth_params params; + + if (!rdev->ops->external_auth) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_SSID]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_BSSID]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_STATUS_CODE]) + return -EINVAL; + + memset(¶ms, 0, sizeof(params)); + + params.ssid.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); + if (params.ssid.ssid_len == 0 || + params.ssid.ssid_len > IEEE80211_MAX_SSID_LEN) + return -EINVAL; + memcpy(params.ssid.ssid, nla_data(info->attrs[NL80211_ATTR_SSID]), + params.ssid.ssid_len); + + memcpy(params.bssid, nla_data(info->attrs[NL80211_ATTR_BSSID]), + ETH_ALEN); + + params.status = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]); + + return rdev_external_auth(rdev, dev, ¶ms); +} + +static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *buf; + size_t len; + u8 *dest; + u16 proto; + bool noencrypt; + int err; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211)) + return -EOPNOTSUPP; + + if (!rdev->ops->tx_control_port) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_FRAME] || + !info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) { + GENL_SET_ERR_MSG(info, "Frame, MAC or ethertype missing"); + return -EINVAL; + } + + wdev_lock(wdev); + + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + break; + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + if (wdev->current_bss) + break; + err = -ENOTCONN; + goto out; + default: + err = -EOPNOTSUPP; + goto out; + } + + wdev_unlock(wdev); + + buf = nla_data(info->attrs[NL80211_ATTR_FRAME]); + len = nla_len(info->attrs[NL80211_ATTR_FRAME]); + dest = nla_data(info->attrs[NL80211_ATTR_MAC]); + proto = nla_get_u16(info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]); + noencrypt = + nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]); + + return rdev_tx_control_port(rdev, dev, buf, len, + dest, cpu_to_be16(proto), noencrypt); + + out: + wdev_unlock(wdev); + return err; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -13358,7 +13536,22 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, - + { + .cmd = NL80211_CMD_EXTERNAL_AUTH, + .doit = nl80211_external_auth, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_CONTROL_PORT_FRAME, + .doit = nl80211_tx_control_port, + .policy = nl80211_policy, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; static struct genl_family nl80211_fam __ro_after_init = { @@ -13672,7 +13865,6 @@ void nl80211_common_reg_change_event(enum nl80211_commands cmd_id, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -13720,7 +13912,6 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -13808,7 +13999,6 @@ static void nl80211_send_mlme_timeout(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -13884,7 +14074,6 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -13924,7 +14113,6 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -13954,7 +14142,6 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -13991,7 +14178,6 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14024,7 +14210,6 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14065,7 +14250,6 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_notify_new_peer_candidate); @@ -14104,7 +14288,6 @@ void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14159,7 +14342,6 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14205,7 +14387,6 @@ static void nl80211_send_remain_on_chan_event( return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14319,7 +14500,6 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_conn_failed); @@ -14356,7 +14536,6 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, return true; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); return true; } @@ -14440,7 +14619,6 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid); nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); return -ENOBUFS; } @@ -14484,11 +14662,68 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_mgmt_tx_status); +static int __nl80211_rx_control_port(struct net_device *dev, + const u8 *buf, size_t len, + const u8 *addr, u16 proto, + bool unencrypted, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct sk_buff *msg; + void *hdr; + u32 nlportid = READ_ONCE(wdev->conn_owner_nlportid); + + if (!nlportid) + return -ENOENT; + + msg = nlmsg_new(100 + len, gfp); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CONTROL_PORT_FRAME); + if (!hdr) { + nlmsg_free(msg); + return -ENOBUFS; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put(msg, NL80211_ATTR_FRAME, len, buf) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || + nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) || + (unencrypted && nla_put_flag(msg, + NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid); + + nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} + +bool cfg80211_rx_control_port(struct net_device *dev, + const u8 *buf, size_t len, + const u8 *addr, u16 proto, bool unencrypted) +{ + int ret; + + trace_cfg80211_rx_control_port(dev, buf, len, addr, proto, unencrypted); + ret = __nl80211_rx_control_port(dev, buf, len, addr, proto, + unencrypted, GFP_ATOMIC); + trace_cfg80211_return_bool(ret == 0); + return ret == 0; +} +EXPORT_SYMBOL(cfg80211_rx_control_port); + static struct sk_buff *cfg80211_prepare_cqm(struct net_device *dev, const char *mac, gfp_t gfp) { @@ -14693,7 +14928,6 @@ static void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14751,7 +14985,6 @@ nl80211_pmksa_candidate_notify(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14804,7 +15037,6 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14886,12 +15118,67 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } +void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac, + struct sta_opmode_info *sta_opmode, + gfp_t gfp) +{ + struct sk_buff *msg; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + void *hdr; + + if (WARN_ON(!mac)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_STA_OPMODE_CHANGED); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx)) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac)) + goto nla_put_failure; + + if ((sta_opmode->changed & STA_OPMODE_SMPS_MODE_CHANGED) && + nla_put_u8(msg, NL80211_ATTR_SMPS_MODE, sta_opmode->smps_mode)) + goto nla_put_failure; + + if ((sta_opmode->changed & STA_OPMODE_MAX_BW_CHANGED) && + nla_put_u8(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw)) + goto nla_put_failure; + + if ((sta_opmode->changed & STA_OPMODE_N_SS_CHANGED) && + nla_put_u8(msg, NL80211_ATTR_NSS, sta_opmode->rx_nss)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_MLME, gfp); + + return; + +nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_sta_opmode_change_notify); + void cfg80211_probe_status(struct net_device *dev, const u8 *addr, - u64 cookie, bool acked, gfp_t gfp) + u64 cookie, bool acked, s32 ack_signal, + bool is_valid_ack_signal, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -14916,7 +15203,9 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie, NL80211_ATTR_PAD) || - (acked && nla_put_flag(msg, NL80211_ATTR_ACK))) + (acked && nla_put_flag(msg, NL80211_ATTR_ACK)) || + (is_valid_ack_signal && nla_put_s32(msg, NL80211_ATTR_ACK_SIGNAL, + ack_signal))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -14926,7 +15215,6 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_probe_status); @@ -14971,8 +15259,6 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, nla_put_failure: spin_unlock_bh(&rdev->beacon_registrations_lock); - if (hdr) - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_report_obss_beacon); @@ -15188,7 +15474,6 @@ void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_tdls_oper_request); @@ -15333,8 +15618,6 @@ void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp) return; nla_put_failure: - if (hdr) - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_crit_proto_stopped); @@ -15369,6 +15652,47 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev) nlmsg_free(msg); } +int cfg80211_external_auth_request(struct net_device *dev, + struct cfg80211_external_auth_params *params, + gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct sk_buff *msg; + void *hdr; + + if (!wdev->conn_owner_nlportid) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_EXTERNAL_AUTH); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || + nla_put_u32(msg, NL80211_ATTR_AKM_SUITES, params->key_mgmt_suite) || + nla_put_u32(msg, NL80211_ATTR_EXTERNAL_AUTH_ACTION, + params->action) || + nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, params->bssid) || + nla_put(msg, NL80211_ATTR_SSID, params->ssid.ssid_len, + params->ssid.ssid)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, + wdev->conn_owner_nlportid); + return 0; + + nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} +EXPORT_SYMBOL(cfg80211_external_auth_request); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 0c06240d25af..87479a53411b 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -714,6 +714,21 @@ static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev, return ret; } +static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, + struct net_device *dev, + const void *buf, size_t len, + const u8 *dest, __be16 proto, + const bool noencrypt) +{ + int ret; + trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, + dest, proto, noencrypt); + ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, + dest, proto, noencrypt); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + static inline int rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) @@ -1190,4 +1205,19 @@ static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev, trace_rdev_return_int(&rdev->wiphy, ret); return ret; } + +static inline int +rdev_external_auth(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_external_auth_params *params) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_external_auth(&rdev->wiphy, dev, params); + if (rdev->ops->external_auth) + ret = rdev->ops->external_auth(&rdev->wiphy, dev, params); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 7b42f0bacfd8..16c7e4ef5820 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,6 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -134,12 +135,12 @@ static void restore_regulatory_settings(bool reset_user); static const struct ieee80211_regdomain *get_cfg80211_regdom(void) { - return rtnl_dereference(cfg80211_regdomain); + return rcu_dereference_rtnl(cfg80211_regdomain); } const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy) { - return rtnl_dereference(wiphy->regd); + return rcu_dereference_rtnl(wiphy->regd); } static const char *reg_dfs_region_str(enum nl80211_dfs_regions dfs_region) @@ -424,23 +425,36 @@ static const struct ieee80211_regdomain * reg_copy_regd(const struct ieee80211_regdomain *src_regd) { struct ieee80211_regdomain *regd; - int size_of_regd; + int size_of_regd, size_of_wmms; unsigned int i; + struct ieee80211_wmm_rule *d_wmm, *s_wmm; size_of_regd = sizeof(struct ieee80211_regdomain) + src_regd->n_reg_rules * sizeof(struct ieee80211_reg_rule); + size_of_wmms = src_regd->n_wmm_rules * + sizeof(struct ieee80211_wmm_rule); - regd = kzalloc(size_of_regd, GFP_KERNEL); + regd = kzalloc(size_of_regd + size_of_wmms, GFP_KERNEL); if (!regd) return ERR_PTR(-ENOMEM); memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain)); - for (i = 0; i < src_regd->n_reg_rules; i++) + d_wmm = (struct ieee80211_wmm_rule *)((u8 *)regd + size_of_regd); + s_wmm = (struct ieee80211_wmm_rule *)((u8 *)src_regd + size_of_regd); + memcpy(d_wmm, s_wmm, size_of_wmms); + + for (i = 0; i < src_regd->n_reg_rules; i++) { memcpy(®d->reg_rules[i], &src_regd->reg_rules[i], sizeof(struct ieee80211_reg_rule)); + if (!src_regd->reg_rules[i].wmm_rule) + continue; + regd->reg_rules[i].wmm_rule = d_wmm + + (src_regd->reg_rules[i].wmm_rule - s_wmm) / + sizeof(struct ieee80211_wmm_rule); + } return regd; } @@ -595,6 +609,17 @@ enum fwdb_flags { FWDB_FLAG_AUTO_BW = BIT(4), }; +struct fwdb_wmm_ac { + u8 ecw; + u8 aifsn; + __be16 cot; +} __packed; + +struct fwdb_wmm_rule { + struct fwdb_wmm_ac client[IEEE80211_NUM_ACS]; + struct fwdb_wmm_ac ap[IEEE80211_NUM_ACS]; +} __packed; + struct fwdb_rule { u8 len; u8 flags; @@ -602,6 +627,7 @@ struct fwdb_rule { __be32 start, end, max_bw; /* start of optional data */ __be16 cac_timeout; + __be16 wmm_ptr; } __packed __aligned(4); #define FWDB_MAGIC 0x52474442 @@ -613,6 +639,31 @@ struct fwdb_header { struct fwdb_country country[]; } __packed __aligned(4); +static int ecw2cw(int ecw) +{ + return (1 << ecw) - 1; +} + +static bool valid_wmm(struct fwdb_wmm_rule *rule) +{ + struct fwdb_wmm_ac *ac = (struct fwdb_wmm_ac *)rule; + int i; + + for (i = 0; i < IEEE80211_NUM_ACS * 2; i++) { + u16 cw_min = ecw2cw((ac[i].ecw & 0xf0) >> 4); + u16 cw_max = ecw2cw(ac[i].ecw & 0x0f); + u8 aifsn = ac[i].aifsn; + + if (cw_min >= cw_max) + return false; + + if (aifsn < 1) + return false; + } + + return true; +} + static bool valid_rule(const u8 *data, unsigned int size, u16 rule_ptr) { struct fwdb_rule *rule = (void *)(data + (rule_ptr << 2)); @@ -623,7 +674,18 @@ static bool valid_rule(const u8 *data, unsigned int size, u16 rule_ptr) /* mandatory fields */ if (rule->len < offsetofend(struct fwdb_rule, max_bw)) return false; + if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) { + u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2; + struct fwdb_wmm_rule *wmm; + + if (wmm_ptr + sizeof(struct fwdb_wmm_rule) > size) + return false; + wmm = (void *)(data + wmm_ptr); + + if (!valid_wmm(wmm)) + return false; + } return true; } @@ -798,23 +860,118 @@ static bool valid_regdb(const u8 *data, unsigned int size) return true; } +static void set_wmm_rule(struct ieee80211_wmm_rule *rule, + struct fwdb_wmm_rule *wmm) +{ + unsigned int i; + + for (i = 0; i < IEEE80211_NUM_ACS; i++) { + rule->client[i].cw_min = + ecw2cw((wmm->client[i].ecw & 0xf0) >> 4); + rule->client[i].cw_max = ecw2cw(wmm->client[i].ecw & 0x0f); + rule->client[i].aifsn = wmm->client[i].aifsn; + rule->client[i].cot = 1000 * be16_to_cpu(wmm->client[i].cot); + rule->ap[i].cw_min = ecw2cw((wmm->ap[i].ecw & 0xf0) >> 4); + rule->ap[i].cw_max = ecw2cw(wmm->ap[i].ecw & 0x0f); + rule->ap[i].aifsn = wmm->ap[i].aifsn; + rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot); + } +} + +static int __regdb_query_wmm(const struct fwdb_header *db, + const struct fwdb_country *country, int freq, + u32 *dbptr, struct ieee80211_wmm_rule *rule) +{ + unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; + struct fwdb_collection *coll = (void *)((u8 *)db + ptr); + int i; + + for (i = 0; i < coll->n_rules; i++) { + __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); + unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2; + struct fwdb_rule *rrule = (void *)((u8 *)db + rule_ptr); + struct fwdb_wmm_rule *wmm; + unsigned int wmm_ptr; + + if (rrule->len < offsetofend(struct fwdb_rule, wmm_ptr)) + continue; + + if (freq >= KHZ_TO_MHZ(be32_to_cpu(rrule->start)) && + freq <= KHZ_TO_MHZ(be32_to_cpu(rrule->end))) { + wmm_ptr = be16_to_cpu(rrule->wmm_ptr) << 2; + wmm = (void *)((u8 *)db + wmm_ptr); + set_wmm_rule(rule, wmm); + if (dbptr) + *dbptr = wmm_ptr; + return 0; + } + } + + return -ENODATA; +} + +int reg_query_regdb_wmm(char *alpha2, int freq, u32 *dbptr, + struct ieee80211_wmm_rule *rule) +{ + const struct fwdb_header *hdr = regdb; + const struct fwdb_country *country; + + if (IS_ERR(regdb)) + return PTR_ERR(regdb); + + country = &hdr->country[0]; + while (country->coll_ptr) { + if (alpha2_equal(alpha2, country->alpha2)) + return __regdb_query_wmm(regdb, country, freq, dbptr, + rule); + + country++; + } + + return -ENODATA; +} +EXPORT_SYMBOL(reg_query_regdb_wmm); + +struct wmm_ptrs { + struct ieee80211_wmm_rule *rule; + u32 ptr; +}; + +static struct ieee80211_wmm_rule *find_wmm_ptr(struct wmm_ptrs *wmm_ptrs, + u32 wmm_ptr, int n_wmms) +{ + int i; + + for (i = 0; i < n_wmms; i++) { + if (wmm_ptrs[i].ptr == wmm_ptr) + return wmm_ptrs[i].rule; + } + return NULL; +} + static int regdb_query_country(const struct fwdb_header *db, const struct fwdb_country *country) { unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; struct fwdb_collection *coll = (void *)((u8 *)db + ptr); struct ieee80211_regdomain *regdom; - unsigned int size_of_regd; - unsigned int i; + struct ieee80211_regdomain *tmp_rd; + unsigned int size_of_regd, i, n_wmms = 0; + struct wmm_ptrs *wmm_ptrs; - size_of_regd = - sizeof(struct ieee80211_regdomain) + + size_of_regd = sizeof(struct ieee80211_regdomain) + coll->n_rules * sizeof(struct ieee80211_reg_rule); regdom = kzalloc(size_of_regd, GFP_KERNEL); if (!regdom) return -ENOMEM; + wmm_ptrs = kcalloc(coll->n_rules, sizeof(*wmm_ptrs), GFP_KERNEL); + if (!wmm_ptrs) { + kfree(regdom); + return -ENOMEM; + } + regdom->n_reg_rules = coll->n_rules; regdom->alpha2[0] = country->alpha2[0]; regdom->alpha2[1] = country->alpha2[1]; @@ -851,7 +1008,38 @@ static int regdb_query_country(const struct fwdb_header *db, if (rule->len >= offsetofend(struct fwdb_rule, cac_timeout)) rrule->dfs_cac_ms = 1000 * be16_to_cpu(rule->cac_timeout); + if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) { + u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2; + struct ieee80211_wmm_rule *wmm_pos = + find_wmm_ptr(wmm_ptrs, wmm_ptr, n_wmms); + struct fwdb_wmm_rule *wmm; + struct ieee80211_wmm_rule *wmm_rule; + + if (wmm_pos) { + rrule->wmm_rule = wmm_pos; + continue; + } + wmm = (void *)((u8 *)db + wmm_ptr); + tmp_rd = krealloc(regdom, size_of_regd + (n_wmms + 1) * + sizeof(struct ieee80211_wmm_rule), + GFP_KERNEL); + + if (!tmp_rd) { + kfree(regdom); + return -ENOMEM; + } + regdom = tmp_rd; + + wmm_rule = (struct ieee80211_wmm_rule *) + ((u8 *)regdom + size_of_regd + n_wmms * + sizeof(struct ieee80211_wmm_rule)); + + set_wmm_rule(wmm_rule, wmm); + wmm_ptrs[n_wmms].ptr = wmm_ptr; + wmm_ptrs[n_wmms++].rule = wmm_rule; + } } + kfree(wmm_ptrs); return reg_schedule_apply(regdom); } diff --git a/net/wireless/sme.c b/net/wireless/sme.c index fdb3646274a5..5df6b33db786 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -1032,6 +1032,8 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, wdev->current_bss = NULL; wdev->ssid_len = 0; wdev->conn_owner_nlportid = 0; + kzfree(wdev->connect_keys); + wdev->connect_keys = NULL; nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap); @@ -1237,17 +1239,38 @@ void cfg80211_autodisconnect_wk(struct work_struct *work) wdev_lock(wdev); if (wdev->conn_owner_nlportid) { - /* - * Use disconnect_bssid if still connecting and ops->disconnect - * not implemented. Otherwise we can use cfg80211_disconnect. - */ - if (rdev->ops->disconnect || wdev->current_bss) - cfg80211_disconnect(rdev, wdev->netdev, - WLAN_REASON_DEAUTH_LEAVING, true); - else - cfg80211_mlme_deauth(rdev, wdev->netdev, - wdev->disconnect_bssid, NULL, 0, - WLAN_REASON_DEAUTH_LEAVING, false); + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + cfg80211_leave_ibss(rdev, wdev->netdev, false); + break; + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + cfg80211_stop_ap(rdev, wdev->netdev, false); + break; + case NL80211_IFTYPE_MESH_POINT: + cfg80211_leave_mesh(rdev, wdev->netdev); + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + /* + * Use disconnect_bssid if still connecting and + * ops->disconnect not implemented. Otherwise we can + * use cfg80211_disconnect. + */ + if (rdev->ops->disconnect || wdev->current_bss) + cfg80211_disconnect(rdev, wdev->netdev, + WLAN_REASON_DEAUTH_LEAVING, + true); + else + cfg80211_mlme_deauth(rdev, wdev->netdev, + wdev->disconnect_bssid, + NULL, 0, + WLAN_REASON_DEAUTH_LEAVING, + false); + break; + default: + break; + } } wdev_unlock(wdev); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index bcfedd39e7a3..55fb279a5196 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1882,6 +1882,32 @@ TRACE_EVENT(rdev_mgmt_tx, BOOL_TO_STR(__entry->dont_wait_for_ack)) ); +TRACE_EVENT(rdev_tx_control_port, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const u8 *buf, size_t len, const u8 *dest, __be16 proto, + bool unencrypted), + TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(dest) + __field(__be16, proto) + __field(bool, unencrypted) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(dest, dest); + __entry->proto = proto; + __entry->unencrypted = unencrypted; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT "," + " proto: 0x%x, unencrypted: %s", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest), + be16_to_cpu(__entry->proto), + BOOL_TO_STR(__entry->unencrypted)) +); + TRACE_EVENT(rdev_set_noack_map, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u16 noack_map), @@ -2319,6 +2345,29 @@ TRACE_EVENT(rdev_del_pmk, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(aa)) ); +TRACE_EVENT(rdev_external_auth, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_external_auth_params *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry(WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(bssid) + __array(u8, ssid, IEEE80211_MAX_SSID_LEN + 1) + __field(u16, status) + ), + TP_fast_assign(WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(bssid, params->bssid); + memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1); + memcpy(__entry->ssid, params->ssid.ssid, + params->ssid.ssid_len); + __entry->status = params->status; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT + ", ssid: %s, status: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->bssid, __entry->ssid, __entry->status) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ @@ -2577,6 +2626,27 @@ TRACE_EVENT(cfg80211_mgmt_tx_status, WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) ); +TRACE_EVENT(cfg80211_rx_control_port, + TP_PROTO(struct net_device *netdev, const u8 *buf, size_t len, + const u8 *addr, u16 proto, bool unencrypted), + TP_ARGS(netdev, buf, len, addr, proto, unencrypted), + TP_STRUCT__entry( + NETDEV_ENTRY + MAC_ENTRY(addr) + __field(u16, proto) + __field(bool, unencrypted) + ), + TP_fast_assign( + NETDEV_ASSIGN; + MAC_ASSIGN(addr, addr); + __entry->proto = proto; + __entry->unencrypted = unencrypted; + ), + TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT " proto: 0x%x, unencrypted: %s", + NETDEV_PR_ARG, MAC_PR_ARG(addr), + __entry->proto, BOOL_TO_STR(__entry->unencrypted)) +); + TRACE_EVENT(cfg80211_cqm_rssi_notify, TP_PROTO(struct net_device *netdev, enum nl80211_cqm_rssi_threshold_event rssi_event, @@ -3114,7 +3184,7 @@ TRACE_EVENT(rdev_start_radar_detection, TRACE_EVENT(rdev_set_mcast_rate, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - int mcast_rate[NUM_NL80211_BANDS]), + int *mcast_rate), TP_ARGS(wiphy, netdev, mcast_rate), TP_STRUCT__entry( WIPHY_ENTRY diff --git a/net/wireless/util.c b/net/wireless/util.c index c69160694b6c..d112e9a89364 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -420,7 +420,8 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, - const u8 *addr, enum nl80211_iftype iftype) + const u8 *addr, enum nl80211_iftype iftype, + u8 data_offset) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct { @@ -434,7 +435,7 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return -1; - hdrlen = ieee80211_hdrlen(hdr->frame_control); + hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset; if (skb->len < hdrlen + 8) return -1; diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 9efbfc753347..5e677dac2a0c 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -347,13 +347,13 @@ void wireless_nlevent_flush(void) struct sk_buff *skb; struct net *net; - ASSERT_RTNL(); - + down_read(&net_rwsem); for_each_net(net) { while ((skb = skb_dequeue(&net->wext_nlevents))) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); } + up_read(&net_rwsem); } EXPORT_SYMBOL_GPL(wireless_nlevent_flush); @@ -410,9 +410,7 @@ subsys_initcall(wireless_nlevent_init); /* Process events generated by the wireless layer or the driver. */ static void wireless_nlevent_process(struct work_struct *work) { - rtnl_lock(); wireless_nlevent_flush(); - rtnl_unlock(); } static DECLARE_WORK(wireless_nlevent_work, wireless_nlevent_process); diff --git a/net/wireless/wext-proc.c b/net/wireless/wext-proc.c index 5511f989ef47..b4c464594a5e 100644 --- a/net/wireless/wext-proc.c +++ b/net/wireless/wext-proc.c @@ -142,7 +142,7 @@ static const struct file_operations wireless_seq_fops = { int __net_init wext_proc_init(struct net *net) { /* Create /proc/net/wireless entry */ - if (!proc_create("wireless", S_IRUGO, net->proc_net, + if (!proc_create("wireless", 0444, net->proc_net, &wireless_seq_fops)) return -ENOMEM; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 562cc11131f6..d49aa79b7997 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -896,7 +896,7 @@ out: } static int x25_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr; struct sock *sk = sock->sk; @@ -913,7 +913,7 @@ static int x25_getname(struct socket *sock, struct sockaddr *uaddr, sx25->sx25_addr = x25->source_addr; sx25->sx25_family = AF_X25; - *uaddr_len = sizeof(*sx25); + rc = sizeof(*sx25); out: return rc; diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c index 0917f047f2cf..64b415e93f6a 100644 --- a/net/x25/x25_proc.c +++ b/net/x25/x25_proc.c @@ -212,16 +212,16 @@ int __init x25_proc_init(void) if (!proc_mkdir("x25", init_net.proc_net)) return -ENOMEM; - if (!proc_create("x25/route", S_IRUGO, init_net.proc_net, - &x25_seq_route_fops)) + if (!proc_create("x25/route", 0444, init_net.proc_net, + &x25_seq_route_fops)) goto out; - if (!proc_create("x25/socket", S_IRUGO, init_net.proc_net, - &x25_seq_socket_fops)) + if (!proc_create("x25/socket", 0444, init_net.proc_net, + &x25_seq_socket_fops)) goto out; - if (!proc_create("x25/forward", S_IRUGO, init_net.proc_net, - &x25_seq_forward_fops)) + if (!proc_create("x25/forward", 0444, init_net.proc_net, + &x25_seq_forward_fops)) goto out; return 0; diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index db0b1315d577..9c214ec681ac 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -335,8 +335,7 @@ int x25_decode(struct sock *sk, struct sk_buff *skb, int *ns, int *nr, int *q, } } - pr_debug("invalid PLP frame %02X %02X %02X\n", - frame[0], frame[1], frame[2]); + pr_debug("invalid PLP frame %3ph\n", frame); return X25_ILLEGAL; } diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 8e70291e586a..175941e15a6e 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -217,7 +217,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) if (skb->len <= mtu) goto ok; - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) goto ok; } @@ -350,7 +350,7 @@ static struct notifier_block xfrm_dev_notifier = { .notifier_call = xfrm_dev_event, }; -void __net_init xfrm_dev_init(void) +void __init xfrm_dev_init(void) { register_netdevice_notifier(&xfrm_dev_notifier); } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 1472c0857975..352abca2605f 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -9,6 +9,7 @@ */ #include <linux/bottom_half.h> +#include <linux/cache.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/module.h> @@ -26,12 +27,18 @@ struct xfrm_trans_tasklet { }; struct xfrm_trans_cb { + union { + struct inet_skb_parm h4; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_skb_parm h6; +#endif + } header; int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb); }; #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) -static struct kmem_cache *secpath_cachep __read_mostly; +static struct kmem_cache *secpath_cachep __ro_after_init; static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index ccfdc7115a83..a00ec715aa46 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -283,7 +283,7 @@ static struct crypto_comp * __percpu *ipcomp_alloc_tfms(const char *alg_name) struct crypto_comp *tfm; /* This can be any valid CPU ID so we don't need locking. */ - tfm = __this_cpu_read(*pos->tfms); + tfm = this_cpu_read(*pos->tfms); if (!strcmp(crypto_comp_name(tfm), alg_name)) { pos->users++; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 23468672a767..89b178a78dc7 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -285,8 +285,9 @@ void xfrm_local_error(struct sk_buff *skb, int mtu) return; afinfo = xfrm_state_get_afinfo(proto); - if (afinfo) + if (afinfo) { afinfo->local_error(skb, mtu); - rcu_read_unlock(); + rcu_read_unlock(); + } } EXPORT_SYMBOL_GPL(xfrm_local_error); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7a23078132cf..40b54cc64243 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -51,7 +51,7 @@ static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; -static struct kmem_cache *xfrm_dst_cache __read_mostly; +static struct kmem_cache *xfrm_dst_cache __ro_after_init; static __read_mostly seqcount_t xfrm_policy_hash_generation; static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); @@ -1458,10 +1458,13 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, static int xfrm_get_tos(const struct flowi *fl, int family) { const struct xfrm_policy_afinfo *afinfo; - int tos = 0; + int tos; afinfo = xfrm_policy_get_afinfo(family); - tos = afinfo ? afinfo->get_tos(fl) : 0; + if (!afinfo) + return 0; + + tos = afinfo->get_tos(fl); rcu_read_unlock(); @@ -1740,7 +1743,7 @@ static void xfrm_pcpu_work_fn(struct work_struct *work) void xfrm_policy_cache_flush(void) { struct xfrm_dst *old; - bool found = 0; + bool found = false; int cpu; might_sleep(); @@ -1891,7 +1894,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) spin_unlock(&pq->hold_queue.lock); dst_hold(xfrm_dst_path(dst)); - dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, 0); + dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) goto purge_queue; @@ -2729,14 +2732,14 @@ static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, while (dst->xfrm) { const struct xfrm_state *xfrm = dst->xfrm; + dst = xfrm_dst_child(dst); + if (xfrm->props.mode == XFRM_MODE_TRANSPORT) continue; if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR) daddr = xfrm->coaddr; else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) daddr = &xfrm->id.daddr; - - dst = xfrm_dst_child(dst); } return daddr; } @@ -2892,8 +2895,6 @@ static int __net_init xfrm_policy_init(struct net *net) INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); - if (net_eq(net, &init_net)) - xfrm_dev_init(); return 0; out_bydst: @@ -2996,6 +2997,7 @@ void __init xfrm_init(void) INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn); register_pernet_subsys(&xfrm_net_ops); + xfrm_dev_init(); seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); } diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index 6d5f85f4e672..ed06903cd84d 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -79,7 +79,7 @@ static const struct file_operations xfrm_statistics_seq_fops = { int __net_init xfrm_proc_init(struct net *net) { - if (!proc_create("xfrm_stat", S_IRUGO, net->proc_net, + if (!proc_create("xfrm_stat", 0444, net->proc_net, &xfrm_statistics_seq_fops)) return -ENOMEM; return 0; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 1d38c6acf8af..9e3a5e85f828 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -660,7 +660,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff } else { XFRM_SKB_CB(skb)->seq.output.low = oseq + 1; XFRM_SKB_CB(skb)->seq.output.hi = oseq_hi; - xo->seq.low = oseq = oseq + 1; + xo->seq.low = oseq + 1; xo->seq.hi = oseq_hi; oseq += skb_shinfo(skb)->gso_segs; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 54e21f19d722..f9d2f2233f09 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2056,6 +2056,11 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen struct xfrm_mgr *km; struct xfrm_policy *pol = NULL; +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + return -EOPNOTSUPP; +#endif + if (!optval && !optlen) { xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 7f52b8eb177d..080035f056d9 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -121,22 +121,17 @@ static inline int verify_replay(struct xfrm_usersa_info *p, struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL]; struct xfrm_replay_state_esn *rs; - if (p->flags & XFRM_STATE_ESN) { - if (!rt) - return -EINVAL; + if (!rt) + return (p->flags & XFRM_STATE_ESN) ? -EINVAL : 0; - rs = nla_data(rt); + rs = nla_data(rt); - if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8) - return -EINVAL; - - if (nla_len(rt) < (int)xfrm_replay_state_esn_len(rs) && - nla_len(rt) != sizeof(*rs)) - return -EINVAL; - } + if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8) + return -EINVAL; - if (!rt) - return 0; + if (nla_len(rt) < (int)xfrm_replay_state_esn_len(rs) && + nla_len(rt) != sizeof(*rs)) + return -EINVAL; /* As only ESP and AH support ESN feature. */ if ((p->id.proto != IPPROTO_ESP) && (p->id.proto != IPPROTO_AH)) |