diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-17 22:44:48 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-17 22:44:48 +0300 |
commit | 9bd553929f68921be0f2014dd06561e0c8249a0d (patch) | |
tree | 720e556374e3500af9a0210178fabfc6bd0f754c /drivers/infiniband/ulp/ipoib | |
parent | 022ff62c3d8c3758d15ccc6b58615fd8f257ba85 (diff) | |
parent | 0a3173a5f09bc58a3638ecfd0a80bdbae55e123c (diff) | |
download | linux-9bd553929f68921be0f2014dd06561e0c8249a0d.tar.xz |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"This has been a large cycle for RDMA, with several major patch series
reworking parts of the core code.
- Rework the so-called 'gid cache' and internal APIs to use a kref'd
pointer to a struct instead of copying, push this upwards into the
callers and add more stuff to the struct. The new design avoids
some ugly races the old one suffered with. This is part of the
namespace enablement work as the new struct is learning to be
namespace aware.
- Various uapi cleanups, moving more stuff to include/uapi and fixing
some long standing bugs that have recently been discovered.
- Driver updates for mlx5, mlx4 i40iw, rxe, cxgb4, hfi1, usnic,
pvrdma, and hns
- Provide max_send_sge and max_recv_sge attributes to better support
HW where these values are asymmetric.
- mlx5 user API 'devx' allows sending commands directly to the device
FW, instead of trying to cram every wild and niche feature into the
common API. Sort of like what GPU does.
- Major write() and ioctl() API rework to cleanly support PCI device
hot unplug and advance the ioctl conversion work
- Sparse and compile warning cleanups
- Add 'const' to the ib_poll_cq() signature, and permit a NULL
'bad_wr', which is the common use case
- Various patches to avoid high order allocations across the stack
- SRQ support for cxgb4, hns and qedr
- Changes to IPoIB to better follow the netdev model for working with
struct net_device liftime"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (312 commits)
Revert "net/smc: Replace ib_query_gid with rdma_get_gid_attr"
RDMA/hns: Fix usage of bitmap allocation functions return values
IB/core: Change filter function return type from int to bool
IB/core: Update GID entries for netdevice whose mac address changes
IB/core: Add default GIDs of the bond master netdev
IB/core: Consider adding default GIDs of bond device
IB/core: Delete lower netdevice default GID entries in bonding scenario
IB/core: Avoid confusing del_netdev_default_ips
IB/core: Add comment for change upper netevent handling
qedr: Add user space support for SRQ
qedr: Add support for kernel mode SRQ's
qedr: Add wrapping generic structure for qpidr and adjust idr routines.
IB/mlx5: Fix leaking stack memory to userspace
Update the e-mail address of Bart Van Assche
IB/ucm: Fix compiling ucm.c
IB/uverbs: Do not check for device disassociation during ioctl
IB/uverbs: Remove struct uverbs_root_spec and all supporting code
IB/uverbs: Use uverbs_api to unmarshal ioctl commands
IB/uverbs: Use uverbs_alloc for allocations
IB/uverbs: Add a simple allocator to uverbs_attr_bundle
...
Diffstat (limited to 'drivers/infiniband/ulp/ipoib')
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib.h | 32 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_cm.c | 81 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_fs.c | 6 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_ib.c | 15 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_main.c | 444 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 7 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_netlink.c | 23 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 8 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 261 |
10 files changed, 480 insertions, 399 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index a50b062ed13e..1abe3c62f106 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -91,11 +91,9 @@ enum { IPOIB_STOP_REAPER = 7, IPOIB_FLAG_ADMIN_CM = 9, IPOIB_FLAG_UMCAST = 10, - IPOIB_STOP_NEIGH_GC = 11, IPOIB_NEIGH_TBL_FLUSH = 12, IPOIB_FLAG_DEV_ADDR_SET = 13, IPOIB_FLAG_DEV_ADDR_CTRL = 14, - IPOIB_FLAG_GOING_DOWN = 15, IPOIB_MAX_BACKOFF_SECONDS = 16, @@ -252,11 +250,11 @@ struct ipoib_cm_tx { struct ipoib_neigh *neigh; struct ipoib_path *path; struct ipoib_tx_buf *tx_ring; - unsigned tx_head; - unsigned tx_tail; + unsigned int tx_head; + unsigned int tx_tail; unsigned long flags; u32 mtu; - unsigned max_send_sge; + unsigned int max_send_sge; }; struct ipoib_cm_rx_buf { @@ -325,15 +323,22 @@ struct ipoib_dev_priv { spinlock_t lock; struct net_device *dev; + void (*next_priv_destructor)(struct net_device *dev); struct napi_struct send_napi; struct napi_struct recv_napi; unsigned long flags; + /* + * This protects access to the child_intfs list. + * To READ from child_intfs the RTNL or vlan_rwsem read side must be + * held. To WRITE RTNL and the vlan_rwsem write side must be held (in + * that order) This lock exists because we have a few contexts where + * we need the child_intfs, but do not want to grab the RTNL. + */ struct rw_semaphore vlan_rwsem; struct mutex mcast_mutex; - struct mutex sysfs_mutex; struct rb_root path_tree; struct list_head path_list; @@ -373,8 +378,8 @@ struct ipoib_dev_priv { struct ipoib_rx_buf *rx_ring; struct ipoib_tx_buf *tx_ring; - unsigned tx_head; - unsigned tx_tail; + unsigned int tx_head; + unsigned int tx_tail; struct ib_sge tx_sge[MAX_SKB_FRAGS + 1]; struct ib_ud_wr tx_wr; struct ib_wc send_wc[MAX_SEND_CQE]; @@ -404,7 +409,7 @@ struct ipoib_dev_priv { #endif u64 hca_caps; struct ipoib_ethtool_st ethtool; - unsigned max_send_sge; + unsigned int max_send_sge; bool sm_fullmember_sendonly_support; const struct net_device_ops *rn_ops; }; @@ -414,7 +419,7 @@ struct ipoib_ah { struct ib_ah *ah; struct list_head list; struct kref ref; - unsigned last_send; + unsigned int last_send; int valid; }; @@ -483,6 +488,7 @@ static inline void ipoib_put_ah(struct ipoib_ah *ah) kref_put(&ah->ref, ipoib_free_ah); } int ipoib_open(struct net_device *dev); +void ipoib_intf_free(struct net_device *dev); int ipoib_add_pkey_attr(struct net_device *dev); int ipoib_add_umcast_attr(struct net_device *dev); @@ -510,9 +516,6 @@ void ipoib_ib_dev_down(struct net_device *dev); int ipoib_ib_dev_stop_default(struct net_device *dev); void ipoib_pkey_dev_check_presence(struct net_device *dev); -int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); -void ipoib_dev_cleanup(struct net_device *dev); - void ipoib_mcast_join_task(struct work_struct *work); void ipoib_mcast_carrier_on_task(struct work_struct *work); void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); @@ -600,7 +603,6 @@ void ipoib_pkey_open(struct ipoib_dev_priv *priv); void ipoib_drain_cq(struct net_device *dev); void ipoib_set_ethtool_ops(struct net_device *dev); -void ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); #define IPOIB_FLAGS_RC 0x80 #define IPOIB_FLAGS_UC 0x40 @@ -729,7 +731,7 @@ void ipoib_cm_dev_stop(struct net_device *dev) static inline int ipoib_cm_dev_init(struct net_device *dev) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 6535d9beb24d..ea01b8dd2be6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -78,7 +78,7 @@ static struct ib_send_wr ipoib_cm_rx_drain_wr = { }; static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, - struct ib_cm_event *event); + const struct ib_cm_event *event); static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, u64 mapping[IPOIB_CM_RX_SG]) @@ -94,7 +94,6 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - struct ib_recv_wr *bad_wr; int i, ret; priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; @@ -102,7 +101,7 @@ static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) for (i = 0; i < priv->cm.num_frags; ++i) priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; - ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); + ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, NULL); if (unlikely(ret)) { ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1, @@ -120,7 +119,6 @@ static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, struct ib_sge *sge, int id) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - struct ib_recv_wr *bad_wr; int i, ret; wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; @@ -128,7 +126,7 @@ static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, for (i = 0; i < IPOIB_CM_RX_SG; ++i) sge[i].addr = rx->rx_ring[id].mapping[i]; - ret = ib_post_recv(rx->qp, wr, &bad_wr); + ret = ib_post_recv(rx->qp, wr, NULL); if (unlikely(ret)) { ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret); ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, @@ -212,7 +210,6 @@ static void ipoib_cm_free_rx_ring(struct net_device *dev, static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) { - struct ib_send_wr *bad_wr; struct ipoib_cm_rx *p; /* We only reserved 1 extra slot in CQ for drain WRs, so @@ -227,7 +224,7 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) */ p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID; - if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) + if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, NULL)) ipoib_warn(priv, "failed to post drain wr\n"); list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); @@ -275,7 +272,7 @@ static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, static int ipoib_cm_modify_rx_qp(struct net_device *dev, struct ib_cm_id *cm_id, struct ib_qp *qp, - unsigned psn) + unsigned int psn) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_qp_attr qp_attr; @@ -363,7 +360,7 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i if (!rx->rx_ring) return -ENOMEM; - t = kmalloc(sizeof *t, GFP_KERNEL); + t = kmalloc(sizeof(*t), GFP_KERNEL); if (!t) { ret = -ENOMEM; goto err_free_1; @@ -421,8 +418,9 @@ err_free_1: } static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, - struct ib_qp *qp, struct ib_cm_req_event_param *req, - unsigned psn) + struct ib_qp *qp, + const struct ib_cm_req_event_param *req, + unsigned int psn) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_data data = {}; @@ -432,7 +430,7 @@ static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); rep.private_data = &data; - rep.private_data_len = sizeof data; + rep.private_data_len = sizeof(data); rep.flow_control = 0; rep.rnr_retry_count = req->rnr_retry_count; rep.srq = ipoib_cm_has_srq(dev); @@ -441,16 +439,17 @@ static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, return ib_send_cm_rep(cm_id, &rep); } -static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event) { struct net_device *dev = cm_id->context; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_rx *p; - unsigned psn; + unsigned int psn; int ret; ipoib_dbg(priv, "REQ arrived\n"); - p = kzalloc(sizeof *p, GFP_KERNEL); + p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; p->dev = dev; @@ -503,7 +502,7 @@ err_qp: } static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, - struct ib_cm_event *event) + const struct ib_cm_event *event) { struct ipoib_cm_rx *p; struct ipoib_dev_priv *priv; @@ -547,7 +546,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, 0, PAGE_SIZE); --skb_shinfo(skb)->nr_frags; } else { - size = min(length, (unsigned) PAGE_SIZE); + size = min_t(unsigned int, length, PAGE_SIZE); skb_frag_size_set(frag, size); skb->data_len += size; @@ -641,8 +640,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) } } - frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, - (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; + frags = PAGE_ALIGN(wc->byte_len - + min_t(u32, wc->byte_len, IPOIB_CM_HEAD_SIZE)) / + PAGE_SIZE; newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping, GFP_ATOMIC); @@ -657,7 +657,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) } ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping); - memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping); + memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof(*mapping)); ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", wc->byte_len, wc->slid); @@ -698,13 +698,11 @@ static inline int post_send(struct ipoib_dev_priv *priv, unsigned int wr_id, struct ipoib_tx_buf *tx_req) { - struct ib_send_wr *bad_wr; - ipoib_build_sge(priv, tx_req); priv->tx_wr.wr.wr_id = wr_id | IPOIB_OP_CM; - return ib_post_send(tx->qp, &priv->tx_wr.wr, &bad_wr); + return ib_post_send(tx->qp, &priv->tx_wr.wr, NULL); } void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) @@ -712,7 +710,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_tx_buf *tx_req; int rc; - unsigned usable_sge = tx->max_send_sge - !!skb_headlen(skb); + unsigned int usable_sge = tx->max_send_sge - !!skb_headlen(skb); if (unlikely(skb->len > tx->mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", @@ -982,7 +980,8 @@ void ipoib_cm_dev_stop(struct net_device *dev) cancel_delayed_work(&priv->cm.stale_task); } -static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event) { struct ipoib_cm_tx *p = cm_id->context; struct ipoib_dev_priv *priv = ipoib_priv(p->dev); @@ -1068,8 +1067,8 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ struct ib_qp *tx_qp; if (dev->features & NETIF_F_SG) - attr.cap.max_send_sge = - min_t(u32, priv->ca->attrs.max_sge, MAX_SKB_FRAGS + 1); + attr.cap.max_send_sge = min_t(u32, priv->ca->attrs.max_send_sge, + MAX_SKB_FRAGS + 1); tx_qp = ib_create_qp(priv->pd, &attr); tx->max_send_sge = attr.cap.max_send_sge; @@ -1094,7 +1093,7 @@ static int ipoib_cm_send_req(struct net_device *dev, req.qp_num = qp->qp_num; req.qp_type = qp->qp_type; req.private_data = &data; - req.private_data_len = sizeof data; + req.private_data_len = sizeof(data); req.flow_control = 0; req.starting_psn = 0; /* FIXME */ @@ -1152,7 +1151,7 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, ret = -ENOMEM; goto err_tx; } - memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); + memset(p->tx_ring, 0, ipoib_sendq_size * sizeof(*p->tx_ring)); p->qp = ipoib_cm_create_tx_qp(p->dev, p); memalloc_noio_restore(noio_flag); @@ -1248,7 +1247,7 @@ timeout: } static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, - struct ib_cm_event *event) + const struct ib_cm_event *event) { struct ipoib_cm_tx *tx = cm_id->context; struct ipoib_dev_priv *priv = ipoib_priv(tx->dev); @@ -1305,7 +1304,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_tx *tx; - tx = kzalloc(sizeof *tx, GFP_ATOMIC); + tx = kzalloc(sizeof(*tx), GFP_ATOMIC); if (!tx) return NULL; @@ -1370,7 +1369,7 @@ static void ipoib_cm_tx_start(struct work_struct *work) neigh->daddr + QPN_AND_OPTIONS_OFFSET); goto free_neigh; } - memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); + memcpy(&pathrec, &p->path->pathrec, sizeof(pathrec)); spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); @@ -1428,7 +1427,7 @@ static void ipoib_cm_skb_reap(struct work_struct *work) struct net_device *dev = priv->dev; struct sk_buff *skb; unsigned long flags; - unsigned mtu = priv->mcast_mtu; + unsigned int mtu = priv->mcast_mtu; netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); @@ -1518,19 +1517,16 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, { struct net_device *dev = to_net_dev(d); int ret; - struct ipoib_dev_priv *priv = ipoib_priv(dev); - - if (test_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags)) - return -EPERM; - - if (!mutex_trylock(&priv->sysfs_mutex)) - return restart_syscall(); if (!rtnl_trylock()) { - mutex_unlock(&priv->sysfs_mutex); return restart_syscall(); } + if (dev->reg_state != NETREG_REGISTERED) { + rtnl_unlock(); + return -EPERM; + } + ret = ipoib_set_mode(dev, buf); /* The assumption is that the function ipoib_set_mode returned @@ -1539,7 +1535,6 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, */ if (ret != -EBUSY) rtnl_unlock(); - mutex_unlock(&priv->sysfs_mutex); return (!ret || ret == -EBUSY) ? count : ret; } @@ -1564,7 +1559,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); if (IS_ERR(priv->cm.srq)) { - if (PTR_ERR(priv->cm.srq) != -ENOSYS) + if (PTR_ERR(priv->cm.srq) != -EOPNOTSUPP) pr_warn("%s: failed to allocate SRQ, error %ld\n", priv->ca->name, PTR_ERR(priv->cm.srq)); priv->cm.srq = NULL; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 2706bf26cbac..83429925dfc6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -102,7 +102,7 @@ static int ipoib_set_coalesce(struct net_device *dev, ret = rdma_set_cq_moderation(priv->recv_cq, coal->rx_max_coalesced_frames, coal->rx_coalesce_usecs); - if (ret && ret != -ENOSYS) { + if (ret && ret != -EOPNOTSUPP) { ipoib_warn(priv, "failed modifying CQ (%d)\n", ret); return ret; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c index ea302b054601..178488028734 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -262,15 +262,15 @@ static const struct file_operations ipoib_path_fops = { void ipoib_create_debug_files(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - char name[IFNAMSIZ + sizeof "_path"]; + char name[IFNAMSIZ + sizeof("_path")]; - snprintf(name, sizeof name, "%s_mcg", dev->name); + snprintf(name, sizeof(name), "%s_mcg", dev->name); priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, ipoib_root, dev, &ipoib_mcg_fops); if (!priv->mcg_dentry) ipoib_warn(priv, "failed to create mcg debug file\n"); - snprintf(name, sizeof name, "%s_path", dev->name); + snprintf(name, sizeof(name), "%s_path", dev->name); priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, ipoib_root, dev, &ipoib_path_fops); if (!priv->path_dentry) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index f47f9ace1f48..9006a13af1de 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -40,6 +40,7 @@ #include <linux/ip.h> #include <linux/tcp.h> +#include <rdma/ib_cache.h> #include "ipoib.h" @@ -57,7 +58,7 @@ struct ipoib_ah *ipoib_create_ah(struct net_device *dev, struct ipoib_ah *ah; struct ib_ah *vah; - ah = kmalloc(sizeof *ah, GFP_KERNEL); + ah = kmalloc(sizeof(*ah), GFP_KERNEL); if (!ah) return ERR_PTR(-ENOMEM); @@ -100,7 +101,6 @@ static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, static int ipoib_ib_post_receive(struct net_device *dev, int id) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - struct ib_recv_wr *bad_wr; int ret; priv->rx_wr.wr_id = id | IPOIB_OP_RECV; @@ -108,7 +108,7 @@ static int ipoib_ib_post_receive(struct net_device *dev, int id) priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1]; - ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr); + ret = ib_post_recv(priv->qp, &priv->rx_wr, NULL); if (unlikely(ret)) { ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping); @@ -202,7 +202,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) } memcpy(mapping, priv->rx_ring[wr_id].mapping, - IPOIB_UD_RX_SG * sizeof *mapping); + IPOIB_UD_RX_SG * sizeof(*mapping)); /* * If we can't allocate a new RX buffer, dump @@ -541,7 +541,6 @@ static inline int post_send(struct ipoib_dev_priv *priv, struct ipoib_tx_buf *tx_req, void *head, int hlen) { - struct ib_send_wr *bad_wr; struct sk_buff *skb = tx_req->skb; ipoib_build_sge(priv, tx_req); @@ -558,7 +557,7 @@ static inline int post_send(struct ipoib_dev_priv *priv, } else priv->tx_wr.wr.opcode = IB_WR_SEND; - return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr); + return ib_post_send(priv->qp, &priv->tx_wr.wr, NULL); } int ipoib_send(struct net_device *dev, struct sk_buff *skb, @@ -568,7 +567,7 @@ int ipoib_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_tx_buf *tx_req; int hlen, rc; void *phead; - unsigned usable_sge = priv->max_send_sge - !!skb_headlen(skb); + unsigned int usable_sge = priv->max_send_sge - !!skb_headlen(skb); if (skb_is_gso(skb)) { hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); @@ -1069,7 +1068,7 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv) bool ret = false; netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4); - if (ib_query_gid(priv->ca, priv->port, 0, &gid0, NULL)) + if (rdma_query_gid(priv->ca, priv->port, 0, &gid0)) return false; netif_addr_lock_bh(priv->dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 26cde95bc0f3..e3d28f9ad9c0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -215,11 +215,6 @@ static int ipoib_stop(struct net_device *dev) return 0; } -static void ipoib_uninit(struct net_device *dev) -{ - ipoib_dev_cleanup(dev); -} - static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -634,7 +629,7 @@ struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) { struct ipoib_path_iter *iter; - iter = kmalloc(sizeof *iter, GFP_KERNEL); + iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return NULL; @@ -770,8 +765,10 @@ static void path_rec_completion(int status, struct rdma_ah_attr av; if (!ib_init_ah_attr_from_path(priv->ca, priv->port, - pathrec, &av)) + pathrec, &av, NULL)) { ah = ipoib_create_ah(dev, priv->pd, &av); + rdma_destroy_ah_attr(&av); + } } spin_lock_irqsave(&priv->lock, flags); @@ -883,7 +880,7 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) if (!priv->broadcast) return NULL; - path = kzalloc(sizeof *path, GFP_ATOMIC); + path = kzalloc(sizeof(*path), GFP_ATOMIC); if (!path) return NULL; @@ -1199,11 +1196,13 @@ static void ipoib_timeout(struct net_device *dev) static int ipoib_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, - const void *daddr, const void *saddr, unsigned len) + const void *daddr, + const void *saddr, + unsigned int len) { struct ipoib_header *header; - header = skb_push(skb, sizeof *header); + header = skb_push(skb, sizeof(*header)); header->proto = htons(type); header->reserved = 0; @@ -1306,9 +1305,6 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) int i; LIST_HEAD(remove_list); - if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) - return; - spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, @@ -1320,9 +1316,6 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) /* neigh is obsolete if it was idle for two GC periods */ dt = 2 * arp_tbl.gc_interval; neigh_obsolete = jiffies - dt; - /* handle possible race condition */ - if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) - goto out_unlock; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; @@ -1360,9 +1353,8 @@ static void ipoib_reap_neigh(struct work_struct *work) __ipoib_reap_neigh(priv); - if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) - queue_delayed_work(priv->wq, &priv->neigh_reap_task, - arp_tbl.gc_interval); + queue_delayed_work(priv->wq, &priv->neigh_reap_task, + arp_tbl.gc_interval); } @@ -1371,7 +1363,7 @@ static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, { struct ipoib_neigh *neigh; - neigh = kzalloc(sizeof *neigh, GFP_ATOMIC); + neigh = kzalloc(sizeof(*neigh), GFP_ATOMIC); if (!neigh) return NULL; @@ -1524,9 +1516,8 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); if (!htbl) return -ENOMEM; - set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); size = roundup_pow_of_two(arp_tbl.gc_thresh3); - buckets = kcalloc(size, sizeof(*buckets), GFP_KERNEL); + buckets = kvcalloc(size, sizeof(*buckets), GFP_KERNEL); if (!buckets) { kfree(htbl); return -ENOMEM; @@ -1539,7 +1530,6 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) atomic_set(&ntbl->entries, 0); /* start garbage collection */ - clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); @@ -1554,7 +1544,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct ipoib_neigh __rcu **buckets = htbl->buckets; struct ipoib_neigh_table *ntbl = htbl->ntbl; - kfree(buckets); + kvfree(buckets); kfree(htbl); complete(&ntbl->deleted); } @@ -1649,15 +1639,11 @@ out_unlock: static void ipoib_neigh_hash_uninit(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - int stopped; ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); init_completion(&priv->ntbl.deleted); - /* Stop GC if called at init fail need to cancel work */ - stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); - if (!stopped) - cancel_delayed_work(&priv->neigh_reap_task); + cancel_delayed_work_sync(&priv->neigh_reap_task); ipoib_flush_neighs(priv); @@ -1755,13 +1741,11 @@ static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, return priv->rn_ops->ndo_do_ioctl(dev, ifr, cmd); } -int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) +static int ipoib_dev_init(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret = -ENOMEM; - priv->ca = ca; - priv->port = port; priv->qp = NULL; /* @@ -1777,7 +1761,7 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) /* create pd, which used both for control and datapath*/ priv->pd = ib_alloc_pd(priv->ca, 0); if (IS_ERR(priv->pd)) { - pr_warn("%s: failed to allocate PD\n", ca->name); + pr_warn("%s: failed to allocate PD\n", priv->ca->name); goto clean_wq; } @@ -1787,7 +1771,8 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) goto out_free_pd; } - if (ipoib_neigh_hash_init(priv) < 0) { + ret = ipoib_neigh_hash_init(priv); + if (ret) { pr_warn("%s failed to init neigh hash\n", dev->name); goto out_dev_uninit; } @@ -1796,12 +1781,15 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) if (ipoib_ib_dev_open(dev)) { pr_warn("%s failed to open device\n", dev->name); ret = -ENODEV; - goto out_dev_uninit; + goto out_hash_uninit; } } return 0; +out_hash_uninit: + ipoib_neigh_hash_uninit(dev); + out_dev_uninit: ipoib_ib_dev_cleanup(dev); @@ -1821,21 +1809,151 @@ out: return ret; } -void ipoib_dev_cleanup(struct net_device *dev) +/* + * This must be called before doing an unregister_netdev on a parent device to + * shutdown the IB event handler. + */ +static void ipoib_parent_unregister_pre(struct net_device *ndev) { - struct ipoib_dev_priv *priv = ipoib_priv(dev), *cpriv, *tcpriv; - LIST_HEAD(head); + struct ipoib_dev_priv *priv = ipoib_priv(ndev); - ASSERT_RTNL(); + /* + * ipoib_set_mac checks netif_running before pushing work, clearing + * running ensures the it will not add more work. + */ + rtnl_lock(); + dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); + rtnl_unlock(); - /* Delete any child interfaces first */ - list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { - /* Stop GC on child */ - set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags); - cancel_delayed_work(&cpriv->neigh_reap_task); - unregister_netdevice_queue(cpriv->dev, &head); + /* ipoib_event() cannot be running once this returns */ + ib_unregister_event_handler(&priv->event_handler); + + /* + * Work on the queue grabs the rtnl lock, so this cannot be done while + * also holding it. + */ + flush_workqueue(ipoib_workqueue); +} + +static void ipoib_set_dev_features(struct ipoib_dev_priv *priv) +{ + priv->hca_caps = priv->ca->attrs.device_cap_flags; + + if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { + priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; + + if (priv->hca_caps & IB_DEVICE_UD_TSO) + priv->dev->hw_features |= NETIF_F_TSO; + + priv->dev->features |= priv->dev->hw_features; + } +} + +static int ipoib_parent_init(struct net_device *ndev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + struct ib_port_attr attr; + int result; + + result = ib_query_port(priv->ca, priv->port, &attr); + if (result) { + pr_warn("%s: ib_query_port %d failed\n", priv->ca->name, + priv->port); + return result; + } + priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); + + result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); + if (result) { + pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n", + priv->ca->name, priv->port, result); + return result; } - unregister_netdevice_many(&head); + + result = rdma_query_gid(priv->ca, priv->port, 0, &priv->local_gid); + if (result) { + pr_warn("%s: rdma_query_gid port %d failed (ret = %d)\n", + priv->ca->name, priv->port, result); + return result; + } + memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, + sizeof(union ib_gid)); + + SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent); + priv->dev->dev_id = priv->port - 1; + + return 0; +} + +static void ipoib_child_init(struct net_device *ndev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); + + dev_hold(priv->parent); + + down_write(&ppriv->vlan_rwsem); + list_add_tail(&priv->list, &ppriv->child_intfs); + up_write(&ppriv->vlan_rwsem); + + priv->max_ib_mtu = ppriv->max_ib_mtu; + set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); + memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); + memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid)); +} + +static int ipoib_ndo_init(struct net_device *ndev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + int rc; + + if (priv->parent) { + ipoib_child_init(ndev); + } else { + rc = ipoib_parent_init(ndev); + if (rc) + return rc; + } + + /* MTU will be reset when mcast join happens */ + ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = ndev->mtu; + ndev->max_mtu = IPOIB_CM_MTU; + + ndev->neigh_priv_len = sizeof(struct ipoib_neigh); + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + priv->pkey |= 0x8000; + + ndev->broadcast[8] = priv->pkey >> 8; + ndev->broadcast[9] = priv->pkey & 0xff; + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); + + ipoib_set_dev_features(priv); + + rc = ipoib_dev_init(ndev); + if (rc) { + pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n", + priv->ca->name, priv->dev->name, priv->port, rc); + } + + return 0; +} + +static void ipoib_ndo_uninit(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ASSERT_RTNL(); + + /* + * ipoib_remove_one guarantees the children are removed before the + * parent, and that is the only place where a parent can be removed. + */ + WARN_ON(!list_empty(&priv->child_intfs)); ipoib_neigh_hash_uninit(dev); @@ -1847,6 +1965,16 @@ void ipoib_dev_cleanup(struct net_device *dev) destroy_workqueue(priv->wq); priv->wq = NULL; } + + if (priv->parent) { + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); + + down_write(&ppriv->vlan_rwsem); + list_del(&priv->list); + up_write(&ppriv->vlan_rwsem); + + dev_put(priv->parent); + } } static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state) @@ -1894,7 +2022,8 @@ static const struct header_ops ipoib_header_ops = { }; static const struct net_device_ops ipoib_netdev_ops_pf = { - .ndo_uninit = ipoib_uninit, + .ndo_init = ipoib_ndo_init, + .ndo_uninit = ipoib_ndo_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, @@ -1913,7 +2042,8 @@ static const struct net_device_ops ipoib_netdev_ops_pf = { }; static const struct net_device_ops ipoib_netdev_ops_vf = { - .ndo_uninit = ipoib_uninit, + .ndo_init = ipoib_ndo_init, + .ndo_uninit = ipoib_ndo_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, @@ -1945,6 +2075,13 @@ void ipoib_setup_common(struct net_device *dev) netif_keep_dst(dev); memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); + + /* + * unregister_netdev always frees the netdev, we use this mode + * consistently to unify all the various unregister paths, including + * those connected to rtnl_link_ops which require it. + */ + dev->needs_free_netdev = true; } static void ipoib_build_priv(struct net_device *dev) @@ -1955,7 +2092,6 @@ static void ipoib_build_priv(struct net_device *dev) spin_lock_init(&priv->lock); init_rwsem(&priv->vlan_rwsem); mutex_init(&priv->mcast_mutex); - mutex_init(&priv->sysfs_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); @@ -1999,9 +2135,7 @@ static struct net_device rn->send = ipoib_send; rn->attach_mcast = ipoib_mcast_attach; rn->detach_mcast = ipoib_mcast_detach; - rn->free_rdma_netdev = free_netdev; rn->hca = hca; - dev->netdev_ops = &ipoib_netdev_default_pf; return dev; @@ -2039,6 +2173,9 @@ struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port, if (!priv) return NULL; + priv->ca = hca; + priv->port = port; + dev = ipoib_get_netdev(hca, port, name); if (!dev) goto free_priv; @@ -2053,6 +2190,15 @@ struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port, rn = netdev_priv(dev); rn->clnt_priv = priv; + + /* + * Only the child register_netdev flows can handle priv_destructor + * being set, so we force it to NULL here and handle manually until it + * is safe to turn on. + */ + priv->next_priv_destructor = dev->priv_destructor; + dev->priv_destructor = NULL; + ipoib_build_priv(dev); return priv; @@ -2061,6 +2207,27 @@ free_priv: return NULL; } +void ipoib_intf_free(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); + + dev->priv_destructor = priv->next_priv_destructor; + if (dev->priv_destructor) + dev->priv_destructor(dev); + + /* + * There are some error flows around register_netdev failing that may + * attempt to call priv_destructor twice, prevent that from happening. + */ + dev->priv_destructor = NULL; + + /* unregister/destroy is very complicated. Make bugs more obvious. */ + rn->clnt_priv = NULL; + + kfree(priv); +} + static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, char *buf) { @@ -2186,12 +2353,6 @@ static ssize_t create_child(struct device *dev, if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) return -EINVAL; - /* - * Set the full membership bit, so that we join the right - * broadcast group, etc. - */ - pkey |= 0x8000; - ret = ipoib_vlan_add(to_net_dev(dev), pkey); return ret ? ret : count; @@ -2223,87 +2384,19 @@ int ipoib_add_pkey_attr(struct net_device *dev) return device_create_file(&dev->dev, &dev_attr_pkey); } -void ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) -{ - priv->hca_caps = hca->attrs.device_cap_flags; - - if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { - priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; - - if (priv->hca_caps & IB_DEVICE_UD_TSO) - priv->dev->hw_features |= NETIF_F_TSO; - - priv->dev->features |= priv->dev->hw_features; - } -} - static struct net_device *ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; - struct ib_port_attr attr; - struct rdma_netdev *rn; - int result = -ENOMEM; + struct net_device *ndev; + int result; priv = ipoib_intf_alloc(hca, port, format); if (!priv) { pr_warn("%s, %d: ipoib_intf_alloc failed\n", hca->name, port); - goto alloc_mem_failed; - } - - SET_NETDEV_DEV(priv->dev, hca->dev.parent); - priv->dev->dev_id = port - 1; - - result = ib_query_port(hca, port, &attr); - if (result) { - pr_warn("%s: ib_query_port %d failed\n", hca->name, port); - goto device_init_failed; - } - - priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); - - /* MTU will be reset when mcast join happens */ - priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); - priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; - priv->dev->max_mtu = IPOIB_CM_MTU; - - priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh); - - result = ib_query_pkey(hca, port, 0, &priv->pkey); - if (result) { - pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n", - hca->name, port, result); - goto device_init_failed; - } - - ipoib_set_dev_features(priv, hca); - - /* - * Set the full membership bit, so that we join the right - * broadcast group, etc. - */ - priv->pkey |= 0x8000; - - priv->dev->broadcast[8] = priv->pkey >> 8; - priv->dev->broadcast[9] = priv->pkey & 0xff; - - result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL); - if (result) { - pr_warn("%s: ib_query_gid port %d failed (ret = %d)\n", - hca->name, port, result); - goto device_init_failed; - } - - memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, - sizeof(union ib_gid)); - set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); - - result = ipoib_dev_init(priv->dev, hca, port); - if (result) { - pr_warn("%s: failed to initialize port %d (ret = %d)\n", - hca->name, port, result); - goto device_init_failed; + return ERR_PTR(-ENOMEM); } + ndev = priv->dev; INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); @@ -2312,46 +2405,43 @@ static struct net_device *ipoib_add_port(const char *format, /* call event handler to ensure pkey in sync */ queue_work(ipoib_workqueue, &priv->flush_heavy); - result = register_netdev(priv->dev); + result = register_netdev(ndev); if (result) { pr_warn("%s: couldn't register ipoib port %d; error %d\n", hca->name, port, result); - goto register_failed; + + ipoib_parent_unregister_pre(ndev); + ipoib_intf_free(ndev); + free_netdev(ndev); + + return ERR_PTR(result); } - result = -ENOMEM; - if (ipoib_cm_add_mode_attr(priv->dev)) + /* + * We cannot set priv_destructor before register_netdev because we + * need priv to be always valid during the error flow to execute + * ipoib_parent_unregister_pre(). Instead handle it manually and only + * enter priv_destructor mode once we are completely registered. + */ + ndev->priv_destructor = ipoib_intf_free; + + if (ipoib_cm_add_mode_attr(ndev)) goto sysfs_failed; - if (ipoib_add_pkey_attr(priv->dev)) + if (ipoib_add_pkey_attr(ndev)) goto sysfs_failed; - if (ipoib_add_umcast_attr(priv->dev)) + if (ipoib_add_umcast_attr(ndev)) goto sysfs_failed; - if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) + if (device_create_file(&ndev->dev, &dev_attr_create_child)) goto sysfs_failed; - if (device_create_file(&priv->dev->dev, &dev_attr_delete_child)) + if (device_create_file(&ndev->dev, &dev_attr_delete_child)) goto sysfs_failed; - return priv->dev; + return ndev; sysfs_failed: - unregister_netdev(priv->dev); - -register_failed: - ib_unregister_event_handler(&priv->event_handler); - flush_workqueue(ipoib_workqueue); - /* Stop GC if started before flush */ - set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); - cancel_delayed_work(&priv->neigh_reap_task); - flush_workqueue(priv->wq); - ipoib_dev_cleanup(priv->dev); - -device_init_failed: - rn = netdev_priv(priv->dev); - rn->free_rdma_netdev(priv->dev); - kfree(priv); - -alloc_mem_failed: - return ERR_PTR(result); + ipoib_parent_unregister_pre(ndev); + unregister_netdev(ndev); + return ERR_PTR(-ENOMEM); } static void ipoib_add_one(struct ib_device *device) @@ -2362,7 +2452,7 @@ static void ipoib_add_one(struct ib_device *device) int p; int count = 0; - dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); + dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL); if (!dev_list) return; @@ -2396,39 +2486,18 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) return; list_for_each_entry_safe(priv, tmp, dev_list, list) { - struct rdma_netdev *parent_rn = netdev_priv(priv->dev); - - ib_unregister_event_handler(&priv->event_handler); - flush_workqueue(ipoib_workqueue); - - /* mark interface in the middle of destruction */ - set_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags); + LIST_HEAD(head); + ipoib_parent_unregister_pre(priv->dev); rtnl_lock(); - dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); - rtnl_unlock(); - - /* Stop GC */ - set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); - cancel_delayed_work(&priv->neigh_reap_task); - flush_workqueue(priv->wq); - - /* Wrap rtnl_lock/unlock with mutex to protect sysfs calls */ - mutex_lock(&priv->sysfs_mutex); - unregister_netdev(priv->dev); - mutex_unlock(&priv->sysfs_mutex); - - parent_rn->free_rdma_netdev(priv->dev); - list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { - struct rdma_netdev *child_rn; + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, + list) + unregister_netdevice_queue(cpriv->dev, &head); + unregister_netdevice_queue(priv->dev, &head); + unregister_netdevice_many(&head); - child_rn = netdev_priv(cpriv->dev); - child_rn->free_rdma_netdev(cpriv->dev); - kfree(cpriv); - } - - kfree(priv); + rtnl_unlock(); } kfree(dev_list); @@ -2476,8 +2545,7 @@ static int __init ipoib_init_module(void) * its private workqueue, and we only queue up flush events * on our global flush workqueue. This avoids the deadlocks. */ - ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", - WQ_MEM_RECLAIM); + ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", 0); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 6709328d90f8..b9e9562f5034 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -140,7 +140,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, { struct ipoib_mcast *mcast; - mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC); + mcast = kzalloc(sizeof(*mcast), can_sleep ? GFP_KERNEL : GFP_ATOMIC); if (!mcast) return NULL; @@ -822,6 +822,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) if (neigh && list_empty(&neigh->list)) { kref_get(&mcast->ah->ref); neigh->ah = mcast->ah; + neigh->ah->valid = 1; list_add_tail(&neigh->list, &mcast->neigh_list); } } @@ -917,7 +918,7 @@ void ipoib_mcast_restart_task(struct work_struct *work) if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast)) continue; - memcpy(mgid.raw, ha->addr + 4, sizeof mgid); + memcpy(mgid.raw, ha->addr + 4, sizeof(mgid)); mcast = __ipoib_mcast_find(dev, &mgid); if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { @@ -997,7 +998,7 @@ struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev) { struct ipoib_mcast_iter *iter; - iter = kmalloc(sizeof *iter, GFP_KERNEL); + iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return NULL; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index 3e44087935ae..d4d553a51fa9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -122,15 +122,6 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, } else child_pkey = nla_get_u16(data[IFLA_IPOIB_PKEY]); - if (child_pkey == 0 || child_pkey == 0x8000) - return -EINVAL; - - /* - * Set the full membership bit, so that we join the right - * broadcast group, etc. - */ - child_pkey |= 0x8000; - err = __ipoib_vlan_add(ppriv, ipoib_priv(dev), child_pkey, IPOIB_RTNL_CHILD); @@ -139,19 +130,6 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, return err; } -static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head *head) -{ - struct ipoib_dev_priv *priv, *ppriv; - - priv = ipoib_priv(dev); - ppriv = ipoib_priv(priv->parent); - - down_write(&ppriv->vlan_rwsem); - unregister_netdevice_queue(dev, head); - list_del(&priv->list); - up_write(&ppriv->vlan_rwsem); -} - static size_t ipoib_get_size(const struct net_device *dev) { return nla_total_size(2) + /* IFLA_IPOIB_PKEY */ @@ -167,7 +145,6 @@ static struct rtnl_link_ops ipoib_link_ops __read_mostly = { .setup = ipoib_setup_common, .newlink = ipoib_new_child_link, .changelink = ipoib_changelink, - .dellink = ipoib_unregister_child_dev, .get_size = ipoib_get_size, .fill_info = ipoib_fill_info, }; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 984a88096f39..9f36ca786df8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -52,7 +52,7 @@ int ipoib_mcast_attach(struct net_device *dev, struct ib_device *hca, if (set_qkey) { ret = -ENOMEM; - qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); + qp_attr = kmalloc(sizeof(*qp_attr), GFP_KERNEL); if (!qp_attr) goto out; @@ -147,7 +147,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) .cap = { .max_send_wr = ipoib_sendq_size, .max_recv_wr = ipoib_recvq_size, - .max_send_sge = min_t(u32, priv->ca->attrs.max_sge, + .max_send_sge = min_t(u32, priv->ca->attrs.max_send_sge, MAX_SKB_FRAGS + 1), .max_recv_sge = IPOIB_UD_RX_SG }, @@ -168,8 +168,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) else size += ipoib_recvq_size * ipoib_max_conn_qp; } else - if (ret != -ENOSYS) - return -ENODEV; + if (ret != -EOPNOTSUPP) + return ret; req_vec = (priv->port - 1) * 2; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 55a9b71ed05a..341753fbda54 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -50,68 +50,112 @@ static ssize_t show_parent(struct device *d, struct device_attribute *attr, } static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL); +static bool is_child_unique(struct ipoib_dev_priv *ppriv, + struct ipoib_dev_priv *priv) +{ + struct ipoib_dev_priv *tpriv; + + ASSERT_RTNL(); + + /* + * Since the legacy sysfs interface uses pkey for deletion it cannot + * support more than one interface with the same pkey, it creates + * ambiguity. The RTNL interface deletes using the netdev so it does + * not have a problem to support duplicated pkeys. + */ + if (priv->child_type != IPOIB_LEGACY_CHILD) + return true; + + /* + * First ensure this isn't a duplicate. We check the parent device and + * then all of the legacy child interfaces to make sure the Pkey + * doesn't match. + */ + if (ppriv->pkey == priv->pkey) + return false; + + list_for_each_entry(tpriv, &ppriv->child_intfs, list) { + if (tpriv->pkey == priv->pkey && + tpriv->child_type == IPOIB_LEGACY_CHILD) + return false; + } + + return true; +} + +/* + * NOTE: If this function fails then the priv->dev will remain valid, however + * priv can have been freed and must not be touched by caller in the error + * case. + * + * If (ndev->reg_state == NETREG_UNINITIALIZED) then it is up to the caller to + * free the net_device (just as rtnl_newlink does) otherwise the net_device + * will be freed when the rtnl is unlocked. + */ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, u16 pkey, int type) { + struct net_device *ndev = priv->dev; int result; - priv->max_ib_mtu = ppriv->max_ib_mtu; - /* MTU will be reset when mcast join happens */ - priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); - priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; - priv->parent = ppriv->dev; - set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); + ASSERT_RTNL(); + + /* + * Racing with unregister of the parent must be prevented by the + * caller. + */ + WARN_ON(ppriv->dev->reg_state != NETREG_REGISTERED); - ipoib_set_dev_features(priv, ppriv->ca); + if (pkey == 0 || pkey == 0x8000) { + result = -EINVAL; + goto out_early; + } + priv->parent = ppriv->dev; priv->pkey = pkey; + priv->child_type = type; - memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); - memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid)); - set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); - priv->dev->broadcast[8] = pkey >> 8; - priv->dev->broadcast[9] = pkey & 0xff; - - result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port); - if (result < 0) { - ipoib_warn(ppriv, "failed to initialize subinterface: " - "device %s, port %d", - ppriv->ca->name, ppriv->port); - goto err; + if (!is_child_unique(ppriv, priv)) { + result = -ENOTUNIQ; + goto out_early; } - result = register_netdevice(priv->dev); + /* We do not need to touch priv if register_netdevice fails */ + ndev->priv_destructor = ipoib_intf_free; + + result = register_netdevice(ndev); if (result) { ipoib_warn(priv, "failed to initialize; error %i", result); - goto register_failed; + + /* + * register_netdevice sometimes calls priv_destructor, + * sometimes not. Make sure it was done. + */ + goto out_early; } /* RTNL childs don't need proprietary sysfs entries */ if (type == IPOIB_LEGACY_CHILD) { - if (ipoib_cm_add_mode_attr(priv->dev)) + if (ipoib_cm_add_mode_attr(ndev)) goto sysfs_failed; - if (ipoib_add_pkey_attr(priv->dev)) + if (ipoib_add_pkey_attr(ndev)) goto sysfs_failed; - if (ipoib_add_umcast_attr(priv->dev)) + if (ipoib_add_umcast_attr(ndev)) goto sysfs_failed; - if (device_create_file(&priv->dev->dev, &dev_attr_parent)) + if (device_create_file(&ndev->dev, &dev_attr_parent)) goto sysfs_failed; } - priv->child_type = type; - list_add_tail(&priv->list, &ppriv->child_intfs); - return 0; sysfs_failed: - result = -ENOMEM; unregister_netdevice(priv->dev); + return -ENOMEM; -register_failed: - ipoib_dev_cleanup(priv->dev); - -err: +out_early: + if (ndev->priv_destructor) + ndev->priv_destructor(ndev); return result; } @@ -119,129 +163,124 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) { struct ipoib_dev_priv *ppriv, *priv; char intf_name[IFNAMSIZ]; - struct ipoib_dev_priv *tpriv; + struct net_device *ndev; int result; if (!capable(CAP_NET_ADMIN)) return -EPERM; - ppriv = ipoib_priv(pdev); - - if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags)) - return -EPERM; - - snprintf(intf_name, sizeof intf_name, "%s.%04x", - ppriv->dev->name, pkey); - - if (!mutex_trylock(&ppriv->sysfs_mutex)) + if (!rtnl_trylock()) return restart_syscall(); - if (!rtnl_trylock()) { - mutex_unlock(&ppriv->sysfs_mutex); - return restart_syscall(); - } - - if (!down_write_trylock(&ppriv->vlan_rwsem)) { + if (pdev->reg_state != NETREG_REGISTERED) { rtnl_unlock(); - mutex_unlock(&ppriv->sysfs_mutex); - return restart_syscall(); + return -EPERM; } + ppriv = ipoib_priv(pdev); + + snprintf(intf_name, sizeof(intf_name), "%s.%04x", + ppriv->dev->name, pkey); + priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); if (!priv) { result = -ENOMEM; goto out; } - - /* - * First ensure this isn't a duplicate. We check the parent device and - * then all of the legacy child interfaces to make sure the Pkey - * doesn't match. - */ - if (ppriv->pkey == pkey) { - result = -ENOTUNIQ; - goto out; - } - - list_for_each_entry(tpriv, &ppriv->child_intfs, list) { - if (tpriv->pkey == pkey && - tpriv->child_type == IPOIB_LEGACY_CHILD) { - result = -ENOTUNIQ; - goto out; - } - } + ndev = priv->dev; result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD); + if (result && ndev->reg_state == NETREG_UNINITIALIZED) + free_netdev(ndev); + out: - up_write(&ppriv->vlan_rwsem); rtnl_unlock(); - mutex_unlock(&ppriv->sysfs_mutex); - if (result && priv) { - struct rdma_netdev *rn; + return result; +} + +struct ipoib_vlan_delete_work { + struct work_struct work; + struct net_device *dev; +}; + +/* + * sysfs callbacks of a netdevice cannot obtain the rtnl lock as + * unregister_netdev ultimately deletes the sysfs files while holding the rtnl + * lock. This deadlocks the system. + * + * A callback can use rtnl_trylock to avoid the deadlock but it cannot call + * unregister_netdev as that internally takes and releases the rtnl_lock. So + * instead we find the netdev to unregister and then do the actual unregister + * from the global work queue where we can obtain the rtnl_lock safely. + */ +static void ipoib_vlan_delete_task(struct work_struct *work) +{ + struct ipoib_vlan_delete_work *pwork = + container_of(work, struct ipoib_vlan_delete_work, work); + struct net_device *dev = pwork->dev; + + rtnl_lock(); + + /* Unregistering tasks can race with another task or parent removal */ + if (dev->reg_state == NETREG_REGISTERED) { + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); - rn = netdev_priv(priv->dev); - rn->free_rdma_netdev(priv->dev); - kfree(priv); + ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name); + unregister_netdevice(dev); } - return result; + rtnl_unlock(); + + kfree(pwork); } int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) { struct ipoib_dev_priv *ppriv, *priv, *tpriv; - struct net_device *dev = NULL; + int rc; if (!capable(CAP_NET_ADMIN)) return -EPERM; - ppriv = ipoib_priv(pdev); - - if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags)) - return -EPERM; - - if (!mutex_trylock(&ppriv->sysfs_mutex)) + if (!rtnl_trylock()) return restart_syscall(); - if (!rtnl_trylock()) { - mutex_unlock(&ppriv->sysfs_mutex); - return restart_syscall(); - } - - if (!down_write_trylock(&ppriv->vlan_rwsem)) { + if (pdev->reg_state != NETREG_REGISTERED) { rtnl_unlock(); - mutex_unlock(&ppriv->sysfs_mutex); - return restart_syscall(); + return -EPERM; } + ppriv = ipoib_priv(pdev); + + rc = -ENODEV; list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { - list_del(&priv->list); - dev = priv->dev; + struct ipoib_vlan_delete_work *work; + + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) { + rc = -ENOMEM; + goto out; + } + + down_write(&ppriv->vlan_rwsem); + list_del_init(&priv->list); + up_write(&ppriv->vlan_rwsem); + work->dev = priv->dev; + INIT_WORK(&work->work, ipoib_vlan_delete_task); + queue_work(ipoib_workqueue, &work->work); + + rc = 0; break; } } - up_write(&ppriv->vlan_rwsem); - - if (dev) { - ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name); - unregister_netdevice(dev); - } +out: rtnl_unlock(); - mutex_unlock(&ppriv->sysfs_mutex); - - if (dev) { - struct rdma_netdev *rn; - - rn = netdev_priv(dev); - rn->free_rdma_netdev(priv->dev); - kfree(priv); - return 0; - } - return -ENODEV; + return rc; } |