diff options
36 files changed, 450 insertions, 153 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 46e88ed7f41d..ac77a13d2ea2 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -586,6 +586,21 @@ tcp_min_tso_segs - INTEGER if available window is too small. Default: 2 +tcp_pacing_ss_ratio - INTEGER + sk->sk_pacing_rate is set by TCP stack using a ratio applied + to current rate. (current_rate = cwnd * mss / srtt) + If TCP is in slow start, tcp_pacing_ss_ratio is applied + to let TCP probe for bigger speeds, assuming cwnd can be + doubled every other RTT. + Default: 200 + +tcp_pacing_ca_ratio - INTEGER + sk->sk_pacing_rate is set by TCP stack using a ratio applied + to current rate. (current_rate = cwnd * mss / srtt) + If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio + is applied to conservatively probe for bigger throughput. + Default: 120 + tcp_tso_win_divisor - INTEGER This allows control over what percentage of the congestion window can be consumed by a single TSO frame. diff --git a/MAINTAINERS b/MAINTAINERS index 4e6dcb692d30..38df5702ada0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11053,7 +11053,7 @@ F: drivers/input/mouse/vmmouse.c F: drivers/input/mouse/vmmouse.h VMWARE VMXNET3 ETHERNET DRIVER -M: Shreyas Bhatewara <sbhatewara@vmware.com> +M: Shrikrishna Khare <skhare@vmware.com> M: "VMware, Inc." <pv-drivers@vmware.com> L: netdev@vger.kernel.org S: Maintained @@ -11078,6 +11078,14 @@ S: Supported F: drivers/regulator/ F: include/linux/regulator/ +VRF +M: David Ahern <dsa@cumulusnetworks.com> +M: Shrijeet Mukherjee <shm@cumulusnetworks.com> +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/vrf.c +F: include/net/vrf.h + VT1211 HARDWARE MONITOR DRIVER M: Juerg Haefliger <juergh@gmail.com> L: lm-sensors@lm-sensors.org diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c index bab01c849165..48ce83e443c2 100644 --- a/drivers/net/ethernet/allwinner/sun4i-emac.c +++ b/drivers/net/ethernet/allwinner/sun4i-emac.c @@ -28,6 +28,7 @@ #include <linux/of_platform.h> #include <linux/platform_device.h> #include <linux/phy.h> +#include <linux/soc/sunxi/sunxi_sram.h> #include "sun4i-emac.h" @@ -857,11 +858,17 @@ static int emac_probe(struct platform_device *pdev) clk_prepare_enable(db->clk); + ret = sunxi_sram_claim(&pdev->dev); + if (ret) { + dev_err(&pdev->dev, "Error couldn't map SRAM to device\n"); + goto out; + } + db->phy_node = of_parse_phandle(np, "phy", 0); if (!db->phy_node) { dev_err(&pdev->dev, "no associated PHY\n"); ret = -ENODEV; - goto out; + goto out_release_sram; } /* Read MAC-address from DT */ @@ -893,7 +900,7 @@ static int emac_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "Registering netdev failed!\n"); ret = -ENODEV; - goto out; + goto out_release_sram; } dev_info(&pdev->dev, "%s: at %p, IRQ %d MAC: %pM\n", @@ -901,6 +908,8 @@ static int emac_probe(struct platform_device *pdev) return 0; +out_release_sram: + sunxi_sram_release(&pdev->dev); out: dev_err(db->dev, "not found (%d).\n", ret); diff --git a/drivers/net/ethernet/cisco/enic/vnic_dev.c b/drivers/net/ethernet/cisco/enic/vnic_dev.c index 19a49a6e3911..a3badefaf360 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_dev.c +++ b/drivers/net/ethernet/cisco/enic/vnic_dev.c @@ -301,12 +301,12 @@ static int _vnic_dev_cmd2(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, struct devcmd2_result *result = dc2c->result + dc2c->next_result; unsigned int i; int delay, err; - u32 fetch_index, posted, new_posted; + u32 fetch_index, new_posted; + u32 posted = dc2c->posted; - posted = ioread32(&dc2c->wq_ctrl->posted_index); fetch_index = ioread32(&dc2c->wq_ctrl->fetch_index); - if (posted == 0xFFFFFFFF || fetch_index == 0xFFFFFFFF) + if (fetch_index == 0xFFFFFFFF) return -ENODEV; new_posted = (posted + 1) % DEVCMD2_RING_SIZE; @@ -331,6 +331,7 @@ static int _vnic_dev_cmd2(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, */ wmb(); iowrite32(new_posted, &dc2c->wq_ctrl->posted_index); + dc2c->posted = new_posted; if (dc2c->cmd_ring[posted].flags & DEVCMD2_FNORESULT) return 0; @@ -402,6 +403,7 @@ static int vnic_dev_init_devcmd2(struct vnic_dev *vdev) enic_wq_init_start(&vdev->devcmd2->wq, 0, fetch_index, fetch_index, 0, 0); + vdev->devcmd2->posted = fetch_index; vnic_wq_enable(&vdev->devcmd2->wq); err = vnic_dev_alloc_desc_ring(vdev, &vdev->devcmd2->results_ring, diff --git a/drivers/net/ethernet/cisco/enic/vnic_wq.h b/drivers/net/ethernet/cisco/enic/vnic_wq.h index 8944af935a60..01209613d57d 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_wq.h +++ b/drivers/net/ethernet/cisco/enic/vnic_wq.h @@ -97,6 +97,7 @@ struct devcmd2_controller { int color; struct vnic_dev_ring results_ring; struct vnic_wq wq; + u32 posted; }; static inline unsigned int vnic_wq_desc_avail(struct vnic_wq *wq) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 27ca4596775a..0983a208b299 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -405,7 +405,6 @@ struct mlx5e_channel { __be32 mkey_be; u8 num_tc; unsigned long flags; - int tc_to_txq_map[MLX5E_MAX_NUM_TC]; /* control */ struct mlx5e_priv *priv; @@ -475,6 +474,7 @@ struct mlx5e_priv { /* priv data path fields - start */ int default_vlan_prio; struct mlx5e_sq **txq_to_sq_map; + int channeltc_to_txq_map[MLX5E_MAX_NUM_CHANNELS][MLX5E_MAX_NUM_TC]; /* priv data path fields - end */ unsigned long state; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 55166dd5b4ea..59874d666cff 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -949,13 +949,13 @@ static void mlx5e_close_sqs(struct mlx5e_channel *c) mlx5e_close_sq(&c->sq[tc]); } -static void mlx5e_build_tc_to_txq_map(struct mlx5e_channel *c, - int num_channels) +static void mlx5e_build_channeltc_to_txq_map(struct mlx5e_priv *priv, int ix) { int i; for (i = 0; i < MLX5E_MAX_NUM_TC; i++) - c->tc_to_txq_map[i] = c->ix + i * num_channels; + priv->channeltc_to_txq_map[ix][i] = + ix + i * priv->params.num_channels; } static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, @@ -979,7 +979,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->mkey_be = cpu_to_be32(priv->mr.key); c->num_tc = priv->params.num_tc; - mlx5e_build_tc_to_txq_map(c, priv->params.num_channels); + mlx5e_build_channeltc_to_txq_map(priv, ix); netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 64380bc0cd6a..b73672f32e2c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -106,7 +106,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, priv->default_vlan_prio; int tc = netdev_get_prio_tc_map(dev, up); - return priv->channel[channel_ix]->tc_to_txq_map[tc]; + return priv->channeltc_to_txq_map[channel_ix][tc]; } static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq, diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index a34f4742aa00..045f98fed476 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1726,6 +1726,7 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) mlxsw_pci_dbg_root); if (!mlxsw_pci->dbg_dir) { dev_err(&pdev->dev, "Failed to create debugfs dir\n"); + err = -ENOMEM; goto err_dbg_create_dir; } diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index f790f61ea78a..d6d39df5b3dc 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -637,6 +637,9 @@ enum rtl_register_content { /* _TBICSRBit */ TBILinkOK = 0x02000000, + /* ResetCounterCommand */ + CounterReset = 0x1, + /* DumpCounterCommand */ CounterDump = 0x8, @@ -747,6 +750,14 @@ struct rtl8169_counters { __le16 tx_underun; }; +struct rtl8169_tc_offsets { + bool inited; + __le64 tx_errors; + __le32 tx_multi_collision; + __le32 rx_multicast; + __le16 tx_aborted; +}; + enum rtl_flag { RTL_FLAG_TASK_ENABLED, RTL_FLAG_TASK_SLOW_PENDING, @@ -824,6 +835,7 @@ struct rtl8169_private { struct mii_if_info mii; struct rtl8169_counters counters; + struct rtl8169_tc_offsets tc_offset; u32 saved_wolopts; u32 opts1_mask; @@ -2179,6 +2191,73 @@ static int rtl8169_get_sset_count(struct net_device *dev, int sset) } } +static struct rtl8169_counters *rtl8169_map_counters(struct net_device *dev, + dma_addr_t *paddr, + u32 counter_cmd) +{ + struct rtl8169_private *tp = netdev_priv(dev); + void __iomem *ioaddr = tp->mmio_addr; + struct device *d = &tp->pci_dev->dev; + struct rtl8169_counters *counters; + u32 cmd; + + counters = dma_alloc_coherent(d, sizeof(*counters), paddr, GFP_KERNEL); + if (counters) { + RTL_W32(CounterAddrHigh, (u64)*paddr >> 32); + cmd = (u64)*paddr & DMA_BIT_MASK(32); + RTL_W32(CounterAddrLow, cmd); + RTL_W32(CounterAddrLow, cmd | counter_cmd); + } + return counters; +} + +static void rtl8169_unmap_counters (struct net_device *dev, + dma_addr_t paddr, + struct rtl8169_counters *counters) +{ + struct rtl8169_private *tp = netdev_priv(dev); + void __iomem *ioaddr = tp->mmio_addr; + struct device *d = &tp->pci_dev->dev; + + RTL_W32(CounterAddrLow, 0); + RTL_W32(CounterAddrHigh, 0); + + dma_free_coherent(d, sizeof(*counters), counters, paddr); +} + +DECLARE_RTL_COND(rtl_reset_counters_cond) +{ + void __iomem *ioaddr = tp->mmio_addr; + + return RTL_R32(CounterAddrLow) & CounterReset; +} + +static bool rtl8169_reset_counters(struct net_device *dev) +{ + struct rtl8169_private *tp = netdev_priv(dev); + struct rtl8169_counters *counters; + dma_addr_t paddr; + bool ret = true; + + /* + * Versions prior to RTL_GIGA_MAC_VER_19 don't support resetting the + * tally counters. + */ + if (tp->mac_version < RTL_GIGA_MAC_VER_19) + return true; + + counters = rtl8169_map_counters(dev, &paddr, CounterReset); + if (!counters) + return false; + + if (!rtl_udelay_loop_wait_low(tp, &rtl_reset_counters_cond, 10, 1000)) + ret = false; + + rtl8169_unmap_counters(dev, paddr, counters); + + return ret; +} + DECLARE_RTL_COND(rtl_counters_cond) { void __iomem *ioaddr = tp->mmio_addr; @@ -2186,38 +2265,72 @@ DECLARE_RTL_COND(rtl_counters_cond) return RTL_R32(CounterAddrLow) & CounterDump; } -static void rtl8169_update_counters(struct net_device *dev) +static bool rtl8169_update_counters(struct net_device *dev) { struct rtl8169_private *tp = netdev_priv(dev); void __iomem *ioaddr = tp->mmio_addr; - struct device *d = &tp->pci_dev->dev; struct rtl8169_counters *counters; dma_addr_t paddr; - u32 cmd; + bool ret = true; /* * Some chips are unable to dump tally counters when the receiver * is disabled. */ if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0) - return; + return true; - counters = dma_alloc_coherent(d, sizeof(*counters), &paddr, GFP_KERNEL); + counters = rtl8169_map_counters(dev, &paddr, CounterDump); if (!counters) - return; - - RTL_W32(CounterAddrHigh, (u64)paddr >> 32); - cmd = (u64)paddr & DMA_BIT_MASK(32); - RTL_W32(CounterAddrLow, cmd); - RTL_W32(CounterAddrLow, cmd | CounterDump); + return false; if (rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000)) memcpy(&tp->counters, counters, sizeof(*counters)); + else + ret = false; - RTL_W32(CounterAddrLow, 0); - RTL_W32(CounterAddrHigh, 0); + rtl8169_unmap_counters(dev, paddr, counters); - dma_free_coherent(d, sizeof(*counters), counters, paddr); + return ret; +} + +static bool rtl8169_init_counter_offsets(struct net_device *dev) +{ + struct rtl8169_private *tp = netdev_priv(dev); + bool ret = false; + + /* + * rtl8169_init_counter_offsets is called from rtl_open. On chip + * versions prior to RTL_GIGA_MAC_VER_19 the tally counters are only + * reset by a power cycle, while the counter values collected by the + * driver are reset at every driver unload/load cycle. + * + * To make sure the HW values returned by @get_stats64 match the SW + * values, we collect the initial values at first open(*) and use them + * as offsets to normalize the values returned by @get_stats64. + * + * (*) We can't call rtl8169_init_counter_offsets from rtl_init_one + * for the reason stated in rtl8169_update_counters; CmdRxEnb is only + * set at open time by rtl_hw_start. + */ + + if (tp->tc_offset.inited) + return true; + + /* If both, reset and update fail, propagate to caller. */ + if (rtl8169_reset_counters(dev)) + ret = true; + + if (rtl8169_update_counters(dev)) + ret = true; + + tp->tc_offset.tx_errors = tp->counters.tx_errors; + tp->tc_offset.tx_multi_collision = tp->counters.tx_multi_collision; + tp->tc_offset.rx_multicast = tp->counters.rx_multicast; + tp->tc_offset.tx_aborted = tp->counters.tx_aborted; + tp->tc_offset.inited = true; + + return ret; } static void rtl8169_get_ethtool_stats(struct net_device *dev, @@ -7631,6 +7744,9 @@ static int rtl_open(struct net_device *dev) rtl_hw_start(dev); + if (!rtl8169_init_counter_offsets(dev)) + netif_warn(tp, hw, dev, "counter reset/update failed\n"); + netif_start_queue(dev); rtl_unlock_work(tp); @@ -7689,6 +7805,25 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) stats->rx_fifo_errors = dev->stats.rx_fifo_errors; stats->rx_missed_errors = dev->stats.rx_missed_errors; + /* + * Fetch additonal counter values missing in stats collected by driver + * from tally counters. + */ + rtl8169_update_counters(dev); + + /* + * Subtract values fetched during initalization. + * See rtl8169_init_counter_offsets for a description why we do that. + */ + stats->tx_errors = le64_to_cpu(tp->counters.tx_errors) - + le64_to_cpu(tp->tc_offset.tx_errors); + stats->collisions = le32_to_cpu(tp->counters.tx_multi_collision) - + le32_to_cpu(tp->tc_offset.tx_multi_collision); + stats->multicast = le32_to_cpu(tp->counters.rx_multicast) - + le32_to_cpu(tp->tc_offset.rx_multicast); + stats->tx_aborted_errors = le16_to_cpu(tp->counters.tx_aborted) - + le16_to_cpu(tp->tc_offset.tx_aborted); + return stats; } diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index aeebc0a7bf47..a21c77bc1b27 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -2004,8 +2004,10 @@ static int davinci_emac_probe(struct platform_device *pdev) if (res_ctrl) { priv->ctrl_base = devm_ioremap_resource(&pdev->dev, res_ctrl); - if (IS_ERR(priv->ctrl_base)) + if (IS_ERR(priv->ctrl_base)) { + rc = PTR_ERR(priv->ctrl_base); goto no_pdata; + } } else { priv->ctrl_base = priv->remap_addr + pdata->ctrl_mod_reg_offset; } diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index fa8f5046afe9..0481daf9201a 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -283,6 +283,8 @@ static int unit_set(struct idr *p, void *ptr, int n); static void unit_put(struct idr *p, int n); static void *unit_find(struct idr *p, int n); +static const struct net_device_ops ppp_netdev_ops; + static struct class *ppp_class; /* per net-namespace data */ @@ -919,13 +921,22 @@ static __net_init int ppp_init_net(struct net *net) static __net_exit void ppp_exit_net(struct net *net) { struct ppp_net *pn = net_generic(net, ppp_net_id); + struct net_device *dev; + struct net_device *aux; struct ppp *ppp; LIST_HEAD(list); int id; rtnl_lock(); + for_each_netdev_safe(net, dev, aux) { + if (dev->netdev_ops == &ppp_netdev_ops) + unregister_netdevice_queue(dev, &list); + } + idr_for_each_entry(&pn->units_idr, ppp, id) - unregister_netdevice_queue(ppp->dev, &list); + /* Skip devices already unregistered by previous loop */ + if (!net_eq(dev_net(ppp->dev), net)) + unregister_netdevice_queue(ppp->dev, &list); unregister_netdevice_many(&list); rtnl_unlock(); @@ -1017,6 +1028,7 @@ ppp_start_xmit(struct sk_buff *skb, struct net_device *dev) proto = npindex_to_proto[npi]; put_unaligned_be16(proto, pp); + skb_scrub_packet(skb, !net_eq(ppp->ppp_net, dev_net(dev))); skb_queue_tail(&ppp->file.xq, skb); ppp_xmit_process(ppp); return NETDEV_TX_OK; @@ -1137,7 +1149,6 @@ static void ppp_setup(struct net_device *dev) dev->tx_queue_len = 3; dev->type = ARPHRD_PPP; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; - dev->features |= NETIF_F_NETNS_LOCAL; netif_keep_dst(dev); } @@ -1900,6 +1911,8 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) skb->dev = ppp->dev; skb->protocol = htons(npindex_to_ethertype[npi]); skb_reset_mac_header(skb); + skb_scrub_packet(skb, !net_eq(ppp->ppp_net, + dev_net(ppp->dev))); netif_rx(skb); } } diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index d5332ddcea3f..002f0bd27001 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -65,71 +65,12 @@ struct inet_peer_base { int total; }; -#define INETPEER_BASE_BIT 0x1UL - -static inline struct inet_peer *inetpeer_ptr(unsigned long val) -{ - BUG_ON(val & INETPEER_BASE_BIT); - return (struct inet_peer *) val; -} - -static inline struct inet_peer_base *inetpeer_base_ptr(unsigned long val) -{ - if (!(val & INETPEER_BASE_BIT)) - return NULL; - val &= ~INETPEER_BASE_BIT; - return (struct inet_peer_base *) val; -} - -static inline bool inetpeer_ptr_is_peer(unsigned long val) -{ - return !(val & INETPEER_BASE_BIT); -} - -static inline void __inetpeer_ptr_set_peer(unsigned long *val, struct inet_peer *peer) -{ - /* This implicitly clears INETPEER_BASE_BIT */ - *val = (unsigned long) peer; -} - -static inline bool inetpeer_ptr_set_peer(unsigned long *ptr, struct inet_peer *peer) -{ - unsigned long val = (unsigned long) peer; - unsigned long orig = *ptr; - - if (!(orig & INETPEER_BASE_BIT) || - cmpxchg(ptr, orig, val) != orig) - return false; - return true; -} - -static inline void inetpeer_init_ptr(unsigned long *ptr, struct inet_peer_base *base) -{ - *ptr = (unsigned long) base | INETPEER_BASE_BIT; -} - -static inline void inetpeer_transfer_peer(unsigned long *to, unsigned long *from) -{ - unsigned long val = *from; - - *to = val; - if (inetpeer_ptr_is_peer(val)) { - struct inet_peer *peer = inetpeer_ptr(val); - atomic_inc(&peer->refcnt); - } -} - void inet_peer_base_init(struct inet_peer_base *); void inet_initpeers(void) __init; #define INETPEER_METRICS_NEW (~(u32) 0) -static inline bool inet_metrics_new(const struct inet_peer *p) -{ - return p->metrics[RTAX_LOCK-1] == INETPEER_METRICS_NEW; -} - /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, @@ -163,12 +104,4 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout); void inetpeer_invalidate_tree(struct inet_peer_base *); -/* - * temporary check to make sure we dont access rid, tcp_ts, - * tcp_ts_stamp if no refcount is taken on inet_peer - */ -static inline void inet_peer_refcheck(const struct inet_peer *p) -{ - WARN_ON_ONCE(atomic_read(&p->refcnt) <= 0); -} #endif /* _NET_INETPEER_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 364426a2be5a..4a7b03947a38 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -281,6 +281,8 @@ extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_autocorking; extern int sysctl_tcp_invalid_ratelimit; +extern int sysctl_tcp_pacing_ss_ratio; +extern int sysctl_tcp_pacing_ca_ratio; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -1165,6 +1167,19 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) } u32 tcp_default_init_rwnd(u32 mss); +void tcp_cwnd_restart(struct sock *sk, s32 delta); + +static inline void tcp_slow_start_after_idle_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + s32 delta; + + if (!sysctl_tcp_slow_start_after_idle || tp->packets_out) + return; + delta = tcp_time_stamp - tp->lsndtime; + if (delta > inet_csk(sk)->icsk_rto) + tcp_cwnd_restart(sk, delta); +} /* Determine a window scaling and initial window to offer. */ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 6b3234599a2c..480a319b4c92 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -241,9 +241,10 @@ static inline void vxlan_get_rx_port(struct net_device *netdev) { } #endif -#endif static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) { return vs->sock->sk->sk_family; } + +#endif diff --git a/net/core/dst.c b/net/core/dst.c index 50dcdbb0ee46..477035ed7903 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -262,11 +262,12 @@ again: if (dst->dev) dev_put(dst->dev); + lwtstate_put(dst->lwtstate); + if (dst->flags & DST_METADATA) kfree(dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); - lwtstate_put(dst->lwtstate); dst = child; if (dst) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bf9a5d93c2d1..8a725cc50a90 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -392,7 +392,7 @@ EXPORT_SYMBOL(napi_alloc_frag); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on - * @length: length to allocate + * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb * * Allocate a new &sk_buff and assign it a usage count of one. The @@ -461,7 +461,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb); /** * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for - * @length: length to allocate + * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages * * Allocate a new sk_buff for use in NAPI receive. This buffer will diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index ac9a32ec3ee4..f2a71025a770 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -360,8 +360,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + ahp->icv_trunc_len + seqhi_len); - if (!work_iph) + if (!work_iph) { + err = -ENOMEM; goto out; + } seqhi = (__be32 *)((char *)work_iph + ihl); auth_data = ah_tmp_auth(seqhi, seqhi_len); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0330ab2e2b63..879bdc5c95b1 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -29,6 +29,7 @@ static int zero; static int one = 1; static int four = 4; +static int thousand = 1000; static int gso_max_segs = GSO_MAX_SEGS; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; @@ -712,6 +713,24 @@ static struct ctl_table ipv4_table[] = { .extra2 = &gso_max_segs, }, { + .procname = "tcp_pacing_ss_ratio", + .data = &sysctl_tcp_pacing_ss_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, + { + .procname = "tcp_pacing_ca_ratio", + .data = &sysctl_tcp_pacing_ca_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, + { .procname = "tcp_autocorking", .data = &sysctl_tcp_autocorking, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 45534a5ab430..b8b8fa184f75 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -627,6 +627,8 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb) sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; + + tcp_slow_start_after_idle_check(sk); } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4e4d6bcd0ca9..dc08e2352665 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -753,13 +753,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) * TCP pacing, to smooth the burst on large writes when packets * in flight is significantly lower than cwnd (or rwin) */ +int sysctl_tcp_pacing_ss_ratio __read_mostly = 200; +int sysctl_tcp_pacing_ca_ratio __read_mostly = 120; + static void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u64 rate; /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ - rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); + rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); + + /* current rate is (cwnd * mss) / srtt + * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. + * In Congestion Avoidance phase, set it to 120 % the current rate. + * + * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) + * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching + * end of slow start and should slow down. + */ + if (tp->snd_cwnd < tp->snd_ssthresh / 2) + rate *= sysctl_tcp_pacing_ss_ratio; + else + rate *= sysctl_tcp_pacing_ca_ratio; rate *= max(tp->snd_cwnd, tp->packets_out); @@ -3332,6 +3348,9 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 tp->pred_flags = 0; tcp_fast_path_check(sk); + if (tcp_send_head(sk)) + tcp_slow_start_after_idle_check(sk); + if (nwin > tp->max_window) { tp->max_window = nwin; tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 444ab5beecbd..1188e4fcf23b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -137,12 +137,12 @@ static __u16 tcp_advertise_mss(struct sock *sk) } /* RFC2861. Reset CWND after idle period longer RTO to "restart window". - * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst) + * This is the first part of cwnd validation mechanism. + */ +void tcp_cwnd_restart(struct sock *sk, s32 delta) { struct tcp_sock *tp = tcp_sk(sk); - s32 delta = tcp_time_stamp - tp->lsndtime; - u32 restart_cwnd = tcp_init_cwnd(tp, dst); + u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 cwnd = tp->snd_cwnd; tcp_ca_event(sk, CA_EVENT_CWND_RESTART); @@ -164,10 +164,6 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; - if (sysctl_tcp_slow_start_after_idle && - (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) - tcp_cwnd_restart(sk, __sk_dst_get(sk)); - tp->lsndtime = now; /* If it is a reply for ato after last received diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 55b3c0f4dde5..bb919b28619f 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -15,6 +15,7 @@ #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> +#include <net/vrf.h> static struct xfrm_policy_afinfo xfrm4_policy_afinfo; @@ -107,8 +108,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; - if (skb_dst(skb)) - oif = skb_dst(skb)->dev->ifindex; + if (skb_dst(skb)) { + oif = vrf_master_ifindex(skb_dst(skb)->dev) ? + : skb_dst(skb)->dev->ifindex; + } memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index ed7d4e3f9c10..0630a4d5daaa 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -577,8 +577,10 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len + ahp->icv_trunc_len + seqhi_len); - if (!work_iph) + if (!work_iph) { + err = -ENOMEM; goto out; + } auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len); seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index a74013d3eceb..30caa289c5db 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -20,6 +20,7 @@ #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> +#include <net/vrf.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif @@ -131,8 +132,10 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) nexthdr = nh[nhoff]; - if (skb_dst(skb)) - oif = skb_dst(skb)->dev->ifindex; + if (skb_dst(skb)) { + oif = vrf_master_ifindex(skb_dst(skb)->dev) ? + : skb_dst(skb)->dev->ifindex; + } memset(fl6, 0, sizeof(struct flowi6)); fl6->flowi6_mark = skb->mark; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 896834cd3b9a..a2f28a6d4dc5 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -438,6 +438,14 @@ static const struct proto_ops rds_proto_ops = { .sendpage = sock_no_sendpage, }; +static void rds_sock_destruct(struct sock *sk) +{ + struct rds_sock *rs = rds_sk_to_rs(sk); + + WARN_ON((&rs->rs_item != rs->rs_item.next || + &rs->rs_item != rs->rs_item.prev)); +} + static int __rds_create(struct socket *sock, struct sock *sk, int protocol) { struct rds_sock *rs; @@ -445,6 +453,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) sock_init_data(sock, sk); sock->ops = &rds_proto_ops; sk->sk_protocol = protocol; + sk->sk_destruct = rds_sock_destruct; rs = rds_sk_to_rs(sk); spin_lock_init(&rs->rs_lock); diff --git a/net/rds/connection.c b/net/rds/connection.c index d4fecb21ca25..a50e652eb269 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -301,6 +301,8 @@ void rds_conn_shutdown(struct rds_connection *conn) wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags)); + wait_event(conn->c_waitq, + !test_bit(RDS_RECV_REFILL, &conn->c_flags)); conn->c_trans->conn_shutdown(conn); rds_conn_reset(conn); diff --git a/net/rds/ib.h b/net/rds/ib.h index 86d88ec5d556..6422c52682e5 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -320,7 +320,7 @@ void rds_ib_recv_exit(void); int rds_ib_recv(struct rds_connection *conn); int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); void rds_ib_recv_free_caches(struct rds_ib_connection *ic); -void rds_ib_recv_refill(struct rds_connection *conn, int prefill); +void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f40d8f52b753..d150bb4aa3cb 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -135,7 +135,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_ib_recv_init_ring(ic); /* Post receive buffers - as a side effect, this will update * the posted credit count. */ - rds_ib_recv_refill(conn, 1); + rds_ib_recv_refill(conn, 1, GFP_KERNEL); /* Tune RNR behavior */ rds_ib_tune_rnr(ic, &qp_attr); @@ -640,6 +640,15 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) (atomic_read(&ic->i_signaled_sends) == 0)); tasklet_kill(&ic->i_recv_tasklet); + /* first destroy the ib state that generates callbacks */ + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + if (ic->i_send_cq) + ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + ib_destroy_cq(ic->i_recv_cq); + + /* then free the resources that ib callbacks use */ if (ic->i_send_hdrs) ib_dma_free_coherent(dev, ic->i_send_ring.w_nr * @@ -663,12 +672,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) if (ic->i_recvs) rds_ib_recv_clear_ring(ic); - if (ic->i_cm_id->qp) - rdma_destroy_qp(ic->i_cm_id); - if (ic->i_send_cq) - ib_destroy_cq(ic->i_send_cq); - if (ic->i_recv_cq) - ib_destroy_cq(ic->i_recv_cq); rdma_destroy_id(ic->i_cm_id); /* diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 657ba9f5d308..7b7aac8cdb56 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -151,12 +151,17 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr); - if (rds_ibdev_old) { + if (!rds_ibdev_old) + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + + if (rds_ibdev_old != rds_ibdev) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); rds_ib_dev_put(rds_ibdev_old); + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); } + rds_ib_dev_put(rds_ibdev_old); - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + return 0; } void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) @@ -485,7 +490,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) /* FIXME we need a way to tell a r/w MR * from a r/o MR */ - BUG_ON(irqs_disabled()); + WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); put_page(page); } diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index cac5b4506ee3..ed9b41e3b277 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -297,7 +297,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic } static int rds_ib_recv_refill_one(struct rds_connection *conn, - struct rds_ib_recv_work *recv, int prefill) + struct rds_ib_recv_work *recv, gfp_t gfp) { struct rds_ib_connection *ic = conn->c_transport_data; struct ib_sge *sge; @@ -305,7 +305,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, gfp_t slab_mask = GFP_NOWAIT; gfp_t page_mask = GFP_NOWAIT; - if (prefill) { + if (gfp & __GFP_WAIT) { slab_mask = GFP_KERNEL; page_mask = GFP_HIGHUSER; } @@ -347,6 +347,24 @@ out: return ret; } +static int acquire_refill(struct rds_connection *conn) +{ + return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0; +} + +static void release_refill(struct rds_connection *conn) +{ + clear_bit(RDS_RECV_REFILL, &conn->c_flags); + + /* We don't use wait_on_bit()/wake_up_bit() because our waking is in a + * hot path and finding waiters is very rare. We don't want to walk + * the system-wide hashed waitqueue buckets in the fast path only to + * almost never find waiters. + */ + if (waitqueue_active(&conn->c_waitq)) + wake_up_all(&conn->c_waitq); +} + /* * This tries to allocate and post unused work requests after making sure that * they have all the allocations they need to queue received fragments into @@ -354,15 +372,23 @@ out: * * -1 is returned if posting fails due to temporary resource exhaustion. */ -void rds_ib_recv_refill(struct rds_connection *conn, int prefill) +void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_recv_work *recv; struct ib_recv_wr *failed_wr; unsigned int posted = 0; int ret = 0; + bool can_wait = !!(gfp & __GFP_WAIT); u32 pos; + /* the goal here is to just make sure that someone, somewhere + * is posting buffers. If we can't get the refill lock, + * let them do their thing + */ + if (!acquire_refill(conn)) + return; + while ((prefill || rds_conn_up(conn)) && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { if (pos >= ic->i_recv_ring.w_nr) { @@ -372,7 +398,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill) } recv = &ic->i_recvs[pos]; - ret = rds_ib_recv_refill_one(conn, recv, prefill); + ret = rds_ib_recv_refill_one(conn, recv, gfp); if (ret) { break; } @@ -402,6 +428,24 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill) if (ret) rds_ib_ring_unalloc(&ic->i_recv_ring, 1); + + release_refill(conn); + + /* if we're called from the softirq handler, we'll be GFP_NOWAIT. + * in this case the ring being low is going to lead to more interrupts + * and we can safely let the softirq code take care of it unless the + * ring is completely empty. + * + * if we're called from krdsd, we'll be GFP_KERNEL. In this case + * we might have raced with the softirq code while we had the refill + * lock held. Use rds_ib_ring_low() instead of ring_empty to decide + * if we should requeue. + */ + if (rds_conn_up(conn) && + ((can_wait && rds_ib_ring_low(&ic->i_recv_ring)) || + rds_ib_ring_empty(&ic->i_recv_ring))) { + queue_delayed_work(rds_wq, &conn->c_recv_w, 1); + } } /* @@ -982,10 +1026,17 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, } /* - * It's very important that we only free this ring entry if we've truly - * freed the resources allocated to the entry. The refilling path can - * leak if we don't. + * rds_ib_process_recv() doesn't always consume the frag, and + * we might not have called it at all if the wc didn't indicate + * success. We already unmapped the frag's pages, though, and + * the following rds_ib_ring_free() call tells the refill path + * that it will not find an allocated frag here. Make sure we + * keep that promise by freeing a frag that's still on the ring. */ + if (recv->r_frag) { + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + } rds_ib_ring_free(&ic->i_recv_ring, 1); } } @@ -1016,7 +1067,7 @@ void rds_ib_recv_tasklet_fn(unsigned long data) rds_ib_stats_inc(s_ib_rx_ring_empty); if (rds_ib_ring_low(&ic->i_recv_ring)) - rds_ib_recv_refill(conn, 0); + rds_ib_recv_refill(conn, 0, GFP_NOWAIT); } int rds_ib_recv(struct rds_connection *conn) @@ -1025,8 +1076,10 @@ int rds_ib_recv(struct rds_connection *conn) int ret = 0; rdsdebug("conn %p\n", conn); - if (rds_conn_up(conn)) + if (rds_conn_up(conn)) { rds_ib_attempt_ack(ic); + rds_ib_recv_refill(conn, 0, GFP_KERNEL); + } return ret; } diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 5d0a704fa039..c576ebeb4115 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -709,6 +709,11 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_op = ic->i_data_op; prev->s_wr.send_flags |= IB_SEND_SOLICITED; + if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + prev->s_wr.send_flags |= IB_SEND_SIGNALED; + nr_sig++; + } ic->i_data_op = NULL; } diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 40084d843e9f..c1df9b1cf3b2 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -451,7 +451,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) * is the case for a RDMA_READ which copies from remote * to local memory */ if (!ro->op_write) { - BUG_ON(irqs_disabled()); + WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); } put_page(page); @@ -658,6 +658,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if (ret < 0) goto out; + else + ret = 0; rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 208240836043..b9b40af5345b 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -34,6 +34,7 @@ #include <rdma/rdma_cm.h> #include "rdma_transport.h" +#include "ib.h" static struct rdma_cm_id *rds_rdma_listen_id; @@ -82,8 +83,18 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_ROUTE_RESOLVED: - /* XXX worry about racing with listen acceptance */ - ret = trans->cm_initiate_connect(cm_id); + /* Connection could have been dropped so make sure the + * cm_id is valid before proceeding + */ + if (conn) { + struct rds_ib_connection *ibic; + + ibic = conn->c_transport_data; + if (ibic && ibic->i_cm_id == cm_id) + ret = trans->cm_initiate_connect(cm_id); + else + rds_conn_drop(conn); + } break; case RDMA_CM_EVENT_ESTABLISHED: diff --git a/net/rds/rds.h b/net/rds/rds.h index 9005fb0586f6..afb4048d0cfd 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -80,6 +80,7 @@ enum { #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 #define RDS_IN_XMIT 2 +#define RDS_RECV_REFILL 3 struct rds_connection { struct hlist_node c_hash_node; diff --git a/net/rds/send.c b/net/rds/send.c index 2581b8e3dbe7..4df61a515b83 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -282,26 +282,34 @@ restart: /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { rm->m_final_op = &rm->rdma; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue + */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); - if (ret) + if (ret) { + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + wake_up_interruptible(&rm->m_flush_wait); break; + } conn->c_xmit_rdma_sent = 1; - /* The transport owns the mapped memory for now. - * You can't unmap it while it's on the send queue */ - set_bit(RDS_MSG_MAPPED, &rm->m_flags); } if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { rm->m_final_op = &rm->atomic; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue + */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); - if (ret) + if (ret) { + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + wake_up_interruptible(&rm->m_flush_wait); break; + } conn->c_xmit_atomic_sent = 1; - /* The transport owns the mapped memory for now. - * You can't unmap it while it's on the send queue */ - set_bit(RDS_MSG_MAPPED, &rm->m_flags); } /* @@ -411,7 +419,8 @@ over_batch: */ if (ret == 0) { smp_mb(); - if (!list_empty(&conn->c_send_queue) && + if ((test_bit(0, &conn->c_map_queued) || + !list_empty(&conn->c_send_queue)) && send_gen == conn->c_send_gen) { rds_stats_inc(s_send_lock_queue_raced); goto restart; @@ -769,8 +778,22 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); list_del_init(&rm->m_sock_item); - rds_message_wait(rm); + + /* just in case the code above skipped this message + * because RDS_MSG_ON_CONN wasn't set, run it again here + * taking m_rs_lock is the only thing that keeps us + * from racing with ack processing. + */ + spin_lock_irqsave(&rm->m_rs_lock, flags); + + spin_lock(&rs->rs_lock); + __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); + spin_unlock(&rs->rs_lock); + + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + rds_message_put(rm); } } @@ -992,6 +1015,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } + if (payload_len > rds_sk_sndbuf(rs)) { + ret = -EMSGSIZE; + goto out; + } + /* size of rm including all sgs */ ret = rds_rm_size(msg, payload_len); if (ret < 0) @@ -1064,11 +1092,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, dport, &queued)) { rds_stats_inc(s_send_queue_full); - /* XXX make sure this is reasonable */ - if (payload_len > rds_sk_sndbuf(rs)) { - ret = -EMSGSIZE; - goto out; - } + if (nonblock) { ret = -EAGAIN; goto out; |