36 files changed, 450 insertions, 153 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 46e88ed7f41d..ac77a13d2ea2 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -586,6 +586,21 @@ tcp_min_tso_segs - INTEGER
 	if available window is too small.
 	Default: 2
 
+tcp_pacing_ss_ratio - INTEGER
+	sk->sk_pacing_rate is set by TCP stack using a ratio applied
+	to current rate. (current_rate = cwnd * mss / srtt)
+	If TCP is in slow start, tcp_pacing_ss_ratio is applied
+	to let TCP probe for bigger speeds, assuming cwnd can be
+	doubled every other RTT.
+	Default: 200
+
+tcp_pacing_ca_ratio - INTEGER
+	sk->sk_pacing_rate is set by TCP stack using a ratio applied
+	to current rate. (current_rate = cwnd * mss / srtt)
+	If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio
+	is applied to conservatively probe for bigger throughput.
+	Default: 120
+
 tcp_tso_win_divisor - INTEGER
 	This allows control over what percentage of the congestion window
 	can be consumed by a single TSO frame.
diff --git a/MAINTAINERS b/MAINTAINERS
index 4e6dcb692d30..38df5702ada0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11053,7 +11053,7 @@ F:	drivers/input/mouse/vmmouse.c
 F:	drivers/input/mouse/vmmouse.h
 
 VMWARE VMXNET3 ETHERNET DRIVER
-M:	Shreyas Bhatewara <sbhatewara@vmware.com>
+M:	Shrikrishna Khare <skhare@vmware.com>
 M:	"VMware, Inc." <pv-drivers@vmware.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
@@ -11078,6 +11078,14 @@ S:	Supported
 F:	drivers/regulator/
 F:	include/linux/regulator/
 
+VRF
+M:	David Ahern <dsa@cumulusnetworks.com>
+M:	Shrijeet Mukherjee <shm@cumulusnetworks.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/vrf.c
+F:	include/net/vrf.h
+
 VT1211 HARDWARE MONITOR DRIVER
 M:	Juerg Haefliger <juergh@gmail.com>
 L:	lm-sensors@lm-sensors.org
diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c
index bab01c849165..48ce83e443c2 100644
--- a/drivers/net/ethernet/allwinner/sun4i-emac.c
+++ b/drivers/net/ethernet/allwinner/sun4i-emac.c
@@ -28,6 +28,7 @@
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
 #include <linux/phy.h>
+#include <linux/soc/sunxi/sunxi_sram.h>
 
 #include "sun4i-emac.h"
 
@@ -857,11 +858,17 @@ static int emac_probe(struct platform_device *pdev)
 
 	clk_prepare_enable(db->clk);
 
+	ret = sunxi_sram_claim(&pdev->dev);
+	if (ret) {
+		dev_err(&pdev->dev, "Error couldn't map SRAM to device\n");
+		goto out;
+	}
+
 	db->phy_node = of_parse_phandle(np, "phy", 0);
 	if (!db->phy_node) {
 		dev_err(&pdev->dev, "no associated PHY\n");
 		ret = -ENODEV;
-		goto out;
+		goto out_release_sram;
 	}
 
 	/* Read MAC-address from DT */
@@ -893,7 +900,7 @@ static int emac_probe(struct platform_device *pdev)
 	if (ret) {
 		dev_err(&pdev->dev, "Registering netdev failed!\n");
 		ret = -ENODEV;
-		goto out;
+		goto out_release_sram;
 	}
 
 	dev_info(&pdev->dev, "%s: at %p, IRQ %d MAC: %pM\n",
@@ -901,6 +908,8 @@ static int emac_probe(struct platform_device *pdev)
 
 	return 0;
 
+out_release_sram:
+	sunxi_sram_release(&pdev->dev);
 out:
 	dev_err(db->dev, "not found (%d).\n", ret);
 
diff --git a/drivers/net/ethernet/cisco/enic/vnic_dev.c b/drivers/net/ethernet/cisco/enic/vnic_dev.c
index 19a49a6e3911..a3badefaf360 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_dev.c
+++ b/drivers/net/ethernet/cisco/enic/vnic_dev.c
@@ -301,12 +301,12 @@ static int _vnic_dev_cmd2(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd,
 	struct devcmd2_result *result = dc2c->result + dc2c->next_result;
 	unsigned int i;
 	int delay, err;
-	u32 fetch_index, posted, new_posted;
+	u32 fetch_index, new_posted;
+	u32 posted = dc2c->posted;
 
-	posted = ioread32(&dc2c->wq_ctrl->posted_index);
 	fetch_index = ioread32(&dc2c->wq_ctrl->fetch_index);
 
-	if (posted == 0xFFFFFFFF || fetch_index == 0xFFFFFFFF)
+	if (fetch_index == 0xFFFFFFFF)
 		return -ENODEV;
 
 	new_posted = (posted + 1) % DEVCMD2_RING_SIZE;
@@ -331,6 +331,7 @@ static int _vnic_dev_cmd2(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd,
 	 */
 	wmb();
 	iowrite32(new_posted, &dc2c->wq_ctrl->posted_index);
+	dc2c->posted = new_posted;
 
 	if (dc2c->cmd_ring[posted].flags & DEVCMD2_FNORESULT)
 		return 0;
@@ -402,6 +403,7 @@ static int vnic_dev_init_devcmd2(struct vnic_dev *vdev)
 
 	enic_wq_init_start(&vdev->devcmd2->wq, 0, fetch_index, fetch_index, 0,
 			   0);
+	vdev->devcmd2->posted = fetch_index;
 	vnic_wq_enable(&vdev->devcmd2->wq);
 
 	err = vnic_dev_alloc_desc_ring(vdev, &vdev->devcmd2->results_ring,
diff --git a/drivers/net/ethernet/cisco/enic/vnic_wq.h b/drivers/net/ethernet/cisco/enic/vnic_wq.h
index 8944af935a60..01209613d57d 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_wq.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_wq.h
@@ -97,6 +97,7 @@ struct devcmd2_controller {
 	int color;
 	struct vnic_dev_ring results_ring;
 	struct vnic_wq wq;
+	u32 posted;
 };
 
 static inline unsigned int vnic_wq_desc_avail(struct vnic_wq *wq)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 27ca4596775a..0983a208b299 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -405,7 +405,6 @@ struct mlx5e_channel {
 	__be32                     mkey_be;
 	u8                         num_tc;
 	unsigned long              flags;
-	int                        tc_to_txq_map[MLX5E_MAX_NUM_TC];
 
 	/* control */
 	struct mlx5e_priv         *priv;
@@ -475,6 +474,7 @@ struct mlx5e_priv {
 	/* priv data path fields - start */
 	int                        default_vlan_prio;
 	struct mlx5e_sq            **txq_to_sq_map;
+	int channeltc_to_txq_map[MLX5E_MAX_NUM_CHANNELS][MLX5E_MAX_NUM_TC];
 	/* priv data path fields - end */
 
 	unsigned long              state;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 55166dd5b4ea..59874d666cff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -949,13 +949,13 @@ static void mlx5e_close_sqs(struct mlx5e_channel *c)
 		mlx5e_close_sq(&c->sq[tc]);
 }
 
-static void mlx5e_build_tc_to_txq_map(struct mlx5e_channel *c,
-				      int num_channels)
+static void mlx5e_build_channeltc_to_txq_map(struct mlx5e_priv *priv, int ix)
 {
 	int i;
 
 	for (i = 0; i < MLX5E_MAX_NUM_TC; i++)
-		c->tc_to_txq_map[i] = c->ix + i * num_channels;
+		priv->channeltc_to_txq_map[ix][i] =
+			ix + i * priv->params.num_channels;
 }
 
 static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
@@ -979,7 +979,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 	c->mkey_be  = cpu_to_be32(priv->mr.key);
 	c->num_tc   = priv->params.num_tc;
 
-	mlx5e_build_tc_to_txq_map(c, priv->params.num_channels);
+	mlx5e_build_channeltc_to_txq_map(priv, ix);
 
 	netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 64380bc0cd6a..b73672f32e2c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -106,7 +106,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
 		 priv->default_vlan_prio;
 	int tc = netdev_get_prio_tc_map(dev, up);
 
-	return priv->channel[channel_ix]->tc_to_txq_map[tc];
+	return priv->channeltc_to_txq_map[channel_ix][tc];
 }
 
 static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index a34f4742aa00..045f98fed476 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -1726,6 +1726,7 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 						mlxsw_pci_dbg_root);
 	if (!mlxsw_pci->dbg_dir) {
 		dev_err(&pdev->dev, "Failed to create debugfs dir\n");
+		err = -ENOMEM;
 		goto err_dbg_create_dir;
 	}
 
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index f790f61ea78a..d6d39df5b3dc 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -637,6 +637,9 @@ enum rtl_register_content {
 	/* _TBICSRBit */
 	TBILinkOK	= 0x02000000,
 
+	/* ResetCounterCommand */
+	CounterReset	= 0x1,
+
 	/* DumpCounterCommand */
 	CounterDump	= 0x8,
 
@@ -747,6 +750,14 @@ struct rtl8169_counters {
 	__le16	tx_underun;
 };
 
+struct rtl8169_tc_offsets {
+	bool	inited;
+	__le64	tx_errors;
+	__le32	tx_multi_collision;
+	__le32	rx_multicast;
+	__le16	tx_aborted;
+};
+
 enum rtl_flag {
 	RTL_FLAG_TASK_ENABLED,
 	RTL_FLAG_TASK_SLOW_PENDING,
@@ -824,6 +835,7 @@ struct rtl8169_private {
 
 	struct mii_if_info mii;
 	struct rtl8169_counters counters;
+	struct rtl8169_tc_offsets tc_offset;
 	u32 saved_wolopts;
 	u32 opts1_mask;
 
@@ -2179,6 +2191,73 @@ static int rtl8169_get_sset_count(struct net_device *dev, int sset)
 	}
 }
 
+static struct rtl8169_counters *rtl8169_map_counters(struct net_device *dev,
+						     dma_addr_t *paddr,
+						     u32 counter_cmd)
+{
+	struct rtl8169_private *tp = netdev_priv(dev);
+	void __iomem *ioaddr = tp->mmio_addr;
+	struct device *d = &tp->pci_dev->dev;
+	struct rtl8169_counters *counters;
+	u32 cmd;
+
+	counters = dma_alloc_coherent(d, sizeof(*counters), paddr, GFP_KERNEL);
+	if (counters) {
+		RTL_W32(CounterAddrHigh, (u64)*paddr >> 32);
+		cmd = (u64)*paddr & DMA_BIT_MASK(32);
+		RTL_W32(CounterAddrLow, cmd);
+		RTL_W32(CounterAddrLow, cmd | counter_cmd);
+	}
+	return counters;
+}
+
+static void rtl8169_unmap_counters (struct net_device *dev,
+				    dma_addr_t paddr,
+				    struct rtl8169_counters *counters)
+{
+	struct rtl8169_private *tp = netdev_priv(dev);
+	void __iomem *ioaddr = tp->mmio_addr;
+	struct device *d = &tp->pci_dev->dev;
+
+	RTL_W32(CounterAddrLow, 0);
+	RTL_W32(CounterAddrHigh, 0);
+
+	dma_free_coherent(d, sizeof(*counters), counters, paddr);
+}
+
+DECLARE_RTL_COND(rtl_reset_counters_cond)
+{
+	void __iomem *ioaddr = tp->mmio_addr;
+
+	return RTL_R32(CounterAddrLow) & CounterReset;
+}
+
+static bool rtl8169_reset_counters(struct net_device *dev)
+{
+	struct rtl8169_private *tp = netdev_priv(dev);
+	struct rtl8169_counters *counters;
+	dma_addr_t paddr;
+	bool ret = true;
+
+	/*
+	 * Versions prior to RTL_GIGA_MAC_VER_19 don't support resetting the
+	 * tally counters.
+	 */
+	if (tp->mac_version < RTL_GIGA_MAC_VER_19)
+		return true;
+
+	counters = rtl8169_map_counters(dev, &paddr, CounterReset);
+	if (!counters)
+		return false;
+
+	if (!rtl_udelay_loop_wait_low(tp, &rtl_reset_counters_cond, 10, 1000))
+		ret = false;
+
+	rtl8169_unmap_counters(dev, paddr, counters);
+
+	return ret;
+}
+
 DECLARE_RTL_COND(rtl_counters_cond)
 {
 	void __iomem *ioaddr = tp->mmio_addr;
@@ -2186,38 +2265,72 @@ DECLARE_RTL_COND(rtl_counters_cond)
 	return RTL_R32(CounterAddrLow) & CounterDump;
 }
 
-static void rtl8169_update_counters(struct net_device *dev)
+static bool rtl8169_update_counters(struct net_device *dev)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
 	void __iomem *ioaddr = tp->mmio_addr;
-	struct device *d = &tp->pci_dev->dev;
 	struct rtl8169_counters *counters;
 	dma_addr_t paddr;
-	u32 cmd;
+	bool ret = true;
 
 	/*
 	 * Some chips are unable to dump tally counters when the receiver
 	 * is disabled.
 	 */
 	if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0)
-		return;
+		return true;
 
-	counters = dma_alloc_coherent(d, sizeof(*counters), &paddr, GFP_KERNEL);
+	counters = rtl8169_map_counters(dev, &paddr, CounterDump);
 	if (!counters)
-		return;
-
-	RTL_W32(CounterAddrHigh, (u64)paddr >> 32);
-	cmd = (u64)paddr & DMA_BIT_MASK(32);
-	RTL_W32(CounterAddrLow, cmd);
-	RTL_W32(CounterAddrLow, cmd | CounterDump);
+		return false;
 
 	if (rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000))
 		memcpy(&tp->counters, counters, sizeof(*counters));
+	else
+		ret = false;
 
-	RTL_W32(CounterAddrLow, 0);
-	RTL_W32(CounterAddrHigh, 0);
+	rtl8169_unmap_counters(dev, paddr, counters);
 
-	dma_free_coherent(d, sizeof(*counters), counters, paddr);
+	return ret;
+}
+
+static bool rtl8169_init_counter_offsets(struct net_device *dev)
+{
+	struct rtl8169_private *tp = netdev_priv(dev);
+	bool ret = false;
+
+	/*
+	 * rtl8169_init_counter_offsets is called from rtl_open.  On chip
+	 * versions prior to RTL_GIGA_MAC_VER_19 the tally counters are only
+	 * reset by a power cycle, while the counter values collected by the
+	 * driver are reset at every driver unload/load cycle.
+	 *
+	 * To make sure the HW values returned by @get_stats64 match the SW
+	 * values, we collect the initial values at first open(*) and use them
+	 * as offsets to normalize the values returned by @get_stats64.
+	 *
+	 * (*) We can't call rtl8169_init_counter_offsets from rtl_init_one
+	 * for the reason stated in rtl8169_update_counters; CmdRxEnb is only
+	 * set at open time by rtl_hw_start.
+	 */
+
+	if (tp->tc_offset.inited)
+		return true;
+
+	/* If both, reset and update fail, propagate to caller. */
+	if (rtl8169_reset_counters(dev))
+		ret = true;
+
+	if (rtl8169_update_counters(dev))
+		ret = true;
+
+	tp->tc_offset.tx_errors = tp->counters.tx_errors;
+	tp->tc_offset.tx_multi_collision = tp->counters.tx_multi_collision;
+	tp->tc_offset.rx_multicast = tp->counters.rx_multicast;
+	tp->tc_offset.tx_aborted = tp->counters.tx_aborted;
+	tp->tc_offset.inited = true;
+
+	return ret;
 }
 
 static void rtl8169_get_ethtool_stats(struct net_device *dev,
@@ -7631,6 +7744,9 @@ static int rtl_open(struct net_device *dev)
 
 	rtl_hw_start(dev);
 
+	if (!rtl8169_init_counter_offsets(dev))
+		netif_warn(tp, hw, dev, "counter reset/update failed\n");
+
 	netif_start_queue(dev);
 
 	rtl_unlock_work(tp);
@@ -7689,6 +7805,25 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->rx_fifo_errors	= dev->stats.rx_fifo_errors;
 	stats->rx_missed_errors = dev->stats.rx_missed_errors;
 
+	/*
+	 * Fetch additonal counter values missing in stats collected by driver
+	 * from tally counters.
+	 */
+	rtl8169_update_counters(dev);
+
+	/*
+	 * Subtract values fetched during initalization.
+	 * See rtl8169_init_counter_offsets for a description why we do that.
+	 */
+	stats->tx_errors = le64_to_cpu(tp->counters.tx_errors) -
+		le64_to_cpu(tp->tc_offset.tx_errors);
+	stats->collisions = le32_to_cpu(tp->counters.tx_multi_collision) -
+		le32_to_cpu(tp->tc_offset.tx_multi_collision);
+	stats->multicast = le32_to_cpu(tp->counters.rx_multicast) -
+		le32_to_cpu(tp->tc_offset.rx_multicast);
+	stats->tx_aborted_errors = le16_to_cpu(tp->counters.tx_aborted) -
+		le16_to_cpu(tp->tc_offset.tx_aborted);
+
 	return stats;
 }
 
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index aeebc0a7bf47..a21c77bc1b27 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -2004,8 +2004,10 @@ static int davinci_emac_probe(struct platform_device *pdev)
 	if (res_ctrl) {
 		priv->ctrl_base =
 			devm_ioremap_resource(&pdev->dev, res_ctrl);
-		if (IS_ERR(priv->ctrl_base))
+		if (IS_ERR(priv->ctrl_base)) {
+			rc = PTR_ERR(priv->ctrl_base);
 			goto no_pdata;
+		}
 	} else {
 		priv->ctrl_base = priv->remap_addr + pdata->ctrl_mod_reg_offset;
 	}
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index fa8f5046afe9..0481daf9201a 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -283,6 +283,8 @@ static int unit_set(struct idr *p, void *ptr, int n);
 static void unit_put(struct idr *p, int n);
 static void *unit_find(struct idr *p, int n);
 
+static const struct net_device_ops ppp_netdev_ops;
+
 static struct class *ppp_class;
 
 /* per net-namespace data */
@@ -919,13 +921,22 @@ static __net_init int ppp_init_net(struct net *net)
 static __net_exit void ppp_exit_net(struct net *net)
 {
 	struct ppp_net *pn = net_generic(net, ppp_net_id);
+	struct net_device *dev;
+	struct net_device *aux;
 	struct ppp *ppp;
 	LIST_HEAD(list);
 	int id;
 
 	rtnl_lock();
+	for_each_netdev_safe(net, dev, aux) {
+		if (dev->netdev_ops == &ppp_netdev_ops)
+			unregister_netdevice_queue(dev, &list);
+	}
+
 	idr_for_each_entry(&pn->units_idr, ppp, id)
-		unregister_netdevice_queue(ppp->dev, &list);
+		/* Skip devices already unregistered by previous loop */
+		if (!net_eq(dev_net(ppp->dev), net))
+			unregister_netdevice_queue(ppp->dev, &list);
 
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
@@ -1017,6 +1028,7 @@ ppp_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	proto = npindex_to_proto[npi];
 	put_unaligned_be16(proto, pp);
 
+	skb_scrub_packet(skb, !net_eq(ppp->ppp_net, dev_net(dev)));
 	skb_queue_tail(&ppp->file.xq, skb);
 	ppp_xmit_process(ppp);
 	return NETDEV_TX_OK;
@@ -1137,7 +1149,6 @@ static void ppp_setup(struct net_device *dev)
 	dev->tx_queue_len = 3;
 	dev->type = ARPHRD_PPP;
 	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
-	dev->features |= NETIF_F_NETNS_LOCAL;
 	netif_keep_dst(dev);
 }
 
@@ -1900,6 +1911,8 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
 			skb->dev = ppp->dev;
 			skb->protocol = htons(npindex_to_ethertype[npi]);
 			skb_reset_mac_header(skb);
+			skb_scrub_packet(skb, !net_eq(ppp->ppp_net,
+						      dev_net(ppp->dev)));
 			netif_rx(skb);
 		}
 	}
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index d5332ddcea3f..002f0bd27001 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -65,71 +65,12 @@ struct inet_peer_base {
 	int			total;
 };
 
-#define INETPEER_BASE_BIT	0x1UL
-
-static inline struct inet_peer *inetpeer_ptr(unsigned long val)
-{
-	BUG_ON(val & INETPEER_BASE_BIT);
-	return (struct inet_peer *) val;
-}
-
-static inline struct inet_peer_base *inetpeer_base_ptr(unsigned long val)
-{
-	if (!(val & INETPEER_BASE_BIT))
-		return NULL;
-	val &= ~INETPEER_BASE_BIT;
-	return (struct inet_peer_base *) val;
-}
-
-static inline bool inetpeer_ptr_is_peer(unsigned long val)
-{
-	return !(val & INETPEER_BASE_BIT);
-}
-
-static inline void __inetpeer_ptr_set_peer(unsigned long *val, struct inet_peer *peer)
-{
-	/* This implicitly clears INETPEER_BASE_BIT */
-	*val = (unsigned long) peer;
-}
-
-static inline bool inetpeer_ptr_set_peer(unsigned long *ptr, struct inet_peer *peer)
-{
-	unsigned long val = (unsigned long) peer;
-	unsigned long orig = *ptr;
-
-	if (!(orig & INETPEER_BASE_BIT) ||
-	    cmpxchg(ptr, orig, val) != orig)
-		return false;
-	return true;
-}
-
-static inline void inetpeer_init_ptr(unsigned long *ptr, struct inet_peer_base *base)
-{
-	*ptr = (unsigned long) base | INETPEER_BASE_BIT;
-}
-
-static inline void inetpeer_transfer_peer(unsigned long *to, unsigned long *from)
-{
-	unsigned long val = *from;
-
-	*to = val;
-	if (inetpeer_ptr_is_peer(val)) {
-		struct inet_peer *peer = inetpeer_ptr(val);
-		atomic_inc(&peer->refcnt);
-	}
-}
-
 void inet_peer_base_init(struct inet_peer_base *);
 
 void inet_initpeers(void) __init;
 
 #define INETPEER_METRICS_NEW	(~(u32) 0)
 
-static inline bool inet_metrics_new(const struct inet_peer *p)
-{
-	return p->metrics[RTAX_LOCK-1] == INETPEER_METRICS_NEW;
-}
-
 /* can be called with or without local BH being disabled */
 struct inet_peer *inet_getpeer(struct inet_peer_base *base,
 			       const struct inetpeer_addr *daddr,
@@ -163,12 +104,4 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);
 
 void inetpeer_invalidate_tree(struct inet_peer_base *);
 
-/*
- * temporary check to make sure we dont access rid, tcp_ts,
- * tcp_ts_stamp if no refcount is taken on inet_peer
- */
-static inline void inet_peer_refcheck(const struct inet_peer *p)
-{
-	WARN_ON_ONCE(atomic_read(&p->refcnt) <= 0);
-}
 #endif /* _NET_INETPEER_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 364426a2be5a..4a7b03947a38 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -281,6 +281,8 @@ extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_autocorking;
 extern int sysctl_tcp_invalid_ratelimit;
+extern int sysctl_tcp_pacing_ss_ratio;
+extern int sysctl_tcp_pacing_ca_ratio;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -1165,6 +1167,19 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
 }
 
 u32 tcp_default_init_rwnd(u32 mss);
+void tcp_cwnd_restart(struct sock *sk, s32 delta);
+
+static inline void tcp_slow_start_after_idle_check(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	s32 delta;
+
+	if (!sysctl_tcp_slow_start_after_idle || tp->packets_out)
+		return;
+	delta = tcp_time_stamp - tp->lsndtime;
+	if (delta > inet_csk(sk)->icsk_rto)
+		tcp_cwnd_restart(sk, delta);
+}
 
 /* Determine a window scaling and initial window to offer. */
 void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 6b3234599a2c..480a319b4c92 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -241,9 +241,10 @@ static inline void vxlan_get_rx_port(struct net_device *netdev)
 {
 }
 #endif
-#endif
 
 static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs)
 {
 	return vs->sock->sk->sk_family;
 }
+
+#endif
diff --git a/net/core/dst.c b/net/core/dst.c
index 50dcdbb0ee46..477035ed7903 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -262,11 +262,12 @@ again:
 	if (dst->dev)
 		dev_put(dst->dev);
 
+	lwtstate_put(dst->lwtstate);
+
 	if (dst->flags & DST_METADATA)
 		kfree(dst);
 	else
 		kmem_cache_free(dst->ops->kmem_cachep, dst);
-	lwtstate_put(dst->lwtstate);
 
 	dst = child;
 	if (dst) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bf9a5d93c2d1..8a725cc50a90 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -392,7 +392,7 @@ EXPORT_SYMBOL(napi_alloc_frag);
 /**
  *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
  *	@dev: network device to receive on
- *	@length: length to allocate
+ *	@len: length to allocate
  *	@gfp_mask: get_free_pages mask, passed to alloc_skb
  *
  *	Allocate a new &sk_buff and assign it a usage count of one. The
@@ -461,7 +461,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
 /**
  *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
  *	@napi: napi instance this buffer was allocated for
- *	@length: length to allocate
+ *	@len: length to allocate
  *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
  *
  *	Allocate a new sk_buff for use in NAPI receive.  This buffer will
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index ac9a32ec3ee4..f2a71025a770 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -360,8 +360,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 
 	work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
 				ahp->icv_trunc_len + seqhi_len);
-	if (!work_iph)
+	if (!work_iph) {
+		err = -ENOMEM;
 		goto out;
+	}
 
 	seqhi = (__be32 *)((char *)work_iph + ihl);
 	auth_data = ah_tmp_auth(seqhi, seqhi_len);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0330ab2e2b63..879bdc5c95b1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
 static int zero;
 static int one = 1;
 static int four = 4;
+static int thousand = 1000;
 static int gso_max_segs = GSO_MAX_SEGS;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
@@ -712,6 +713,24 @@ static struct ctl_table ipv4_table[] = {
 		.extra2		= &gso_max_segs,
 	},
 	{
+		.procname	= "tcp_pacing_ss_ratio",
+		.data		= &sysctl_tcp_pacing_ss_ratio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &thousand,
+	},
+	{
+		.procname	= "tcp_pacing_ca_ratio",
+		.data		= &sysctl_tcp_pacing_ca_ratio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &thousand,
+	},
+	{
 		.procname	= "tcp_autocorking",
 		.data		= &sysctl_tcp_autocorking,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 45534a5ab430..b8b8fa184f75 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -627,6 +627,8 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb)
 	sk_mem_charge(sk, skb->truesize);
 	if (tp->nonagle & TCP_NAGLE_PUSH)
 		tp->nonagle &= ~TCP_NAGLE_PUSH;
+
+	tcp_slow_start_after_idle_check(sk);
 }
 
 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4e4d6bcd0ca9..dc08e2352665 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -753,13 +753,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
  * TCP pacing, to smooth the burst on large writes when packets
  * in flight is significantly lower than cwnd (or rwin)
  */
+int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
+int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
+
 static void tcp_update_pacing_rate(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	u64 rate;
 
 	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
-	rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
+	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
+
+	/* current rate is (cwnd * mss) / srtt
+	 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
+	 * In Congestion Avoidance phase, set it to 120 % the current rate.
+	 *
+	 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
+	 *	 If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
+	 *	 end of slow start and should slow down.
+	 */
+	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
+		rate *= sysctl_tcp_pacing_ss_ratio;
+	else
+		rate *= sysctl_tcp_pacing_ca_ratio;
 
 	rate *= max(tp->snd_cwnd, tp->packets_out);
 
@@ -3332,6 +3348,9 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 			tp->pred_flags = 0;
 			tcp_fast_path_check(sk);
 
+			if (tcp_send_head(sk))
+				tcp_slow_start_after_idle_check(sk);
+
 			if (nwin > tp->max_window) {
 				tp->max_window = nwin;
 				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 444ab5beecbd..1188e4fcf23b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -137,12 +137,12 @@ static __u16 tcp_advertise_mss(struct sock *sk)
 }
 
 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
- * This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
+ * This is the first part of cwnd validation mechanism.
+ */
+void tcp_cwnd_restart(struct sock *sk, s32 delta)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	s32 delta = tcp_time_stamp - tp->lsndtime;
-	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
+	u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
 	u32 cwnd = tp->snd_cwnd;
 
 	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
@@ -164,10 +164,6 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const u32 now = tcp_time_stamp;
 
-	if (sysctl_tcp_slow_start_after_idle &&
-	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
-		tcp_cwnd_restart(sk, __sk_dst_get(sk));
-
 	tp->lsndtime = now;
 
 	/* If it is a reply for ato after last received
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 55b3c0f4dde5..bb919b28619f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -15,6 +15,7 @@
 #include <net/dst.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
+#include <net/vrf.h>
 
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
 
@@ -107,8 +108,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
 	struct flowi4 *fl4 = &fl->u.ip4;
 	int oif = 0;
 
-	if (skb_dst(skb))
-		oif = skb_dst(skb)->dev->ifindex;
+	if (skb_dst(skb)) {
+		oif = vrf_master_ifindex(skb_dst(skb)->dev) ?
+			: skb_dst(skb)->dev->ifindex;
+	}
 
 	memset(fl4, 0, sizeof(struct flowi4));
 	fl4->flowi4_mark = skb->mark;
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index ed7d4e3f9c10..0630a4d5daaa 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -577,8 +577,10 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
 
 	work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len +
 				ahp->icv_trunc_len + seqhi_len);
-	if (!work_iph)
+	if (!work_iph) {
+		err = -ENOMEM;
 		goto out;
+	}
 
 	auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len);
 	seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len);
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index a74013d3eceb..30caa289c5db 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -20,6 +20,7 @@
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/vrf.h>
 #if IS_ENABLED(CONFIG_IPV6_MIP6)
 #include <net/mip6.h>
 #endif
@@ -131,8 +132,10 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 
 	nexthdr = nh[nhoff];
 
-	if (skb_dst(skb))
-		oif = skb_dst(skb)->dev->ifindex;
+	if (skb_dst(skb)) {
+		oif = vrf_master_ifindex(skb_dst(skb)->dev) ?
+			: skb_dst(skb)->dev->ifindex;
+	}
 
 	memset(fl6, 0, sizeof(struct flowi6));
 	fl6->flowi6_mark = skb->mark;
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 896834cd3b9a..a2f28a6d4dc5 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -438,6 +438,14 @@ static const struct proto_ops rds_proto_ops = {
 	.sendpage =	sock_no_sendpage,
 };
 
+static void rds_sock_destruct(struct sock *sk)
+{
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+
+	WARN_ON((&rs->rs_item != rs->rs_item.next ||
+		 &rs->rs_item != rs->rs_item.prev));
+}
+
 static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 {
 	struct rds_sock *rs;
@@ -445,6 +453,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 	sock_init_data(sock, sk);
 	sock->ops		= &rds_proto_ops;
 	sk->sk_protocol		= protocol;
+	sk->sk_destruct		= rds_sock_destruct;
 
 	rs = rds_sk_to_rs(sk);
 	spin_lock_init(&rs->rs_lock);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index d4fecb21ca25..a50e652eb269 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -301,6 +301,8 @@ void rds_conn_shutdown(struct rds_connection *conn)
 
 		wait_event(conn->c_waitq,
 			   !test_bit(RDS_IN_XMIT, &conn->c_flags));
+		wait_event(conn->c_waitq,
+			   !test_bit(RDS_RECV_REFILL, &conn->c_flags));
 
 		conn->c_trans->conn_shutdown(conn);
 		rds_conn_reset(conn);
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 86d88ec5d556..6422c52682e5 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -320,7 +320,7 @@ void rds_ib_recv_exit(void);
 int rds_ib_recv(struct rds_connection *conn);
 int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
 void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
-void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp);
 void rds_ib_inc_free(struct rds_incoming *inc);
 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
 void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f40d8f52b753..d150bb4aa3cb 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -135,7 +135,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 	rds_ib_recv_init_ring(ic);
 	/* Post receive buffers - as a side effect, this will update
 	 * the posted credit count. */
-	rds_ib_recv_refill(conn, 1);
+	rds_ib_recv_refill(conn, 1, GFP_KERNEL);
 
 	/* Tune RNR behavior */
 	rds_ib_tune_rnr(ic, &qp_attr);
@@ -640,6 +640,15 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 			   (atomic_read(&ic->i_signaled_sends) == 0));
 		tasklet_kill(&ic->i_recv_tasklet);
 
+		/* first destroy the ib state that generates callbacks */
+		if (ic->i_cm_id->qp)
+			rdma_destroy_qp(ic->i_cm_id);
+		if (ic->i_send_cq)
+			ib_destroy_cq(ic->i_send_cq);
+		if (ic->i_recv_cq)
+			ib_destroy_cq(ic->i_recv_cq);
+
+		/* then free the resources that ib callbacks use */
 		if (ic->i_send_hdrs)
 			ib_dma_free_coherent(dev,
 					   ic->i_send_ring.w_nr *
@@ -663,12 +672,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 		if (ic->i_recvs)
 			rds_ib_recv_clear_ring(ic);
 
-		if (ic->i_cm_id->qp)
-			rdma_destroy_qp(ic->i_cm_id);
-		if (ic->i_send_cq)
-			ib_destroy_cq(ic->i_send_cq);
-		if (ic->i_recv_cq)
-			ib_destroy_cq(ic->i_recv_cq);
 		rdma_destroy_id(ic->i_cm_id);
 
 		/*
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 657ba9f5d308..7b7aac8cdb56 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -151,12 +151,17 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 	struct rds_ib_device *rds_ibdev_old;
 
 	rds_ibdev_old = rds_ib_get_device(ipaddr);
-	if (rds_ibdev_old) {
+	if (!rds_ibdev_old)
+		return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+
+	if (rds_ibdev_old != rds_ibdev) {
 		rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
 		rds_ib_dev_put(rds_ibdev_old);
+		return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
 	}
+	rds_ib_dev_put(rds_ibdev_old);
 
-	return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+	return 0;
 }
 
 void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
@@ -485,7 +490,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
 
 			/* FIXME we need a way to tell a r/w MR
 			 * from a r/o MR */
-			BUG_ON(irqs_disabled());
+			WARN_ON(!page->mapping && irqs_disabled());
 			set_page_dirty(page);
 			put_page(page);
 		}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index cac5b4506ee3..ed9b41e3b277 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -297,7 +297,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
 }
 
 static int rds_ib_recv_refill_one(struct rds_connection *conn,
-				  struct rds_ib_recv_work *recv, int prefill)
+				  struct rds_ib_recv_work *recv, gfp_t gfp)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
 	struct ib_sge *sge;
@@ -305,7 +305,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 	gfp_t slab_mask = GFP_NOWAIT;
 	gfp_t page_mask = GFP_NOWAIT;
 
-	if (prefill) {
+	if (gfp & __GFP_WAIT) {
 		slab_mask = GFP_KERNEL;
 		page_mask = GFP_HIGHUSER;
 	}
@@ -347,6 +347,24 @@ out:
 	return ret;
 }
 
+static int acquire_refill(struct rds_connection *conn)
+{
+	return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0;
+}
+
+static void release_refill(struct rds_connection *conn)
+{
+	clear_bit(RDS_RECV_REFILL, &conn->c_flags);
+
+	/* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+	 * hot path and finding waiters is very rare.  We don't want to walk
+	 * the system-wide hashed waitqueue buckets in the fast path only to
+	 * almost never find waiters.
+	 */
+	if (waitqueue_active(&conn->c_waitq))
+		wake_up_all(&conn->c_waitq);
+}
+
 /*
  * This tries to allocate and post unused work requests after making sure that
  * they have all the allocations they need to queue received fragments into
@@ -354,15 +372,23 @@ out:
  *
  * -1 is returned if posting fails due to temporary resource exhaustion.
  */
-void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
 	struct rds_ib_recv_work *recv;
 	struct ib_recv_wr *failed_wr;
 	unsigned int posted = 0;
 	int ret = 0;
+	bool can_wait = !!(gfp & __GFP_WAIT);
 	u32 pos;
 
+	/* the goal here is to just make sure that someone, somewhere
+	 * is posting buffers.  If we can't get the refill lock,
+	 * let them do their thing
+	 */
+	if (!acquire_refill(conn))
+		return;
+
 	while ((prefill || rds_conn_up(conn)) &&
 	       rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
 		if (pos >= ic->i_recv_ring.w_nr) {
@@ -372,7 +398,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 		}
 
 		recv = &ic->i_recvs[pos];
-		ret = rds_ib_recv_refill_one(conn, recv, prefill);
+		ret = rds_ib_recv_refill_one(conn, recv, gfp);
 		if (ret) {
 			break;
 		}
@@ -402,6 +428,24 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 
 	if (ret)
 		rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+
+	release_refill(conn);
+
+	/* if we're called from the softirq handler, we'll be GFP_NOWAIT.
+	 * in this case the ring being low is going to lead to more interrupts
+	 * and we can safely let the softirq code take care of it unless the
+	 * ring is completely empty.
+	 *
+	 * if we're called from krdsd, we'll be GFP_KERNEL.  In this case
+	 * we might have raced with the softirq code while we had the refill
+	 * lock held.  Use rds_ib_ring_low() instead of ring_empty to decide
+	 * if we should requeue.
+	 */
+	if (rds_conn_up(conn) &&
+	    ((can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
+	    rds_ib_ring_empty(&ic->i_recv_ring))) {
+		queue_delayed_work(rds_wq, &conn->c_recv_w, 1);
+	}
 }
 
 /*
@@ -982,10 +1026,17 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
 		}
 
 		/*
-		 * It's very important that we only free this ring entry if we've truly
-		 * freed the resources allocated to the entry.  The refilling path can
-		 * leak if we don't.
+		 * rds_ib_process_recv() doesn't always consume the frag, and
+		 * we might not have called it at all if the wc didn't indicate
+		 * success. We already unmapped the frag's pages, though, and
+		 * the following rds_ib_ring_free() call tells the refill path
+		 * that it will not find an allocated frag here. Make sure we
+		 * keep that promise by freeing a frag that's still on the ring.
 		 */
+		if (recv->r_frag) {
+			rds_ib_frag_free(ic, recv->r_frag);
+			recv->r_frag = NULL;
+		}
 		rds_ib_ring_free(&ic->i_recv_ring, 1);
 	}
 }
@@ -1016,7 +1067,7 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
 		rds_ib_stats_inc(s_ib_rx_ring_empty);
 
 	if (rds_ib_ring_low(&ic->i_recv_ring))
-		rds_ib_recv_refill(conn, 0);
+		rds_ib_recv_refill(conn, 0, GFP_NOWAIT);
 }
 
 int rds_ib_recv(struct rds_connection *conn)
@@ -1025,8 +1076,10 @@ int rds_ib_recv(struct rds_connection *conn)
 	int ret = 0;
 
 	rdsdebug("conn %p\n", conn);
-	if (rds_conn_up(conn))
+	if (rds_conn_up(conn)) {
 		rds_ib_attempt_ack(ic);
+		rds_ib_recv_refill(conn, 0, GFP_KERNEL);
+	}
 
 	return ret;
 }
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 5d0a704fa039..c576ebeb4115 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -709,6 +709,11 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	if (scat == &rm->data.op_sg[rm->data.op_count]) {
 		prev->s_op = ic->i_data_op;
 		prev->s_wr.send_flags |= IB_SEND_SOLICITED;
+		if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) {
+			ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+			prev->s_wr.send_flags |= IB_SEND_SIGNALED;
+			nr_sig++;
+		}
 		ic->i_data_op = NULL;
 	}
 
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 40084d843e9f..c1df9b1cf3b2 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -451,7 +451,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro)
 		 * is the case for a RDMA_READ which copies from remote
 		 * to local memory */
 		if (!ro->op_write) {
-			BUG_ON(irqs_disabled());
+			WARN_ON(!page->mapping && irqs_disabled());
 			set_page_dirty(page);
 		}
 		put_page(page);
@@ -658,6 +658,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 		ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
 		if (ret < 0)
 			goto out;
+		else
+			ret = 0;
 
 		rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
 			 nr_bytes, nr, iov->bytes, iov->addr);
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 208240836043..b9b40af5345b 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -34,6 +34,7 @@
 #include <rdma/rdma_cm.h>
 
 #include "rdma_transport.h"
+#include "ib.h"
 
 static struct rdma_cm_id *rds_rdma_listen_id;
 
@@ -82,8 +83,18 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 		break;
 
 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
-		/* XXX worry about racing with listen acceptance */
-		ret = trans->cm_initiate_connect(cm_id);
+		/* Connection could have been dropped so make sure the
+		 * cm_id is valid before proceeding
+		 */
+		if (conn) {
+			struct rds_ib_connection *ibic;
+
+			ibic = conn->c_transport_data;
+			if (ibic && ibic->i_cm_id == cm_id)
+				ret = trans->cm_initiate_connect(cm_id);
+			else
+				rds_conn_drop(conn);
+		}
 		break;
 
 	case RDMA_CM_EVENT_ESTABLISHED:
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 9005fb0586f6..afb4048d0cfd 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -80,6 +80,7 @@ enum {
 #define RDS_LL_SEND_FULL	0
 #define RDS_RECONNECT_PENDING	1
 #define RDS_IN_XMIT		2
+#define RDS_RECV_REFILL		3
 
 struct rds_connection {
 	struct hlist_node	c_hash_node;
diff --git a/net/rds/send.c b/net/rds/send.c
index 2581b8e3dbe7..4df61a515b83 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -282,26 +282,34 @@ restart:
 		/* The transport either sends the whole rdma or none of it */
 		if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
 			rm->m_final_op = &rm->rdma;
+			/* The transport owns the mapped memory for now.
+			 * You can't unmap it while it's on the send queue
+			 */
+			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
 			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
-			if (ret)
+			if (ret) {
+				clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+				wake_up_interruptible(&rm->m_flush_wait);
 				break;
+			}
 			conn->c_xmit_rdma_sent = 1;
 
-			/* The transport owns the mapped memory for now.
-			 * You can't unmap it while it's on the send queue */
-			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
 		}
 
 		if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
 			rm->m_final_op = &rm->atomic;
+			/* The transport owns the mapped memory for now.
+			 * You can't unmap it while it's on the send queue
+			 */
+			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
 			ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
-			if (ret)
+			if (ret) {
+				clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+				wake_up_interruptible(&rm->m_flush_wait);
 				break;
+			}
 			conn->c_xmit_atomic_sent = 1;
 
-			/* The transport owns the mapped memory for now.
-			 * You can't unmap it while it's on the send queue */
-			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
 		}
 
 		/*
@@ -411,7 +419,8 @@ over_batch:
 	 */
 	if (ret == 0) {
 		smp_mb();
-		if (!list_empty(&conn->c_send_queue) &&
+		if ((test_bit(0, &conn->c_map_queued) ||
+		     !list_empty(&conn->c_send_queue)) &&
 		    send_gen == conn->c_send_gen) {
 			rds_stats_inc(s_send_lock_queue_raced);
 			goto restart;
@@ -769,8 +778,22 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 	while (!list_empty(&list)) {
 		rm = list_entry(list.next, struct rds_message, m_sock_item);
 		list_del_init(&rm->m_sock_item);
-
 		rds_message_wait(rm);
+
+		/* just in case the code above skipped this message
+		 * because RDS_MSG_ON_CONN wasn't set, run it again here
+		 * taking m_rs_lock is the only thing that keeps us
+		 * from racing with ack processing.
+		 */
+		spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+		spin_lock(&rs->rs_lock);
+		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+		spin_unlock(&rs->rs_lock);
+
+		rm->m_rs = NULL;
+		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
 		rds_message_put(rm);
 	}
 }
@@ -992,6 +1015,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 		goto out;
 	}
 
+	if (payload_len > rds_sk_sndbuf(rs)) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
 	/* size of rm including all sgs */
 	ret = rds_rm_size(msg, payload_len);
 	if (ret < 0)
@@ -1064,11 +1092,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
 				  dport, &queued)) {
 		rds_stats_inc(s_send_queue_full);
-		/* XXX make sure this is reasonable */
-		if (payload_len > rds_sk_sndbuf(rs)) {
-			ret = -EMSGSIZE;
-			goto out;
-		}
+
 		if (nonblock) {
 			ret = -EAGAIN;
 			goto out;