summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.txt15
-rw-r--r--MAINTAINERS10
-rw-r--r--drivers/net/ethernet/allwinner/sun4i-emac.c13
-rw-r--r--drivers/net/ethernet/cisco/enic/vnic_dev.c8
-rw-r--r--drivers/net/ethernet/cisco/enic/vnic_wq.h1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c8
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_tx.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/pci.c1
-rw-r--r--drivers/net/ethernet/realtek/r8169.c163
-rw-r--r--drivers/net/ethernet/ti/davinci_emac.c4
-rw-r--r--drivers/net/ppp/ppp_generic.c17
-rw-r--r--include/net/inetpeer.h67
-rw-r--r--include/net/tcp.h15
-rw-r--r--include/net/vxlan.h3
-rw-r--r--net/core/dst.c3
-rw-r--r--net/core/skbuff.c4
-rw-r--r--net/ipv4/ah4.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c19
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_input.c21
-rw-r--r--net/ipv4/tcp_output.c12
-rw-r--r--net/ipv4/xfrm4_policy.c7
-rw-r--r--net/ipv6/ah6.c4
-rw-r--r--net/ipv6/xfrm6_policy.c7
-rw-r--r--net/rds/af_rds.c9
-rw-r--r--net/rds/connection.c2
-rw-r--r--net/rds/ib.h2
-rw-r--r--net/rds/ib_cm.c17
-rw-r--r--net/rds/ib_rdma.c11
-rw-r--r--net/rds/ib_recv.c71
-rw-r--r--net/rds/ib_send.c5
-rw-r--r--net/rds/rdma.c4
-rw-r--r--net/rds/rdma_transport.c15
-rw-r--r--net/rds/rds.h1
-rw-r--r--net/rds/send.c54
36 files changed, 450 insertions, 153 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 46e88ed7f41d..ac77a13d2ea2 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -586,6 +586,21 @@ tcp_min_tso_segs - INTEGER
if available window is too small.
Default: 2
+tcp_pacing_ss_ratio - INTEGER
+ sk->sk_pacing_rate is set by TCP stack using a ratio applied
+ to current rate. (current_rate = cwnd * mss / srtt)
+ If TCP is in slow start, tcp_pacing_ss_ratio is applied
+ to let TCP probe for bigger speeds, assuming cwnd can be
+ doubled every other RTT.
+ Default: 200
+
+tcp_pacing_ca_ratio - INTEGER
+ sk->sk_pacing_rate is set by TCP stack using a ratio applied
+ to current rate. (current_rate = cwnd * mss / srtt)
+ If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio
+ is applied to conservatively probe for bigger throughput.
+ Default: 120
+
tcp_tso_win_divisor - INTEGER
This allows control over what percentage of the congestion window
can be consumed by a single TSO frame.
diff --git a/MAINTAINERS b/MAINTAINERS
index 4e6dcb692d30..38df5702ada0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11053,7 +11053,7 @@ F: drivers/input/mouse/vmmouse.c
F: drivers/input/mouse/vmmouse.h
VMWARE VMXNET3 ETHERNET DRIVER
-M: Shreyas Bhatewara <sbhatewara@vmware.com>
+M: Shrikrishna Khare <skhare@vmware.com>
M: "VMware, Inc." <pv-drivers@vmware.com>
L: netdev@vger.kernel.org
S: Maintained
@@ -11078,6 +11078,14 @@ S: Supported
F: drivers/regulator/
F: include/linux/regulator/
+VRF
+M: David Ahern <dsa@cumulusnetworks.com>
+M: Shrijeet Mukherjee <shm@cumulusnetworks.com>
+L: netdev@vger.kernel.org
+S: Maintained
+F: drivers/net/vrf.c
+F: include/net/vrf.h
+
VT1211 HARDWARE MONITOR DRIVER
M: Juerg Haefliger <juergh@gmail.com>
L: lm-sensors@lm-sensors.org
diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c
index bab01c849165..48ce83e443c2 100644
--- a/drivers/net/ethernet/allwinner/sun4i-emac.c
+++ b/drivers/net/ethernet/allwinner/sun4i-emac.c
@@ -28,6 +28,7 @@
#include <linux/of_platform.h>
#include <linux/platform_device.h>
#include <linux/phy.h>
+#include <linux/soc/sunxi/sunxi_sram.h>
#include "sun4i-emac.h"
@@ -857,11 +858,17 @@ static int emac_probe(struct platform_device *pdev)
clk_prepare_enable(db->clk);
+ ret = sunxi_sram_claim(&pdev->dev);
+ if (ret) {
+ dev_err(&pdev->dev, "Error couldn't map SRAM to device\n");
+ goto out;
+ }
+
db->phy_node = of_parse_phandle(np, "phy", 0);
if (!db->phy_node) {
dev_err(&pdev->dev, "no associated PHY\n");
ret = -ENODEV;
- goto out;
+ goto out_release_sram;
}
/* Read MAC-address from DT */
@@ -893,7 +900,7 @@ static int emac_probe(struct platform_device *pdev)
if (ret) {
dev_err(&pdev->dev, "Registering netdev failed!\n");
ret = -ENODEV;
- goto out;
+ goto out_release_sram;
}
dev_info(&pdev->dev, "%s: at %p, IRQ %d MAC: %pM\n",
@@ -901,6 +908,8 @@ static int emac_probe(struct platform_device *pdev)
return 0;
+out_release_sram:
+ sunxi_sram_release(&pdev->dev);
out:
dev_err(db->dev, "not found (%d).\n", ret);
diff --git a/drivers/net/ethernet/cisco/enic/vnic_dev.c b/drivers/net/ethernet/cisco/enic/vnic_dev.c
index 19a49a6e3911..a3badefaf360 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_dev.c
+++ b/drivers/net/ethernet/cisco/enic/vnic_dev.c
@@ -301,12 +301,12 @@ static int _vnic_dev_cmd2(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd,
struct devcmd2_result *result = dc2c->result + dc2c->next_result;
unsigned int i;
int delay, err;
- u32 fetch_index, posted, new_posted;
+ u32 fetch_index, new_posted;
+ u32 posted = dc2c->posted;
- posted = ioread32(&dc2c->wq_ctrl->posted_index);
fetch_index = ioread32(&dc2c->wq_ctrl->fetch_index);
- if (posted == 0xFFFFFFFF || fetch_index == 0xFFFFFFFF)
+ if (fetch_index == 0xFFFFFFFF)
return -ENODEV;
new_posted = (posted + 1) % DEVCMD2_RING_SIZE;
@@ -331,6 +331,7 @@ static int _vnic_dev_cmd2(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd,
*/
wmb();
iowrite32(new_posted, &dc2c->wq_ctrl->posted_index);
+ dc2c->posted = new_posted;
if (dc2c->cmd_ring[posted].flags & DEVCMD2_FNORESULT)
return 0;
@@ -402,6 +403,7 @@ static int vnic_dev_init_devcmd2(struct vnic_dev *vdev)
enic_wq_init_start(&vdev->devcmd2->wq, 0, fetch_index, fetch_index, 0,
0);
+ vdev->devcmd2->posted = fetch_index;
vnic_wq_enable(&vdev->devcmd2->wq);
err = vnic_dev_alloc_desc_ring(vdev, &vdev->devcmd2->results_ring,
diff --git a/drivers/net/ethernet/cisco/enic/vnic_wq.h b/drivers/net/ethernet/cisco/enic/vnic_wq.h
index 8944af935a60..01209613d57d 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_wq.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_wq.h
@@ -97,6 +97,7 @@ struct devcmd2_controller {
int color;
struct vnic_dev_ring results_ring;
struct vnic_wq wq;
+ u32 posted;
};
static inline unsigned int vnic_wq_desc_avail(struct vnic_wq *wq)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 27ca4596775a..0983a208b299 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -405,7 +405,6 @@ struct mlx5e_channel {
__be32 mkey_be;
u8 num_tc;
unsigned long flags;
- int tc_to_txq_map[MLX5E_MAX_NUM_TC];
/* control */
struct mlx5e_priv *priv;
@@ -475,6 +474,7 @@ struct mlx5e_priv {
/* priv data path fields - start */
int default_vlan_prio;
struct mlx5e_sq **txq_to_sq_map;
+ int channeltc_to_txq_map[MLX5E_MAX_NUM_CHANNELS][MLX5E_MAX_NUM_TC];
/* priv data path fields - end */
unsigned long state;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 55166dd5b4ea..59874d666cff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -949,13 +949,13 @@ static void mlx5e_close_sqs(struct mlx5e_channel *c)
mlx5e_close_sq(&c->sq[tc]);
}
-static void mlx5e_build_tc_to_txq_map(struct mlx5e_channel *c,
- int num_channels)
+static void mlx5e_build_channeltc_to_txq_map(struct mlx5e_priv *priv, int ix)
{
int i;
for (i = 0; i < MLX5E_MAX_NUM_TC; i++)
- c->tc_to_txq_map[i] = c->ix + i * num_channels;
+ priv->channeltc_to_txq_map[ix][i] =
+ ix + i * priv->params.num_channels;
}
static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
@@ -979,7 +979,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
c->mkey_be = cpu_to_be32(priv->mr.key);
c->num_tc = priv->params.num_tc;
- mlx5e_build_tc_to_txq_map(c, priv->params.num_channels);
+ mlx5e_build_channeltc_to_txq_map(priv, ix);
netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 64380bc0cd6a..b73672f32e2c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -106,7 +106,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
priv->default_vlan_prio;
int tc = netdev_get_prio_tc_map(dev, up);
- return priv->channel[channel_ix]->tc_to_txq_map[tc];
+ return priv->channeltc_to_txq_map[channel_ix][tc];
}
static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index a34f4742aa00..045f98fed476 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -1726,6 +1726,7 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
mlxsw_pci_dbg_root);
if (!mlxsw_pci->dbg_dir) {
dev_err(&pdev->dev, "Failed to create debugfs dir\n");
+ err = -ENOMEM;
goto err_dbg_create_dir;
}
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index f790f61ea78a..d6d39df5b3dc 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -637,6 +637,9 @@ enum rtl_register_content {
/* _TBICSRBit */
TBILinkOK = 0x02000000,
+ /* ResetCounterCommand */
+ CounterReset = 0x1,
+
/* DumpCounterCommand */
CounterDump = 0x8,
@@ -747,6 +750,14 @@ struct rtl8169_counters {
__le16 tx_underun;
};
+struct rtl8169_tc_offsets {
+ bool inited;
+ __le64 tx_errors;
+ __le32 tx_multi_collision;
+ __le32 rx_multicast;
+ __le16 tx_aborted;
+};
+
enum rtl_flag {
RTL_FLAG_TASK_ENABLED,
RTL_FLAG_TASK_SLOW_PENDING,
@@ -824,6 +835,7 @@ struct rtl8169_private {
struct mii_if_info mii;
struct rtl8169_counters counters;
+ struct rtl8169_tc_offsets tc_offset;
u32 saved_wolopts;
u32 opts1_mask;
@@ -2179,6 +2191,73 @@ static int rtl8169_get_sset_count(struct net_device *dev, int sset)
}
}
+static struct rtl8169_counters *rtl8169_map_counters(struct net_device *dev,
+ dma_addr_t *paddr,
+ u32 counter_cmd)
+{
+ struct rtl8169_private *tp = netdev_priv(dev);
+ void __iomem *ioaddr = tp->mmio_addr;
+ struct device *d = &tp->pci_dev->dev;
+ struct rtl8169_counters *counters;
+ u32 cmd;
+
+ counters = dma_alloc_coherent(d, sizeof(*counters), paddr, GFP_KERNEL);
+ if (counters) {
+ RTL_W32(CounterAddrHigh, (u64)*paddr >> 32);
+ cmd = (u64)*paddr & DMA_BIT_MASK(32);
+ RTL_W32(CounterAddrLow, cmd);
+ RTL_W32(CounterAddrLow, cmd | counter_cmd);
+ }
+ return counters;
+}
+
+static void rtl8169_unmap_counters (struct net_device *dev,
+ dma_addr_t paddr,
+ struct rtl8169_counters *counters)
+{
+ struct rtl8169_private *tp = netdev_priv(dev);
+ void __iomem *ioaddr = tp->mmio_addr;
+ struct device *d = &tp->pci_dev->dev;
+
+ RTL_W32(CounterAddrLow, 0);
+ RTL_W32(CounterAddrHigh, 0);
+
+ dma_free_coherent(d, sizeof(*counters), counters, paddr);
+}
+
+DECLARE_RTL_COND(rtl_reset_counters_cond)
+{
+ void __iomem *ioaddr = tp->mmio_addr;
+
+ return RTL_R32(CounterAddrLow) & CounterReset;
+}
+
+static bool rtl8169_reset_counters(struct net_device *dev)
+{
+ struct rtl8169_private *tp = netdev_priv(dev);
+ struct rtl8169_counters *counters;
+ dma_addr_t paddr;
+ bool ret = true;
+
+ /*
+ * Versions prior to RTL_GIGA_MAC_VER_19 don't support resetting the
+ * tally counters.
+ */
+ if (tp->mac_version < RTL_GIGA_MAC_VER_19)
+ return true;
+
+ counters = rtl8169_map_counters(dev, &paddr, CounterReset);
+ if (!counters)
+ return false;
+
+ if (!rtl_udelay_loop_wait_low(tp, &rtl_reset_counters_cond, 10, 1000))
+ ret = false;
+
+ rtl8169_unmap_counters(dev, paddr, counters);
+
+ return ret;
+}
+
DECLARE_RTL_COND(rtl_counters_cond)
{
void __iomem *ioaddr = tp->mmio_addr;
@@ -2186,38 +2265,72 @@ DECLARE_RTL_COND(rtl_counters_cond)
return RTL_R32(CounterAddrLow) & CounterDump;
}
-static void rtl8169_update_counters(struct net_device *dev)
+static bool rtl8169_update_counters(struct net_device *dev)
{
struct rtl8169_private *tp = netdev_priv(dev);
void __iomem *ioaddr = tp->mmio_addr;
- struct device *d = &tp->pci_dev->dev;
struct rtl8169_counters *counters;
dma_addr_t paddr;
- u32 cmd;
+ bool ret = true;
/*
* Some chips are unable to dump tally counters when the receiver
* is disabled.
*/
if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0)
- return;
+ return true;
- counters = dma_alloc_coherent(d, sizeof(*counters), &paddr, GFP_KERNEL);
+ counters = rtl8169_map_counters(dev, &paddr, CounterDump);
if (!counters)
- return;
-
- RTL_W32(CounterAddrHigh, (u64)paddr >> 32);
- cmd = (u64)paddr & DMA_BIT_MASK(32);
- RTL_W32(CounterAddrLow, cmd);
- RTL_W32(CounterAddrLow, cmd | CounterDump);
+ return false;
if (rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000))
memcpy(&tp->counters, counters, sizeof(*counters));
+ else
+ ret = false;
- RTL_W32(CounterAddrLow, 0);
- RTL_W32(CounterAddrHigh, 0);
+ rtl8169_unmap_counters(dev, paddr, counters);
- dma_free_coherent(d, sizeof(*counters), counters, paddr);
+ return ret;
+}
+
+static bool rtl8169_init_counter_offsets(struct net_device *dev)
+{
+ struct rtl8169_private *tp = netdev_priv(dev);
+ bool ret = false;
+
+ /*
+ * rtl8169_init_counter_offsets is called from rtl_open. On chip
+ * versions prior to RTL_GIGA_MAC_VER_19 the tally counters are only
+ * reset by a power cycle, while the counter values collected by the
+ * driver are reset at every driver unload/load cycle.
+ *
+ * To make sure the HW values returned by @get_stats64 match the SW
+ * values, we collect the initial values at first open(*) and use them
+ * as offsets to normalize the values returned by @get_stats64.
+ *
+ * (*) We can't call rtl8169_init_counter_offsets from rtl_init_one
+ * for the reason stated in rtl8169_update_counters; CmdRxEnb is only
+ * set at open time by rtl_hw_start.
+ */
+
+ if (tp->tc_offset.inited)
+ return true;
+
+ /* If both, reset and update fail, propagate to caller. */
+ if (rtl8169_reset_counters(dev))
+ ret = true;
+
+ if (rtl8169_update_counters(dev))
+ ret = true;
+
+ tp->tc_offset.tx_errors = tp->counters.tx_errors;
+ tp->tc_offset.tx_multi_collision = tp->counters.tx_multi_collision;
+ tp->tc_offset.rx_multicast = tp->counters.rx_multicast;
+ tp->tc_offset.tx_aborted = tp->counters.tx_aborted;
+ tp->tc_offset.inited = true;
+
+ return ret;
}
static void rtl8169_get_ethtool_stats(struct net_device *dev,
@@ -7631,6 +7744,9 @@ static int rtl_open(struct net_device *dev)
rtl_hw_start(dev);
+ if (!rtl8169_init_counter_offsets(dev))
+ netif_warn(tp, hw, dev, "counter reset/update failed\n");
+
netif_start_queue(dev);
rtl_unlock_work(tp);
@@ -7689,6 +7805,25 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
stats->rx_fifo_errors = dev->stats.rx_fifo_errors;
stats->rx_missed_errors = dev->stats.rx_missed_errors;
+ /*
+ * Fetch additonal counter values missing in stats collected by driver
+ * from tally counters.
+ */
+ rtl8169_update_counters(dev);
+
+ /*
+ * Subtract values fetched during initalization.
+ * See rtl8169_init_counter_offsets for a description why we do that.
+ */
+ stats->tx_errors = le64_to_cpu(tp->counters.tx_errors) -
+ le64_to_cpu(tp->tc_offset.tx_errors);
+ stats->collisions = le32_to_cpu(tp->counters.tx_multi_collision) -
+ le32_to_cpu(tp->tc_offset.tx_multi_collision);
+ stats->multicast = le32_to_cpu(tp->counters.rx_multicast) -
+ le32_to_cpu(tp->tc_offset.rx_multicast);
+ stats->tx_aborted_errors = le16_to_cpu(tp->counters.tx_aborted) -
+ le16_to_cpu(tp->tc_offset.tx_aborted);
+
return stats;
}
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index aeebc0a7bf47..a21c77bc1b27 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -2004,8 +2004,10 @@ static int davinci_emac_probe(struct platform_device *pdev)
if (res_ctrl) {
priv->ctrl_base =
devm_ioremap_resource(&pdev->dev, res_ctrl);
- if (IS_ERR(priv->ctrl_base))
+ if (IS_ERR(priv->ctrl_base)) {
+ rc = PTR_ERR(priv->ctrl_base);
goto no_pdata;
+ }
} else {
priv->ctrl_base = priv->remap_addr + pdata->ctrl_mod_reg_offset;
}
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index fa8f5046afe9..0481daf9201a 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -283,6 +283,8 @@ static int unit_set(struct idr *p, void *ptr, int n);
static void unit_put(struct idr *p, int n);
static void *unit_find(struct idr *p, int n);
+static const struct net_device_ops ppp_netdev_ops;
+
static struct class *ppp_class;
/* per net-namespace data */
@@ -919,13 +921,22 @@ static __net_init int ppp_init_net(struct net *net)
static __net_exit void ppp_exit_net(struct net *net)
{
struct ppp_net *pn = net_generic(net, ppp_net_id);
+ struct net_device *dev;
+ struct net_device *aux;
struct ppp *ppp;
LIST_HEAD(list);
int id;
rtnl_lock();
+ for_each_netdev_safe(net, dev, aux) {
+ if (dev->netdev_ops == &ppp_netdev_ops)
+ unregister_netdevice_queue(dev, &list);
+ }
+
idr_for_each_entry(&pn->units_idr, ppp, id)
- unregister_netdevice_queue(ppp->dev, &list);
+ /* Skip devices already unregistered by previous loop */
+ if (!net_eq(dev_net(ppp->dev), net))
+ unregister_netdevice_queue(ppp->dev, &list);
unregister_netdevice_many(&list);
rtnl_unlock();
@@ -1017,6 +1028,7 @@ ppp_start_xmit(struct sk_buff *skb, struct net_device *dev)
proto = npindex_to_proto[npi];
put_unaligned_be16(proto, pp);
+ skb_scrub_packet(skb, !net_eq(ppp->ppp_net, dev_net(dev)));
skb_queue_tail(&ppp->file.xq, skb);
ppp_xmit_process(ppp);
return NETDEV_TX_OK;
@@ -1137,7 +1149,6 @@ static void ppp_setup(struct net_device *dev)
dev->tx_queue_len = 3;
dev->type = ARPHRD_PPP;
dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
- dev->features |= NETIF_F_NETNS_LOCAL;
netif_keep_dst(dev);
}
@@ -1900,6 +1911,8 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
skb->dev = ppp->dev;
skb->protocol = htons(npindex_to_ethertype[npi]);
skb_reset_mac_header(skb);
+ skb_scrub_packet(skb, !net_eq(ppp->ppp_net,
+ dev_net(ppp->dev)));
netif_rx(skb);
}
}
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index d5332ddcea3f..002f0bd27001 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -65,71 +65,12 @@ struct inet_peer_base {
int total;
};
-#define INETPEER_BASE_BIT 0x1UL
-
-static inline struct inet_peer *inetpeer_ptr(unsigned long val)
-{
- BUG_ON(val & INETPEER_BASE_BIT);
- return (struct inet_peer *) val;
-}
-
-static inline struct inet_peer_base *inetpeer_base_ptr(unsigned long val)
-{
- if (!(val & INETPEER_BASE_BIT))
- return NULL;
- val &= ~INETPEER_BASE_BIT;
- return (struct inet_peer_base *) val;
-}
-
-static inline bool inetpeer_ptr_is_peer(unsigned long val)
-{
- return !(val & INETPEER_BASE_BIT);
-}
-
-static inline void __inetpeer_ptr_set_peer(unsigned long *val, struct inet_peer *peer)
-{
- /* This implicitly clears INETPEER_BASE_BIT */
- *val = (unsigned long) peer;
-}
-
-static inline bool inetpeer_ptr_set_peer(unsigned long *ptr, struct inet_peer *peer)
-{
- unsigned long val = (unsigned long) peer;
- unsigned long orig = *ptr;
-
- if (!(orig & INETPEER_BASE_BIT) ||
- cmpxchg(ptr, orig, val) != orig)
- return false;
- return true;
-}
-
-static inline void inetpeer_init_ptr(unsigned long *ptr, struct inet_peer_base *base)
-{
- *ptr = (unsigned long) base | INETPEER_BASE_BIT;
-}
-
-static inline void inetpeer_transfer_peer(unsigned long *to, unsigned long *from)
-{
- unsigned long val = *from;
-
- *to = val;
- if (inetpeer_ptr_is_peer(val)) {
- struct inet_peer *peer = inetpeer_ptr(val);
- atomic_inc(&peer->refcnt);
- }
-}
-
void inet_peer_base_init(struct inet_peer_base *);
void inet_initpeers(void) __init;
#define INETPEER_METRICS_NEW (~(u32) 0)
-static inline bool inet_metrics_new(const struct inet_peer *p)
-{
- return p->metrics[RTAX_LOCK-1] == INETPEER_METRICS_NEW;
-}
-
/* can be called with or without local BH being disabled */
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
const struct inetpeer_addr *daddr,
@@ -163,12 +104,4 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);
void inetpeer_invalidate_tree(struct inet_peer_base *);
-/*
- * temporary check to make sure we dont access rid, tcp_ts,
- * tcp_ts_stamp if no refcount is taken on inet_peer
- */
-static inline void inet_peer_refcheck(const struct inet_peer *p)
-{
- WARN_ON_ONCE(atomic_read(&p->refcnt) <= 0);
-}
#endif /* _NET_INETPEER_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 364426a2be5a..4a7b03947a38 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -281,6 +281,8 @@ extern unsigned int sysctl_tcp_notsent_lowat;
extern int sysctl_tcp_min_tso_segs;
extern int sysctl_tcp_autocorking;
extern int sysctl_tcp_invalid_ratelimit;
+extern int sysctl_tcp_pacing_ss_ratio;
+extern int sysctl_tcp_pacing_ca_ratio;
extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
@@ -1165,6 +1167,19 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
}
u32 tcp_default_init_rwnd(u32 mss);
+void tcp_cwnd_restart(struct sock *sk, s32 delta);
+
+static inline void tcp_slow_start_after_idle_check(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ s32 delta;
+
+ if (!sysctl_tcp_slow_start_after_idle || tp->packets_out)
+ return;
+ delta = tcp_time_stamp - tp->lsndtime;
+ if (delta > inet_csk(sk)->icsk_rto)
+ tcp_cwnd_restart(sk, delta);
+}
/* Determine a window scaling and initial window to offer. */
void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 6b3234599a2c..480a319b4c92 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -241,9 +241,10 @@ static inline void vxlan_get_rx_port(struct net_device *netdev)
{
}
#endif
-#endif
static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs)
{
return vs->sock->sk->sk_family;
}
+
+#endif
diff --git a/net/core/dst.c b/net/core/dst.c
index 50dcdbb0ee46..477035ed7903 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -262,11 +262,12 @@ again:
if (dst->dev)
dev_put(dst->dev);
+ lwtstate_put(dst->lwtstate);
+
if (dst->flags & DST_METADATA)
kfree(dst);
else
kmem_cache_free(dst->ops->kmem_cachep, dst);
- lwtstate_put(dst->lwtstate);
dst = child;
if (dst) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bf9a5d93c2d1..8a725cc50a90 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -392,7 +392,7 @@ EXPORT_SYMBOL(napi_alloc_frag);
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
* @dev: network device to receive on
- * @length: length to allocate
+ * @len: length to allocate
* @gfp_mask: get_free_pages mask, passed to alloc_skb
*
* Allocate a new &sk_buff and assign it a usage count of one. The
@@ -461,7 +461,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
/**
* __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
* @napi: napi instance this buffer was allocated for
- * @length: length to allocate
+ * @len: length to allocate
* @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
*
* Allocate a new sk_buff for use in NAPI receive. This buffer will
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index ac9a32ec3ee4..f2a71025a770 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -360,8 +360,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
ahp->icv_trunc_len + seqhi_len);
- if (!work_iph)
+ if (!work_iph) {
+ err = -ENOMEM;
goto out;
+ }
seqhi = (__be32 *)((char *)work_iph + ihl);
auth_data = ah_tmp_auth(seqhi, seqhi_len);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0330ab2e2b63..879bdc5c95b1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
static int zero;
static int one = 1;
static int four = 4;
+static int thousand = 1000;
static int gso_max_segs = GSO_MAX_SEGS;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
@@ -712,6 +713,24 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &gso_max_segs,
},
{
+ .procname = "tcp_pacing_ss_ratio",
+ .data = &sysctl_tcp_pacing_ss_ratio,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &thousand,
+ },
+ {
+ .procname = "tcp_pacing_ca_ratio",
+ .data = &sysctl_tcp_pacing_ca_ratio,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &thousand,
+ },
+ {
.procname = "tcp_autocorking",
.data = &sysctl_tcp_autocorking,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 45534a5ab430..b8b8fa184f75 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -627,6 +627,8 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb)
sk_mem_charge(sk, skb->truesize);
if (tp->nonagle & TCP_NAGLE_PUSH)
tp->nonagle &= ~TCP_NAGLE_PUSH;
+
+ tcp_slow_start_after_idle_check(sk);
}
static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4e4d6bcd0ca9..dc08e2352665 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -753,13 +753,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
* TCP pacing, to smooth the burst on large writes when packets
* in flight is significantly lower than cwnd (or rwin)
*/
+int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
+int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
+
static void tcp_update_pacing_rate(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
u64 rate;
/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
- rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
+ rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
+
+ /* current rate is (cwnd * mss) / srtt
+ * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
+ * In Congestion Avoidance phase, set it to 120 % the current rate.
+ *
+ * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
+ * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
+ * end of slow start and should slow down.
+ */
+ if (tp->snd_cwnd < tp->snd_ssthresh / 2)
+ rate *= sysctl_tcp_pacing_ss_ratio;
+ else
+ rate *= sysctl_tcp_pacing_ca_ratio;
rate *= max(tp->snd_cwnd, tp->packets_out);
@@ -3332,6 +3348,9 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
tp->pred_flags = 0;
tcp_fast_path_check(sk);
+ if (tcp_send_head(sk))
+ tcp_slow_start_after_idle_check(sk);
+
if (nwin > tp->max_window) {
tp->max_window = nwin;
tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 444ab5beecbd..1188e4fcf23b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -137,12 +137,12 @@ static __u16 tcp_advertise_mss(struct sock *sk)
}
/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
- * This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
+ * This is the first part of cwnd validation mechanism.
+ */
+void tcp_cwnd_restart(struct sock *sk, s32 delta)
{
struct tcp_sock *tp = tcp_sk(sk);
- s32 delta = tcp_time_stamp - tp->lsndtime;
- u32 restart_cwnd = tcp_init_cwnd(tp, dst);
+ u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
u32 cwnd = tp->snd_cwnd;
tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
@@ -164,10 +164,6 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_time_stamp;
- if (sysctl_tcp_slow_start_after_idle &&
- (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
- tcp_cwnd_restart(sk, __sk_dst_get(sk));
-
tp->lsndtime = now;
/* If it is a reply for ato after last received
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 55b3c0f4dde5..bb919b28619f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -15,6 +15,7 @@
#include <net/dst.h>
#include <net/xfrm.h>
#include <net/ip.h>
+#include <net/vrf.h>
static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
@@ -107,8 +108,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
struct flowi4 *fl4 = &fl->u.ip4;
int oif = 0;
- if (skb_dst(skb))
- oif = skb_dst(skb)->dev->ifindex;
+ if (skb_dst(skb)) {
+ oif = vrf_master_ifindex(skb_dst(skb)->dev) ?
+ : skb_dst(skb)->dev->ifindex;
+ }
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index ed7d4e3f9c10..0630a4d5daaa 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -577,8 +577,10 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len +
ahp->icv_trunc_len + seqhi_len);
- if (!work_iph)
+ if (!work_iph) {
+ err = -ENOMEM;
goto out;
+ }
auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len);
seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len);
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index a74013d3eceb..30caa289c5db 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -20,6 +20,7 @@
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
+#include <net/vrf.h>
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/mip6.h>
#endif
@@ -131,8 +132,10 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
nexthdr = nh[nhoff];
- if (skb_dst(skb))
- oif = skb_dst(skb)->dev->ifindex;
+ if (skb_dst(skb)) {
+ oif = vrf_master_ifindex(skb_dst(skb)->dev) ?
+ : skb_dst(skb)->dev->ifindex;
+ }
memset(fl6, 0, sizeof(struct flowi6));
fl6->flowi6_mark = skb->mark;
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 896834cd3b9a..a2f28a6d4dc5 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -438,6 +438,14 @@ static const struct proto_ops rds_proto_ops = {
.sendpage = sock_no_sendpage,
};
+static void rds_sock_destruct(struct sock *sk)
+{
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+
+ WARN_ON((&rs->rs_item != rs->rs_item.next ||
+ &rs->rs_item != rs->rs_item.prev));
+}
+
static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
{
struct rds_sock *rs;
@@ -445,6 +453,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
sock_init_data(sock, sk);
sock->ops = &rds_proto_ops;
sk->sk_protocol = protocol;
+ sk->sk_destruct = rds_sock_destruct;
rs = rds_sk_to_rs(sk);
spin_lock_init(&rs->rs_lock);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index d4fecb21ca25..a50e652eb269 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -301,6 +301,8 @@ void rds_conn_shutdown(struct rds_connection *conn)
wait_event(conn->c_waitq,
!test_bit(RDS_IN_XMIT, &conn->c_flags));
+ wait_event(conn->c_waitq,
+ !test_bit(RDS_RECV_REFILL, &conn->c_flags));
conn->c_trans->conn_shutdown(conn);
rds_conn_reset(conn);
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 86d88ec5d556..6422c52682e5 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -320,7 +320,7 @@ void rds_ib_recv_exit(void);
int rds_ib_recv(struct rds_connection *conn);
int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
-void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp);
void rds_ib_inc_free(struct rds_incoming *inc);
int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f40d8f52b753..d150bb4aa3cb 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -135,7 +135,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
rds_ib_recv_init_ring(ic);
/* Post receive buffers - as a side effect, this will update
* the posted credit count. */
- rds_ib_recv_refill(conn, 1);
+ rds_ib_recv_refill(conn, 1, GFP_KERNEL);
/* Tune RNR behavior */
rds_ib_tune_rnr(ic, &qp_attr);
@@ -640,6 +640,15 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
(atomic_read(&ic->i_signaled_sends) == 0));
tasklet_kill(&ic->i_recv_tasklet);
+ /* first destroy the ib state that generates callbacks */
+ if (ic->i_cm_id->qp)
+ rdma_destroy_qp(ic->i_cm_id);
+ if (ic->i_send_cq)
+ ib_destroy_cq(ic->i_send_cq);
+ if (ic->i_recv_cq)
+ ib_destroy_cq(ic->i_recv_cq);
+
+ /* then free the resources that ib callbacks use */
if (ic->i_send_hdrs)
ib_dma_free_coherent(dev,
ic->i_send_ring.w_nr *
@@ -663,12 +672,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
if (ic->i_recvs)
rds_ib_recv_clear_ring(ic);
- if (ic->i_cm_id->qp)
- rdma_destroy_qp(ic->i_cm_id);
- if (ic->i_send_cq)
- ib_destroy_cq(ic->i_send_cq);
- if (ic->i_recv_cq)
- ib_destroy_cq(ic->i_recv_cq);
rdma_destroy_id(ic->i_cm_id);
/*
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 657ba9f5d308..7b7aac8cdb56 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -151,12 +151,17 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
struct rds_ib_device *rds_ibdev_old;
rds_ibdev_old = rds_ib_get_device(ipaddr);
- if (rds_ibdev_old) {
+ if (!rds_ibdev_old)
+ return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+
+ if (rds_ibdev_old != rds_ibdev) {
rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
rds_ib_dev_put(rds_ibdev_old);
+ return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
}
+ rds_ib_dev_put(rds_ibdev_old);
- return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+ return 0;
}
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
@@ -485,7 +490,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
/* FIXME we need a way to tell a r/w MR
* from a r/o MR */
- BUG_ON(irqs_disabled());
+ WARN_ON(!page->mapping && irqs_disabled());
set_page_dirty(page);
put_page(page);
}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index cac5b4506ee3..ed9b41e3b277 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -297,7 +297,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
}
static int rds_ib_recv_refill_one(struct rds_connection *conn,
- struct rds_ib_recv_work *recv, int prefill)
+ struct rds_ib_recv_work *recv, gfp_t gfp)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct ib_sge *sge;
@@ -305,7 +305,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
gfp_t slab_mask = GFP_NOWAIT;
gfp_t page_mask = GFP_NOWAIT;
- if (prefill) {
+ if (gfp & __GFP_WAIT) {
slab_mask = GFP_KERNEL;
page_mask = GFP_HIGHUSER;
}
@@ -347,6 +347,24 @@ out:
return ret;
}
+static int acquire_refill(struct rds_connection *conn)
+{
+ return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0;
+}
+
+static void release_refill(struct rds_connection *conn)
+{
+ clear_bit(RDS_RECV_REFILL, &conn->c_flags);
+
+ /* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+ * hot path and finding waiters is very rare. We don't want to walk
+ * the system-wide hashed waitqueue buckets in the fast path only to
+ * almost never find waiters.
+ */
+ if (waitqueue_active(&conn->c_waitq))
+ wake_up_all(&conn->c_waitq);
+}
+
/*
* This tries to allocate and post unused work requests after making sure that
* they have all the allocations they need to queue received fragments into
@@ -354,15 +372,23 @@ out:
*
* -1 is returned if posting fails due to temporary resource exhaustion.
*/
-void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_recv_work *recv;
struct ib_recv_wr *failed_wr;
unsigned int posted = 0;
int ret = 0;
+ bool can_wait = !!(gfp & __GFP_WAIT);
u32 pos;
+ /* the goal here is to just make sure that someone, somewhere
+ * is posting buffers. If we can't get the refill lock,
+ * let them do their thing
+ */
+ if (!acquire_refill(conn))
+ return;
+
while ((prefill || rds_conn_up(conn)) &&
rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
if (pos >= ic->i_recv_ring.w_nr) {
@@ -372,7 +398,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
}
recv = &ic->i_recvs[pos];
- ret = rds_ib_recv_refill_one(conn, recv, prefill);
+ ret = rds_ib_recv_refill_one(conn, recv, gfp);
if (ret) {
break;
}
@@ -402,6 +428,24 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
if (ret)
rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+
+ release_refill(conn);
+
+ /* if we're called from the softirq handler, we'll be GFP_NOWAIT.
+ * in this case the ring being low is going to lead to more interrupts
+ * and we can safely let the softirq code take care of it unless the
+ * ring is completely empty.
+ *
+ * if we're called from krdsd, we'll be GFP_KERNEL. In this case
+ * we might have raced with the softirq code while we had the refill
+ * lock held. Use rds_ib_ring_low() instead of ring_empty to decide
+ * if we should requeue.
+ */
+ if (rds_conn_up(conn) &&
+ ((can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
+ rds_ib_ring_empty(&ic->i_recv_ring))) {
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 1);
+ }
}
/*
@@ -982,10 +1026,17 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
}
/*
- * It's very important that we only free this ring entry if we've truly
- * freed the resources allocated to the entry. The refilling path can
- * leak if we don't.
+ * rds_ib_process_recv() doesn't always consume the frag, and
+ * we might not have called it at all if the wc didn't indicate
+ * success. We already unmapped the frag's pages, though, and
+ * the following rds_ib_ring_free() call tells the refill path
+ * that it will not find an allocated frag here. Make sure we
+ * keep that promise by freeing a frag that's still on the ring.
*/
+ if (recv->r_frag) {
+ rds_ib_frag_free(ic, recv->r_frag);
+ recv->r_frag = NULL;
+ }
rds_ib_ring_free(&ic->i_recv_ring, 1);
}
}
@@ -1016,7 +1067,7 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
rds_ib_stats_inc(s_ib_rx_ring_empty);
if (rds_ib_ring_low(&ic->i_recv_ring))
- rds_ib_recv_refill(conn, 0);
+ rds_ib_recv_refill(conn, 0, GFP_NOWAIT);
}
int rds_ib_recv(struct rds_connection *conn)
@@ -1025,8 +1076,10 @@ int rds_ib_recv(struct rds_connection *conn)
int ret = 0;
rdsdebug("conn %p\n", conn);
- if (rds_conn_up(conn))
+ if (rds_conn_up(conn)) {
rds_ib_attempt_ack(ic);
+ rds_ib_recv_refill(conn, 0, GFP_KERNEL);
+ }
return ret;
}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 5d0a704fa039..c576ebeb4115 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -709,6 +709,11 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
if (scat == &rm->data.op_sg[rm->data.op_count]) {
prev->s_op = ic->i_data_op;
prev->s_wr.send_flags |= IB_SEND_SOLICITED;
+ if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) {
+ ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+ prev->s_wr.send_flags |= IB_SEND_SIGNALED;
+ nr_sig++;
+ }
ic->i_data_op = NULL;
}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 40084d843e9f..c1df9b1cf3b2 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -451,7 +451,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro)
* is the case for a RDMA_READ which copies from remote
* to local memory */
if (!ro->op_write) {
- BUG_ON(irqs_disabled());
+ WARN_ON(!page->mapping && irqs_disabled());
set_page_dirty(page);
}
put_page(page);
@@ -658,6 +658,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
if (ret < 0)
goto out;
+ else
+ ret = 0;
rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
nr_bytes, nr, iov->bytes, iov->addr);
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 208240836043..b9b40af5345b 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -34,6 +34,7 @@
#include <rdma/rdma_cm.h>
#include "rdma_transport.h"
+#include "ib.h"
static struct rdma_cm_id *rds_rdma_listen_id;
@@ -82,8 +83,18 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
break;
case RDMA_CM_EVENT_ROUTE_RESOLVED:
- /* XXX worry about racing with listen acceptance */
- ret = trans->cm_initiate_connect(cm_id);
+ /* Connection could have been dropped so make sure the
+ * cm_id is valid before proceeding
+ */
+ if (conn) {
+ struct rds_ib_connection *ibic;
+
+ ibic = conn->c_transport_data;
+ if (ibic && ibic->i_cm_id == cm_id)
+ ret = trans->cm_initiate_connect(cm_id);
+ else
+ rds_conn_drop(conn);
+ }
break;
case RDMA_CM_EVENT_ESTABLISHED:
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 9005fb0586f6..afb4048d0cfd 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -80,6 +80,7 @@ enum {
#define RDS_LL_SEND_FULL 0
#define RDS_RECONNECT_PENDING 1
#define RDS_IN_XMIT 2
+#define RDS_RECV_REFILL 3
struct rds_connection {
struct hlist_node c_hash_node;
diff --git a/net/rds/send.c b/net/rds/send.c
index 2581b8e3dbe7..4df61a515b83 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -282,26 +282,34 @@ restart:
/* The transport either sends the whole rdma or none of it */
if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
rm->m_final_op = &rm->rdma;
+ /* The transport owns the mapped memory for now.
+ * You can't unmap it while it's on the send queue
+ */
+ set_bit(RDS_MSG_MAPPED, &rm->m_flags);
ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
- if (ret)
+ if (ret) {
+ clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ wake_up_interruptible(&rm->m_flush_wait);
break;
+ }
conn->c_xmit_rdma_sent = 1;
- /* The transport owns the mapped memory for now.
- * You can't unmap it while it's on the send queue */
- set_bit(RDS_MSG_MAPPED, &rm->m_flags);
}
if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
rm->m_final_op = &rm->atomic;
+ /* The transport owns the mapped memory for now.
+ * You can't unmap it while it's on the send queue
+ */
+ set_bit(RDS_MSG_MAPPED, &rm->m_flags);
ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
- if (ret)
+ if (ret) {
+ clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ wake_up_interruptible(&rm->m_flush_wait);
break;
+ }
conn->c_xmit_atomic_sent = 1;
- /* The transport owns the mapped memory for now.
- * You can't unmap it while it's on the send queue */
- set_bit(RDS_MSG_MAPPED, &rm->m_flags);
}
/*
@@ -411,7 +419,8 @@ over_batch:
*/
if (ret == 0) {
smp_mb();
- if (!list_empty(&conn->c_send_queue) &&
+ if ((test_bit(0, &conn->c_map_queued) ||
+ !list_empty(&conn->c_send_queue)) &&
send_gen == conn->c_send_gen) {
rds_stats_inc(s_send_lock_queue_raced);
goto restart;
@@ -769,8 +778,22 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
while (!list_empty(&list)) {
rm = list_entry(list.next, struct rds_message, m_sock_item);
list_del_init(&rm->m_sock_item);
-
rds_message_wait(rm);
+
+ /* just in case the code above skipped this message
+ * because RDS_MSG_ON_CONN wasn't set, run it again here
+ * taking m_rs_lock is the only thing that keeps us
+ * from racing with ack processing.
+ */
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+ spin_lock(&rs->rs_lock);
+ __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+ spin_unlock(&rs->rs_lock);
+
+ rm->m_rs = NULL;
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
rds_message_put(rm);
}
}
@@ -992,6 +1015,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
goto out;
}
+ if (payload_len > rds_sk_sndbuf(rs)) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
/* size of rm including all sgs */
ret = rds_rm_size(msg, payload_len);
if (ret < 0)
@@ -1064,11 +1092,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
dport, &queued)) {
rds_stats_inc(s_send_queue_full);
- /* XXX make sure this is reasonable */
- if (payload_len > rds_sk_sndbuf(rs)) {
- ret = -EMSGSIZE;
- goto out;
- }
+
if (nonblock) {
ret = -EAGAIN;
goto out;