summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/atm/common.c2
-rw-r--r--net/atm/mpc.c9
-rw-r--r--net/atm/mpoa_caches.c48
-rw-r--r--net/atm/mpoa_caches.h9
-rw-r--r--net/atm/mpoa_proc.c15
-rw-r--r--net/bridge/br_device.c10
-rw-r--r--net/bridge/br_fdb.c392
-rw-r--r--net/bridge/br_mdb.c6
-rw-r--r--net/bridge/br_nf_core.c1
-rw-r--r--net/bridge/br_private.h16
-rw-r--r--net/bridge/br_switchdev.c8
-rw-r--r--net/can/gw.c14
-rw-r--r--net/core/dev.c86
-rw-r--r--net/core/dst.c14
-rw-r--r--net/core/filter.c40
-rw-r--r--net/core/flow_dissector.c12
-rw-r--r--net/core/gen_stats.c9
-rw-r--r--net/core/pktgen.c12
-rw-r--r--net/core/rtnetlink.c312
-rw-r--r--net/core/sock_reuseport.c4
-rw-r--r--net/dccp/minisocks.c7
-rw-r--r--net/decnet/dn_dev.c9
-rw-r--r--net/decnet/dn_fib.c6
-rw-r--r--net/decnet/dn_route.c42
-rw-r--r--net/dsa/Kconfig9
-rw-r--r--net/dsa/Makefile3
-rw-r--r--net/dsa/dsa2.c2
-rw-r--r--net/dsa/dsa_priv.h9
-rw-r--r--net/dsa/legacy.c20
-rw-r--r--net/dsa/slave.c25
-rw-r--r--net/dsa/switch.c111
-rw-r--r--net/dsa/tag_mtk.c38
-rw-r--r--net/ipv4/inet_hashtables.c184
-rw-r--r--net/ipv4/inet_timewait_sock.c27
-rw-r--r--net/ipv4/ip_gre.c172
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv4/tcp.c3
-rw-r--r--net/ipv4/tcp_fastopen.c30
-rw-r--r--net/ipv4/tcp_input.c31
-rw-r--r--net/ipv4/tcp_metrics.c5
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_output.c9
-rw-r--r--net/ipv4/tcp_timer.c17
-rw-r--r--net/ipv4/udp.c44
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c2
-rw-r--r--net/ipv6/addrconf.c48
-rw-r--r--net/ipv6/addrlabel.c25
-rw-r--r--net/ipv6/ila/ila_xlat.c4
-rw-r--r--net/ipv6/inet6_hashtables.c77
-rw-r--r--net/ipv6/ip6_fib.c30
-rw-r--r--net/ipv6/ip6_gre.c649
-rw-r--r--net/ipv6/ip6_output.c4
-rw-r--r--net/ipv6/ip6_tunnel.c5
-rw-r--r--net/ipv6/ip6mr.c9
-rw-r--r--net/ipv6/route.c70
-rw-r--r--net/ipv6/seg6.c4
-rw-r--r--net/ipv6/udp.c54
-rw-r--r--net/ipv6/xfrm6_mode_tunnel.c2
-rw-r--r--net/ipv6/xfrm6_policy.c2
-rw-r--r--net/mac80211/mesh_pathtbl.c34
-rw-r--r--net/mpls/af_mpls.c15
-rw-r--r--net/netfilter/nft_set_hash.c10
-rw-r--r--net/netfilter/xt_policy.c3
-rw-r--r--net/netlink/af_netlink.c71
-rw-r--r--net/netlink/diag.c8
-rw-r--r--net/openvswitch/flow.c6
-rw-r--r--net/openvswitch/flow_netlink.c8
-rw-r--r--net/openvswitch/vport-internal_dev.c9
-rw-r--r--net/phonet/pn_netlink.c21
-rw-r--r--net/qrtr/qrtr.c8
-rw-r--r--net/rds/connection.c7
-rw-r--r--net/rds/rds.h6
-rw-r--r--net/rds/tcp.c38
-rw-r--r--net/rds/tcp.h1
-rw-r--r--net/sched/act_api.c2
-rw-r--r--net/sched/act_bpf.c10
-rw-r--r--net/sched/act_connmark.c8
-rw-r--r--net/sched/act_csum.c8
-rw-r--r--net/sched/act_gact.c8
-rw-r--r--net/sched/act_ife.c18
-rw-r--r--net/sched/act_ipt.c18
-rw-r--r--net/sched/act_mirred.c27
-rw-r--r--net/sched/act_nat.c8
-rw-r--r--net/sched/act_pedit.c10
-rw-r--r--net/sched/act_police.c8
-rw-r--r--net/sched/act_sample.c10
-rw-r--r--net/sched/act_simple.c10
-rw-r--r--net/sched/act_skbedit.c8
-rw-r--r--net/sched/act_skbmod.c10
-rw-r--r--net/sched/act_tunnel_key.c10
-rw-r--r--net/sched/act_vlan.c10
-rw-r--r--net/sched/cls_api.c43
-rw-r--r--net/sched/cls_flower.c1
-rw-r--r--net/sched/sch_api.c105
-rw-r--r--net/sched/sch_generic.c440
-rw-r--r--net/sched/sch_mq.c34
-rw-r--r--net/sched/sch_mqprio.c69
-rw-r--r--net/sctp/Makefile2
-rw-r--r--net/sctp/associola.c2
-rw-r--r--net/sctp/chunk.c8
-rw-r--r--net/sctp/output.c5
-rw-r--r--net/sctp/outqueue.c12
-rw-r--r--net/sctp/proc.c6
-rw-r--r--net/sctp/sm_make_chunk.c69
-rw-r--r--net/sctp/sm_sideeffect.c47
-rw-r--r--net/sctp/sm_statefuns.c45
-rw-r--r--net/sctp/sm_statetable.c5
-rw-r--r--net/sctp/socket.c149
-rw-r--r--net/sctp/stream.c47
-rw-r--r--net/sctp/stream_interleave.c1334
-rw-r--r--net/sctp/stream_sched.c3
-rw-r--r--net/sctp/sysctl.c7
-rw-r--r--net/sctp/ulpevent.c15
-rw-r--r--net/sctp/ulpqueue.c23
-rw-r--r--net/smc/af_smc.c24
-rw-r--r--net/smc/smc_cdc.c12
-rw-r--r--net/smc/smc_clc.c84
-rw-r--r--net/smc/smc_clc.h34
-rw-r--r--net/smc/smc_close.c2
-rw-r--r--net/smc/smc_close.h1
-rw-r--r--net/smc/smc_rx.c5
-rw-r--r--net/smc/smc_tx.c11
-rw-r--r--net/tipc/bcast.c12
-rw-r--r--net/tipc/link.c2
-rw-r--r--net/tipc/msg.c51
-rw-r--r--net/tipc/msg.h3
-rw-r--r--net/tipc/socket.c6
-rw-r--r--net/xfrm/xfrm_device.c6
-rw-r--r--net/xfrm/xfrm_input.c1
-rw-r--r--net/xfrm/xfrm_output.c2
-rw-r--r--net/xfrm/xfrm_policy.c137
-rw-r--r--net/xfrm/xfrm_replay.c3
-rw-r--r--net/xfrm/xfrm_state.c7
133 files changed, 4455 insertions, 1663 deletions
diff --git a/net/atm/common.c b/net/atm/common.c
index 8a4f99114cd2..5763fd241dc3 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -14,7 +14,7 @@
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
-#include <linux/time.h> /* struct timeval */
+#include <linux/time64.h> /* 64-bit time for seconds */
#include <linux/skbuff.h>
#include <linux/bitops.h>
#include <linux/init.h>
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 7c6a1cc760a2..31e0dcb970f8 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -1089,7 +1089,7 @@ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc)
msg->type = SND_MPOA_RES_RQST;
msg->content.in_info = entry->ctrl_info;
msg_to_mpoad(msg, mpc);
- do_gettimeofday(&(entry->reply_wait));
+ entry->reply_wait = ktime_get_seconds();
mpc->in_ops->put(entry);
return;
}
@@ -1099,7 +1099,7 @@ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc)
msg->type = SND_MPOA_RES_RQST;
msg->content.in_info = entry->ctrl_info;
msg_to_mpoad(msg, mpc);
- do_gettimeofday(&(entry->reply_wait));
+ entry->reply_wait = ktime_get_seconds();
mpc->in_ops->put(entry);
return;
}
@@ -1175,8 +1175,9 @@ static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc)
}
entry->ctrl_info = msg->content.in_info;
- do_gettimeofday(&(entry->tv));
- do_gettimeofday(&(entry->reply_wait)); /* Used in refreshing func from now on */
+ entry->time = ktime_get_seconds();
+ /* Used in refreshing func from now on */
+ entry->reply_wait = ktime_get_seconds();
entry->refresh_time = 0;
ddprintk_cont("entry->shortcut = %p\n", entry->shortcut);
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
index e01450bb32d6..4bb418313720 100644
--- a/net/atm/mpoa_caches.c
+++ b/net/atm/mpoa_caches.c
@@ -117,7 +117,7 @@ static in_cache_entry *in_cache_add_entry(__be32 dst_ip,
memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN);
entry->ctrl_info.in_dst_ip = dst_ip;
- do_gettimeofday(&(entry->tv));
+ entry->time = ktime_get_seconds();
entry->retry_time = client->parameters.mpc_p4;
entry->count = 1;
entry->entry_state = INGRESS_INVALID;
@@ -148,7 +148,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
if (qos != NULL)
msg.qos = qos->qos;
msg_to_mpoad(&msg, mpc);
- do_gettimeofday(&(entry->reply_wait));
+ entry->reply_wait = ktime_get_seconds();
entry->entry_state = INGRESS_RESOLVING;
}
if (entry->shortcut != NULL)
@@ -171,7 +171,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
if (qos != NULL)
msg.qos = qos->qos;
msg_to_mpoad(&msg, mpc);
- do_gettimeofday(&(entry->reply_wait));
+ entry->reply_wait = ktime_get_seconds();
}
return CLOSED;
@@ -227,17 +227,16 @@ static void in_cache_remove_entry(in_cache_entry *entry,
static void clear_count_and_expired(struct mpoa_client *client)
{
in_cache_entry *entry, *next_entry;
- struct timeval now;
+ time64_t now;
- do_gettimeofday(&now);
+ now = ktime_get_seconds();
write_lock_bh(&client->ingress_lock);
entry = client->in_cache;
while (entry != NULL) {
entry->count = 0;
next_entry = entry->next;
- if ((now.tv_sec - entry->tv.tv_sec)
- > entry->ctrl_info.holding_time) {
+ if ((now - entry->time) > entry->ctrl_info.holding_time) {
dprintk("holding time expired, ip = %pI4\n",
&entry->ctrl_info.in_dst_ip);
client->in_ops->remove_entry(entry, client);
@@ -253,35 +252,35 @@ static void check_resolving_entries(struct mpoa_client *client)
struct atm_mpoa_qos *qos;
in_cache_entry *entry;
- struct timeval now;
+ time64_t now;
struct k_message msg;
- do_gettimeofday(&now);
+ now = ktime_get_seconds();
read_lock_bh(&client->ingress_lock);
entry = client->in_cache;
while (entry != NULL) {
if (entry->entry_state == INGRESS_RESOLVING) {
- if ((now.tv_sec - entry->hold_down.tv_sec) <
- client->parameters.mpc_p6) {
+
+ if ((now - entry->hold_down)
+ < client->parameters.mpc_p6) {
entry = entry->next; /* Entry in hold down */
continue;
}
- if ((now.tv_sec - entry->reply_wait.tv_sec) >
- entry->retry_time) {
+ if ((now - entry->reply_wait) > entry->retry_time) {
entry->retry_time = MPC_C1 * (entry->retry_time);
/*
* Retry time maximum exceeded,
* put entry in hold down.
*/
if (entry->retry_time > client->parameters.mpc_p5) {
- do_gettimeofday(&(entry->hold_down));
+ entry->hold_down = ktime_get_seconds();
entry->retry_time = client->parameters.mpc_p4;
entry = entry->next;
continue;
}
/* Ask daemon to send a resolution request. */
- memset(&(entry->hold_down), 0, sizeof(struct timeval));
+ memset(&entry->hold_down, 0, sizeof(time64_t));
msg.type = SND_MPOA_RES_RTRY;
memcpy(msg.MPS_ctrl, client->mps_ctrl_addr, ATM_ESA_LEN);
msg.content.in_info = entry->ctrl_info;
@@ -289,7 +288,7 @@ static void check_resolving_entries(struct mpoa_client *client)
if (qos != NULL)
msg.qos = qos->qos;
msg_to_mpoad(&msg, client);
- do_gettimeofday(&(entry->reply_wait));
+ entry->reply_wait = ktime_get_seconds();
}
}
entry = entry->next;
@@ -300,18 +299,18 @@ static void check_resolving_entries(struct mpoa_client *client)
/* Call this every MPC-p5 seconds. */
static void refresh_entries(struct mpoa_client *client)
{
- struct timeval now;
+ time64_t now;
struct in_cache_entry *entry = client->in_cache;
ddprintk("refresh_entries\n");
- do_gettimeofday(&now);
+ now = ktime_get_seconds();
read_lock_bh(&client->ingress_lock);
while (entry != NULL) {
if (entry->entry_state == INGRESS_RESOLVED) {
if (!(entry->refresh_time))
entry->refresh_time = (2 * (entry->ctrl_info.holding_time))/3;
- if ((now.tv_sec - entry->reply_wait.tv_sec) >
+ if ((now - entry->reply_wait) >
entry->refresh_time) {
dprintk("refreshing an entry.\n");
entry->entry_state = INGRESS_REFRESHING;
@@ -480,7 +479,7 @@ static eg_cache_entry *eg_cache_add_entry(struct k_message *msg,
memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN);
entry->ctrl_info = msg->content.eg_info;
- do_gettimeofday(&(entry->tv));
+ entry->time = ktime_get_seconds();
entry->entry_state = EGRESS_RESOLVED;
dprintk("new_eg_cache_entry cache_id %u\n",
ntohl(entry->ctrl_info.cache_id));
@@ -495,7 +494,7 @@ static eg_cache_entry *eg_cache_add_entry(struct k_message *msg,
static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time)
{
- do_gettimeofday(&(entry->tv));
+ entry->time = ktime_get_seconds();
entry->entry_state = EGRESS_RESOLVED;
entry->ctrl_info.holding_time = holding_time;
}
@@ -503,17 +502,16 @@ static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time)
static void clear_expired(struct mpoa_client *client)
{
eg_cache_entry *entry, *next_entry;
- struct timeval now;
+ time64_t now;
struct k_message msg;
- do_gettimeofday(&now);
+ now = ktime_get_seconds();
write_lock_irq(&client->egress_lock);
entry = client->eg_cache;
while (entry != NULL) {
next_entry = entry->next;
- if ((now.tv_sec - entry->tv.tv_sec)
- > entry->ctrl_info.holding_time) {
+ if ((now - entry->time) > entry->ctrl_info.holding_time) {
msg.type = SND_EGRESS_PURGE;
msg.content.eg_info = entry->ctrl_info;
dprintk("egress_cache: holding time expired, cache_id = %u.\n",
diff --git a/net/atm/mpoa_caches.h b/net/atm/mpoa_caches.h
index 6a266669ebf4..464c4c7f8d1f 100644
--- a/net/atm/mpoa_caches.h
+++ b/net/atm/mpoa_caches.h
@@ -2,6 +2,7 @@
#ifndef MPOA_CACHES_H
#define MPOA_CACHES_H
+#include <linux/time64.h>
#include <linux/netdevice.h>
#include <linux/types.h>
#include <linux/atm.h>
@@ -16,9 +17,9 @@ void atm_mpoa_init_cache(struct mpoa_client *mpc);
typedef struct in_cache_entry {
struct in_cache_entry *next;
struct in_cache_entry *prev;
- struct timeval tv;
- struct timeval reply_wait;
- struct timeval hold_down;
+ time64_t time;
+ time64_t reply_wait;
+ time64_t hold_down;
uint32_t packets_fwded;
uint16_t entry_state;
uint32_t retry_time;
@@ -53,7 +54,7 @@ struct in_cache_ops{
typedef struct eg_cache_entry{
struct eg_cache_entry *next;
struct eg_cache_entry *prev;
- struct timeval tv;
+ time64_t time;
uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN];
struct atm_vcc *shortcut;
uint32_t packets_rcvd;
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index 8a0c17e1c203..2212da9c2da2 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -8,7 +8,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
-#include <linux/time.h>
+#include <linux/ktime.h>
#include <linux/seq_file.h>
#include <linux/uaccess.h>
#include <linux/atmmpc.h>
@@ -138,7 +138,7 @@ static int mpc_show(struct seq_file *m, void *v)
int i;
in_cache_entry *in_entry;
eg_cache_entry *eg_entry;
- struct timeval now;
+ time64_t now;
unsigned char ip_string[16];
if (v == SEQ_START_TOKEN) {
@@ -148,15 +148,17 @@ static int mpc_show(struct seq_file *m, void *v)
seq_printf(m, "\nInterface %d:\n\n", mpc->dev_num);
seq_printf(m, "Ingress Entries:\nIP address State Holding time Packets fwded VPI VCI\n");
- do_gettimeofday(&now);
+ now = ktime_get_seconds();
for (in_entry = mpc->in_cache; in_entry; in_entry = in_entry->next) {
+ unsigned long seconds_delta = now - in_entry->time;
+
sprintf(ip_string, "%pI4", &in_entry->ctrl_info.in_dst_ip);
seq_printf(m, "%-16s%s%-14lu%-12u",
ip_string,
ingress_state_string(in_entry->entry_state),
in_entry->ctrl_info.holding_time -
- (now.tv_sec-in_entry->tv.tv_sec),
+ seconds_delta,
in_entry->packets_fwded);
if (in_entry->shortcut)
seq_printf(m, " %-3d %-3d",
@@ -169,13 +171,14 @@ static int mpc_show(struct seq_file *m, void *v)
seq_printf(m, "Egress Entries:\nIngress MPC ATM addr\nCache-id State Holding time Packets recvd Latest IP addr VPI VCI\n");
for (eg_entry = mpc->eg_cache; eg_entry; eg_entry = eg_entry->next) {
unsigned char *p = eg_entry->ctrl_info.in_MPC_data_ATM_addr;
+ unsigned long seconds_delta = now - eg_entry->time;
+
for (i = 0; i < ATM_ESA_LEN; i++)
seq_printf(m, "%02x", p[i]);
seq_printf(m, "\n%-16lu%s%-14lu%-15u",
(unsigned long)ntohl(eg_entry->ctrl_info.cache_id),
egress_state_string(eg_entry->entry_state),
- (eg_entry->ctrl_info.holding_time -
- (now.tv_sec-eg_entry->tv.tv_sec)),
+ (eg_entry->ctrl_info.holding_time - seconds_delta),
eg_entry->packets_rcvd);
/* latest IP address */
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index af5b8c87f590..1285ca30ab0a 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -125,9 +125,16 @@ static int br_dev_init(struct net_device *dev)
if (!br->stats)
return -ENOMEM;
+ err = br_fdb_hash_init(br);
+ if (err) {
+ free_percpu(br->stats);
+ return err;
+ }
+
err = br_vlan_init(br);
if (err) {
free_percpu(br->stats);
+ br_fdb_hash_fini(br);
return err;
}
@@ -135,6 +142,7 @@ static int br_dev_init(struct net_device *dev)
if (err) {
free_percpu(br->stats);
br_vlan_flush(br);
+ br_fdb_hash_fini(br);
}
br_set_lockdep_class(dev);
@@ -148,6 +156,7 @@ static void br_dev_uninit(struct net_device *dev)
br_multicast_dev_del(br);
br_multicast_uninit_stats(br);
br_vlan_flush(br);
+ br_fdb_hash_fini(br);
free_percpu(br->stats);
}
@@ -416,6 +425,7 @@ void br_dev_setup(struct net_device *dev)
br->dev = dev;
spin_lock_init(&br->lock);
INIT_LIST_HEAD(&br->port_list);
+ INIT_HLIST_HEAD(&br->fdb_list);
spin_lock_init(&br->hash_lock);
br->bridge_id.prio[0] = 0x80;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 4ea5c8bbe286..dc87fbc9a23b 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -28,14 +28,20 @@
#include <trace/events/bridge.h>
#include "br_private.h"
+static const struct rhashtable_params br_fdb_rht_params = {
+ .head_offset = offsetof(struct net_bridge_fdb_entry, rhnode),
+ .key_offset = offsetof(struct net_bridge_fdb_entry, key),
+ .key_len = sizeof(struct net_bridge_fdb_key),
+ .automatic_shrinking = true,
+ .locks_mul = 1,
+};
+
static struct kmem_cache *br_fdb_cache __read_mostly;
static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr, u16 vid);
static void fdb_notify(struct net_bridge *br,
const struct net_bridge_fdb_entry *, int);
-static u32 fdb_salt __read_mostly;
-
int __init br_fdb_init(void)
{
br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
@@ -45,7 +51,6 @@ int __init br_fdb_init(void)
if (!br_fdb_cache)
return -ENOMEM;
- get_random_bytes(&fdb_salt, sizeof(fdb_salt));
return 0;
}
@@ -54,6 +59,15 @@ void br_fdb_fini(void)
kmem_cache_destroy(br_fdb_cache);
}
+int br_fdb_hash_init(struct net_bridge *br)
+{
+ return rhashtable_init(&br->fdb_hash_tbl, &br_fdb_rht_params);
+}
+
+void br_fdb_hash_fini(struct net_bridge *br)
+{
+ rhashtable_destroy(&br->fdb_hash_tbl);
+}
/* if topology_changing then use forward_delay (default 15 sec)
* otherwise keep longer (default 5 minutes)
@@ -70,13 +84,6 @@ static inline int has_expired(const struct net_bridge *br,
time_before_eq(fdb->updated + hold_time(br), jiffies);
}
-static inline int br_mac_hash(const unsigned char *mac, __u16 vid)
-{
- /* use 1 byte of OUI and 3 bytes of NIC */
- u32 key = get_unaligned((u32 *)(mac + 2));
- return jhash_2words(key, vid, fdb_salt) & (BR_HASH_SIZE - 1);
-}
-
static void fdb_rcu_free(struct rcu_head *head)
{
struct net_bridge_fdb_entry *ent
@@ -84,19 +91,18 @@ static void fdb_rcu_free(struct rcu_head *head)
kmem_cache_free(br_fdb_cache, ent);
}
-static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
+static struct net_bridge_fdb_entry *fdb_find_rcu(struct rhashtable *tbl,
const unsigned char *addr,
__u16 vid)
{
- struct net_bridge_fdb_entry *f;
+ struct net_bridge_fdb_key key;
WARN_ON_ONCE(!rcu_read_lock_held());
- hlist_for_each_entry_rcu(f, head, hlist)
- if (ether_addr_equal(f->addr.addr, addr) && f->vlan_id == vid)
- break;
+ key.vlan_id = vid;
+ memcpy(key.addr.addr, addr, sizeof(key.addr.addr));
- return f;
+ return rhashtable_lookup(tbl, &key, br_fdb_rht_params);
}
/* requires bridge hash_lock */
@@ -104,13 +110,12 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
const unsigned char *addr,
__u16 vid)
{
- struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
struct net_bridge_fdb_entry *fdb;
lockdep_assert_held_once(&br->hash_lock);
rcu_read_lock();
- fdb = fdb_find_rcu(head, addr, vid);
+ fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
rcu_read_unlock();
return fdb;
@@ -120,9 +125,7 @@ struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
const unsigned char *addr,
__u16 vid)
{
- struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
-
- return fdb_find_rcu(head, addr, vid);
+ return fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
}
/* When a static FDB entry is added, the mac address from the entry is
@@ -175,9 +178,11 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
trace_fdb_delete(br, f);
if (f->is_static)
- fdb_del_hw_addr(br, f->addr.addr);
+ fdb_del_hw_addr(br, f->key.addr.addr);
- hlist_del_init_rcu(&f->hlist);
+ hlist_del_init_rcu(&f->fdb_node);
+ rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode,
+ br_fdb_rht_params);
fdb_notify(br, f, RTM_DELNEIGH);
call_rcu(&f->rcu, fdb_rcu_free);
}
@@ -187,11 +192,11 @@ static void fdb_delete_local(struct net_bridge *br,
const struct net_bridge_port *p,
struct net_bridge_fdb_entry *f)
{
- const unsigned char *addr = f->addr.addr;
+ const unsigned char *addr = f->key.addr.addr;
struct net_bridge_vlan_group *vg;
const struct net_bridge_vlan *v;
struct net_bridge_port *op;
- u16 vid = f->vlan_id;
+ u16 vid = f->key.vlan_id;
/* Maybe another port has same hw addr? */
list_for_each_entry(op, &br->port_list, list) {
@@ -233,31 +238,23 @@ void br_fdb_find_delete_local(struct net_bridge *br,
void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
{
struct net_bridge_vlan_group *vg;
+ struct net_bridge_fdb_entry *f;
struct net_bridge *br = p->br;
struct net_bridge_vlan *v;
- int i;
spin_lock_bh(&br->hash_lock);
-
vg = nbp_vlan_group(p);
- /* Search all chains since old address/hash is unknown */
- for (i = 0; i < BR_HASH_SIZE; i++) {
- struct hlist_node *h;
- hlist_for_each(h, &br->hash[i]) {
- struct net_bridge_fdb_entry *f;
-
- f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
- if (f->dst == p && f->is_local && !f->added_by_user) {
- /* delete old one */
- fdb_delete_local(br, p, f);
-
- /* if this port has no vlan information
- * configured, we can safely be done at
- * this point.
- */
- if (!vg || !vg->num_vlans)
- goto insert;
- }
+ hlist_for_each_entry(f, &br->fdb_list, fdb_node) {
+ if (f->dst == p && f->is_local && !f->added_by_user) {
+ /* delete old one */
+ fdb_delete_local(br, p, f);
+
+ /* if this port has no vlan information
+ * configured, we can safely be done at
+ * this point.
+ */
+ if (!vg || !vg->num_vlans)
+ goto insert;
}
}
@@ -316,35 +313,32 @@ void br_fdb_cleanup(struct work_struct *work)
{
struct net_bridge *br = container_of(work, struct net_bridge,
gc_work.work);
+ struct net_bridge_fdb_entry *f = NULL;
unsigned long delay = hold_time(br);
unsigned long work_delay = delay;
unsigned long now = jiffies;
- int i;
- for (i = 0; i < BR_HASH_SIZE; i++) {
- struct net_bridge_fdb_entry *f;
- struct hlist_node *n;
+ /* this part is tricky, in order to avoid blocking learning and
+ * consequently forwarding, we rely on rcu to delete objects with
+ * delayed freeing allowing us to continue traversing
+ */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ unsigned long this_timer;
- if (!br->hash[i].first)
+ if (f->is_static || f->added_by_external_learn)
continue;
-
- spin_lock_bh(&br->hash_lock);
- hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
- unsigned long this_timer;
-
- if (f->is_static)
- continue;
- if (f->added_by_external_learn)
- continue;
- this_timer = f->updated + delay;
- if (time_after(this_timer, now))
- work_delay = min(work_delay, this_timer - now);
- else
+ this_timer = f->updated + delay;
+ if (time_after(this_timer, now)) {
+ work_delay = min(work_delay, this_timer - now);
+ } else {
+ spin_lock_bh(&br->hash_lock);
+ if (!hlist_unhashed(&f->fdb_node))
fdb_delete(br, f);
+ spin_unlock_bh(&br->hash_lock);
}
- spin_unlock_bh(&br->hash_lock);
- cond_resched();
}
+ rcu_read_unlock();
/* Cleanup minimum 10 milliseconds apart */
work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10));
@@ -354,16 +348,13 @@ void br_fdb_cleanup(struct work_struct *work)
/* Completely flush all dynamic entries in forwarding database.*/
void br_fdb_flush(struct net_bridge *br)
{
- int i;
+ struct net_bridge_fdb_entry *f;
+ struct hlist_node *tmp;
spin_lock_bh(&br->hash_lock);
- for (i = 0; i < BR_HASH_SIZE; i++) {
- struct net_bridge_fdb_entry *f;
- struct hlist_node *n;
- hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
- if (!f->is_static)
- fdb_delete(br, f);
- }
+ hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
+ if (!f->is_static)
+ fdb_delete(br, f);
}
spin_unlock_bh(&br->hash_lock);
}
@@ -377,27 +368,22 @@ void br_fdb_delete_by_port(struct net_bridge *br,
u16 vid,
int do_all)
{
- int i;
+ struct net_bridge_fdb_entry *f;
+ struct hlist_node *tmp;
spin_lock_bh(&br->hash_lock);
- for (i = 0; i < BR_HASH_SIZE; i++) {
- struct hlist_node *h, *g;
+ hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
+ if (f->dst != p)
+ continue;
- hlist_for_each_safe(h, g, &br->hash[i]) {
- struct net_bridge_fdb_entry *f
- = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
- if (f->dst != p)
+ if (!do_all)
+ if (f->is_static || (vid && f->key.vlan_id != vid))
continue;
- if (!do_all)
- if (f->is_static || (vid && f->vlan_id != vid))
- continue;
-
- if (f->is_local)
- fdb_delete_local(br, p, f);
- else
- fdb_delete(br, f);
- }
+ if (f->is_local)
+ fdb_delete_local(br, p, f);
+ else
+ fdb_delete(br, f);
}
spin_unlock_bh(&br->hash_lock);
}
@@ -433,52 +419,48 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
int br_fdb_fillbuf(struct net_bridge *br, void *buf,
unsigned long maxnum, unsigned long skip)
{
- struct __fdb_entry *fe = buf;
- int i, num = 0;
struct net_bridge_fdb_entry *f;
+ struct __fdb_entry *fe = buf;
+ int num = 0;
memset(buf, 0, maxnum*sizeof(struct __fdb_entry));
rcu_read_lock();
- for (i = 0; i < BR_HASH_SIZE; i++) {
- hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
- if (num >= maxnum)
- goto out;
+ hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ if (num >= maxnum)
+ break;
- if (has_expired(br, f))
- continue;
+ if (has_expired(br, f))
+ continue;
- /* ignore pseudo entry for local MAC address */
- if (!f->dst)
- continue;
+ /* ignore pseudo entry for local MAC address */
+ if (!f->dst)
+ continue;
- if (skip) {
- --skip;
- continue;
- }
+ if (skip) {
+ --skip;
+ continue;
+ }
- /* convert from internal format to API */
- memcpy(fe->mac_addr, f->addr.addr, ETH_ALEN);
+ /* convert from internal format to API */
+ memcpy(fe->mac_addr, f->key.addr.addr, ETH_ALEN);
- /* due to ABI compat need to split into hi/lo */
- fe->port_no = f->dst->port_no;
- fe->port_hi = f->dst->port_no >> 8;
+ /* due to ABI compat need to split into hi/lo */
+ fe->port_no = f->dst->port_no;
+ fe->port_hi = f->dst->port_no >> 8;
- fe->is_local = f->is_local;
- if (!f->is_static)
- fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated);
- ++fe;
- ++num;
- }
+ fe->is_local = f->is_local;
+ if (!f->is_static)
+ fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated);
+ ++fe;
+ ++num;
}
-
- out:
rcu_read_unlock();
return num;
}
-static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
+static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br,
struct net_bridge_port *source,
const unsigned char *addr,
__u16 vid,
@@ -489,16 +471,23 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
if (fdb) {
- memcpy(fdb->addr.addr, addr, ETH_ALEN);
+ memcpy(fdb->key.addr.addr, addr, ETH_ALEN);
fdb->dst = source;
- fdb->vlan_id = vid;
+ fdb->key.vlan_id = vid;
fdb->is_local = is_local;
fdb->is_static = is_static;
fdb->added_by_user = 0;
fdb->added_by_external_learn = 0;
fdb->offloaded = 0;
fdb->updated = fdb->used = jiffies;
- hlist_add_head_rcu(&fdb->hlist, head);
+ if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl,
+ &fdb->rhnode,
+ br_fdb_rht_params)) {
+ kmem_cache_free(br_fdb_cache, fdb);
+ fdb = NULL;
+ } else {
+ hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list);
+ }
}
return fdb;
}
@@ -506,7 +495,6 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr, u16 vid)
{
- struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
struct net_bridge_fdb_entry *fdb;
if (!is_valid_ether_addr(addr))
@@ -524,7 +512,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
fdb_delete(br, fdb);
}
- fdb = fdb_create(head, source, addr, vid, 1, 1);
+ fdb = fdb_create(br, source, addr, vid, 1, 1);
if (!fdb)
return -ENOMEM;
@@ -548,7 +536,6 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr, u16 vid, bool added_by_user)
{
- struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
struct net_bridge_fdb_entry *fdb;
bool fdb_modified = false;
@@ -561,7 +548,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
source->state == BR_STATE_FORWARDING))
return;
- fdb = fdb_find_rcu(head, addr, vid);
+ fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
if (likely(fdb)) {
/* attempt to update an entry for a local interface */
if (unlikely(fdb->is_local)) {
@@ -590,14 +577,13 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
}
} else {
spin_lock(&br->hash_lock);
- if (likely(!fdb_find_rcu(head, addr, vid))) {
- fdb = fdb_create(head, source, addr, vid, 0, 0);
- if (fdb) {
- if (unlikely(added_by_user))
- fdb->added_by_user = 1;
- trace_br_fdb_update(br, source, addr, vid, added_by_user);
- fdb_notify(br, fdb, RTM_NEWNEIGH);
- }
+ fdb = fdb_create(br, source, addr, vid, 0, 0);
+ if (fdb) {
+ if (unlikely(added_by_user))
+ fdb->added_by_user = 1;
+ trace_br_fdb_update(br, source, addr, vid,
+ added_by_user);
+ fdb_notify(br, fdb, RTM_NEWNEIGH);
}
/* else we lose race and someone else inserts
* it first, don't bother updating
@@ -646,7 +632,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
if (fdb->added_by_external_learn)
ndm->ndm_flags |= NTF_EXT_LEARNED;
- if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr))
+ if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
goto nla_put_failure;
if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex))
goto nla_put_failure;
@@ -657,7 +643,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
goto nla_put_failure;
- if (fdb->vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id))
+ if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16),
+ &fdb->key.vlan_id))
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -711,54 +698,48 @@ int br_fdb_dump(struct sk_buff *skb,
int *idx)
{
struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_fdb_entry *f;
int err = 0;
- int i;
if (!(dev->priv_flags & IFF_EBRIDGE))
- goto out;
+ return err;
if (!filter_dev) {
err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
if (err < 0)
- goto out;
+ return err;
}
- for (i = 0; i < BR_HASH_SIZE; i++) {
- struct net_bridge_fdb_entry *f;
-
- hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
-
- if (*idx < cb->args[2])
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ if (*idx < cb->args[2])
+ goto skip;
+ if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) {
+ if (filter_dev != dev)
goto skip;
-
- if (filter_dev &&
- (!f->dst || f->dst->dev != filter_dev)) {
- if (filter_dev != dev)
- goto skip;
- /* !f->dst is a special case for bridge
- * It means the MAC belongs to the bridge
- * Therefore need a little more filtering
- * we only want to dump the !f->dst case
- */
- if (f->dst)
- goto skip;
- }
- if (!filter_dev && f->dst)
+ /* !f->dst is a special case for bridge
+ * It means the MAC belongs to the bridge
+ * Therefore need a little more filtering
+ * we only want to dump the !f->dst case
+ */
+ if (f->dst)
goto skip;
-
- err = fdb_fill_info(skb, br, f,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH,
- NLM_F_MULTI);
- if (err < 0)
- goto out;
-skip:
- *idx += 1;
}
+ if (!filter_dev && f->dst)
+ goto skip;
+
+ err = fdb_fill_info(skb, br, f,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ NLM_F_MULTI);
+ if (err < 0)
+ break;
+skip:
+ *idx += 1;
}
+ rcu_read_unlock();
-out:
return err;
}
@@ -766,7 +747,6 @@ out:
static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
const __u8 *addr, __u16 state, __u16 flags, __u16 vid)
{
- struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
struct net_bridge_fdb_entry *fdb;
bool modified = false;
@@ -787,7 +767,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
if (!(flags & NLM_F_CREATE))
return -ENOENT;
- fdb = fdb_create(head, source, addr, vid, 0, 0);
+ fdb = fdb_create(br, source, addr, vid, 0, 0);
if (!fdb)
return -ENOMEM;
@@ -1012,65 +992,60 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p)
{
- struct net_bridge_fdb_entry *fdb, *tmp;
- int i;
+ struct net_bridge_fdb_entry *f, *tmp;
int err;
ASSERT_RTNL();
- for (i = 0; i < BR_HASH_SIZE; i++) {
- hlist_for_each_entry(fdb, &br->hash[i], hlist) {
- /* We only care for static entries */
- if (!fdb->is_static)
- continue;
-
- err = dev_uc_add(p->dev, fdb->addr.addr);
- if (err)
- goto rollback;
- }
+ /* the key here is that static entries change only under rtnl */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ /* We only care for static entries */
+ if (!f->is_static)
+ continue;
+ err = dev_uc_add(p->dev, f->key.addr.addr);
+ if (err)
+ goto rollback;
}
- return 0;
+done:
+ rcu_read_unlock();
-rollback:
- for (i = 0; i < BR_HASH_SIZE; i++) {
- hlist_for_each_entry(tmp, &br->hash[i], hlist) {
- /* If we reached the fdb that failed, we can stop */
- if (tmp == fdb)
- break;
-
- /* We only care for static entries */
- if (!tmp->is_static)
- continue;
+ return err;
- dev_uc_del(p->dev, tmp->addr.addr);
- }
+rollback:
+ hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) {
+ /* We only care for static entries */
+ if (!tmp->is_static)
+ continue;
+ if (tmp == f)
+ break;
+ dev_uc_del(p->dev, tmp->key.addr.addr);
}
- return err;
+
+ goto done;
}
void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
{
- struct net_bridge_fdb_entry *fdb;
- int i;
+ struct net_bridge_fdb_entry *f;
ASSERT_RTNL();
- for (i = 0; i < BR_HASH_SIZE; i++) {
- hlist_for_each_entry_rcu(fdb, &br->hash[i], hlist) {
- /* We only care for static entries */
- if (!fdb->is_static)
- continue;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ /* We only care for static entries */
+ if (!f->is_static)
+ continue;
- dev_uc_del(p->dev, fdb->addr.addr);
- }
+ dev_uc_del(p->dev, f->key.addr.addr);
}
+ rcu_read_unlock();
}
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
const unsigned char *addr, u16 vid)
{
struct net_bridge_fdb_entry *fdb;
- struct hlist_head *head;
bool modified = false;
int err = 0;
@@ -1078,10 +1053,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
spin_lock_bh(&br->hash_lock);
- head = &br->hash[br_mac_hash(addr, vid)];
fdb = br_fdb_find(br, addr, vid);
if (!fdb) {
- fdb = fdb_create(head, p, addr, vid, 0, 0);
+ fdb = fdb_create(br, p, addr, vid, 0, 0);
if (!fdb) {
err = -ENOMEM;
goto err_unlock;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index b0f4c734900b..6d9f48bd374a 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -760,9 +760,9 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
void br_mdb_init(void)
{
- rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0);
- rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0);
+ rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0);
}
void br_mdb_uninit(void)
diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
index 20cbb727df4d..8e2d7cfa4e16 100644
--- a/net/bridge/br_nf_core.c
+++ b/net/bridge/br_nf_core.c
@@ -78,7 +78,6 @@ void br_netfilter_rtable_init(struct net_bridge *br)
atomic_set(&rt->dst.__refcnt, 1);
rt->dst.dev = br->dev;
- rt->dst.path = &rt->dst;
dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE;
rt->dst.ops = &fake_dst_ops;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1312b8d20ec3..80559fd11b7e 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -168,12 +168,17 @@ struct net_bridge_vlan_group {
u16 pvid;
};
+struct net_bridge_fdb_key {
+ mac_addr addr;
+ u16 vlan_id;
+};
+
struct net_bridge_fdb_entry {
- struct hlist_node hlist;
+ struct rhash_head rhnode;
struct net_bridge_port *dst;
- mac_addr addr;
- __u16 vlan_id;
+ struct net_bridge_fdb_key key;
+ struct hlist_node fdb_node;
unsigned char is_local:1,
is_static:1,
added_by_user:1,
@@ -315,7 +320,7 @@ struct net_bridge {
struct net_bridge_vlan_group __rcu *vlgrp;
#endif
- struct hlist_head hash[BR_HASH_SIZE];
+ struct rhashtable fdb_hash_tbl;
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
union {
struct rtable fake_rtable;
@@ -405,6 +410,7 @@ struct net_bridge {
int offload_fwd_mark;
#endif
bool neigh_suppress_enabled;
+ struct hlist_head fdb_list;
};
struct br_input_skb_cb {
@@ -515,6 +521,8 @@ static inline void br_netpoll_disable(struct net_bridge_port *p)
/* br_fdb.c */
int br_fdb_init(void);
void br_fdb_fini(void);
+int br_fdb_hash_init(struct net_bridge *br);
+void br_fdb_hash_fini(struct net_bridge *br);
void br_fdb_flush(struct net_bridge *br);
void br_fdb_find_delete_local(struct net_bridge *br,
const struct net_bridge_port *p,
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 9700e0f3307b..ee775f4ff76c 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -121,13 +121,13 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
switch (type) {
case RTM_DELNEIGH:
- br_switchdev_fdb_call_notifiers(false, fdb->addr.addr,
- fdb->vlan_id,
+ br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
+ fdb->key.vlan_id,
fdb->dst->dev);
break;
case RTM_NEWNEIGH:
- br_switchdev_fdb_call_notifiers(true, fdb->addr.addr,
- fdb->vlan_id,
+ br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
+ fdb->key.vlan_id,
fdb->dst->dev);
break;
}
diff --git a/net/can/gw.c b/net/can/gw.c
index 73a02af4b5d7..398dd0395ad9 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1014,6 +1014,8 @@ static struct pernet_operations cangw_pernet_ops = {
static __init int cgw_module_init(void)
{
+ int ret;
+
/* sanitize given module parameter */
max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS);
@@ -1031,15 +1033,19 @@ static __init int cgw_module_init(void)
notifier.notifier_call = cgw_notifier;
register_netdevice_notifier(&notifier);
- if (__rtnl_register(PF_CAN, RTM_GETROUTE, NULL, cgw_dump_jobs, 0)) {
+ ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_GETROUTE,
+ NULL, cgw_dump_jobs, 0);
+ if (ret) {
unregister_netdevice_notifier(&notifier);
kmem_cache_destroy(cgw_cache);
return -ENOBUFS;
}
- /* Only the first call to __rtnl_register can fail */
- __rtnl_register(PF_CAN, RTM_NEWROUTE, cgw_create_job, NULL, 0);
- __rtnl_register(PF_CAN, RTM_DELROUTE, cgw_remove_job, NULL, 0);
+ /* Only the first call to rtnl_register_module can fail */
+ rtnl_register_module(THIS_MODULE, PF_CAN, RTM_NEWROUTE,
+ cgw_create_job, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_CAN, RTM_DELROUTE,
+ cgw_remove_job, NULL, 0);
return 0;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index f47e96b62308..b0eee49a2489 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2803,7 +2803,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
segs = skb_mac_gso_segment(skb, features);
- if (unlikely(skb_needs_check(skb, tx_path)))
+ if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
skb_warn_bad_offload(skb);
return segs;
@@ -3162,6 +3162,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
int rc;
qdisc_calculate_pkt_len(skb, q);
+
+ if (q->flags & TCQ_F_NOLOCK) {
+ if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+ __qdisc_drop(skb, &to_free);
+ rc = NET_XMIT_DROP;
+ } else {
+ rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+ __qdisc_run(q);
+ }
+
+ if (unlikely(to_free))
+ kfree_skb_list(to_free);
+ return rc;
+ }
+
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
@@ -3192,9 +3207,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
contended = false;
}
__qdisc_run(q);
- } else
- qdisc_run_end(q);
+ }
+ qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
@@ -3204,6 +3219,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
contended = false;
}
__qdisc_run(q);
+ qdisc_run_end(q);
}
}
spin_unlock(root_lock);
@@ -4143,19 +4159,22 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
while (head) {
struct Qdisc *q = head;
- spinlock_t *root_lock;
+ spinlock_t *root_lock = NULL;
head = head->next_sched;
- root_lock = qdisc_lock(q);
- spin_lock(root_lock);
+ if (!(q->flags & TCQ_F_NOLOCK)) {
+ root_lock = qdisc_lock(q);
+ spin_lock(root_lock);
+ }
/* We need to make sure head->next_sched is read
* before clearing __QDISC_STATE_SCHED
*/
smp_mb__before_atomic();
clear_bit(__QDISC_STATE_SCHED, &q->state);
qdisc_run(q);
- spin_unlock(root_lock);
+ if (root_lock)
+ spin_unlock(root_lock);
}
}
}
@@ -7073,17 +7092,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
}
EXPORT_SYMBOL(dev_change_proto_down);
-u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id)
+void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
+ struct netdev_bpf *xdp)
{
- struct netdev_bpf xdp;
-
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_QUERY_PROG;
+ memset(xdp, 0, sizeof(*xdp));
+ xdp->command = XDP_QUERY_PROG;
/* Query must always succeed. */
- WARN_ON(bpf_op(dev, &xdp) < 0);
- if (prog_id)
- *prog_id = xdp.prog_id;
+ WARN_ON(bpf_op(dev, xdp) < 0);
+}
+
+static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
+{
+ struct netdev_bpf xdp;
+
+ __dev_xdp_query(dev, bpf_op, &xdp);
return xdp.prog_attached;
}
@@ -7106,6 +7129,27 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
return bpf_op(dev, &xdp);
}
+static void dev_xdp_uninstall(struct net_device *dev)
+{
+ struct netdev_bpf xdp;
+ bpf_op_t ndo_bpf;
+
+ /* Remove generic XDP */
+ WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
+
+ /* Remove from the driver */
+ ndo_bpf = dev->netdev_ops->ndo_bpf;
+ if (!ndo_bpf)
+ return;
+
+ __dev_xdp_query(dev, ndo_bpf, &xdp);
+ if (xdp.prog_attached == XDP_ATTACHED_NONE)
+ return;
+
+ /* Program removal should always succeed */
+ WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
+}
+
/**
* dev_change_xdp_fd - set or clear a bpf program for a device rx path
* @dev: device
@@ -7134,10 +7178,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
bpf_chk = generic_xdp_install;
if (fd >= 0) {
- if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL))
+ if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
return -EEXIST;
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
- __dev_xdp_attached(dev, bpf_op, NULL))
+ __dev_xdp_attached(dev, bpf_op))
return -EBUSY;
prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
@@ -7236,6 +7280,7 @@ static void rollback_registered_many(struct list_head *head)
/* Shutdown queueing discipline. */
dev_shutdown(dev);
+ dev_xdp_uninstall(dev);
/* Notify protocols, that we are about to destroy
* this device. They should clean all the things.
@@ -8195,7 +8240,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
void free_netdev(struct net_device *dev)
{
struct napi_struct *p, *n;
- struct bpf_prog *prog;
might_sleep();
netif_free_tx_queues(dev);
@@ -8214,12 +8258,6 @@ void free_netdev(struct net_device *dev)
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
- prog = rcu_dereference_protected(dev->xdp_prog, 1);
- if (prog) {
- bpf_prog_put(prog);
- static_key_slow_dec(&generic_xdp_needed);
- }
-
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
netdev_freemem(dev);
diff --git a/net/core/dst.c b/net/core/dst.c
index 662a2d4a3d19..007aa0b08291 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -21,6 +21,7 @@
#include <linux/sched.h>
#include <linux/prefetch.h>
#include <net/lwtunnel.h>
+#include <net/xfrm.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
@@ -62,15 +63,12 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
struct net_device *dev, int initial_ref, int initial_obsolete,
unsigned short flags)
{
- dst->child = NULL;
dst->dev = dev;
if (dev)
dev_hold(dev);
dst->ops = ops;
dst_init_metrics(dst, dst_default_metrics.metrics, true);
dst->expires = 0UL;
- dst->path = dst;
- dst->from = NULL;
#ifdef CONFIG_XFRM
dst->xfrm = NULL;
#endif
@@ -88,7 +86,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
dst->__use = 0;
dst->lastuse = jiffies;
dst->flags = flags;
- dst->next = NULL;
if (!(flags & DST_NOCOUNT))
dst_entries_add(ops, 1);
}
@@ -116,12 +113,17 @@ EXPORT_SYMBOL(dst_alloc);
struct dst_entry *dst_destroy(struct dst_entry * dst)
{
- struct dst_entry *child;
+ struct dst_entry *child = NULL;
smp_rmb();
- child = dst->child;
+#ifdef CONFIG_XFRM
+ if (dst->xfrm) {
+ struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
+ child = xdst->child;
+ }
+#endif
if (!(dst->flags & DST_NOCOUNT))
dst_entries_add(dst->ops, -1);
diff --git a/net/core/filter.c b/net/core/filter.c
index 6a85e67fafce..754abe1041b7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3013,6 +3013,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
if (flags & BPF_F_DONT_FRAGMENT)
info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+ if (flags & BPF_F_ZERO_CSUM_TX)
+ info->key.tun_flags &= ~TUNNEL_CSUM;
info->key.tun_id = cpu_to_be64(from->tunnel_id);
info->key.tos = from->tunnel_tos;
@@ -3026,8 +3028,6 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
IPV6_FLOWLABEL_MASK;
} else {
info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
- if (flags & BPF_F_ZERO_CSUM_TX)
- info->key.tun_flags &= ~TUNNEL_CSUM;
}
return 0;
@@ -4437,6 +4437,42 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_num));
break;
+
+ case offsetof(struct bpf_sock_ops, is_fullsock):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern,
+ is_fullsock),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ is_fullsock));
+ break;
+
+/* Helper macro for adding read access to tcp_sock fields. */
+#define SOCK_OPS_GET_TCP32(FIELD_NAME) \
+ do { \
+ BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, \
+ is_fullsock), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ is_fullsock)); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, sk),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, sk));\
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \
+ offsetof(struct tcp_sock, FIELD_NAME)); \
+ } while (0)
+
+ case offsetof(struct bpf_sock_ops, snd_cwnd):
+ SOCK_OPS_GET_TCP32(snd_cwnd);
+ break;
+
+ case offsetof(struct bpf_sock_ops, srtt_us):
+ SOCK_OPS_GET_TCP32(srtt_us);
+ break;
}
return insn - insn_buf;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 15ce30063765..cc75488d3653 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -133,10 +133,10 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
ctrl->addr_type = type;
}
-static void
-__skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
- struct flow_dissector *flow_dissector,
- void *target_container)
+void
+skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
{
struct ip_tunnel_info *info;
struct ip_tunnel_key *key;
@@ -212,6 +212,7 @@ __skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
tp->dst = key->tp_dst;
}
}
+EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
static enum flow_dissect_ret
__skb_flow_dissect_mpls(const struct sk_buff *skb,
@@ -576,9 +577,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
FLOW_DISSECTOR_KEY_BASIC,
target_container);
- __skb_flow_dissect_tunnel_info(skb, flow_dissector,
- target_container);
-
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
struct ethhdr *eth = eth_hdr(skb);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 87f28557b329..b2b2323bdc84 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -252,10 +252,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
}
}
-static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
- const struct gnet_stats_queue __percpu *cpu,
- const struct gnet_stats_queue *q,
- __u32 qlen)
+void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *cpu,
+ const struct gnet_stats_queue *q,
+ __u32 qlen)
{
if (cpu) {
__gnet_stats_copy_queue_cpu(qstats, cpu);
@@ -269,6 +269,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
qstats->qlen = qlen;
}
+EXPORT_SYMBOL(__gnet_stats_copy_queue);
/**
* gnet_stats_copy_queue - copy queue statistics into statistics TLV
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f95a15086225..b9ce241cd28c 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -399,7 +399,7 @@ struct pktgen_dev {
__u8 ipsmode; /* IPSEC mode (config) */
__u8 ipsproto; /* IPSEC type (config) */
__u32 spi;
- struct dst_entry dst;
+ struct xfrm_dst xdst;
struct dst_ops dstops;
#endif
char result[512];
@@ -2609,7 +2609,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
* supports both transport/tunnel mode + ESP/AH type.
*/
if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0))
- skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF;
+ skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF;
rcu_read_lock_bh();
err = x->outer_mode->output(x, skb);
@@ -3742,10 +3742,10 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
* performance under such circumstance.
*/
pkt_dev->dstops.family = AF_INET;
- pkt_dev->dst.dev = pkt_dev->odev;
- dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false);
- pkt_dev->dst.child = &pkt_dev->dst;
- pkt_dev->dst.ops = &pkt_dev->dstops;
+ pkt_dev->xdst.u.dst.dev = pkt_dev->odev;
+ dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false);
+ pkt_dev->xdst.child = &pkt_dev->xdst.u.dst;
+ pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops;
#endif
return add_dev_to_thread(t, pkt_dev);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dabba2a91fc8..c688dc564b11 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -62,7 +62,9 @@
struct rtnl_link {
rtnl_doit_func doit;
rtnl_dumpit_func dumpit;
+ struct module *owner;
unsigned int flags;
+ struct rcu_head rcu;
};
static DEFINE_MUTEX(rtnl_mutex);
@@ -127,8 +129,7 @@ bool lockdep_rtnl_is_held(void)
EXPORT_SYMBOL(lockdep_rtnl_is_held);
#endif /* #ifdef CONFIG_PROVE_LOCKING */
-static struct rtnl_link __rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
-static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1];
+static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
static inline int rtm_msgindex(int msgtype)
{
@@ -144,72 +145,127 @@ static inline int rtm_msgindex(int msgtype)
return msgindex;
}
-/**
- * __rtnl_register - Register a rtnetlink message type
- * @protocol: Protocol family or PF_UNSPEC
- * @msgtype: rtnetlink message type
- * @doit: Function pointer called for each request message
- * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
- *
- * Registers the specified function pointers (at least one of them has
- * to be non-NULL) to be called whenever a request message for the
- * specified protocol family and message type is received.
- *
- * The special protocol family PF_UNSPEC may be used to define fallback
- * function pointers for the case when no entry for the specific protocol
- * family exists.
- *
- * Returns 0 on success or a negative error code.
- */
-int __rtnl_register(int protocol, int msgtype,
- rtnl_doit_func doit, rtnl_dumpit_func dumpit,
- unsigned int flags)
+static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
+{
+ struct rtnl_link **tab;
+
+ if (protocol >= ARRAY_SIZE(rtnl_msg_handlers))
+ protocol = PF_UNSPEC;
+
+ tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]);
+ if (!tab)
+ tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]);
+
+ return tab[msgtype];
+}
+
+static int rtnl_register_internal(struct module *owner,
+ int protocol, int msgtype,
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+ unsigned int flags)
{
- struct rtnl_link *tab;
+ struct rtnl_link *link, *old;
+ struct rtnl_link __rcu **tab;
int msgindex;
+ int ret = -ENOBUFS;
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
msgindex = rtm_msgindex(msgtype);
- tab = rcu_dereference_raw(rtnl_msg_handlers[protocol]);
+ rtnl_lock();
+ tab = rtnl_msg_handlers[protocol];
if (tab == NULL) {
- tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL);
- if (tab == NULL)
- return -ENOBUFS;
+ tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
+ if (!tab)
+ goto unlock;
+ /* ensures we see the 0 stores */
rcu_assign_pointer(rtnl_msg_handlers[protocol], tab);
}
+ old = rtnl_dereference(tab[msgindex]);
+ if (old) {
+ link = kmemdup(old, sizeof(*old), GFP_KERNEL);
+ if (!link)
+ goto unlock;
+ } else {
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link)
+ goto unlock;
+ }
+
+ WARN_ON(link->owner && link->owner != owner);
+ link->owner = owner;
+
+ WARN_ON(doit && link->doit && link->doit != doit);
if (doit)
- tab[msgindex].doit = doit;
+ link->doit = doit;
+ WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit);
if (dumpit)
- tab[msgindex].dumpit = dumpit;
- tab[msgindex].flags |= flags;
+ link->dumpit = dumpit;
- return 0;
+ link->flags |= flags;
+
+ /* publish protocol:msgtype */
+ rcu_assign_pointer(tab[msgindex], link);
+ ret = 0;
+ if (old)
+ kfree_rcu(old, rcu);
+unlock:
+ rtnl_unlock();
+ return ret;
+}
+
+/**
+ * rtnl_register_module - Register a rtnetlink message type
+ *
+ * @owner: module registering the hook (THIS_MODULE)
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
+ *
+ * Like rtnl_register, but for use by removable modules.
+ */
+int rtnl_register_module(struct module *owner,
+ int protocol, int msgtype,
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+ unsigned int flags)
+{
+ return rtnl_register_internal(owner, protocol, msgtype,
+ doit, dumpit, flags);
}
-EXPORT_SYMBOL_GPL(__rtnl_register);
+EXPORT_SYMBOL_GPL(rtnl_register_module);
/**
* rtnl_register - Register a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
*
- * Identical to __rtnl_register() but panics on failure. This is useful
- * as failure of this function is very unlikely, it can only happen due
- * to lack of memory when allocating the chain to store all message
- * handlers for a protocol. Meant for use in init functions where lack
- * of memory implies no sense in continuing.
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
*/
void rtnl_register(int protocol, int msgtype,
rtnl_doit_func doit, rtnl_dumpit_func dumpit,
unsigned int flags)
{
- if (__rtnl_register(protocol, msgtype, doit, dumpit, flags) < 0)
- panic("Unable to register rtnetlink message handler, "
- "protocol = %d, message type = %d\n",
- protocol, msgtype);
+ int err;
+
+ err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit,
+ flags);
+ if (err)
+ pr_err("Unable to register rtnetlink message handler, "
+ "protocol = %d, message type = %d\n", protocol, msgtype);
}
-EXPORT_SYMBOL_GPL(rtnl_register);
/**
* rtnl_unregister - Unregister a rtnetlink message type
@@ -220,24 +276,25 @@ EXPORT_SYMBOL_GPL(rtnl_register);
*/
int rtnl_unregister(int protocol, int msgtype)
{
- struct rtnl_link *handlers;
+ struct rtnl_link **tab, *link;
int msgindex;
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
msgindex = rtm_msgindex(msgtype);
rtnl_lock();
- handlers = rtnl_dereference(rtnl_msg_handlers[protocol]);
- if (!handlers) {
+ tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
+ if (!tab) {
rtnl_unlock();
return -ENOENT;
}
- handlers[msgindex].doit = NULL;
- handlers[msgindex].dumpit = NULL;
- handlers[msgindex].flags = 0;
+ link = tab[msgindex];
+ rcu_assign_pointer(tab[msgindex], NULL);
rtnl_unlock();
+ kfree_rcu(link, rcu);
+
return 0;
}
EXPORT_SYMBOL_GPL(rtnl_unregister);
@@ -251,20 +308,27 @@ EXPORT_SYMBOL_GPL(rtnl_unregister);
*/
void rtnl_unregister_all(int protocol)
{
- struct rtnl_link *handlers;
+ struct rtnl_link **tab, *link;
+ int msgindex;
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
rtnl_lock();
- handlers = rtnl_dereference(rtnl_msg_handlers[protocol]);
+ tab = rtnl_msg_handlers[protocol];
RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL);
+ for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
+ link = tab[msgindex];
+ if (!link)
+ continue;
+
+ rcu_assign_pointer(tab[msgindex], NULL);
+ kfree_rcu(link, rcu);
+ }
rtnl_unlock();
synchronize_net();
- while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1)
- schedule();
- kfree(handlers);
+ kfree(tab);
}
EXPORT_SYMBOL_GPL(rtnl_unregister_all);
@@ -1261,6 +1325,7 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
{
const struct net_device_ops *ops = dev->netdev_ops;
const struct bpf_prog *generic_xdp_prog;
+ struct netdev_bpf xdp;
ASSERT_RTNL();
@@ -1273,7 +1338,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
if (!ops->ndo_bpf)
return XDP_ATTACHED_NONE;
- return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id);
+ __dev_xdp_query(dev, ops->ndo_bpf, &xdp);
+ *prog_id = xdp.prog_id;
+
+ return xdp.prog_attached;
}
static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1569,6 +1637,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_PROMISCUITY] = { .type = NLA_U32 },
[IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
[IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
+ [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 },
+ [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 },
[IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
[IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
@@ -2219,6 +2289,34 @@ static int do_setlink(const struct sk_buff *skb,
}
}
+ if (tb[IFLA_GSO_MAX_SIZE]) {
+ u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);
+
+ if (max_size > GSO_MAX_SIZE) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (dev->gso_max_size ^ max_size) {
+ netif_set_gso_max_size(dev, max_size);
+ status |= DO_SETLINK_MODIFIED;
+ }
+ }
+
+ if (tb[IFLA_GSO_MAX_SEGS]) {
+ u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+
+ if (max_segs > GSO_MAX_SEGS) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (dev->gso_max_segs ^ max_segs) {
+ dev->gso_max_segs = max_segs;
+ status |= DO_SETLINK_MODIFIED;
+ }
+ }
+
if (tb[IFLA_OPERSTATE])
set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
@@ -2583,6 +2681,10 @@ struct net_device *rtnl_create_link(struct net *net,
dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
if (tb[IFLA_GROUP])
dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ if (tb[IFLA_GSO_MAX_SIZE])
+ netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
+ if (tb[IFLA_GSO_MAX_SEGS])
+ dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
return dev;
}
@@ -2973,18 +3075,26 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
s_idx = 1;
for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
+ struct rtnl_link **tab;
int type = cb->nlh->nlmsg_type-RTM_BASE;
- struct rtnl_link *handlers;
+ struct rtnl_link *link;
rtnl_dumpit_func dumpit;
if (idx < s_idx || idx == PF_PACKET)
continue;
- handlers = rtnl_dereference(rtnl_msg_handlers[idx]);
- if (!handlers)
+ if (type < 0 || type >= RTM_NR_MSGTYPES)
continue;
- dumpit = READ_ONCE(handlers[type].dumpit);
+ tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]);
+ if (!tab)
+ continue;
+
+ link = tab[type];
+ if (!link)
+ continue;
+
+ dumpit = link->dumpit;
if (!dumpit)
continue;
@@ -4314,7 +4424,8 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
- struct rtnl_link *handlers;
+ struct rtnl_link *link;
+ struct module *owner;
int err = -EOPNOTSUPP;
rtnl_doit_func doit;
unsigned int flags;
@@ -4338,79 +4449,85 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
return -EPERM;
- if (family >= ARRAY_SIZE(rtnl_msg_handlers))
- family = PF_UNSPEC;
-
rcu_read_lock();
- handlers = rcu_dereference(rtnl_msg_handlers[family]);
- if (!handlers) {
- family = PF_UNSPEC;
- handlers = rcu_dereference(rtnl_msg_handlers[family]);
- }
-
if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
struct sock *rtnl;
rtnl_dumpit_func dumpit;
u16 min_dump_alloc = 0;
- dumpit = READ_ONCE(handlers[type].dumpit);
- if (!dumpit) {
+ link = rtnl_get_link(family, type);
+ if (!link || !link->dumpit) {
family = PF_UNSPEC;
- handlers = rcu_dereference(rtnl_msg_handlers[PF_UNSPEC]);
- if (!handlers)
- goto err_unlock;
-
- dumpit = READ_ONCE(handlers[type].dumpit);
- if (!dumpit)
+ link = rtnl_get_link(family, type);
+ if (!link || !link->dumpit)
goto err_unlock;
}
-
- refcount_inc(&rtnl_msg_handlers_ref[family]);
+ owner = link->owner;
+ dumpit = link->dumpit;
if (type == RTM_GETLINK - RTM_BASE)
min_dump_alloc = rtnl_calcit(skb, nlh);
+ err = 0;
+ /* need to do this before rcu_read_unlock() */
+ if (!try_module_get(owner))
+ err = -EPROTONOSUPPORT;
+
rcu_read_unlock();
rtnl = net->rtnl;
- {
+ if (err == 0) {
struct netlink_dump_control c = {
.dump = dumpit,
.min_dump_alloc = min_dump_alloc,
+ .module = owner,
};
err = netlink_dump_start(rtnl, skb, nlh, &c);
+ /* netlink_dump_start() will keep a reference on
+ * module if dump is still in progress.
+ */
+ module_put(owner);
}
- refcount_dec(&rtnl_msg_handlers_ref[family]);
return err;
}
- doit = READ_ONCE(handlers[type].doit);
- if (!doit) {
+ link = rtnl_get_link(family, type);
+ if (!link || !link->doit) {
family = PF_UNSPEC;
- handlers = rcu_dereference(rtnl_msg_handlers[family]);
+ link = rtnl_get_link(PF_UNSPEC, type);
+ if (!link || !link->doit)
+ goto out_unlock;
}
- flags = READ_ONCE(handlers[type].flags);
+ owner = link->owner;
+ if (!try_module_get(owner)) {
+ err = -EPROTONOSUPPORT;
+ goto out_unlock;
+ }
+
+ flags = link->flags;
if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
- refcount_inc(&rtnl_msg_handlers_ref[family]);
- doit = READ_ONCE(handlers[type].doit);
+ doit = link->doit;
rcu_read_unlock();
if (doit)
err = doit(skb, nlh, extack);
- refcount_dec(&rtnl_msg_handlers_ref[family]);
+ module_put(owner);
return err;
}
-
rcu_read_unlock();
rtnl_lock();
- handlers = rtnl_dereference(rtnl_msg_handlers[family]);
- if (handlers) {
- doit = READ_ONCE(handlers[type].doit);
- if (doit)
- err = doit(skb, nlh, extack);
- }
+ link = rtnl_get_link(family, type);
+ if (link && link->doit)
+ err = link->doit(skb, nlh, extack);
rtnl_unlock();
+
+ module_put(owner);
+
+ return err;
+
+out_unlock:
+ rcu_read_unlock();
return err;
err_unlock:
@@ -4498,11 +4615,6 @@ static struct pernet_operations rtnetlink_net_ops = {
void __init rtnetlink_init(void)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(rtnl_msg_handlers_ref); i++)
- refcount_set(&rtnl_msg_handlers_ref[i], 1);
-
if (register_pernet_subsys(&rtnetlink_net_ops))
panic("rtnetlink_init: cannot initialize rtnetlink\n");
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 5eeb1d20cc38..c5bb52bc73a1 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -235,7 +235,9 @@ struct sock *reuseport_select_sock(struct sock *sk,
if (prog && skb)
sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
- else
+
+ /* no bpf or invalid bpf result: fall back to hash usage */
+ if (!sk2)
sk2 = reuse->socks[reciprocal_scale(hash, socks)];
}
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 178bb9833311..37ccbe62eb1a 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -63,9 +63,10 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
*/
local_bh_disable();
inet_twsk_schedule(tw, timeo);
- /* Linkage updates. */
- __inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
- inet_twsk_put(tw);
+ /* Linkage updates.
+ * Note that access to tw after this point is illegal.
+ */
+ inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
local_bh_enable();
} else {
/* Sorry, if we're out of memory, just CLOSE this
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 9153247dad28..d1885cf59319 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1418,9 +1418,12 @@ void __init dn_dev_init(void)
dn_dev_devices_on();
- rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL, 0);
- rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL, 0);
- rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr, 0);
+ rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWADDR,
+ dn_nl_newaddr, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELADDR,
+ dn_nl_deladdr, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR,
+ NULL, dn_nl_dump_ifaddr, 0);
proc_create("decnet_dev", S_IRUGO, init_net.proc_net, &dn_dev_seq_fops);
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index b37a1b833c77..fce94cbd4378 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -792,8 +792,10 @@ void __init dn_fib_init(void)
register_dnaddr_notifier(&dn_fib_dnaddr_notifier);
- rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL, 0);
- rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWROUTE,
+ dn_fib_rtm_newroute, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE,
+ dn_fib_rtm_delroute, NULL, 0);
}
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 324cb9f2f551..73160d4aebbe 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -199,11 +199,11 @@ static void dn_dst_check_expire(struct timer_list *unused)
lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
if (atomic_read(&rt->dst.__refcnt) > 1 ||
(now - rt->dst.lastuse) < expire) {
- rtp = &rt->dst.dn_next;
+ rtp = &rt->dn_next;
continue;
}
- *rtp = rt->dst.dn_next;
- rt->dst.dn_next = NULL;
+ *rtp = rt->dn_next;
+ rt->dn_next = NULL;
dst_dev_put(&rt->dst);
dst_release(&rt->dst);
}
@@ -233,11 +233,11 @@ static int dn_dst_gc(struct dst_ops *ops)
lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
if (atomic_read(&rt->dst.__refcnt) > 1 ||
(now - rt->dst.lastuse) < expire) {
- rtp = &rt->dst.dn_next;
+ rtp = &rt->dn_next;
continue;
}
- *rtp = rt->dst.dn_next;
- rt->dst.dn_next = NULL;
+ *rtp = rt->dn_next;
+ rt->dn_next = NULL;
dst_dev_put(&rt->dst);
dst_release(&rt->dst);
break;
@@ -333,8 +333,8 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
lockdep_is_held(&dn_rt_hash_table[hash].lock))) != NULL) {
if (compare_keys(&rth->fld, &rt->fld)) {
/* Put it first */
- *rthp = rth->dst.dn_next;
- rcu_assign_pointer(rth->dst.dn_next,
+ *rthp = rth->dn_next;
+ rcu_assign_pointer(rth->dn_next,
dn_rt_hash_table[hash].chain);
rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth);
@@ -345,10 +345,10 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
*rp = rth;
return 0;
}
- rthp = &rth->dst.dn_next;
+ rthp = &rth->dn_next;
}
- rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain);
+ rcu_assign_pointer(rt->dn_next, dn_rt_hash_table[hash].chain);
rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt);
dst_hold_and_use(&rt->dst, now);
@@ -369,8 +369,8 @@ static void dn_run_flush(struct timer_list *unused)
goto nothing_to_declare;
for(; rt; rt = next) {
- next = rcu_dereference_raw(rt->dst.dn_next);
- RCU_INIT_POINTER(rt->dst.dn_next, NULL);
+ next = rcu_dereference_raw(rt->dn_next);
+ RCU_INIT_POINTER(rt->dn_next, NULL);
dst_dev_put(&rt->dst);
dst_release(&rt->dst);
}
@@ -1183,6 +1183,7 @@ make_route:
if (rt == NULL)
goto e_nobufs;
+ rt->dn_next = NULL;
memset(&rt->fld, 0, sizeof(rt->fld));
rt->fld.saddr = oldflp->saddr;
rt->fld.daddr = oldflp->daddr;
@@ -1252,7 +1253,7 @@ static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn *
if (!(flags & MSG_TRYHARD)) {
rcu_read_lock_bh();
for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt;
- rt = rcu_dereference_bh(rt->dst.dn_next)) {
+ rt = rcu_dereference_bh(rt->dn_next)) {
if ((flp->daddr == rt->fld.daddr) &&
(flp->saddr == rt->fld.saddr) &&
(flp->flowidn_mark == rt->fld.flowidn_mark) &&
@@ -1448,6 +1449,7 @@ make_route:
if (rt == NULL)
goto e_nobufs;
+ rt->dn_next = NULL;
memset(&rt->fld, 0, sizeof(rt->fld));
rt->rt_saddr = fld.saddr;
rt->rt_daddr = fld.daddr;
@@ -1529,7 +1531,7 @@ static int dn_route_input(struct sk_buff *skb)
rcu_read_lock();
for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL;
- rt = rcu_dereference(rt->dst.dn_next)) {
+ rt = rcu_dereference(rt->dn_next)) {
if ((rt->fld.saddr == cb->src) &&
(rt->fld.daddr == cb->dst) &&
(rt->fld.flowidn_oif == 0) &&
@@ -1749,7 +1751,7 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock_bh();
for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0;
rt;
- rt = rcu_dereference_bh(rt->dst.dn_next), idx++) {
+ rt = rcu_dereference_bh(rt->dn_next), idx++) {
if (idx < s_idx)
continue;
skb_dst_set(skb, dst_clone(&rt->dst));
@@ -1795,7 +1797,7 @@ static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_rou
{
struct dn_rt_cache_iter_state *s = seq->private;
- rt = rcu_dereference_bh(rt->dst.dn_next);
+ rt = rcu_dereference_bh(rt->dn_next);
while (!rt) {
rcu_read_unlock_bh();
if (--s->bucket < 0)
@@ -1921,11 +1923,11 @@ void __init dn_route_init(void)
&dn_rt_cache_seq_fops);
#ifdef CONFIG_DECNET_ROUTER
- rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
- dn_fib_dump, 0);
+ rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE,
+ dn_cache_getroute, dn_fib_dump, 0);
#else
- rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
- dn_cache_dump, 0);
+ rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE,
+ dn_cache_getroute, dn_cache_dump, 0);
#endif
}
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 03c3bdf25468..bbf2c82cf7b2 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -16,6 +16,15 @@ config NET_DSA
if NET_DSA
+config NET_DSA_LEGACY
+ bool "Support for older platform device and Device Tree registration"
+ default y
+ ---help---
+ Say Y if you want to enable support for the older platform device and
+ deprecated Device Tree binding registration.
+
+ This feature is scheduled for removal in 4.17.
+
# tagging formats
config NET_DSA_TAG_BRCM
bool
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 0e13c1f95d13..9e4d3536f977 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,7 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
# the core
obj-$(CONFIG_NET_DSA) += dsa_core.o
-dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o
+dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o
+dsa_core-$(CONFIG_NET_DSA_LEGACY) += legacy.o
# tagging formats
dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 1e287420ff49..21f9bed11988 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -241,7 +241,7 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst)
for (port = 0; port < ds->num_ports; port++) {
dp = &ds->ports[port];
- if (dsa_port_is_user(dp))
+ if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
dp->cpu_dp = dst->cpu_dp;
}
}
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 7d036696e8c4..b03665e8fb4e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -97,8 +97,17 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
bool dsa_schedule_work(struct work_struct *work);
/* legacy.c */
+#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
int dsa_legacy_register(void);
void dsa_legacy_unregister(void);
+#else
+static inline int dsa_legacy_register(void)
+{
+ return -ENODEV;
+}
+
+static inline void dsa_legacy_unregister(void) { }
+#endif
int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr, u16 vid,
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 84611d7fcfa2..aa56d3fb5da4 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -718,26 +718,6 @@ static int dsa_resume(struct device *d)
}
#endif
-/* legacy way, bypassing the bridge *****************************************/
-int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
- struct net_device *dev,
- const unsigned char *addr, u16 vid,
- u16 flags)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- return dsa_port_fdb_add(dp, addr, vid);
-}
-
-int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
- struct net_device *dev,
- const unsigned char *addr, u16 vid)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- return dsa_port_fdb_del(dp, addr, vid);
-}
-
static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume);
static const struct of_device_id dsa_of_match_table[] = {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index a95a55f79137..f52307296de4 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -708,14 +708,12 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev,
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_mall_tc_entry *mall_tc_entry;
__be16 protocol = cls->common.protocol;
- struct net *net = dev_net(dev);
struct dsa_switch *ds = dp->ds;
struct net_device *to_dev;
const struct tc_action *a;
struct dsa_port *to_dp;
int err = -EOPNOTSUPP;
LIST_HEAD(actions);
- int ifindex;
if (!ds->ops->port_mirror_add)
return err;
@@ -729,8 +727,7 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev,
if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
struct dsa_mall_mirror_tc_entry *mirror;
- ifindex = tcf_mirred_ifindex(a);
- to_dev = __dev_get_by_index(net, ifindex);
+ to_dev = tcf_mirred_dev(a);
if (!to_dev)
return -EINVAL;
@@ -943,6 +940,26 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
.set_rxnfc = dsa_slave_set_rxnfc,
};
+/* legacy way, bypassing the bridge *****************************************/
+int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid,
+ u16 flags)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+
+ return dsa_port_fdb_add(dp, addr, vid);
+}
+
+int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+
+ return dsa_port_fdb_del(dp, addr, vid);
+}
+
static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_open = dsa_slave_open,
.ndo_stop = dsa_slave_close,
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 29608d087a7c..b93511726069 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -83,29 +83,52 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
static int dsa_switch_fdb_add(struct dsa_switch *ds,
struct dsa_notifier_fdb_info *info)
{
- /* Do not care yet about other switch chips of the fabric */
- if (ds->index != info->sw_index)
- return 0;
+ int port = dsa_towards_port(ds, info->sw_index, info->port);
if (!ds->ops->port_fdb_add)
return -EOPNOTSUPP;
- return ds->ops->port_fdb_add(ds, info->port, info->addr,
- info->vid);
+ return ds->ops->port_fdb_add(ds, port, info->addr, info->vid);
}
static int dsa_switch_fdb_del(struct dsa_switch *ds,
struct dsa_notifier_fdb_info *info)
{
- /* Do not care yet about other switch chips of the fabric */
- if (ds->index != info->sw_index)
- return 0;
+ int port = dsa_towards_port(ds, info->sw_index, info->port);
if (!ds->ops->port_fdb_del)
return -EOPNOTSUPP;
- return ds->ops->port_fdb_del(ds, info->port, info->addr,
- info->vid);
+ return ds->ops->port_fdb_del(ds, port, info->addr, info->vid);
+}
+
+static int
+dsa_switch_mdb_prepare_bitmap(struct dsa_switch *ds,
+ const struct switchdev_obj_port_mdb *mdb,
+ const unsigned long *bitmap)
+{
+ int port, err;
+
+ if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
+ return -EOPNOTSUPP;
+
+ for_each_set_bit(port, bitmap, ds->num_ports) {
+ err = ds->ops->port_mdb_prepare(ds, port, mdb);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static void dsa_switch_mdb_add_bitmap(struct dsa_switch *ds,
+ const struct switchdev_obj_port_mdb *mdb,
+ const unsigned long *bitmap)
+{
+ int port;
+
+ for_each_set_bit(port, bitmap, ds->num_ports)
+ ds->ops->port_mdb_add(ds, port, mdb);
}
static int dsa_switch_mdb_add(struct dsa_switch *ds,
@@ -114,7 +137,7 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
const struct switchdev_obj_port_mdb *mdb = info->mdb;
struct switchdev_trans *trans = info->trans;
DECLARE_BITMAP(group, ds->num_ports);
- int port, err;
+ int port;
/* Build a mask of Multicast group members */
bitmap_zero(group, ds->num_ports);
@@ -124,21 +147,10 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
if (dsa_is_dsa_port(ds, port))
set_bit(port, group);
- if (switchdev_trans_ph_prepare(trans)) {
- if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
- return -EOPNOTSUPP;
-
- for_each_set_bit(port, group, ds->num_ports) {
- err = ds->ops->port_mdb_prepare(ds, port, mdb, trans);
- if (err)
- return err;
- }
-
- return 0;
- }
+ if (switchdev_trans_ph_prepare(trans))
+ return dsa_switch_mdb_prepare_bitmap(ds, mdb, group);
- for_each_set_bit(port, group, ds->num_ports)
- ds->ops->port_mdb_add(ds, port, mdb, trans);
+ dsa_switch_mdb_add_bitmap(ds, mdb, group);
return 0;
}
@@ -157,13 +169,43 @@ static int dsa_switch_mdb_del(struct dsa_switch *ds,
return 0;
}
+static int
+dsa_switch_vlan_prepare_bitmap(struct dsa_switch *ds,
+ const struct switchdev_obj_port_vlan *vlan,
+ const unsigned long *bitmap)
+{
+ int port, err;
+
+ if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
+ return -EOPNOTSUPP;
+
+ for_each_set_bit(port, bitmap, ds->num_ports) {
+ err = ds->ops->port_vlan_prepare(ds, port, vlan);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static void
+dsa_switch_vlan_add_bitmap(struct dsa_switch *ds,
+ const struct switchdev_obj_port_vlan *vlan,
+ const unsigned long *bitmap)
+{
+ int port;
+
+ for_each_set_bit(port, bitmap, ds->num_ports)
+ ds->ops->port_vlan_add(ds, port, vlan);
+}
+
static int dsa_switch_vlan_add(struct dsa_switch *ds,
struct dsa_notifier_vlan_info *info)
{
const struct switchdev_obj_port_vlan *vlan = info->vlan;
struct switchdev_trans *trans = info->trans;
DECLARE_BITMAP(members, ds->num_ports);
- int port, err;
+ int port;
/* Build a mask of VLAN members */
bitmap_zero(members, ds->num_ports);
@@ -173,21 +215,10 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
set_bit(port, members);
- if (switchdev_trans_ph_prepare(trans)) {
- if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
- return -EOPNOTSUPP;
-
- for_each_set_bit(port, members, ds->num_ports) {
- err = ds->ops->port_vlan_prepare(ds, port, vlan, trans);
- if (err)
- return err;
- }
-
- return 0;
- }
+ if (switchdev_trans_ph_prepare(trans))
+ return dsa_switch_vlan_prepare_bitmap(ds, vlan, members);
- for_each_set_bit(port, members, ds->num_ports)
- ds->ops->port_vlan_add(ds, port, vlan, trans);
+ dsa_switch_vlan_add_bitmap(ds, vlan, members);
return 0;
}
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index 8475434af7d5..11535bc70743 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -13,10 +13,13 @@
*/
#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
#include "dsa_priv.h"
#define MTK_HDR_LEN 4
+#define MTK_HDR_XMIT_UNTAGGED 0
+#define MTK_HDR_XMIT_TAGGED_TPID_8100 1
#define MTK_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0)
#define MTK_HDR_XMIT_DP_BIT_MASK GENMASK(5, 0)
@@ -25,20 +28,37 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
{
struct dsa_port *dp = dsa_slave_to_port(dev);
u8 *mtk_tag;
+ bool is_vlan_skb = true;
- if (skb_cow_head(skb, MTK_HDR_LEN) < 0)
- return NULL;
-
- skb_push(skb, MTK_HDR_LEN);
+ /* Build the special tag after the MAC Source Address. If VLAN header
+ * is present, it's required that VLAN header and special tag is
+ * being combined. Only in this way we can allow the switch can parse
+ * the both special and VLAN tag at the same time and then look up VLAN
+ * table with VID.
+ */
+ if (!skb_vlan_tagged(skb)) {
+ if (skb_cow_head(skb, MTK_HDR_LEN) < 0)
+ return NULL;
- memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN);
+ skb_push(skb, MTK_HDR_LEN);
+ memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN);
+ is_vlan_skb = false;
+ }
- /* Build the tag after the MAC Source Address */
mtk_tag = skb->data + 2 * ETH_ALEN;
- mtk_tag[0] = 0;
+
+ /* Mark tag attribute on special tag insertion to notify hardware
+ * whether that's a combined special tag with 802.1Q header.
+ */
+ mtk_tag[0] = is_vlan_skb ? MTK_HDR_XMIT_TAGGED_TPID_8100 :
+ MTK_HDR_XMIT_UNTAGGED;
mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK;
- mtk_tag[2] = 0;
- mtk_tag[3] = 0;
+
+ /* Tag control information is kept for 802.1Q */
+ if (!is_vlan_skb) {
+ mtk_tag[2] = 0;
+ mtk_tag[3] = 0;
+ }
return skb;
}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e7d15fb0d94d..f6f58108b4c5 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -19,6 +19,7 @@
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/vmalloc.h>
+#include <linux/bootmem.h>
#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
@@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);
+static struct inet_listen_hashbucket *
+inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
+{
+ u32 hash;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ hash = ipv6_portaddr_hash(sock_net(sk),
+ &sk->sk_v6_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ else
+#endif
+ hash = ipv4_portaddr_hash(sock_net(sk),
+ inet_sk(sk)->inet_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ return inet_lhash2_bucket(h, hash);
+}
+
+static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
+{
+ struct inet_listen_hashbucket *ilb2;
+
+ if (!h->lhash2)
+ return;
+
+ ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+ spin_lock(&ilb2->lock);
+ if (sk->sk_reuseport && sk->sk_family == AF_INET6)
+ hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+ &ilb2->head);
+ else
+ hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+ &ilb2->head);
+ ilb2->count++;
+ spin_unlock(&ilb2->lock);
+}
+
+static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
+{
+ struct inet_listen_hashbucket *ilb2;
+
+ if (!h->lhash2 ||
+ WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
+ return;
+
+ ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+ spin_lock(&ilb2->lock);
+ hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
+ ilb2->count--;
+ spin_unlock(&ilb2->lock);
+}
+
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum, const __be32 daddr,
const int dif, const int sdif, bool exact_dif)
@@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
*/
/* called with rcu_read_lock() : No refcount taken on the socket */
+static struct sock *inet_lhash2_lookup(struct net *net,
+ struct inet_listen_hashbucket *ilb2,
+ struct sk_buff *skb, int doff,
+ const __be32 saddr, __be16 sport,
+ const __be32 daddr, const unsigned short hnum,
+ const int dif, const int sdif)
+{
+ bool exact_dif = inet_exact_dif_match(net, skb);
+ struct inet_connection_sock *icsk;
+ struct sock *sk, *result = NULL;
+ int score, hiscore = 0;
+ u32 phash = 0;
+
+ inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+ sk = (struct sock *)icsk;
+ score = compute_score(sk, net, hnum, daddr,
+ dif, sdif, exact_dif);
+ if (score > hiscore) {
+ if (sk->sk_reuseport) {
+ phash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ result = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (result)
+ return result;
+ }
+ result = sk;
+ hiscore = score;
+ }
+ }
+
+ return result;
+}
+
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
@@ -216,32 +305,57 @@ struct sock *__inet_lookup_listener(struct net *net,
{
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore = 0, matches = 0, reuseport = 0;
bool exact_dif = inet_exact_dif_match(net, skb);
+ struct inet_listen_hashbucket *ilb2;
struct sock *sk, *result = NULL;
+ int score, hiscore = 0;
+ unsigned int hash2;
u32 phash = 0;
+ if (ilb->count <= 10 || !hashinfo->lhash2)
+ goto port_lookup;
+
+ /* Too many sk in the ilb bucket (which is hashed by port alone).
+ * Try lhash2 (which is hashed by port and addr) instead.
+ */
+
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+ if (ilb2->count > ilb->count)
+ goto port_lookup;
+
+ result = inet_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+ if (result)
+ return result;
+
+ /* Lookup lhash2 with INADDR_ANY */
+
+ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+ if (ilb2->count > ilb->count)
+ goto port_lookup;
+
+ return inet_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+
+port_lookup:
sk_for_each_rcu(sk, &ilb->head) {
score = compute_score(sk, net, hnum, daddr,
dif, sdif, exact_dif);
if (score > hiscore) {
- reuseport = sk->sk_reuseport;
- if (reuseport) {
+ if (sk->sk_reuseport) {
phash = inet_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, phash,
skb, doff);
if (result)
return result;
- matches = 1;
}
result = sk;
hiscore = score;
- } else if (score == hiscore && reuseport) {
- matches++;
- if (reciprocal_scale(phash, matches) == 0)
- result = sk;
- phash = next_pseudo_random32(phash);
}
}
return result;
@@ -483,6 +597,8 @@ int __inet_hash(struct sock *sk, struct sock *osk)
hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
else
hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+ inet_hash2(hashinfo, sk);
+ ilb->count++;
sock_set_flag(sk, SOCK_RCU_FREE);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
@@ -509,28 +625,35 @@ EXPORT_SYMBOL_GPL(inet_hash);
void inet_unhash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct inet_listen_hashbucket *ilb;
spinlock_t *lock;
bool listener = false;
- int done;
if (sk_unhashed(sk))
return;
if (sk->sk_state == TCP_LISTEN) {
- lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
+ ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+ lock = &ilb->lock;
listener = true;
} else {
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
}
spin_lock_bh(lock);
+ if (sk_unhashed(sk))
+ goto unlock;
+
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
- if (listener)
- done = __sk_del_node_init(sk);
- else
- done = __sk_nulls_del_node_init_rcu(sk);
- if (done)
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ if (listener) {
+ inet_unhash2(hashinfo, sk);
+ __sk_del_node_init(sk);
+ ilb->count--;
+ } else {
+ __sk_nulls_del_node_init_rcu(sk);
+ }
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+unlock:
spin_unlock_bh(lock);
}
EXPORT_SYMBOL_GPL(inet_unhash);
@@ -665,10 +788,37 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
INIT_HLIST_HEAD(&h->listening_hash[i].head);
+ h->listening_hash[i].count = 0;
}
+
+ h->lhash2 = NULL;
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
+void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+ unsigned long numentries, int scale,
+ unsigned long low_limit,
+ unsigned long high_limit)
+{
+ unsigned int i;
+
+ h->lhash2 = alloc_large_system_hash(name,
+ sizeof(*h->lhash2),
+ numentries,
+ scale,
+ 0,
+ NULL,
+ &h->lhash2_mask,
+ low_limit,
+ high_limit);
+
+ for (i = 0; i <= h->lhash2_mask; i++) {
+ spin_lock_init(&h->lhash2[i].lock);
+ INIT_HLIST_HEAD(&h->lhash2[i].head);
+ h->lhash2[i].count = 0;
+ }
+}
+
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
unsigned int locksz = sizeof(spinlock_t);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index b563e0c46bac..277ff69a312d 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -97,7 +97,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
* Essentially we whip up a timewait bucket, copy the relevant info into it
* from the SK, and mess with hash chains and list linkage.
*/
-void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
struct inet_hashinfo *hashinfo)
{
const struct inet_sock *inet = inet_sk(sk);
@@ -119,18 +119,6 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
spin_lock(lock);
- /*
- * Step 2: Hash TW into tcp ehash chain.
- * Notes :
- * - tw_refcnt is set to 4 because :
- * - We have one reference from bhash chain.
- * - We have one reference from ehash chain.
- * - We have one reference from timer.
- * - One reference for ourself (our caller will release it).
- * We can use atomic_set() because prior spin_lock()/spin_unlock()
- * committed into memory all tw fields.
- */
- refcount_set(&tw->tw_refcnt, 4);
inet_twsk_add_node_rcu(tw, &ehead->chain);
/* Step 3: Remove SK from hash chain */
@@ -138,8 +126,19 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock(lock);
+
+ /* tw_refcnt is set to 3 because we have :
+ * - one reference for bhash chain.
+ * - one reference for ehash chain.
+ * - one reference for timer.
+ * We can use atomic_set() because prior spin_lock()/spin_unlock()
+ * committed into memory all tw fields.
+ * Also note that after this point, we lost our implicit reference
+ * so we are not allowed to use tw anymore.
+ */
+ refcount_set(&tw->tw_refcnt, 3);
}
-EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+EXPORT_SYMBOL_GPL(inet_twsk_hashdance);
static void tw_timer_handler(struct timer_list *t)
{
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9c1735632c8c..9a80d84fc182 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -114,7 +114,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static struct rtnl_link_ops ipgre_link_ops __read_mostly;
static int ipgre_tunnel_init(struct net_device *dev);
static void erspan_build_header(struct sk_buff *skb,
- __be32 id, u32 index, bool truncate);
+ __be32 id, u32 index,
+ bool truncate, bool is_ipv4);
static unsigned int ipgre_net_id __read_mostly;
static unsigned int gre_tap_net_id __read_mostly;
@@ -255,34 +256,41 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
{
struct net *net = dev_net(skb->dev);
struct metadata_dst *tun_dst = NULL;
+ struct erspan_base_hdr *ershdr;
+ struct erspan_metadata *pkt_md;
struct ip_tunnel_net *itn;
struct ip_tunnel *tunnel;
- struct erspanhdr *ershdr;
const struct iphdr *iph;
- __be32 index;
+ int ver;
int len;
itn = net_generic(net, erspan_net_id);
len = gre_hdr_len + sizeof(*ershdr);
+ /* Check based hdr len */
if (unlikely(!pskb_may_pull(skb, len)))
return PACKET_REJECT;
iph = ip_hdr(skb);
- ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
+ ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
+ ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET;
/* The original GRE header does not have key field,
* Use ERSPAN 10-bit session ID as key.
*/
tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
- index = ershdr->md.index;
+ pkt_md = (struct erspan_metadata *)(ershdr + 1);
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
tpi->flags | TUNNEL_KEY,
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
+ len = gre_hdr_len + erspan_hdr_len(ver);
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+
if (__iptunnel_pull_header(skb,
- gre_hdr_len + sizeof(*ershdr),
+ len,
htons(ETH_P_TEB),
false, false) < 0)
goto drop;
@@ -306,12 +314,27 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (!md)
return PACKET_REJECT;
- md->index = index;
+ memcpy(md, pkt_md, sizeof(*md));
+ md->version = ver;
+
info = &tun_dst->u.tun_info;
info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
info->options_len = sizeof(*md);
} else {
- tunnel->index = ntohl(index);
+ tunnel->erspan_ver = ver;
+ if (ver == 1) {
+ tunnel->index = ntohl(pkt_md->u.index);
+ } else {
+ u16 md2_flags;
+ u16 dir, hwid;
+
+ md2_flags = ntohs(pkt_md->u.md2.flags);
+ dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+ hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+ tunnel->dir = dir;
+ tunnel->hwid = hwid;
+ }
+
}
skb_reset_mac_header(skb);
@@ -405,7 +428,8 @@ static int gre_rcv(struct sk_buff *skb)
if (hdr_len < 0)
goto drop;
- if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
+ if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
+ tpi.proto == htons(ETH_P_ERSPAN2))) {
if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
return 0;
}
@@ -560,6 +584,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
bool truncate = false;
struct flowi4 fl;
int tunnel_hlen;
+ int version;
__be16 df;
tun_info = skb_tunnel_info(skb);
@@ -568,9 +593,13 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
goto err_free_skb;
key = &tun_info->key;
+ md = ip_tunnel_info_opts(tun_info);
+ if (!md)
+ goto err_free_rt;
/* ERSPAN has fixed 8 byte GRE header */
- tunnel_hlen = 8 + sizeof(struct erspanhdr);
+ version = md->version;
+ tunnel_hlen = 8 + erspan_hdr_len(version);
rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
if (!rt)
@@ -584,12 +613,23 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
truncate = true;
}
- md = ip_tunnel_info_opts(tun_info);
- if (!md)
- goto err_free_rt;
+ if (version == 1) {
+ erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
+ ntohl(md->u.index), truncate, true);
+ } else if (version == 2) {
+ u16 md2_flags;
+ u8 direction;
+ u16 hwid;
- erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
- ntohl(md->index), truncate);
+ md2_flags = ntohs(md->u.md2.flags);
+ direction = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+ hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+
+ erspan_build_header_v2(skb, tunnel_id_to_key32(key->tun_id),
+ direction, hwid, truncate, true);
+ } else {
+ goto err_free_rt;
+ }
gre_build_header(skb, 8, TUNNEL_SEQ,
htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
@@ -668,52 +708,6 @@ free_skb:
return NETDEV_TX_OK;
}
-static inline u8 tos_to_cos(u8 tos)
-{
- u8 dscp, cos;
-
- dscp = tos >> 2;
- cos = dscp >> 3;
- return cos;
-}
-
-static void erspan_build_header(struct sk_buff *skb,
- __be32 id, u32 index, bool truncate)
-{
- struct iphdr *iphdr = ip_hdr(skb);
- struct ethhdr *eth = eth_hdr(skb);
- enum erspan_encap_type enc_type;
- struct erspanhdr *ershdr;
- struct qtag_prefix {
- __be16 eth_type;
- __be16 tci;
- } *qp;
- u16 vlan_tci = 0;
-
- enc_type = ERSPAN_ENCAP_NOVLAN;
-
- /* If mirrored packet has vlan tag, extract tci and
- * perserve vlan header in the mirrored frame.
- */
- if (eth->h_proto == htons(ETH_P_8021Q)) {
- qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
- vlan_tci = ntohs(qp->tci);
- enc_type = ERSPAN_ENCAP_INFRAME;
- }
-
- skb_push(skb, sizeof(*ershdr));
- ershdr = (struct erspanhdr *)skb->data;
- memset(ershdr, 0, sizeof(*ershdr));
-
- ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
- (ERSPAN_VERSION << VER_OFFSET));
- ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
- ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
- (enc_type << EN_OFFSET & EN_MASK) |
- ((truncate << T_OFFSET) & T_MASK));
- ershdr->md.index = htonl(index & INDEX_MASK);
-}
-
static netdev_tx_t erspan_xmit(struct sk_buff *skb,
struct net_device *dev)
{
@@ -737,7 +731,14 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
}
/* Push ERSPAN header */
- erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
+ if (tunnel->erspan_ver == 1)
+ erspan_build_header(skb, tunnel->parms.o_key, tunnel->index,
+ truncate, true);
+ else
+ erspan_build_header_v2(skb, tunnel->parms.o_key,
+ tunnel->dir, tunnel->hwid,
+ truncate, true);
+
tunnel->parms.o_flags &= ~TUNNEL_KEY;
__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
return NETDEV_TX_OK;
@@ -1209,13 +1210,32 @@ static int ipgre_netlink_parms(struct net_device *dev,
if (data[IFLA_GRE_FWMARK])
*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
- if (data[IFLA_GRE_ERSPAN_INDEX]) {
- t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+ if (data[IFLA_GRE_ERSPAN_VER]) {
+ t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
- if (t->index & ~INDEX_MASK)
+ if (t->erspan_ver != 1 && t->erspan_ver != 2)
return -EINVAL;
}
+ if (t->erspan_ver == 1) {
+ if (data[IFLA_GRE_ERSPAN_INDEX]) {
+ t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+ if (t->index & ~INDEX_MASK)
+ return -EINVAL;
+ }
+ } else if (t->erspan_ver == 2) {
+ if (data[IFLA_GRE_ERSPAN_DIR]) {
+ t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+ if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
+ return -EINVAL;
+ }
+ if (data[IFLA_GRE_ERSPAN_HWID]) {
+ t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+ if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
+ return -EINVAL;
+ }
+ }
+
return 0;
}
@@ -1282,7 +1302,7 @@ static int erspan_tunnel_init(struct net_device *dev)
tunnel->tun_hlen = 8;
tunnel->parms.iph.protocol = IPPROTO_GRE;
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
- sizeof(struct erspanhdr);
+ erspan_hdr_len(tunnel->erspan_ver);
t_hlen = tunnel->hlen + sizeof(struct iphdr);
dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
@@ -1412,6 +1432,12 @@ static size_t ipgre_get_size(const struct net_device *dev)
nla_total_size(4) +
/* IFLA_GRE_ERSPAN_INDEX */
nla_total_size(4) +
+ /* IFLA_GRE_ERSPAN_VER */
+ nla_total_size(1) +
+ /* IFLA_GRE_ERSPAN_DIR */
+ nla_total_size(1) +
+ /* IFLA_GRE_ERSPAN_HWID */
+ nla_total_size(2) +
0;
}
@@ -1454,9 +1480,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
goto nla_put_failure;
}
- if (t->index)
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
+ goto nla_put_failure;
+
+ if (t->erspan_ver == 1) {
if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
goto nla_put_failure;
+ } else if (t->erspan_ver == 2) {
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
+ goto nla_put_failure;
+ }
return 0;
@@ -1492,6 +1527,9 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 },
[IFLA_GRE_FWMARK] = { .type = NLA_U32 },
[IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 },
+ [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 },
+ [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 },
};
static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 43b69af242e1..f0ed031f3594 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1106,7 +1106,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
new = true;
}
- __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
+ __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
if (!dst_check(&rt->dst, 0)) {
if (new)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f08eebe60446..c470fec9062f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3578,6 +3578,9 @@ void __init tcp_init(void)
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
inet_hashinfo_init(&tcp_hashinfo);
+ inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
+ thash_entries, 21, /* one slot per 2 MB*/
+ 0, 64 * 1024);
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 78c192ee03a4..018a48477355 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -379,18 +379,9 @@ fastopen:
bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie)
{
- unsigned long last_syn_loss = 0;
const struct dst_entry *dst;
- int syn_loss = 0;
- tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
-
- /* Recurring FO SYN losses: no cookie or data in SYN */
- if (syn_loss > 1 &&
- time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
- cookie->len = -1;
- return false;
- }
+ tcp_fastopen_cache_get(sk, mss, cookie);
/* Firewall blackhole issue check */
if (tcp_fastopen_active_should_disable(sk)) {
@@ -448,6 +439,8 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect);
* following circumstances:
* 1. client side TFO socket receives out of order FIN
* 2. client side TFO socket receives out of order RST
+ * 3. client side TFO socket has timed out three times consecutively during
+ * or after handshake
* We disable active side TFO globally for 1hr at first. Then if it
* happens again, we disable it for 2h, then 4h, 8h, ...
* And we reset the timeout back to 1hr when we see a successful active
@@ -524,3 +517,20 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
dst_release(dst);
}
}
+
+void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired)
+{
+ u32 timeouts = inet_csk(sk)->icsk_retransmits;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Broken middle-boxes may black-hole Fast Open connection during or
+ * even after the handshake. Be extremely conservative and pause
+ * Fast Open globally after hitting the third consecutive timeout or
+ * exceeding the configured timeout limit.
+ */
+ if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) &&
+ (timeouts == 2 || (timeouts < 2 && expired))) {
+ tcp_fastopen_active_disable(sk);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ }
+}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 45f750e85714..4d55c4b338ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -578,8 +578,8 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 copied;
int time;
- int copied;
tcp_mstamp_refresh(tp);
time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
@@ -602,38 +602,31 @@ void tcp_rcv_space_adjust(struct sock *sk)
if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int rcvwin, rcvmem, rcvbuf;
+ int rcvmem, rcvbuf;
+ u64 rcvwin, grow;
/* minimal window to cope with packet losses, assuming
* steady state. Add some cushion because of small variations.
*/
- rcvwin = (copied << 1) + 16 * tp->advmss;
+ rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
- /* If rate increased by 25%,
- * assume slow start, rcvwin = 3 * copied
- * If rate increased by 50%,
- * assume sender can use 2x growth, rcvwin = 4 * copied
- */
- if (copied >=
- tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
- if (copied >=
- tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
- rcvwin <<= 1;
- else
- rcvwin += (rcvwin >> 1);
- }
+ /* Accommodate for sender rate increase (eg. slow start) */
+ grow = rcvwin * (copied - tp->rcvq_space.space);
+ do_div(grow, tp->rcvq_space.space);
+ rcvwin += (grow << 1);
rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
rcvmem += 128;
- rcvbuf = min(rcvwin / tp->advmss * rcvmem,
- sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+ do_div(rcvwin, tp->advmss);
+ rcvbuf = min_t(u64, rcvwin * rcvmem,
+ sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
if (rcvbuf > sk->sk_rcvbuf) {
sk->sk_rcvbuf = rcvbuf;
/* Make the window clamp follow along. */
- tp->window_clamp = rcvwin;
+ tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
}
}
tp->rcvq_space.space = copied;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 7097f92d16e5..759e6bc8327b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -546,8 +546,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
static DEFINE_SEQLOCK(fastopen_seqlock);
void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
- struct tcp_fastopen_cookie *cookie,
- int *syn_loss, unsigned long *last_syn_loss)
+ struct tcp_fastopen_cookie *cookie)
{
struct tcp_metrics_block *tm;
@@ -564,8 +563,6 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
*cookie = tfom->cookie;
if (cookie->len <= 0 && tfom->try_exp == 1)
cookie->exp = true;
- *syn_loss = tfom->syn_loss;
- *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
} while (read_seqretry(&fastopen_seqlock, seq));
}
rcu_read_unlock();
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b079b619b60c..a8384b0c11f8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -316,9 +316,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
*/
local_bh_disable();
inet_twsk_schedule(tw, timeo);
- /* Linkage updates. */
- __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
- inet_twsk_put(tw);
+ /* Linkage updates.
+ * Note that access to tw after this point is illegal.
+ */
+ inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
local_bh_enable();
} else {
/* Sorry, if we're out of memory, just CLOSE this
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a4d214c7b506..04be9f833927 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2414,15 +2414,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
/* Schedule a loss probe in 2*RTT for SACK capable connections
- * in Open state, that are either limited by cwnd or application.
+ * not in loss recovery, that are either limited by cwnd or application.
*/
if ((early_retrans != 3 && early_retrans != 4) ||
!tp->packets_out || !tcp_is_sack(tp) ||
- icsk->icsk_ca_state != TCP_CA_Open)
- return false;
-
- if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
- !tcp_write_queue_empty(sk))
+ (icsk->icsk_ca_state != TCP_CA_Open &&
+ icsk->icsk_ca_state != TCP_CA_CWR))
return false;
/* Probe timeout is 2*rtt. Add minimum RTO to account
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 968fda198376..6db3124cdbda 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -183,11 +183,6 @@ static int tcp_write_timeout(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits) {
dst_negative_advice(sk);
- if (tp->syn_fastopen || tp->syn_data)
- tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
- if (tp->syn_data && icsk->icsk_retransmits == 1)
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVEFAIL);
} else if (!tp->syn_data && !tp->syn_fastopen) {
sk_rethink_txhash(sk);
}
@@ -195,17 +190,6 @@ static int tcp_write_timeout(struct sock *sk)
expired = icsk->icsk_retransmits >= retry_until;
} else {
if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {
- /* Some middle-boxes may black-hole Fast Open _after_
- * the handshake. Therefore we conservatively disable
- * Fast Open on this path on recurring timeouts after
- * successful Fast Open.
- */
- if (tp->syn_data_acked) {
- tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
- if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVEFAIL);
- }
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
@@ -228,6 +212,7 @@ static int tcp_write_timeout(struct sock *sk)
expired = retransmits_timed_out(sk, retry_until,
icsk->icsk_user_timeout);
}
+ tcp_fastopen_active_detect_blackhole(sk, expired);
if (expired) {
/* Has it gone just too far? */
tcp_write_err(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e4ff25c947c5..e9c0d1e1772e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -357,18 +357,12 @@ fail:
}
EXPORT_SYMBOL(udp_lib_get_port);
-static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
- unsigned int port)
-{
- return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
-}
-
int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
unsigned int hash2_nulladdr =
- udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
+ ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
unsigned int hash2_partial =
- udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+ ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
/* precompute partial secondary hash */
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
@@ -445,7 +439,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
struct sk_buff *skb)
{
struct sock *sk, *result;
- int score, badness, matches = 0, reuseport = 0;
+ int score, badness;
u32 hash = 0;
result = NULL;
@@ -454,23 +448,16 @@ static struct sock *udp4_lib_lookup2(struct net *net,
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
- reuseport = sk->sk_reuseport;
- if (reuseport) {
+ if (sk->sk_reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
if (result)
return result;
- matches = 1;
}
badness = score;
result = sk;
- } else if (score == badness && reuseport) {
- matches++;
- if (reciprocal_scale(hash, matches) == 0)
- result = sk;
- hash = next_pseudo_random32(hash);
}
}
return result;
@@ -488,11 +475,11 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
bool exact_dif = udp_lib_exact_dif_match(net, skb);
- int score, badness, matches = 0, reuseport = 0;
+ int score, badness;
u32 hash = 0;
if (hslot->count > 10) {
- hash2 = udp4_portaddr_hash(net, daddr, hnum);
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
@@ -503,7 +490,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
exact_dif, hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
- hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
/* avoid searching the same slot again. */
if (unlikely(slot2 == old_slot2))
@@ -526,23 +513,16 @@ begin:
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
- reuseport = sk->sk_reuseport;
- if (reuseport) {
+ if (sk->sk_reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
if (result)
return result;
- matches = 1;
}
result = sk;
badness = score;
- } else if (score == badness && reuseport) {
- matches++;
- if (reciprocal_scale(hash, matches) == 0)
- result = sk;
- hash = next_pseudo_random32(hash);
}
}
return result;
@@ -1775,7 +1755,7 @@ EXPORT_SYMBOL(udp_lib_rehash);
static void udp_v4_rehash(struct sock *sk)
{
- u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+ u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
inet_sk(sk)->inet_rcv_saddr,
inet_sk(sk)->inet_num);
udp_lib_rehash(sk, new_hash);
@@ -1966,9 +1946,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct sk_buff *nskb;
if (use_hash2) {
- hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+ hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
udptable->mask;
- hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask;
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
hslot = &udptable->hash2[hash2];
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
@@ -2200,7 +2180,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
int dif, int sdif)
{
unsigned short hnum = ntohs(loc_port);
- unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
+ unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
unsigned int slot2 = hash2 & udp_table.mask;
struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index e6265e2c274e..7d885a44dc9d 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -62,7 +62,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
- top_iph->ttl = ip4_dst_hoplimit(dst->child);
+ top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst));
top_iph->saddr = x->props.saddr.a4;
top_iph->daddr = x->id.daddr.a4;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f49bd7897e95..ed06b1190f05 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6595,27 +6595,45 @@ int __init addrconf_init(void)
rtnl_af_register(&inet6_ops);
- err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo,
- 0);
+ err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETLINK,
+ NULL, inet6_dump_ifinfo, 0);
if (err < 0)
goto errout;
- /* Only the first call to __rtnl_register can fail */
- __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0);
- __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0);
- __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr,
- inet6_dump_ifaddr, RTNL_FLAG_DOIT_UNLOCKED);
- __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL,
- inet6_dump_ifmcaddr, 0);
- __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL,
- inet6_dump_ifacaddr, 0);
- __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf,
- inet6_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED);
-
- ipv6_addr_label_rtnl_register();
+ err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDR,
+ inet6_rtm_newaddr, NULL, 0);
+ if (err < 0)
+ goto errout;
+ err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDR,
+ inet6_rtm_deladdr, NULL, 0);
+ if (err < 0)
+ goto errout;
+ err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDR,
+ inet6_rtm_getaddr, inet6_dump_ifaddr,
+ RTNL_FLAG_DOIT_UNLOCKED);
+ if (err < 0)
+ goto errout;
+ err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMULTICAST,
+ NULL, inet6_dump_ifmcaddr, 0);
+ if (err < 0)
+ goto errout;
+ err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST,
+ NULL, inet6_dump_ifacaddr, 0);
+ if (err < 0)
+ goto errout;
+ err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETNETCONF,
+ inet6_netconf_get_devconf,
+ inet6_netconf_dump_devconf,
+ RTNL_FLAG_DOIT_UNLOCKED);
+ if (err < 0)
+ goto errout;
+ err = ipv6_addr_label_rtnl_register();
+ if (err < 0)
+ goto errout;
return 0;
errout:
+ rtnl_unregister_all(PF_INET6);
rtnl_af_unregister(&inet6_ops);
unregister_netdevice_notifier(&ipv6_dev_notf);
errlo:
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 00e1f8ee08f8..1d6ced37ad71 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -547,13 +547,22 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
return err;
}
-void __init ipv6_addr_label_rtnl_register(void)
+int __init ipv6_addr_label_rtnl_register(void)
{
- __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel,
- NULL, RTNL_FLAG_DOIT_UNLOCKED);
- __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel,
- NULL, RTNL_FLAG_DOIT_UNLOCKED);
- __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get,
- ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED);
-}
+ int ret;
+ ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDRLABEL,
+ ip6addrlbl_newdel,
+ NULL, RTNL_FLAG_DOIT_UNLOCKED);
+ if (ret < 0)
+ return ret;
+ ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDRLABEL,
+ ip6addrlbl_newdel,
+ NULL, RTNL_FLAG_DOIT_UNLOCKED);
+ if (ret < 0)
+ return ret;
+ ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL,
+ ip6addrlbl_get,
+ ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED);
+ return ret;
+}
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 6eb5e68f112a..44c39c5f0638 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -512,9 +512,7 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
struct ila_map *ila;
int ret;
- ret = rhashtable_walk_start(rhiter);
- if (ret && ret != -EAGAIN)
- goto done;
+ rhashtable_walk_start(rhiter);
for (;;) {
ila = rhashtable_walk_next(rhiter);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b01858f5deb1..2febe26de6a1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -125,6 +125,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
}
/* called with rcu_read_lock() */
+static struct sock *inet6_lhash2_lookup(struct net *net,
+ struct inet_listen_hashbucket *ilb2,
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ const __be16 sport, const struct in6_addr *daddr,
+ const unsigned short hnum, const int dif, const int sdif)
+{
+ bool exact_dif = inet6_exact_dif_match(net, skb);
+ struct inet_connection_sock *icsk;
+ struct sock *sk, *result = NULL;
+ int score, hiscore = 0;
+ u32 phash = 0;
+
+ inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+ sk = (struct sock *)icsk;
+ score = compute_score(sk, net, hnum, daddr, dif, sdif,
+ exact_dif);
+ if (score > hiscore) {
+ if (sk->sk_reuseport) {
+ phash = inet6_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ result = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (result)
+ return result;
+ }
+ result = sk;
+ hiscore = score;
+ }
+ }
+
+ return result;
+}
+
struct sock *inet6_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
@@ -134,31 +168,56 @@ struct sock *inet6_lookup_listener(struct net *net,
{
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore = 0, matches = 0, reuseport = 0;
bool exact_dif = inet6_exact_dif_match(net, skb);
+ struct inet_listen_hashbucket *ilb2;
struct sock *sk, *result = NULL;
+ int score, hiscore = 0;
+ unsigned int hash2;
u32 phash = 0;
+ if (ilb->count <= 10 || !hashinfo->lhash2)
+ goto port_lookup;
+
+ /* Too many sk in the ilb bucket (which is hashed by port alone).
+ * Try lhash2 (which is hashed by port and addr) instead.
+ */
+
+ hash2 = ipv6_portaddr_hash(net, daddr, hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+ if (ilb2->count > ilb->count)
+ goto port_lookup;
+
+ result = inet6_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+ if (result)
+ return result;
+
+ /* Lookup lhash2 with in6addr_any */
+
+ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+ if (ilb2->count > ilb->count)
+ goto port_lookup;
+
+ return inet6_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+
+port_lookup:
sk_for_each(sk, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
if (score > hiscore) {
- reuseport = sk->sk_reuseport;
- if (reuseport) {
+ if (sk->sk_reuseport) {
phash = inet6_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, phash,
skb, doff);
if (result)
return result;
- matches = 1;
}
result = sk;
hiscore = score;
- } else if (score == hiscore && reuseport) {
- matches++;
- if (reciprocal_scale(phash, matches) == 0)
- result = sk;
- phash = next_pseudo_random32(phash);
}
}
return result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f5285f4e1d08..a64d559fa513 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -893,7 +893,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
ins = &fn->leaf;
for (iter = leaf; iter;
- iter = rcu_dereference_protected(iter->dst.rt6_next,
+ iter = rcu_dereference_protected(iter->rt6_next,
lockdep_is_held(&rt->rt6i_table->tb6_lock))) {
/*
* Search for duplicates
@@ -950,7 +950,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
break;
next_iter:
- ins = &iter->dst.rt6_next;
+ ins = &iter->rt6_next;
}
if (fallback_ins && !found) {
@@ -979,7 +979,7 @@ next_iter:
&sibling->rt6i_siblings);
break;
}
- sibling = rcu_dereference_protected(sibling->dst.rt6_next,
+ sibling = rcu_dereference_protected(sibling->rt6_next,
lockdep_is_held(&rt->rt6i_table->tb6_lock));
}
/* For each sibling in the list, increment the counter of
@@ -1009,7 +1009,7 @@ add:
if (err)
return err;
- rcu_assign_pointer(rt->dst.rt6_next, iter);
+ rcu_assign_pointer(rt->rt6_next, iter);
atomic_inc(&rt->rt6i_ref);
rcu_assign_pointer(rt->rt6i_node, fn);
rcu_assign_pointer(*ins, rt);
@@ -1040,7 +1040,7 @@ add:
atomic_inc(&rt->rt6i_ref);
rcu_assign_pointer(rt->rt6i_node, fn);
- rt->dst.rt6_next = iter->dst.rt6_next;
+ rt->rt6_next = iter->rt6_next;
rcu_assign_pointer(*ins, rt);
call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
rt, extack);
@@ -1059,14 +1059,14 @@ add:
if (nsiblings) {
/* Replacing an ECMP route, remove all siblings */
- ins = &rt->dst.rt6_next;
+ ins = &rt->rt6_next;
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->rt6i_table->tb6_lock));
while (iter) {
if (iter->rt6i_metric > rt->rt6i_metric)
break;
if (rt6_qualify_for_ecmp(iter)) {
- *ins = iter->dst.rt6_next;
+ *ins = iter->rt6_next;
iter->rt6i_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == iter)
@@ -1075,7 +1075,7 @@ add:
nsiblings--;
info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
} else {
- ins = &iter->dst.rt6_next;
+ ins = &iter->rt6_next;
}
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->rt6i_table->tb6_lock));
@@ -1644,7 +1644,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
/* Unlink it */
- *rtp = rt->dst.rt6_next;
+ *rtp = rt->rt6_next;
rt->rt6i_node = NULL;
net->ipv6.rt6_stats->fib_rt_entries--;
net->ipv6.rt6_stats->fib_discarded_routes++;
@@ -1672,7 +1672,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
FOR_WALKERS(net, w) {
if (w->state == FWS_C && w->leaf == rt) {
RT6_TRACE("walker %p adjusted by delroute\n", w);
- w->leaf = rcu_dereference_protected(rt->dst.rt6_next,
+ w->leaf = rcu_dereference_protected(rt->rt6_next,
lockdep_is_held(&table->tb6_lock));
if (!w->leaf)
w->state = FWS_U;
@@ -1731,7 +1731,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
fib6_del_route(table, fn, rtp, info);
return 0;
}
- rtp_next = &cur->dst.rt6_next;
+ rtp_next = &cur->rt6_next;
}
return -ENOENT;
}
@@ -2142,8 +2142,8 @@ int __init fib6_init(void)
if (ret)
goto out_kmem_cache_create;
- ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
- 0);
+ ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL,
+ inet6_dump_fib, 0);
if (ret)
goto out_unregister_subsys;
@@ -2208,7 +2208,7 @@ static int ipv6_route_yield(struct fib6_walker *w)
do {
iter->w.leaf = rcu_dereference_protected(
- iter->w.leaf->dst.rt6_next,
+ iter->w.leaf->rt6_next,
lockdep_is_held(&iter->tbl->tb6_lock));
iter->skip--;
if (!iter->skip && iter->w.leaf)
@@ -2274,7 +2274,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (!v)
goto iter_table;
- n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next);
+ n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next);
if (n) {
++*pos;
return n;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4cfd8e0696fe..5c9c65f1d5c2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -55,6 +55,8 @@
#include <net/ip6_route.h>
#include <net/ip6_tunnel.h>
#include <net/gre.h>
+#include <net/erspan.h>
+#include <net/dst_metadata.h>
static bool log_ecn_error = true;
@@ -68,11 +70,13 @@ static unsigned int ip6gre_net_id __read_mostly;
struct ip6gre_net {
struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
+ struct ip6_tnl __rcu *collect_md_tun;
struct net_device *fb_tunnel_dev;
};
static struct rtnl_link_ops ip6gre_link_ops __read_mostly;
static struct rtnl_link_ops ip6gre_tap_ops __read_mostly;
+static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly;
static int ip6gre_tunnel_init(struct net_device *dev);
static void ip6gre_tunnel_setup(struct net_device *dev);
static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t);
@@ -121,7 +125,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
unsigned int h1 = HASH_KEY(key);
struct ip6_tnl *t, *cand = NULL;
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
- int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
+ int dev_type = (gre_proto == htons(ETH_P_TEB) ||
+ gre_proto == htons(ETH_P_ERSPAN)) ?
ARPHRD_ETHER : ARPHRD_IP6GRE;
int score, cand_score = 4;
@@ -226,6 +231,10 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
if (cand)
return cand;
+ t = rcu_dereference(ign->collect_md_tun);
+ if (t && t->dev->flags & IFF_UP)
+ return t;
+
dev = ign->fb_tunnel_dev;
if (dev->flags & IFF_UP)
return netdev_priv(dev);
@@ -261,6 +270,9 @@ static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t)
{
struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t);
+ if (t->parms.collect_md)
+ rcu_assign_pointer(ign->collect_md_tun, t);
+
rcu_assign_pointer(t->next, rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
}
@@ -270,6 +282,9 @@ static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t)
struct ip6_tnl __rcu **tp;
struct ip6_tnl *iter;
+ if (t->parms.collect_md)
+ rcu_assign_pointer(ign->collect_md_tun, NULL);
+
for (tp = ip6gre_bucket(ign, t);
(iter = rtnl_dereference(*tp)) != NULL;
tp = &iter->next) {
@@ -460,7 +475,108 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
&ipv6h->saddr, &ipv6h->daddr, tpi->key,
tpi->proto);
if (tunnel) {
- ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+ if (tunnel->parms.collect_md) {
+ struct metadata_dst *tun_dst;
+ __be64 tun_id;
+ __be16 flags;
+
+ flags = tpi->flags;
+ tun_id = key32_to_tunnel_id(tpi->key);
+
+ tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0);
+ if (!tun_dst)
+ return PACKET_REJECT;
+
+ ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+ } else {
+ ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+ }
+
+ return PACKET_RCVD;
+ }
+
+ return PACKET_REJECT;
+}
+
+static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
+ struct tnl_ptk_info *tpi)
+{
+ struct erspan_base_hdr *ershdr;
+ struct erspan_metadata *pkt_md;
+ const struct ipv6hdr *ipv6h;
+ struct ip6_tnl *tunnel;
+ u8 ver;
+
+ ipv6h = ipv6_hdr(skb);
+ ershdr = (struct erspan_base_hdr *)skb->data;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr))))
+ return PACKET_REJECT;
+
+ ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET;
+ tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
+ pkt_md = (struct erspan_metadata *)(ershdr + 1);
+
+ tunnel = ip6gre_tunnel_lookup(skb->dev,
+ &ipv6h->saddr, &ipv6h->daddr, tpi->key,
+ tpi->proto);
+ if (tunnel) {
+ int len = erspan_hdr_len(ver);
+
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+
+ if (__iptunnel_pull_header(skb, len,
+ htons(ETH_P_TEB),
+ false, false) < 0)
+ return PACKET_REJECT;
+
+ if (tunnel->parms.collect_md) {
+ struct metadata_dst *tun_dst;
+ struct ip_tunnel_info *info;
+ struct erspan_metadata *md;
+ __be64 tun_id;
+ __be16 flags;
+
+ tpi->flags |= TUNNEL_KEY;
+ flags = tpi->flags;
+ tun_id = key32_to_tunnel_id(tpi->key);
+
+ tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id,
+ sizeof(*md));
+ if (!tun_dst)
+ return PACKET_REJECT;
+
+ info = &tun_dst->u.tun_info;
+ md = ip_tunnel_info_opts(info);
+ if (!md)
+ return PACKET_REJECT;
+
+ memcpy(md, pkt_md, sizeof(*md));
+ md->version = ver;
+ info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+ info->options_len = sizeof(*md);
+
+ ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+
+ } else {
+ tunnel->parms.erspan_ver = ver;
+
+ if (ver == 1) {
+ tunnel->parms.index = ntohl(pkt_md->u.index);
+ } else {
+ u16 md2_flags;
+ u16 dir, hwid;
+
+ md2_flags = ntohs(pkt_md->u.md2.flags);
+ dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+ hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+ tunnel->parms.dir = dir;
+ tunnel->parms.hwid = hwid;
+ }
+
+ ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+ }
return PACKET_RCVD;
}
@@ -481,6 +597,13 @@ static int gre_rcv(struct sk_buff *skb)
if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false))
goto drop;
+ if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
+ tpi.proto == htons(ETH_P_ERSPAN2))) {
+ if (ip6erspan_rcv(skb, hdr_len, &tpi) == PACKET_RCVD)
+ return 0;
+ goto drop;
+ }
+
if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD)
return 0;
@@ -496,6 +619,78 @@ static int gre_handle_offloads(struct sk_buff *skb, bool csum)
csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
+static void prepare_ip6gre_xmit_ipv4(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi6 *fl6, __u8 *dsfield,
+ int *encap_limit)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ *encap_limit = t->parms.encap_limit;
+
+ memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ *dsfield = ipv4_get_dsfield(iph);
+ else
+ *dsfield = ip6_tclass(t->parms.flowinfo);
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6->flowi6_mark = skb->mark;
+ else
+ fl6->flowi6_mark = t->parms.fwmark;
+
+ fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+}
+
+static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi6 *fl6, __u8 *dsfield,
+ int *encap_limit)
+{
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct ip6_tnl *t = netdev_priv(dev);
+ __u16 offset;
+
+ offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+ /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
+
+ if (offset > 0) {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+
+ tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
+ if (tel->encap_limit == 0) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_HDR_FIELD, offset + 2);
+ return -1;
+ }
+ *encap_limit = tel->encap_limit - 1;
+ } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
+ *encap_limit = t->parms.encap_limit;
+ }
+
+ memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ *dsfield = ipv6_get_dsfield(ipv6h);
+ else
+ *dsfield = ip6_tclass(t->parms.flowinfo);
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+ fl6->flowlabel |= ip6_flowlabel(ipv6h);
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6->flowi6_mark = skb->mark;
+ else
+ fl6->flowi6_mark = t->parms.fwmark;
+
+ fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+ return 0;
+}
+
static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
struct net_device *dev, __u8 dsfield,
struct flowi6 *fl6, int encap_limit,
@@ -517,8 +712,38 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
/* Push GRE header. */
protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
- gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
- protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno));
+
+ if (tunnel->parms.collect_md) {
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ __be16 flags;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info ||
+ !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET6))
+ return -EINVAL;
+
+ key = &tun_info->key;
+ memset(fl6, 0, sizeof(*fl6));
+ fl6->flowi6_proto = IPPROTO_GRE;
+ fl6->daddr = key->u.ipv6.dst;
+ fl6->flowlabel = key->label;
+ fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+ dsfield = key->tos;
+ flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
+ tunnel->tun_hlen = gre_calc_hlen(flags);
+
+ gre_build_header(skb, tunnel->tun_hlen,
+ flags, protocol,
+ tunnel_id_to_key32(tun_info->key.tun_id), 0);
+
+ } else {
+ gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
+ protocol, tunnel->parms.o_key,
+ htonl(tunnel->o_seqno));
+ }
return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
NEXTHDR_GRE);
@@ -527,30 +752,17 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- const struct iphdr *iph = ip_hdr(skb);
int encap_limit = -1;
struct flowi6 fl6;
- __u8 dsfield;
+ __u8 dsfield = 0;
__u32 mtu;
int err;
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- encap_limit = t->parms.encap_limit;
-
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- dsfield = ipv4_get_dsfield(iph);
- else
- dsfield = ip6_tclass(t->parms.flowinfo);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
- fl6.flowi6_mark = skb->mark;
- else
- fl6.flowi6_mark = t->parms.fwmark;
-
- fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+ if (!t->parms.collect_md)
+ prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
+ &dsfield, &encap_limit);
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
if (err)
@@ -574,46 +786,17 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
struct ip6_tnl *t = netdev_priv(dev);
struct ipv6hdr *ipv6h = ipv6_hdr(skb);
int encap_limit = -1;
- __u16 offset;
struct flowi6 fl6;
- __u8 dsfield;
+ __u8 dsfield = 0;
__u32 mtu;
int err;
if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
return -1;
- offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
- /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
- ipv6h = ipv6_hdr(skb);
-
- if (offset > 0) {
- struct ipv6_tlv_tnl_enc_lim *tel;
- tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
- if (tel->encap_limit == 0) {
- icmpv6_send(skb, ICMPV6_PARAMPROB,
- ICMPV6_HDR_FIELD, offset + 2);
- return -1;
- }
- encap_limit = tel->encap_limit - 1;
- } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- encap_limit = t->parms.encap_limit;
-
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- dsfield = ipv6_get_dsfield(ipv6h);
- else
- dsfield = ip6_tclass(t->parms.flowinfo);
-
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
- fl6.flowlabel |= ip6_flowlabel(ipv6h);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
- fl6.flowi6_mark = skb->mark;
- else
- fl6.flowi6_mark = t->parms.fwmark;
-
- fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+ if (!t->parms.collect_md &&
+ prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit))
+ return -1;
if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)))
return -1;
@@ -660,7 +843,8 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev)
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ if (!t->parms.collect_md)
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
if (err)
@@ -705,6 +889,141 @@ tx_err:
return NETDEV_TX_OK;
}
+static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device_stats *stats;
+ bool truncate = false;
+ int encap_limit = -1;
+ __u8 dsfield = false;
+ struct flowi6 fl6;
+ int err = -EINVAL;
+ __u32 mtu;
+
+ if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
+ goto tx_err;
+
+ if (gre_handle_offloads(skb, false))
+ goto tx_err;
+
+ if (skb->len > dev->mtu + dev->hard_header_len) {
+ pskb_trim(skb, dev->mtu + dev->hard_header_len);
+ truncate = true;
+ }
+
+ t->parms.o_flags &= ~TUNNEL_KEY;
+ IPCB(skb)->flags = 0;
+
+ /* For collect_md mode, derive fl6 from the tunnel key,
+ * for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}.
+ */
+ if (t->parms.collect_md) {
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ struct erspan_metadata *md;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info ||
+ !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET6))
+ return -EINVAL;
+
+ key = &tun_info->key;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_GRE;
+ fl6.daddr = key->u.ipv6.dst;
+ fl6.flowlabel = key->label;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+ dsfield = key->tos;
+ md = ip_tunnel_info_opts(tun_info);
+ if (!md)
+ goto tx_err;
+
+ if (md->version == 1) {
+ erspan_build_header(skb,
+ tunnel_id_to_key32(key->tun_id),
+ ntohl(md->u.index), truncate,
+ false);
+ } else if (md->version == 2) {
+ u16 md2_flags;
+ u16 dir, hwid;
+
+ md2_flags = ntohs(md->u.md2.flags);
+ dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+ hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+
+ erspan_build_header_v2(skb,
+ tunnel_id_to_key32(key->tun_id),
+ dir, hwid, truncate,
+ false);
+ }
+ } else {
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
+ &dsfield, &encap_limit);
+ break;
+ case htons(ETH_P_IPV6):
+ if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
+ goto tx_err;
+ if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6,
+ &dsfield, &encap_limit))
+ goto tx_err;
+ break;
+ default:
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ break;
+ }
+
+ if (t->parms.erspan_ver == 1)
+ erspan_build_header(skb, t->parms.o_key,
+ t->parms.index,
+ truncate, false);
+ else
+ erspan_build_header_v2(skb, t->parms.o_key,
+ t->parms.dir,
+ t->parms.hwid,
+ truncate, false);
+ fl6.daddr = t->parms.raddr;
+ }
+
+ /* Push GRE header. */
+ gre_build_header(skb, 8, TUNNEL_SEQ,
+ htons(ETH_P_ERSPAN), 0, htonl(t->o_seqno++));
+
+ /* TooBig packet may have updated dst->dev's mtu */
+ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu)
+ dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu);
+
+ err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
+ NEXTHDR_GRE);
+ if (err != 0) {
+ /* XXX: send ICMP error even if DF is not set. */
+ if (err == -EMSGSIZE) {
+ if (skb->protocol == htons(ETH_P_IP))
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_FRAG_NEEDED, htonl(mtu));
+ else
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ }
+
+ goto tx_err;
+ }
+ return NETDEV_TX_OK;
+
+tx_err:
+ stats = &t->dev->stats;
+ stats->tx_errors++;
+ stats->tx_dropped++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
{
struct net_device *dev = t->dev;
@@ -1048,6 +1367,11 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
dev->mtu -= 8;
+ if (tunnel->parms.collect_md) {
+ dev->features |= NETIF_F_NETNS_LOCAL;
+ netif_keep_dst(dev);
+ }
+
return 0;
}
@@ -1062,6 +1386,9 @@ static int ip6gre_tunnel_init(struct net_device *dev)
tunnel = netdev_priv(dev);
+ if (tunnel->parms.collect_md)
+ return 0;
+
memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr));
@@ -1084,7 +1411,6 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev)
dev_hold(dev);
}
-
static struct inet6_protocol ip6gre_protocol __read_mostly = {
.handler = gre_rcv,
.err_handler = ip6gre_err,
@@ -1099,7 +1425,8 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
for_each_netdev_safe(net, dev, aux)
if (dev->rtnl_link_ops == &ip6gre_link_ops ||
- dev->rtnl_link_ops == &ip6gre_tap_ops)
+ dev->rtnl_link_ops == &ip6gre_tap_ops ||
+ dev->rtnl_link_ops == &ip6erspan_tap_ops)
unregister_netdevice_queue(dev, head);
for (prio = 0; prio < 4; prio++) {
@@ -1221,6 +1548,70 @@ out:
return ip6gre_tunnel_validate(tb, data, extack);
}
+static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ __be16 flags = 0;
+ int ret, ver = 0;
+
+ if (!data)
+ return 0;
+
+ ret = ip6gre_tap_validate(tb, data, extack);
+ if (ret)
+ return ret;
+
+ /* ERSPAN should only have GRE sequence and key flag */
+ if (data[IFLA_GRE_OFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ if (data[IFLA_GRE_IFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ if (!data[IFLA_GRE_COLLECT_METADATA] &&
+ flags != (GRE_SEQ | GRE_KEY))
+ return -EINVAL;
+
+ /* ERSPAN Session ID only has 10-bit. Since we reuse
+ * 32-bit key field as ID, check it's range.
+ */
+ if (data[IFLA_GRE_IKEY] &&
+ (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
+ return -EINVAL;
+
+ if (data[IFLA_GRE_OKEY] &&
+ (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
+ return -EINVAL;
+
+ if (data[IFLA_GRE_ERSPAN_VER]) {
+ ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
+ if (ver != 1 && ver != 2)
+ return -EINVAL;
+ }
+
+ if (ver == 1) {
+ if (data[IFLA_GRE_ERSPAN_INDEX]) {
+ u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+
+ if (index & ~INDEX_MASK)
+ return -EINVAL;
+ }
+ } else if (ver == 2) {
+ if (data[IFLA_GRE_ERSPAN_DIR]) {
+ u16 dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+
+ if (dir & ~(DIR_MASK >> DIR_OFFSET))
+ return -EINVAL;
+ }
+
+ if (data[IFLA_GRE_ERSPAN_HWID]) {
+ u16 hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+
+ if (hwid & ~(HWID_MASK >> HWID_OFFSET))
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
static void ip6gre_netlink_parms(struct nlattr *data[],
struct __ip6_tnl_parm *parms)
@@ -1267,6 +1658,22 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
if (data[IFLA_GRE_FWMARK])
parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
+
+ if (data[IFLA_GRE_COLLECT_METADATA])
+ parms->collect_md = true;
+
+ if (data[IFLA_GRE_ERSPAN_VER])
+ parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
+
+ if (parms->erspan_ver == 1) {
+ if (data[IFLA_GRE_ERSPAN_INDEX])
+ parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+ } else if (parms->erspan_ver == 2) {
+ if (data[IFLA_GRE_ERSPAN_DIR])
+ parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+ if (data[IFLA_GRE_ERSPAN_HWID])
+ parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+ }
}
static int ip6gre_tap_init(struct net_device *dev)
@@ -1303,6 +1710,59 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
NETIF_F_HIGHDMA | \
NETIF_F_HW_CSUM)
+static int ip6erspan_tap_init(struct net_device *dev)
+{
+ struct ip6_tnl *tunnel;
+ int t_hlen;
+ int ret;
+
+ tunnel = netdev_priv(dev);
+
+ tunnel->dev = dev;
+ tunnel->net = dev_net(dev);
+ strcpy(tunnel->parms.name, dev->name);
+
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+ if (ret) {
+ free_percpu(dev->tstats);
+ dev->tstats = NULL;
+ return ret;
+ }
+
+ tunnel->tun_hlen = 8;
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
+ erspan_hdr_len(tunnel->parms.erspan_ver);
+ t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
+
+ dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+ dev->mtu = ETH_DATA_LEN - t_hlen;
+ if (dev->type == ARPHRD_ETHER)
+ dev->mtu -= ETH_HLEN;
+ if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ dev->mtu -= 8;
+
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ tunnel = netdev_priv(dev);
+ ip6gre_tnl_link_config(tunnel, 1);
+
+ return 0;
+}
+
+static const struct net_device_ops ip6erspan_netdev_ops = {
+ .ndo_init = ip6erspan_tap_init,
+ .ndo_uninit = ip6gre_tunnel_uninit,
+ .ndo_start_xmit = ip6erspan_tunnel_xmit,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_change_mtu = ip6_tnl_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip6_tnl_get_iflink,
+};
+
static void ip6gre_tap_setup(struct net_device *dev)
{
@@ -1372,8 +1832,13 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
ip6gre_netlink_parms(data, &nt->parms);
- if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
- return -EEXIST;
+ if (nt->parms.collect_md) {
+ if (rtnl_dereference(ign->collect_md_tun))
+ return -EEXIST;
+ } else {
+ if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
+ return -EEXIST;
+ }
if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
eth_hw_addr_random(dev);
@@ -1492,8 +1957,12 @@ static size_t ip6gre_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_GRE_ENCAP_DPORT */
nla_total_size(2) +
+ /* IFLA_GRE_COLLECT_METADATA */
+ nla_total_size(0) +
/* IFLA_GRE_FWMARK */
nla_total_size(4) +
+ /* IFLA_GRE_ERSPAN_INDEX */
+ nla_total_size(4) +
0;
}
@@ -1515,7 +1984,8 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) ||
nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) ||
nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) ||
- nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark))
+ nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark) ||
+ nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index))
goto nla_put_failure;
if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
@@ -1528,6 +1998,24 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
t->encap.flags))
goto nla_put_failure;
+ if (p->collect_md) {
+ if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver))
+ goto nla_put_failure;
+
+ if (p->erspan_ver == 1) {
+ if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index))
+ goto nla_put_failure;
+ } else if (p->erspan_ver == 2) {
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid))
+ goto nla_put_failure;
+ }
+
return 0;
nla_put_failure:
@@ -1550,9 +2038,28 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
[IFLA_GRE_FWMARK] = { .type = NLA_U32 },
+ [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 },
+ [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 },
+ [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 },
};
+static void ip6erspan_tap_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ dev->netdev_ops = &ip6erspan_netdev_ops;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip6gre_dev_free;
+
+ dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ netif_keep_dst(dev);
+}
+
static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
.kind = "ip6gre",
.maxtype = IFLA_GRE_MAX,
@@ -1582,6 +2089,20 @@ static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = {
.get_link_net = ip6_tnl_get_link_net,
};
+static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = {
+ .kind = "ip6erspan",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ip6gre_policy,
+ .priv_size = sizeof(struct ip6_tnl),
+ .setup = ip6erspan_tap_setup,
+ .validate = ip6erspan_tap_validate,
+ .newlink = ip6gre_newlink,
+ .changelink = ip6gre_changelink,
+ .get_size = ip6gre_get_size,
+ .fill_info = ip6gre_fill_info,
+ .get_link_net = ip6_tnl_get_link_net,
+};
+
/*
* And now the modules code and kernel interface.
*/
@@ -1610,9 +2131,15 @@ static int __init ip6gre_init(void)
if (err < 0)
goto tap_ops_failed;
+ err = rtnl_link_register(&ip6erspan_tap_ops);
+ if (err < 0)
+ goto erspan_link_failed;
+
out:
return err;
+erspan_link_failed:
+ rtnl_link_unregister(&ip6gre_tap_ops);
tap_ops_failed:
rtnl_link_unregister(&ip6gre_link_ops);
rtnl_link_failed:
@@ -1626,6 +2153,7 @@ static void __exit ip6gre_fini(void)
{
rtnl_link_unregister(&ip6gre_tap_ops);
rtnl_link_unregister(&ip6gre_link_ops);
+ rtnl_link_unregister(&ip6erspan_tap_ops);
inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE);
unregister_pernet_device(&ip6gre_net_ops);
}
@@ -1637,4 +2165,5 @@ MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
MODULE_DESCRIPTION("GRE over IPv6 tunneling device");
MODULE_ALIAS_RTNL_LINK("ip6gre");
MODULE_ALIAS_RTNL_LINK("ip6gretap");
+MODULE_ALIAS_RTNL_LINK("ip6erspan");
MODULE_ALIAS_NETDEV("ip6gre0");
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5110a418cc4d..176d74fb3b4d 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1201,13 +1201,13 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
rt->dst.dev->mtu : dst_mtu(&rt->dst);
else
mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
- rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+ rt->dst.dev->mtu : dst_mtu(xfrm_dst_path(&rt->dst));
if (np->frag_size < mtu) {
if (np->frag_size)
mtu = np->frag_size;
}
cork->base.fragsize = mtu;
- if (dst_allfrag(rt->dst.path))
+ if (dst_allfrag(xfrm_dst_path(&rt->dst)))
cork->base.flags |= IPCORK_ALLFRAG;
cork->base.length = 0;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index db84f523656d..6ff2f21ae3fc 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -861,7 +861,7 @@ int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb,
struct metadata_dst *tun_dst,
bool log_ecn_err)
{
- return __ip6_tnl_rcv(t, skb, tpi, NULL, ip6ip6_dscp_ecn_decapsulate,
+ return __ip6_tnl_rcv(t, skb, tpi, tun_dst, ip6ip6_dscp_ecn_decapsulate,
log_ecn_err);
}
EXPORT_SYMBOL(ip6_tnl_rcv);
@@ -979,6 +979,9 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
int ret = 0;
struct net *net = t->net;
+ if (t->parms.collect_md)
+ return 1;
+
if ((p->flags & IP6_TNL_F_CAP_XMIT) ||
((p->flags & IP6_TNL_F_CAP_PER_PACKET) &&
(ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index a2e1a864eb46..890f9bda06a4 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1425,10 +1425,13 @@ int __init ip6_mr_init(void)
goto add_proto_fail;
}
#endif
- rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL,
- ip6mr_rtm_dumproute, 0);
- return 0;
+ err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE,
+ NULL, ip6mr_rtm_dumproute, 0);
+ if (err == 0)
+ return 0;
+
#ifdef CONFIG_IPV6_PIMSM_V2
+ inet6_del_protocol(&pim6_protocol, IPPROTO_PIM);
add_proto_fail:
unregister_netdevice_notifier(&ip6_mr_notifier);
#endif
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7a8d1500d374..b3f4d19b3ca5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -186,7 +186,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
{
- return dst_metrics_write_ptr(rt->dst.from);
+ return dst_metrics_write_ptr(&rt->from->dst);
}
static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
@@ -391,7 +391,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
{
struct rt6_info *rt = (struct rt6_info *)dst;
struct rt6_exception_bucket *bucket;
- struct dst_entry *from = dst->from;
+ struct rt6_info *from = rt->from;
struct inet6_dev *idev;
dst_destroy_metrics_generic(dst);
@@ -409,8 +409,8 @@ static void ip6_dst_destroy(struct dst_entry *dst)
kfree(bucket);
}
- dst->from = NULL;
- dst_release(from);
+ rt->from = NULL;
+ dst_release(&from->dst);
}
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -443,9 +443,9 @@ static bool rt6_check_expired(const struct rt6_info *rt)
if (rt->rt6i_flags & RTF_EXPIRES) {
if (time_after(jiffies, rt->dst.expires))
return true;
- } else if (rt->dst.from) {
+ } else if (rt->from) {
return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
- rt6_check_expired((struct rt6_info *)rt->dst.from);
+ rt6_check_expired(rt->from);
}
return false;
}
@@ -502,7 +502,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
if (!oif && ipv6_addr_any(saddr))
goto out;
- for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
+ for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
struct net_device *dev = sprt->dst.dev;
if (oif) {
@@ -721,7 +721,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
match = NULL;
cont = NULL;
- for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
+ for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
if (rt->rt6i_metric != metric) {
cont = rt;
break;
@@ -731,7 +731,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
}
for (rt = leaf; rt && rt != rr_head;
- rt = rcu_dereference(rt->dst.rt6_next)) {
+ rt = rcu_dereference(rt->rt6_next)) {
if (rt->rt6i_metric != metric) {
cont = rt;
break;
@@ -743,7 +743,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
if (match || !cont)
return match;
- for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
+ for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
match = find_match(rt, oif, strict, &mpri, match, do_rr);
return match;
@@ -781,7 +781,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
&do_rr);
if (do_rr) {
- struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
+ struct rt6_info *next = rcu_dereference(rt0->rt6_next);
/* no entries matched; do round-robin */
if (!next || next->rt6i_metric != rt0->rt6i_metric)
@@ -1054,7 +1054,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
*/
if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
- ort = (struct rt6_info *)ort->dst.from;
+ ort = ort->from;
rcu_read_lock();
dev = ip6_rt_get_dev_rcu(ort);
@@ -1274,7 +1274,7 @@ static int rt6_insert_exception(struct rt6_info *nrt,
/* ort can't be a cache or pcpu route */
if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
- ort = (struct rt6_info *)ort->dst.from;
+ ort = ort->from;
WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
spin_lock_bh(&rt6_exception_lock);
@@ -1415,8 +1415,8 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
/* Remove the passed in cached rt from the hash table that contains it */
int rt6_remove_exception_rt(struct rt6_info *rt)
{
- struct rt6_info *from = (struct rt6_info *)rt->dst.from;
struct rt6_exception_bucket *bucket;
+ struct rt6_info *from = rt->from;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
int err;
@@ -1460,8 +1460,8 @@ int rt6_remove_exception_rt(struct rt6_info *rt)
*/
static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
{
- struct rt6_info *from = (struct rt6_info *)rt->dst.from;
struct rt6_exception_bucket *bucket;
+ struct rt6_info *from = rt->from;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
@@ -1929,9 +1929,9 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
static void rt6_dst_from_metrics_check(struct rt6_info *rt)
{
- if (rt->dst.from &&
- dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
- dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
+ if (rt->from &&
+ dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
+ dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
}
static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
@@ -1951,7 +1951,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
{
if (!__rt6_check_expired(rt) &&
rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
- rt6_check((struct rt6_info *)(rt->dst.from), cookie))
+ rt6_check(rt->from, cookie))
return &rt->dst;
else
return NULL;
@@ -1971,7 +1971,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
rt6_dst_from_metrics_check(rt);
if (rt->rt6i_flags & RTF_PCPU ||
- (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
+ (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
return rt6_dst_from_check(rt, cookie);
else
return rt6_check(rt, cookie);
@@ -3055,11 +3055,11 @@ out:
static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
{
- BUG_ON(from->dst.from);
+ BUG_ON(from->from);
rt->rt6i_flags &= ~RTF_EXPIRES;
dst_hold(&from->dst);
- rt->dst.from = &from->dst;
+ rt->from = from;
dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
}
@@ -4596,8 +4596,6 @@ static int __net_init ip6_route_net_init(struct net *net)
GFP_KERNEL);
if (!net->ipv6.ip6_null_entry)
goto out_ip6_dst_entries;
- net->ipv6.ip6_null_entry->dst.path =
- (struct dst_entry *)net->ipv6.ip6_null_entry;
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
ip6_template_metrics, true);
@@ -4609,8 +4607,6 @@ static int __net_init ip6_route_net_init(struct net *net)
GFP_KERNEL);
if (!net->ipv6.ip6_prohibit_entry)
goto out_ip6_null_entry;
- net->ipv6.ip6_prohibit_entry->dst.path =
- (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
ip6_template_metrics, true);
@@ -4620,8 +4616,6 @@ static int __net_init ip6_route_net_init(struct net *net)
GFP_KERNEL);
if (!net->ipv6.ip6_blk_hole_entry)
goto out_ip6_prohibit_entry;
- net->ipv6.ip6_blk_hole_entry->dst.path =
- (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
ip6_template_metrics, true);
@@ -4778,11 +4772,20 @@ int __init ip6_route_init(void)
if (ret)
goto fib6_rules_init;
- ret = -ENOBUFS;
- if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
- __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
- __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
- RTNL_FLAG_DOIT_UNLOCKED))
+ ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
+ inet6_rtm_newroute, NULL, 0);
+ if (ret < 0)
+ goto out_register_late_subsys;
+
+ ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
+ inet6_rtm_delroute, NULL, 0);
+ if (ret < 0)
+ goto out_register_late_subsys;
+
+ ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
+ inet6_rtm_getroute, NULL,
+ RTNL_FLAG_DOIT_UNLOCKED);
+ if (ret < 0)
goto out_register_late_subsys;
ret = register_netdevice_notifier(&ip6_route_dev_notifier);
@@ -4800,6 +4803,7 @@ out:
return ret;
out_register_late_subsys:
+ rtnl_unregister_all(PF_INET6);
unregister_pernet_subsys(&ip6_route_net_late_ops);
fib6_rules_init:
fib6_rules_cleanup();
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index c81407770956..7f5621d09571 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -306,9 +306,7 @@ static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
struct seg6_hmac_info *hinfo;
int ret;
- ret = rhashtable_walk_start(iter);
- if (ret && ret != -EAGAIN)
- goto done;
+ rhashtable_walk_start(iter);
for (;;) {
hinfo = rhashtable_walk_next(iter);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3f30fa313bf2..eecf9f0faf29 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -89,28 +89,12 @@ static u32 udp6_ehashfn(const struct net *net,
udp_ipv6_hash_secret + net_hash_mix(net));
}
-static u32 udp6_portaddr_hash(const struct net *net,
- const struct in6_addr *addr6,
- unsigned int port)
-{
- unsigned int hash, mix = net_hash_mix(net);
-
- if (ipv6_addr_any(addr6))
- hash = jhash_1word(0, mix);
- else if (ipv6_addr_v4mapped(addr6))
- hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
- else
- hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);
-
- return hash ^ port;
-}
-
int udp_v6_get_port(struct sock *sk, unsigned short snum)
{
unsigned int hash2_nulladdr =
- udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
+ ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
unsigned int hash2_partial =
- udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
+ ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
/* precompute partial secondary hash */
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
@@ -119,7 +103,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
static void udp_v6_rehash(struct sock *sk)
{
- u16 new_hash = udp6_portaddr_hash(sock_net(sk),
+ u16 new_hash = ipv6_portaddr_hash(sock_net(sk),
&sk->sk_v6_rcv_saddr,
inet_sk(sk)->inet_num);
@@ -184,7 +168,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
struct udp_hslot *hslot2, struct sk_buff *skb)
{
struct sock *sk, *result;
- int score, badness, matches = 0, reuseport = 0;
+ int score, badness;
u32 hash = 0;
result = NULL;
@@ -193,8 +177,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
- reuseport = sk->sk_reuseport;
- if (reuseport) {
+ if (sk->sk_reuseport) {
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
@@ -202,15 +185,9 @@ static struct sock *udp6_lib_lookup2(struct net *net,
sizeof(struct udphdr));
if (result)
return result;
- matches = 1;
}
result = sk;
badness = score;
- } else if (score == badness && reuseport) {
- matches++;
- if (reciprocal_scale(hash, matches) == 0)
- result = sk;
- hash = next_pseudo_random32(hash);
}
}
return result;
@@ -228,11 +205,11 @@ struct sock *__udp6_lib_lookup(struct net *net,
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
bool exact_dif = udp6_lib_exact_dif_match(net, skb);
- int score, badness, matches = 0, reuseport = 0;
+ int score, badness;
u32 hash = 0;
if (hslot->count > 10) {
- hash2 = udp6_portaddr_hash(net, daddr, hnum);
+ hash2 = ipv6_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
@@ -243,7 +220,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
- hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum);
+ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
slot2 = hash2 & udptable->mask;
/* avoid searching the same slot again. */
if (unlikely(slot2 == old_slot2))
@@ -267,23 +244,16 @@ begin:
score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
sdif, exact_dif);
if (score > badness) {
- reuseport = sk->sk_reuseport;
- if (reuseport) {
+ if (sk->sk_reuseport) {
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
if (result)
return result;
- matches = 1;
}
result = sk;
badness = score;
- } else if (score == badness && reuseport) {
- matches++;
- if (reciprocal_scale(hash, matches) == 0)
- result = sk;
- hash = next_pseudo_random32(hash);
}
}
return result;
@@ -719,9 +689,9 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct sk_buff *nskb;
if (use_hash2) {
- hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
+ hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) &
udptable->mask;
- hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask;
+ hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
hslot = &udptable->hash2[hash2];
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
@@ -909,7 +879,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
int dif, int sdif)
{
unsigned short hnum = ntohs(loc_port);
- unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum);
+ unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
unsigned int slot2 = hash2 & udp_table.mask;
struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
index 02556e356f87..e66b94f46532 100644
--- a/net/ipv6/xfrm6_mode_tunnel.c
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -59,7 +59,7 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
if (x->props.flags & XFRM_STATE_NOECN)
dsfield &= ~INET_ECN_MASK;
ipv6_change_dsfield(top_iph, 0, dsfield);
- top_iph->hop_limit = ip6_dst_hoplimit(dst->child);
+ top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst));
top_iph->saddr = *(struct in6_addr *)&x->props.saddr;
top_iph->daddr = *(struct in6_addr *)&x->id.daddr;
return 0;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 885ade234a49..09fb44ee3b45 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -265,7 +265,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
in6_dev_put(xdst->u.rt6.rt6i_idev);
xdst->u.rt6.rt6i_idev = loopback_idev;
in6_dev_hold(loopback_idev);
- xdst = (struct xfrm_dst *)xdst->u.dst.child;
+ xdst = (struct xfrm_dst *)xfrm_dst_child(&xdst->u.dst);
} while (xdst->u.dst.xfrm);
__in6_dev_put(loopback_idev);
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 86c8dfef56a4..a5125624a76d 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -257,9 +257,7 @@ __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx)
if (ret)
return NULL;
- ret = rhashtable_walk_start(&iter);
- if (ret && ret != -EAGAIN)
- goto err;
+ rhashtable_walk_start(&iter);
while ((mpath = rhashtable_walk_next(&iter))) {
if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -269,7 +267,6 @@ __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx)
if (i++ == idx)
break;
}
-err:
rhashtable_walk_stop(&iter);
rhashtable_walk_exit(&iter);
@@ -513,9 +510,7 @@ void mesh_plink_broken(struct sta_info *sta)
if (ret)
return;
- ret = rhashtable_walk_start(&iter);
- if (ret && ret != -EAGAIN)
- goto out;
+ rhashtable_walk_start(&iter);
while ((mpath = rhashtable_walk_next(&iter))) {
if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -535,7 +530,6 @@ void mesh_plink_broken(struct sta_info *sta)
WLAN_REASON_MESH_PATH_DEST_UNREACHABLE, bcast);
}
}
-out:
rhashtable_walk_stop(&iter);
rhashtable_walk_exit(&iter);
}
@@ -584,9 +578,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta)
if (ret)
return;
- ret = rhashtable_walk_start(&iter);
- if (ret && ret != -EAGAIN)
- goto out;
+ rhashtable_walk_start(&iter);
while ((mpath = rhashtable_walk_next(&iter))) {
if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -597,7 +589,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta)
if (rcu_access_pointer(mpath->next_hop) == sta)
__mesh_path_del(tbl, mpath);
}
-out:
+
rhashtable_walk_stop(&iter);
rhashtable_walk_exit(&iter);
}
@@ -614,9 +606,7 @@ static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata,
if (ret)
return;
- ret = rhashtable_walk_start(&iter);
- if (ret && ret != -EAGAIN)
- goto out;
+ rhashtable_walk_start(&iter);
while ((mpath = rhashtable_walk_next(&iter))) {
if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -627,7 +617,7 @@ static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata,
if (ether_addr_equal(mpath->mpp, proxy))
__mesh_path_del(tbl, mpath);
}
-out:
+
rhashtable_walk_stop(&iter);
rhashtable_walk_exit(&iter);
}
@@ -642,9 +632,7 @@ static void table_flush_by_iface(struct mesh_table *tbl)
if (ret)
return;
- ret = rhashtable_walk_start(&iter);
- if (ret && ret != -EAGAIN)
- goto out;
+ rhashtable_walk_start(&iter);
while ((mpath = rhashtable_walk_next(&iter))) {
if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -653,7 +641,7 @@ static void table_flush_by_iface(struct mesh_table *tbl)
break;
__mesh_path_del(tbl, mpath);
}
-out:
+
rhashtable_walk_stop(&iter);
rhashtable_walk_exit(&iter);
}
@@ -873,9 +861,7 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata,
if (ret)
return;
- ret = rhashtable_walk_start(&iter);
- if (ret && ret != -EAGAIN)
- goto out;
+ rhashtable_walk_start(&iter);
while ((mpath = rhashtable_walk_next(&iter))) {
if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -887,7 +873,7 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata,
time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE))
__mesh_path_del(tbl, mpath);
}
-out:
+
rhashtable_walk_stop(&iter);
rhashtable_walk_exit(&iter);
}
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 8ca9915befc8..5dce8336d33f 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2510,12 +2510,15 @@ static int __init mpls_init(void)
rtnl_af_register(&mpls_af_ops);
- rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, 0);
- rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, 0);
- rtnl_register(PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes,
- 0);
- rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
- mpls_netconf_dump_devconf, 0);
+ rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_NEWROUTE,
+ mpls_rtm_newroute, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_DELROUTE,
+ mpls_rtm_delroute, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETROUTE,
+ mpls_getroute, mpls_dump_routes, 0);
+ rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETNETCONF,
+ mpls_netconf_get_devconf,
+ mpls_netconf_dump_devconf, 0);
err = ipgre_tunnel_encap_add_mpls_ops();
if (err)
pr_err("Can't add mpls over gre tunnel ops\n");
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index f8166c1d5430..3f1624ee056f 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -251,11 +251,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
if (err)
return;
- err = rhashtable_walk_start(&hti);
- if (err && err != -EAGAIN) {
- iter->err = err;
- goto out;
- }
+ rhashtable_walk_start(&hti);
while ((he = rhashtable_walk_next(&hti))) {
if (IS_ERR(he)) {
@@ -306,9 +302,7 @@ static void nft_rhash_gc(struct work_struct *work)
if (err)
goto schedule;
- err = rhashtable_walk_start(&hti);
- if (err && err != -EAGAIN)
- goto out;
+ rhashtable_walk_start(&hti);
while ((he = rhashtable_walk_next(&hti))) {
if (IS_ERR(he)) {
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index 2b4ab189bba7..5639fb03bdd9 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -93,7 +93,8 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info,
if (dst->xfrm == NULL)
return -1;
- for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+ for (i = 0; dst && dst->xfrm;
+ dst = ((struct xfrm_dst *)dst)->child, i++) {
pos = strict ? i : 0;
if (pos >= info->len)
return 0;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 79cc1bf36e4a..972bfe113043 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -65,6 +65,7 @@
#include <linux/net_namespace.h>
#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/scm.h>
#include <net/netlink.h>
@@ -145,8 +146,6 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
static BLOCKING_NOTIFIER_HEAD(netlink_chain);
-static DEFINE_SPINLOCK(netlink_tap_lock);
-static struct list_head netlink_tap_all __read_mostly;
static const struct rhashtable_params netlink_rhashtable_params;
@@ -173,14 +172,24 @@ static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb,
return new;
}
+static unsigned int netlink_tap_net_id;
+
+struct netlink_tap_net {
+ struct list_head netlink_tap_all;
+ struct mutex netlink_tap_lock;
+};
+
int netlink_add_tap(struct netlink_tap *nt)
{
+ struct net *net = dev_net(nt->dev);
+ struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
+
if (unlikely(nt->dev->type != ARPHRD_NETLINK))
return -EINVAL;
- spin_lock(&netlink_tap_lock);
- list_add_rcu(&nt->list, &netlink_tap_all);
- spin_unlock(&netlink_tap_lock);
+ mutex_lock(&nn->netlink_tap_lock);
+ list_add_rcu(&nt->list, &nn->netlink_tap_all);
+ mutex_unlock(&nn->netlink_tap_lock);
__module_get(nt->module);
@@ -190,12 +199,14 @@ EXPORT_SYMBOL_GPL(netlink_add_tap);
static int __netlink_remove_tap(struct netlink_tap *nt)
{
+ struct net *net = dev_net(nt->dev);
+ struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
bool found = false;
struct netlink_tap *tmp;
- spin_lock(&netlink_tap_lock);
+ mutex_lock(&nn->netlink_tap_lock);
- list_for_each_entry(tmp, &netlink_tap_all, list) {
+ list_for_each_entry(tmp, &nn->netlink_tap_all, list) {
if (nt == tmp) {
list_del_rcu(&nt->list);
found = true;
@@ -205,7 +216,7 @@ static int __netlink_remove_tap(struct netlink_tap *nt)
pr_warn("__netlink_remove_tap: %p not found\n", nt);
out:
- spin_unlock(&netlink_tap_lock);
+ mutex_unlock(&nn->netlink_tap_lock);
if (found)
module_put(nt->module);
@@ -224,6 +235,26 @@ int netlink_remove_tap(struct netlink_tap *nt)
}
EXPORT_SYMBOL_GPL(netlink_remove_tap);
+static __net_init int netlink_tap_init_net(struct net *net)
+{
+ struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
+
+ INIT_LIST_HEAD(&nn->netlink_tap_all);
+ mutex_init(&nn->netlink_tap_lock);
+ return 0;
+}
+
+static void __net_exit netlink_tap_exit_net(struct net *net)
+{
+}
+
+static struct pernet_operations netlink_tap_net_ops = {
+ .init = netlink_tap_init_net,
+ .exit = netlink_tap_exit_net,
+ .id = &netlink_tap_net_id,
+ .size = sizeof(struct netlink_tap_net),
+};
+
static bool netlink_filter_tap(const struct sk_buff *skb)
{
struct sock *sk = skb->sk;
@@ -277,7 +308,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
return ret;
}
-static void __netlink_deliver_tap(struct sk_buff *skb)
+static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn)
{
int ret;
struct netlink_tap *tmp;
@@ -285,19 +316,21 @@ static void __netlink_deliver_tap(struct sk_buff *skb)
if (!netlink_filter_tap(skb))
return;
- list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
+ list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
ret = __netlink_deliver_tap_skb(skb, tmp->dev);
if (unlikely(ret))
break;
}
}
-static void netlink_deliver_tap(struct sk_buff *skb)
+static void netlink_deliver_tap(struct net *net, struct sk_buff *skb)
{
+ struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
+
rcu_read_lock();
- if (unlikely(!list_empty(&netlink_tap_all)))
- __netlink_deliver_tap(skb);
+ if (unlikely(!list_empty(&nn->netlink_tap_all)))
+ __netlink_deliver_tap(skb, nn);
rcu_read_unlock();
}
@@ -306,7 +339,7 @@ static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
struct sk_buff *skb)
{
if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
- netlink_deliver_tap(skb);
+ netlink_deliver_tap(sock_net(dst), skb);
}
static void netlink_overrun(struct sock *sk)
@@ -1216,7 +1249,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
int len = skb->len;
- netlink_deliver_tap(skb);
+ netlink_deliver_tap(sock_net(sk), skb);
skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_data_ready(sk);
@@ -2481,8 +2514,9 @@ static int netlink_walk_start(struct nl_seq_iter *iter)
return err;
}
- err = rhashtable_walk_start(&iter->hti);
- return err == -EAGAIN ? 0 : err;
+ rhashtable_walk_start(&iter->hti);
+
+ return 0;
}
static void netlink_walk_stop(struct nl_seq_iter *iter)
@@ -2733,12 +2767,11 @@ static int __init netlink_proto_init(void)
}
}
- INIT_LIST_HEAD(&netlink_tap_all);
-
netlink_add_usersock_entry();
sock_register(&netlink_family_ops);
register_pernet_subsys(&netlink_net_ops);
+ register_pernet_subsys(&netlink_tap_net_ops);
/* The netlink device handler may be needed early. */
rtnetlink_init();
out:
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 8faa20b4d457..7dda33b9b784 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -115,11 +115,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
if (!s_num)
rhashtable_walk_enter(&tbl->hash, hti);
- ret = rhashtable_walk_start(hti);
- if (ret == -EAGAIN)
- ret = 0;
- if (ret)
- goto stop;
+ rhashtable_walk_start(hti);
while ((nlsk = rhashtable_walk_next(hti))) {
if (IS_ERR(nlsk)) {
@@ -146,8 +142,8 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
}
}
-stop:
rhashtable_walk_stop(hti);
+
if (ret)
goto done;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index dbe2379329c5..76d050aba7a4 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -56,12 +56,12 @@
u64 ovs_flow_used_time(unsigned long flow_jiffies)
{
- struct timespec cur_ts;
+ struct timespec64 cur_ts;
u64 cur_ms, idle_ms;
- ktime_get_ts(&cur_ts);
+ ktime_get_ts64(&cur_ts);
idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
- cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
+ cur_ms = (u64)(u32)cur_ts.tv_sec * MSEC_PER_SEC +
cur_ts.tv_nsec / NSEC_PER_MSEC;
return cur_ms - idle_ms;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 624ea74353dd..bce1f78b0de5 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -644,12 +644,12 @@ static int erspan_tun_opt_from_nlattr(const struct nlattr *attr,
BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
memset(&opts, 0, sizeof(opts));
- opts.index = nla_get_be32(attr);
+ opts.u.index = nla_get_be32(attr);
/* Index has only 20-bit */
- if (ntohl(opts.index) & ~INDEX_MASK) {
+ if (ntohl(opts.u.index) & ~INDEX_MASK) {
OVS_NLERR(log, "ERSPAN index number %x too large.",
- ntohl(opts.index));
+ ntohl(opts.u.index));
return -EINVAL;
}
@@ -907,7 +907,7 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
return -EMSGSIZE;
else if (output->tun_flags & TUNNEL_ERSPAN_OPT &&
nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
- ((struct erspan_metadata *)tun_opts)->index))
+ ((struct erspan_metadata *)tun_opts)->u.index))
return -EMSGSIZE;
}
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 04a3128adcf0..3e7747549f90 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -126,18 +126,12 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
}
}
-static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
-{
- dev->needed_headroom = new_hr < 0 ? 0 : new_hr;
-}
-
static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_open = internal_dev_open,
.ndo_stop = internal_dev_stop,
.ndo_start_xmit = internal_dev_xmit,
.ndo_set_mac_address = eth_mac_addr,
.ndo_get_stats64 = internal_get_stats,
- .ndo_set_rx_headroom = internal_set_rx_headroom,
};
static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -154,7 +148,7 @@ static void do_setup(struct net_device *netdev)
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
- IFF_PHONY_HEADROOM | IFF_NO_QUEUE;
+ IFF_NO_QUEUE;
netdev->needs_free_netdev = true;
netdev->priv_destructor = internal_dev_destructor;
netdev->ethtool_ops = &internal_dev_ethtool_ops;
@@ -195,7 +189,6 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
err = -ENOMEM;
goto error_free_netdev;
}
- vport->dev->needed_headroom = vport->dp->max_headroom;
dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
internal_dev = internal_dev_priv(vport->dev);
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index da754fc926e7..871eaf2cb85e 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -299,16 +299,21 @@ out:
int __init phonet_netlink_register(void)
{
- int err = __rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit,
- NULL, 0);
+ int err = rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWADDR,
+ addr_doit, NULL, 0);
if (err)
return err;
- /* Further __rtnl_register() cannot fail */
- __rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL, 0);
- __rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, 0);
- __rtnl_register(PF_PHONET, RTM_NEWROUTE, route_doit, NULL, 0);
- __rtnl_register(PF_PHONET, RTM_DELROUTE, route_doit, NULL, 0);
- __rtnl_register(PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, 0);
+ /* Further rtnl_register_module() cannot fail */
+ rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELADDR,
+ addr_doit, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETADDR,
+ NULL, getaddr_dumpit, 0);
+ rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWROUTE,
+ route_doit, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELROUTE,
+ route_doit, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETROUTE,
+ NULL, route_dumpit, 0);
return 0;
}
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 77ab05e23001..5fb3929e3d7d 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1116,9 +1116,13 @@ static int __init qrtr_proto_init(void)
return rc;
}
- rtnl_register(PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, 0);
+ rc = rtnl_register_module(THIS_MODULE, PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, 0);
+ if (rc) {
+ sock_unregister(qrtr_family.family);
+ proto_unregister(&qrtr_proto);
+ }
- return 0;
+ return rc;
}
postcore_initcall(qrtr_proto_init);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 7ee2d5d68b78..6492c0b608a4 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -230,8 +230,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
conn, &laddr, &faddr,
- trans->t_name ? trans->t_name : "[unknown]",
- is_outgoing ? "(outgoing)" : "");
+ strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name :
+ "[unknown]", is_outgoing ? "(outgoing)" : "");
/*
* Since we ran without holding the conn lock, someone could
@@ -366,6 +366,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp)
* to the conn hash, so we never trigger a reconnect on this
* conn - the reconnect is always triggered by the active peer. */
cancel_delayed_work_sync(&cp->cp_conn_w);
+ if (conn->c_destroy_in_prog)
+ return;
rcu_read_lock();
if (!hlist_unhashed(&conn->c_hash_node)) {
rcu_read_unlock();
@@ -445,7 +447,6 @@ void rds_conn_destroy(struct rds_connection *conn)
*/
rds_cong_remove_conn(conn);
- put_net(conn->c_net);
kfree(conn->c_path);
kmem_cache_free(rds_conn_slab, conn);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c349c71babff..d09f6c1facb4 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -150,7 +150,7 @@ struct rds_connection {
/* Protocol version */
unsigned int c_version;
- struct net *c_net;
+ possible_net_t c_net;
struct list_head c_map_item;
unsigned long c_map_queued;
@@ -165,13 +165,13 @@ struct rds_connection {
static inline
struct net *rds_conn_net(struct rds_connection *conn)
{
- return conn->c_net;
+ return read_pnet(&conn->c_net);
}
static inline
void rds_conn_net_set(struct rds_connection *conn, struct net *net)
{
- conn->c_net = get_net(net);
+ write_pnet(&conn->c_net, net);
}
#define RDS_FLAG_CONG_BITMAP 0x01
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 6b7ee71f40c6..39f502d47969 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -306,7 +306,8 @@ static void rds_tcp_conn_free(void *arg)
rdsdebug("freeing tc %p\n", tc);
spin_lock_irqsave(&rds_tcp_conn_lock, flags);
- list_del(&tc->t_tcp_node);
+ if (!tc->t_tcp_node_detached)
+ list_del(&tc->t_tcp_node);
spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
kmem_cache_free(rds_tcp_conn_slab, tc);
@@ -495,27 +496,6 @@ static struct pernet_operations rds_tcp_net_ops = {
.size = sizeof(struct rds_tcp_net),
};
-/* explicitly send a RST on each socket, thereby releasing any socket refcnts
- * that may otherwise hold up netns deletion.
- */
-static void rds_tcp_conn_paths_destroy(struct rds_connection *conn)
-{
- struct rds_conn_path *cp;
- struct rds_tcp_connection *tc;
- int i;
- struct sock *sk;
-
- for (i = 0; i < RDS_MPATH_WORKERS; i++) {
- cp = &conn->c_path[i];
- tc = cp->cp_transport_data;
- if (!tc->t_sock)
- continue;
- sk = tc->t_sock->sk;
- sk->sk_prot->disconnect(sk, 0);
- tcp_done(sk);
- }
-}
-
static void rds_tcp_kill_sock(struct net *net)
{
struct rds_tcp_connection *tc, *_tc;
@@ -527,18 +507,20 @@ static void rds_tcp_kill_sock(struct net *net)
rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
- struct net *c_net = tc->t_cpath->cp_conn->c_net;
+ struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
if (net != c_net || !tc->t_sock)
continue;
- if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn))
+ if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) {
list_move_tail(&tc->t_tcp_node, &tmp_list);
+ } else {
+ list_del(&tc->t_tcp_node);
+ tc->t_tcp_node_detached = true;
+ }
}
spin_unlock_irq(&rds_tcp_conn_lock);
- list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
- rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn);
+ list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
rds_conn_destroy(tc->t_cpath->cp_conn);
- }
}
void *rds_tcp_listen_sock_def_readable(struct net *net)
@@ -586,7 +568,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
- struct net *c_net = tc->t_cpath->cp_conn->c_net;
+ struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
if (net != c_net || !tc->t_sock)
continue;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 1aafbf7c3011..e7858ee8ed8b 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -12,6 +12,7 @@ struct rds_tcp_incoming {
struct rds_tcp_connection {
struct list_head t_tcp_node;
+ bool t_tcp_node_detached;
struct rds_conn_path *t_cpath;
/* t_conn_path_lock synchronizes the connection establishment between
* rds_tcp_accept_one and rds_tcp_conn_path_connect
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 4d33a50a8a6d..52622a3d2517 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -99,7 +99,7 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
p->tcfa_refcnt--;
if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
if (p->ops->cleanup)
- p->ops->cleanup(p, bind);
+ p->ops->cleanup(p);
tcf_idr_remove(p->idrinfo, p);
ret = ACT_P_DELETED;
}
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 5ef8ce8c83d4..b3f2c15affa7 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -357,7 +357,7 @@ out:
return ret;
}
-static void tcf_bpf_cleanup(struct tc_action *act, int bind)
+static void tcf_bpf_cleanup(struct tc_action *act)
{
struct tcf_bpf_cfg tmp;
@@ -401,16 +401,14 @@ static __net_init int bpf_init_net(struct net *net)
return tc_action_net_init(tn, &act_bpf_ops);
}
-static void __net_exit bpf_exit_net(struct net *net)
+static void __net_exit bpf_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, bpf_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, bpf_net_id);
}
static struct pernet_operations bpf_net_ops = {
.init = bpf_init_net,
- .exit = bpf_exit_net,
+ .exit_batch = bpf_exit_net,
.id = &bpf_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 10b7a8855a6c..2b15ba84e0c8 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -209,16 +209,14 @@ static __net_init int connmark_init_net(struct net *net)
return tc_action_net_init(tn, &act_connmark_ops);
}
-static void __net_exit connmark_exit_net(struct net *net)
+static void __net_exit connmark_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, connmark_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, connmark_net_id);
}
static struct pernet_operations connmark_net_ops = {
.init = connmark_init_net,
- .exit = connmark_exit_net,
+ .exit_batch = connmark_exit_net,
.id = &connmark_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index d836f998117b..af4b8ec60d9a 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -635,16 +635,14 @@ static __net_init int csum_init_net(struct net *net)
return tc_action_net_init(tn, &act_csum_ops);
}
-static void __net_exit csum_exit_net(struct net *net)
+static void __net_exit csum_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, csum_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, csum_net_id);
}
static struct pernet_operations csum_net_ops = {
.init = csum_init_net,
- .exit = csum_exit_net,
+ .exit_batch = csum_exit_net,
.id = &csum_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e29a48ef7fc3..9d632e92cad0 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -235,16 +235,14 @@ static __net_init int gact_init_net(struct net *net)
return tc_action_net_init(tn, &act_gact_ops);
}
-static void __net_exit gact_exit_net(struct net *net)
+static void __net_exit gact_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, gact_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, gact_net_id);
}
static struct pernet_operations gact_net_ops = {
.init = gact_init_net,
- .exit = gact_exit_net,
+ .exit_batch = gact_exit_net,
.id = &gact_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 3007cb1310ea..5954e992685a 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -387,7 +387,7 @@ out_nlmsg_trim:
}
/* under ife->tcf_lock */
-static void _tcf_ife_cleanup(struct tc_action *a, int bind)
+static void _tcf_ife_cleanup(struct tc_action *a)
{
struct tcf_ife_info *ife = to_ife(a);
struct tcf_meta_info *e, *n;
@@ -405,13 +405,13 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind)
}
}
-static void tcf_ife_cleanup(struct tc_action *a, int bind)
+static void tcf_ife_cleanup(struct tc_action *a)
{
struct tcf_ife_info *ife = to_ife(a);
struct tcf_ife_params *p;
spin_lock_bh(&ife->tcf_lock);
- _tcf_ife_cleanup(a, bind);
+ _tcf_ife_cleanup(a);
spin_unlock_bh(&ife->tcf_lock);
p = rcu_dereference_protected(ife->params, 1);
@@ -546,7 +546,7 @@ metadata_parse_err:
if (exists)
tcf_idr_release(*a, bind);
if (ret == ACT_P_CREATED)
- _tcf_ife_cleanup(*a, bind);
+ _tcf_ife_cleanup(*a);
if (exists)
spin_unlock_bh(&ife->tcf_lock);
@@ -567,7 +567,7 @@ metadata_parse_err:
err = use_all_metadata(ife);
if (err) {
if (ret == ACT_P_CREATED)
- _tcf_ife_cleanup(*a, bind);
+ _tcf_ife_cleanup(*a);
if (exists)
spin_unlock_bh(&ife->tcf_lock);
@@ -858,16 +858,14 @@ static __net_init int ife_init_net(struct net *net)
return tc_action_net_init(tn, &act_ife_ops);
}
-static void __net_exit ife_exit_net(struct net *net)
+static void __net_exit ife_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, ife_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, ife_net_id);
}
static struct pernet_operations ife_net_ops = {
.init = ife_init_net,
- .exit = ife_exit_net,
+ .exit_batch = ife_exit_net,
.id = &ife_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d9e399a7e3d5..06e380ae0928 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -77,7 +77,7 @@ static void ipt_destroy_target(struct xt_entry_target *t)
module_put(par.target->me);
}
-static void tcf_ipt_release(struct tc_action *a, int bind)
+static void tcf_ipt_release(struct tc_action *a)
{
struct tcf_ipt *ipt = to_ipt(a);
ipt_destroy_target(ipt->tcfi_t);
@@ -337,16 +337,14 @@ static __net_init int ipt_init_net(struct net *net)
return tc_action_net_init(tn, &act_ipt_ops);
}
-static void __net_exit ipt_exit_net(struct net *net)
+static void __net_exit ipt_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, ipt_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, ipt_net_id);
}
static struct pernet_operations ipt_net_ops = {
.init = ipt_init_net,
- .exit = ipt_exit_net,
+ .exit_batch = ipt_exit_net,
.id = &ipt_net_id,
.size = sizeof(struct tc_action_net),
};
@@ -387,16 +385,14 @@ static __net_init int xt_init_net(struct net *net)
return tc_action_net_init(tn, &act_xt_ops);
}
-static void __net_exit xt_exit_net(struct net *net)
+static void __net_exit xt_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, xt_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, xt_net_id);
}
static struct pernet_operations xt_net_ops = {
.init = xt_init_net,
- .exit = xt_exit_net,
+ .exit_batch = xt_exit_net,
.id = &xt_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 8b3e59388480..37e5e4decbd6 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -29,7 +29,6 @@
#include <net/tc_act/tc_mirred.h>
static LIST_HEAD(mirred_list);
-static DEFINE_SPINLOCK(mirred_list_lock);
static bool tcf_mirred_is_act_redirect(int action)
{
@@ -50,18 +49,15 @@ static bool tcf_mirred_act_wants_ingress(int action)
}
}
-static void tcf_mirred_release(struct tc_action *a, int bind)
+static void tcf_mirred_release(struct tc_action *a)
{
struct tcf_mirred *m = to_mirred(a);
struct net_device *dev;
- /* We could be called either in a RCU callback or with RTNL lock held. */
- spin_lock_bh(&mirred_list_lock);
list_del(&m->tcfm_list);
- dev = rcu_dereference_protected(m->tcfm_dev, 1);
+ dev = rtnl_dereference(m->tcfm_dev);
if (dev)
dev_put(dev);
- spin_unlock_bh(&mirred_list_lock);
}
static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
@@ -139,8 +135,6 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
m->tcf_action = parm->action;
m->tcfm_eaction = parm->eaction;
if (dev != NULL) {
- m->tcfm_ifindex = parm->ifindex;
- m->net = net;
if (ret != ACT_P_CREATED)
dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
dev_hold(dev);
@@ -149,9 +143,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
}
if (ret == ACT_P_CREATED) {
- spin_lock_bh(&mirred_list_lock);
list_add(&m->tcfm_list, &mirred_list);
- spin_unlock_bh(&mirred_list_lock);
tcf_idr_insert(tn, *a);
}
@@ -247,13 +239,14 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_mirred *m = to_mirred(a);
+ struct net_device *dev = rtnl_dereference(m->tcfm_dev);
struct tc_mirred opt = {
.index = m->tcf_index,
.action = m->tcf_action,
.refcnt = m->tcf_refcnt - ref,
.bindcnt = m->tcf_bindcnt - bind,
.eaction = m->tcfm_eaction,
- .ifindex = m->tcfm_ifindex,
+ .ifindex = dev ? dev->ifindex : 0,
};
struct tcf_t t;
@@ -294,7 +287,6 @@ static int mirred_device_event(struct notifier_block *unused,
ASSERT_RTNL();
if (event == NETDEV_UNREGISTER) {
- spin_lock_bh(&mirred_list_lock);
list_for_each_entry(m, &mirred_list, tcfm_list) {
if (rcu_access_pointer(m->tcfm_dev) == dev) {
dev_put(dev);
@@ -304,7 +296,6 @@ static int mirred_device_event(struct notifier_block *unused,
RCU_INIT_POINTER(m->tcfm_dev, NULL);
}
}
- spin_unlock_bh(&mirred_list_lock);
}
return NOTIFY_DONE;
@@ -318,7 +309,7 @@ static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
{
struct tcf_mirred *m = to_mirred(a);
- return __dev_get_by_index(m->net, m->tcfm_ifindex);
+ return rtnl_dereference(m->tcfm_dev);
}
static struct tc_action_ops act_mirred_ops = {
@@ -343,16 +334,14 @@ static __net_init int mirred_init_net(struct net *net)
return tc_action_net_init(tn, &act_mirred_ops);
}
-static void __net_exit mirred_exit_net(struct net *net)
+static void __net_exit mirred_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, mirred_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, mirred_net_id);
}
static struct pernet_operations mirred_net_ops = {
.init = mirred_init_net,
- .exit = mirred_exit_net,
+ .exit_batch = mirred_exit_net,
.id = &mirred_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index c365d01b99c8..98c6a4b2f523 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -310,16 +310,14 @@ static __net_init int nat_init_net(struct net *net)
return tc_action_net_init(tn, &act_nat_ops);
}
-static void __net_exit nat_exit_net(struct net *net)
+static void __net_exit nat_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, nat_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, nat_net_id);
}
static struct pernet_operations nat_net_ops = {
.init = nat_init_net,
- .exit = nat_exit_net,
+ .exit_batch = nat_exit_net,
.id = &nat_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 491fe5deb09e..349beaffb29e 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -216,7 +216,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
return ret;
}
-static void tcf_pedit_cleanup(struct tc_action *a, int bind)
+static void tcf_pedit_cleanup(struct tc_action *a)
{
struct tcf_pedit *p = to_pedit(a);
struct tc_pedit_key *keys = p->tcfp_keys;
@@ -453,16 +453,14 @@ static __net_init int pedit_init_net(struct net *net)
return tc_action_net_init(tn, &act_pedit_ops);
}
-static void __net_exit pedit_exit_net(struct net *net)
+static void __net_exit pedit_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, pedit_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, pedit_net_id);
}
static struct pernet_operations pedit_net_ops = {
.init = pedit_init_net,
- .exit = pedit_exit_net,
+ .exit_batch = pedit_exit_net,
.id = &pedit_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 3bb2ebf9e9ae..bf483db993a1 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -334,16 +334,14 @@ static __net_init int police_init_net(struct net *net)
return tc_action_net_init(tn, &act_police_ops);
}
-static void __net_exit police_exit_net(struct net *net)
+static void __net_exit police_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, police_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, police_net_id);
}
static struct pernet_operations police_net_ops = {
.init = police_init_net,
- .exit = police_exit_net,
+ .exit_batch = police_exit_net,
.id = &police_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 9438969290a6..1ba0df238756 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -96,7 +96,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
return ret;
}
-static void tcf_sample_cleanup(struct tc_action *a, int bind)
+static void tcf_sample_cleanup(struct tc_action *a)
{
struct tcf_sample *s = to_sample(a);
struct psample_group *psample_group;
@@ -236,16 +236,14 @@ static __net_init int sample_init_net(struct net *net)
return tc_action_net_init(tn, &act_sample_ops);
}
-static void __net_exit sample_exit_net(struct net *net)
+static void __net_exit sample_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, sample_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, sample_net_id);
}
static struct pernet_operations sample_net_ops = {
.init = sample_init_net,
- .exit = sample_exit_net,
+ .exit_batch = sample_exit_net,
.id = &sample_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index e7b57e5071a3..425eac11f6da 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -47,7 +47,7 @@ static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
return d->tcf_action;
}
-static void tcf_simp_release(struct tc_action *a, int bind)
+static void tcf_simp_release(struct tc_action *a)
{
struct tcf_defact *d = to_defact(a);
kfree(d->tcfd_defdata);
@@ -204,16 +204,14 @@ static __net_init int simp_init_net(struct net *net)
return tc_action_net_init(tn, &act_simp_ops);
}
-static void __net_exit simp_exit_net(struct net *net)
+static void __net_exit simp_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, simp_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, simp_net_id);
}
static struct pernet_operations simp_net_ops = {
.init = simp_init_net,
- .exit = simp_exit_net,
+ .exit_batch = simp_exit_net,
.id = &simp_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 59949d61f20d..5a3f691bb545 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -241,16 +241,14 @@ static __net_init int skbedit_init_net(struct net *net)
return tc_action_net_init(tn, &act_skbedit_ops);
}
-static void __net_exit skbedit_exit_net(struct net *net)
+static void __net_exit skbedit_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, skbedit_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, skbedit_net_id);
}
static struct pernet_operations skbedit_net_ops = {
.init = skbedit_init_net,
- .exit = skbedit_exit_net,
+ .exit_batch = skbedit_exit_net,
.id = &skbedit_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index b642ad3d39dd..fa975262dbac 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -184,7 +184,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
return ret;
}
-static void tcf_skbmod_cleanup(struct tc_action *a, int bind)
+static void tcf_skbmod_cleanup(struct tc_action *a)
{
struct tcf_skbmod *d = to_skbmod(a);
struct tcf_skbmod_params *p;
@@ -266,16 +266,14 @@ static __net_init int skbmod_init_net(struct net *net)
return tc_action_net_init(tn, &act_skbmod_ops);
}
-static void __net_exit skbmod_exit_net(struct net *net)
+static void __net_exit skbmod_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, skbmod_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, skbmod_net_id);
}
static struct pernet_operations skbmod_net_ops = {
.init = skbmod_init_net,
- .exit = skbmod_exit_net,
+ .exit_batch = skbmod_exit_net,
.id = &skbmod_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 30c96274c638..0e23aac09ad6 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -201,7 +201,7 @@ err_out:
return ret;
}
-static void tunnel_key_release(struct tc_action *a, int bind)
+static void tunnel_key_release(struct tc_action *a)
{
struct tcf_tunnel_key *t = to_tunnel_key(a);
struct tcf_tunnel_key_params *params;
@@ -325,16 +325,14 @@ static __net_init int tunnel_key_init_net(struct net *net)
return tc_action_net_init(tn, &act_tunnel_key_ops);
}
-static void __net_exit tunnel_key_exit_net(struct net *net)
+static void __net_exit tunnel_key_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, tunnel_key_net_id);
}
static struct pernet_operations tunnel_key_net_ops = {
.init = tunnel_key_init_net,
- .exit = tunnel_key_exit_net,
+ .exit_batch = tunnel_key_exit_net,
.id = &tunnel_key_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 97f717a13ad5..e1a1b3f3983a 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -219,7 +219,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
return ret;
}
-static void tcf_vlan_cleanup(struct tc_action *a, int bind)
+static void tcf_vlan_cleanup(struct tc_action *a)
{
struct tcf_vlan *v = to_vlan(a);
struct tcf_vlan_params *p;
@@ -301,16 +301,14 @@ static __net_init int vlan_init_net(struct net *net)
return tc_action_net_init(tn, &act_vlan_ops);
}
-static void __net_exit vlan_exit_net(struct net *net)
+static void __net_exit vlan_exit_net(struct list_head *net_list)
{
- struct tc_action_net *tn = net_generic(net, vlan_net_id);
-
- tc_action_net_exit(tn);
+ tc_action_net_exit(net_list, vlan_net_id);
}
static struct pernet_operations vlan_net_ops = {
.init = vlan_init_net,
- .exit = vlan_exit_net,
+ .exit_batch = vlan_exit_net,
.id = &vlan_net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b91ea03e3afa..32b1ea7cf863 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -217,8 +217,12 @@ static void tcf_chain_flush(struct tcf_chain *chain)
static void tcf_chain_destroy(struct tcf_chain *chain)
{
+ struct tcf_block *block = chain->block;
+
list_del(&chain->list);
kfree(chain);
+ if (list_empty(&block->chain_list))
+ kfree(block);
}
static void tcf_chain_hold(struct tcf_chain *chain)
@@ -329,49 +333,34 @@ int tcf_block_get(struct tcf_block **p_block,
}
EXPORT_SYMBOL(tcf_block_get);
-static void tcf_block_put_final(struct work_struct *work)
-{
- struct tcf_block *block = container_of(work, struct tcf_block, work);
- struct tcf_chain *chain, *tmp;
-
- rtnl_lock();
-
- /* At this point, all the chains should have refcnt == 1. */
- list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
- tcf_chain_put(chain);
- rtnl_unlock();
- kfree(block);
-}
-
/* XXX: Standalone actions are not allowed to jump to any chain, and bound
* actions should be all removed after flushing.
*/
void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
struct tcf_block_ext_info *ei)
{
- struct tcf_chain *chain;
+ struct tcf_chain *chain, *tmp;
- if (!block)
- return;
- /* Hold a refcnt for all chains, except 0, so that they don't disappear
+ /* Hold a refcnt for all chains, so that they don't disappear
* while we are iterating.
*/
+ if (!block)
+ return;
list_for_each_entry(chain, &block->chain_list, list)
- if (chain->index)
- tcf_chain_hold(chain);
+ tcf_chain_hold(chain);
list_for_each_entry(chain, &block->chain_list, list)
tcf_chain_flush(chain);
tcf_block_offload_unbind(block, q, ei);
- INIT_WORK(&block->work, tcf_block_put_final);
- /* Wait for existing RCU callbacks to cool down, make sure their works
- * have been queued before this. We can not flush pending works here
- * because we are holding the RTNL lock.
- */
- rcu_barrier();
- tcf_queue_work(&block->work);
+ /* At this point, all the chains should have refcnt >= 1. */
+ list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+ tcf_chain_put(chain);
+
+ /* Finally, put chain 0 and allow block to be freed. */
+ chain = list_first_entry(&block->chain_list, struct tcf_chain, list);
+ tcf_chain_put(chain);
}
EXPORT_SYMBOL(tcf_block_put_ext);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 543a3e875d05..6132a7317efa 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -166,6 +166,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
* so do it rather here.
*/
skb_key.basic.n_proto = skb->protocol;
+ skb_flow_dissect_tunnel_info(skb, &head->dissector, &skb_key);
skb_flow_dissect(skb, &head->dissector, &skb_key, 0);
fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 0f1eab99ff4e..74c22b4e365e 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -799,7 +799,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
goto nla_put_failure;
if (q->ops->dump && q->ops->dump(q, skb) < 0)
goto nla_put_failure;
- qlen = q->q.qlen;
+
+ qlen = qdisc_qlen_sum(q);
stab = rtnl_dereference(q->stab);
if (stab && qdisc_dump_stab(skb, stab) < 0)
@@ -956,6 +957,11 @@ skip:
} else {
const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
+ /* Only support running class lockless if parent is lockless */
+ if (new && (new->flags & TCQ_F_NOLOCK) &&
+ parent && !(parent->flags & TCQ_F_NOLOCK))
+ new->flags &= ~TCQ_F_NOLOCK;
+
err = -EOPNOTSUPP;
if (cops && cops->graft) {
unsigned long cl = cops->find(parent, classid);
@@ -1022,7 +1028,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
#endif
err = -ENOENT;
- if (ops == NULL)
+ if (!ops)
goto err_out;
sch = qdisc_alloc(dev_queue, ops);
@@ -1062,54 +1068,60 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
}
- if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
- if (qdisc_is_percpu_stats(sch)) {
- sch->cpu_bstats =
- netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
- if (!sch->cpu_bstats)
- goto err_out4;
+ if (ops->init) {
+ err = ops->init(sch, tca[TCA_OPTIONS]);
+ if (err != 0)
+ goto err_out5;
+ }
- sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
- if (!sch->cpu_qstats)
- goto err_out4;
- }
+ if (qdisc_is_percpu_stats(sch)) {
+ sch->cpu_bstats =
+ netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ if (!sch->cpu_bstats)
+ goto err_out4;
- if (tca[TCA_STAB]) {
- stab = qdisc_get_stab(tca[TCA_STAB]);
- if (IS_ERR(stab)) {
- err = PTR_ERR(stab);
- goto err_out4;
- }
- rcu_assign_pointer(sch->stab, stab);
- }
- if (tca[TCA_RATE]) {
- seqcount_t *running;
-
- err = -EOPNOTSUPP;
- if (sch->flags & TCQ_F_MQROOT)
- goto err_out4;
+ sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+ if (!sch->cpu_qstats)
+ goto err_out4;
+ }
- if ((sch->parent != TC_H_ROOT) &&
- !(sch->flags & TCQ_F_INGRESS) &&
- (!p || !(p->flags & TCQ_F_MQROOT)))
- running = qdisc_root_sleeping_running(sch);
- else
- running = &sch->running;
-
- err = gen_new_estimator(&sch->bstats,
- sch->cpu_bstats,
- &sch->rate_est,
- NULL,
- running,
- tca[TCA_RATE]);
- if (err)
- goto err_out4;
+ if (tca[TCA_STAB]) {
+ stab = qdisc_get_stab(tca[TCA_STAB]);
+ if (IS_ERR(stab)) {
+ err = PTR_ERR(stab);
+ goto err_out4;
}
+ rcu_assign_pointer(sch->stab, stab);
+ }
+ if (tca[TCA_RATE]) {
+ seqcount_t *running;
- qdisc_hash_add(sch, false);
+ err = -EOPNOTSUPP;
+ if (sch->flags & TCQ_F_MQROOT)
+ goto err_out4;
- return sch;
+ if (sch->parent != TC_H_ROOT &&
+ !(sch->flags & TCQ_F_INGRESS) &&
+ (!p || !(p->flags & TCQ_F_MQROOT)))
+ running = qdisc_root_sleeping_running(sch);
+ else
+ running = &sch->running;
+
+ err = gen_new_estimator(&sch->bstats,
+ sch->cpu_bstats,
+ &sch->rate_est,
+ NULL,
+ running,
+ tca[TCA_RATE]);
+ if (err)
+ goto err_out4;
}
+
+ qdisc_hash_add(sch, false);
+
+ return sch;
+
+err_out5:
/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
if (ops->destroy)
ops->destroy(sch);
@@ -1141,7 +1153,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
int err = 0;
if (tca[TCA_OPTIONS]) {
- if (sch->ops->change == NULL)
+ if (!sch->ops->change)
return -EINVAL;
err = sch->ops->change(sch, tca[TCA_OPTIONS]);
if (err)
@@ -1346,7 +1358,8 @@ replay:
goto create_n_graft;
if (n->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
- if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
+ if (tca[TCA_KIND] &&
+ nla_strcmp(tca[TCA_KIND], q->ops->id))
return -EINVAL;
if (q == p ||
(p && check_loop(q, p, 0)))
@@ -1391,7 +1404,7 @@ replay:
}
/* Change qdisc parameters */
- if (q == NULL)
+ if (!q)
return -ENOENT;
if (n->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index cd1b200acae7..981c08fe810b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -26,6 +26,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/if_vlan.h>
+#include <linux/skb_array.h>
#include <linux/if_macvlan.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
@@ -47,9 +48,70 @@ EXPORT_SYMBOL(default_qdisc_ops);
* - updates to tree and tree walking are only done under the rtnl mutex.
*/
-static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
+{
+ const struct netdev_queue *txq = q->dev_queue;
+ spinlock_t *lock = NULL;
+ struct sk_buff *skb;
+
+ if (q->flags & TCQ_F_NOLOCK) {
+ lock = qdisc_lock(q);
+ spin_lock(lock);
+ }
+
+ skb = skb_peek(&q->skb_bad_txq);
+ if (skb) {
+ /* check the reason of requeuing without tx lock first */
+ txq = skb_get_tx_queue(txq->dev, skb);
+ if (!netif_xmit_frozen_or_stopped(txq)) {
+ skb = __skb_dequeue(&q->skb_bad_txq);
+ if (qdisc_is_percpu_stats(q)) {
+ qdisc_qstats_cpu_backlog_dec(q, skb);
+ qdisc_qstats_cpu_qlen_dec(q);
+ } else {
+ qdisc_qstats_backlog_dec(q, skb);
+ q->q.qlen--;
+ }
+ } else {
+ skb = NULL;
+ }
+ }
+
+ if (lock)
+ spin_unlock(lock);
+
+ return skb;
+}
+
+static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
+{
+ struct sk_buff *skb = skb_peek(&q->skb_bad_txq);
+
+ if (unlikely(skb))
+ skb = __skb_dequeue_bad_txq(q);
+
+ return skb;
+}
+
+static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
+ struct sk_buff *skb)
{
- q->gso_skb = skb;
+ spinlock_t *lock = NULL;
+
+ if (q->flags & TCQ_F_NOLOCK) {
+ lock = qdisc_lock(q);
+ spin_lock(lock);
+ }
+
+ __skb_queue_tail(&q->skb_bad_txq, skb);
+
+ if (lock)
+ spin_unlock(lock);
+}
+
+static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+{
+ __skb_queue_head(&q->gso_skb, skb);
q->qstats.requeues++;
qdisc_qstats_backlog_inc(q, skb);
q->q.qlen++; /* it's still part of the queue */
@@ -58,6 +120,30 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
return 0;
}
+static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q)
+{
+ spinlock_t *lock = qdisc_lock(q);
+
+ spin_lock(lock);
+ __skb_queue_tail(&q->gso_skb, skb);
+ spin_unlock(lock);
+
+ qdisc_qstats_cpu_requeues_inc(q);
+ qdisc_qstats_cpu_backlog_inc(q, skb);
+ qdisc_qstats_cpu_qlen_inc(q);
+ __netif_schedule(q);
+
+ return 0;
+}
+
+static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+{
+ if (q->flags & TCQ_F_NOLOCK)
+ return dev_requeue_skb_locked(skb, q);
+ else
+ return __dev_requeue_skb(skb, q);
+}
+
static void try_bulk_dequeue_skb(struct Qdisc *q,
struct sk_buff *skb,
const struct netdev_queue *txq,
@@ -95,9 +181,15 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
if (!nskb)
break;
if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
- q->skb_bad_txq = nskb;
- qdisc_qstats_backlog_inc(q, nskb);
- q->q.qlen++;
+ qdisc_enqueue_skb_bad_txq(q, nskb);
+
+ if (qdisc_is_percpu_stats(q)) {
+ qdisc_qstats_cpu_backlog_inc(q, nskb);
+ qdisc_qstats_cpu_qlen_inc(q);
+ } else {
+ qdisc_qstats_backlog_inc(q, nskb);
+ q->q.qlen++;
+ }
break;
}
skb->next = nskb;
@@ -113,40 +205,60 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
int *packets)
{
- struct sk_buff *skb = q->gso_skb;
const struct netdev_queue *txq = q->dev_queue;
+ struct sk_buff *skb = NULL;
*packets = 1;
- if (unlikely(skb)) {
+ if (unlikely(!skb_queue_empty(&q->gso_skb))) {
+ spinlock_t *lock = NULL;
+
+ if (q->flags & TCQ_F_NOLOCK) {
+ lock = qdisc_lock(q);
+ spin_lock(lock);
+ }
+
+ skb = skb_peek(&q->gso_skb);
+
+ /* skb may be null if another cpu pulls gso_skb off in between
+ * empty check and lock.
+ */
+ if (!skb) {
+ if (lock)
+ spin_unlock(lock);
+ goto validate;
+ }
+
/* skb in gso_skb were already validated */
*validate = false;
/* check the reason of requeuing without tx lock first */
txq = skb_get_tx_queue(txq->dev, skb);
if (!netif_xmit_frozen_or_stopped(txq)) {
- q->gso_skb = NULL;
- qdisc_qstats_backlog_dec(q, skb);
- q->q.qlen--;
- } else
+ skb = __skb_dequeue(&q->gso_skb);
+ if (qdisc_is_percpu_stats(q)) {
+ qdisc_qstats_cpu_backlog_dec(q, skb);
+ qdisc_qstats_cpu_qlen_dec(q);
+ } else {
+ qdisc_qstats_backlog_dec(q, skb);
+ q->q.qlen--;
+ }
+ } else {
skb = NULL;
- goto trace;
- }
- *validate = true;
- skb = q->skb_bad_txq;
- if (unlikely(skb)) {
- /* check the reason of requeuing without tx lock first */
- txq = skb_get_tx_queue(txq->dev, skb);
- if (!netif_xmit_frozen_or_stopped(txq)) {
- q->skb_bad_txq = NULL;
- qdisc_qstats_backlog_dec(q, skb);
- q->q.qlen--;
- goto bulk;
}
- skb = NULL;
+ if (lock)
+ spin_unlock(lock);
goto trace;
}
- if (!(q->flags & TCQ_F_ONETXQUEUE) ||
- !netif_xmit_frozen_or_stopped(txq))
- skb = q->dequeue(q);
+validate:
+ *validate = true;
+
+ if ((q->flags & TCQ_F_ONETXQUEUE) &&
+ netif_xmit_frozen_or_stopped(txq))
+ return skb;
+
+ skb = qdisc_dequeue_skb_bad_txq(q);
+ if (unlikely(skb))
+ goto bulk;
+ skb = q->dequeue(q);
if (skb) {
bulk:
if (qdisc_may_bulk(q))
@@ -165,17 +277,18 @@ trace:
* only one CPU can execute this function.
*
* Returns to the caller:
- * 0 - queue is empty or throttled.
- * >0 - queue is not empty.
+ * false - hardware queue frozen backoff
+ * true - feel free to send more pkts
*/
-int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
- struct net_device *dev, struct netdev_queue *txq,
- spinlock_t *root_lock, bool validate)
+bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+ struct net_device *dev, struct netdev_queue *txq,
+ spinlock_t *root_lock, bool validate)
{
int ret = NETDEV_TX_BUSY;
/* And release qdisc */
- spin_unlock(root_lock);
+ if (root_lock)
+ spin_unlock(root_lock);
/* Note that we validate skb (GSO, checksum, ...) outside of locks */
if (validate)
@@ -188,27 +301,28 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
HARD_TX_UNLOCK(dev, txq);
} else {
- spin_lock(root_lock);
- return qdisc_qlen(q);
+ if (root_lock)
+ spin_lock(root_lock);
+ return true;
}
- spin_lock(root_lock);
- if (dev_xmit_complete(ret)) {
- /* Driver sent out skb successfully or skb was consumed */
- ret = qdisc_qlen(q);
- } else {
+ if (root_lock)
+ spin_lock(root_lock);
+
+ if (!dev_xmit_complete(ret)) {
/* Driver returned NETDEV_TX_BUSY - requeue skb */
if (unlikely(ret != NETDEV_TX_BUSY))
net_warn_ratelimited("BUG %s code %d qlen %d\n",
dev->name, ret, q->q.qlen);
- ret = dev_requeue_skb(skb, q);
+ dev_requeue_skb(skb, q);
+ return false;
}
if (ret && netif_xmit_frozen_or_stopped(txq))
- ret = 0;
+ return false;
- return ret;
+ return true;
}
/*
@@ -230,20 +344,22 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
* >0 - queue is not empty.
*
*/
-static inline int qdisc_restart(struct Qdisc *q, int *packets)
+static inline bool qdisc_restart(struct Qdisc *q, int *packets)
{
+ spinlock_t *root_lock = NULL;
struct netdev_queue *txq;
struct net_device *dev;
- spinlock_t *root_lock;
struct sk_buff *skb;
bool validate;
/* Dequeue packet */
skb = dequeue_skb(q, &validate, packets);
if (unlikely(!skb))
- return 0;
+ return false;
+
+ if (!(q->flags & TCQ_F_NOLOCK))
+ root_lock = qdisc_lock(q);
- root_lock = qdisc_lock(q);
dev = qdisc_dev(q);
txq = skb_get_tx_queue(dev, skb);
@@ -267,8 +383,6 @@ void __qdisc_run(struct Qdisc *q)
break;
}
}
-
- qdisc_run_end(q);
}
unsigned long dev_trans_start(struct net_device *dev)
@@ -468,93 +582,93 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
/*
* Private data for a pfifo_fast scheduler containing:
- * - queues for the three band
- * - bitmap indicating which of the bands contain skbs
+ * - rings for priority bands
*/
struct pfifo_fast_priv {
- u32 bitmap;
- struct qdisc_skb_head q[PFIFO_FAST_BANDS];
+ struct skb_array q[PFIFO_FAST_BANDS];
};
-/*
- * Convert a bitmap to the first band number where an skb is queued, where:
- * bitmap=0 means there are no skbs on any band.
- * bitmap=1 means there is an skb on band 0.
- * bitmap=7 means there are skbs on all 3 bands, etc.
- */
-static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
-
-static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
- int band)
+static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
+ int band)
{
- return priv->q + band;
+ return &priv->q[band];
}
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
struct sk_buff **to_free)
{
- if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
- int band = prio2band[skb->priority & TC_PRIO_MAX];
- struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
- struct qdisc_skb_head *list = band2list(priv, band);
-
- priv->bitmap |= (1 << band);
- qdisc->q.qlen++;
- return __qdisc_enqueue_tail(skb, qdisc, list);
- }
+ int band = prio2band[skb->priority & TC_PRIO_MAX];
+ struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+ struct skb_array *q = band2list(priv, band);
+ int err;
+
+ err = skb_array_produce(q, skb);
- return qdisc_drop(skb, qdisc, to_free);
+ if (unlikely(err))
+ return qdisc_drop_cpu(skb, qdisc, to_free);
+
+ qdisc_qstats_cpu_qlen_inc(qdisc);
+ qdisc_qstats_cpu_backlog_inc(qdisc, skb);
+ return NET_XMIT_SUCCESS;
}
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
- int band = bitmap2band[priv->bitmap];
-
- if (likely(band >= 0)) {
- struct qdisc_skb_head *qh = band2list(priv, band);
- struct sk_buff *skb = __qdisc_dequeue_head(qh);
+ struct sk_buff *skb = NULL;
+ int band;
- if (likely(skb != NULL)) {
- qdisc_qstats_backlog_dec(qdisc, skb);
- qdisc_bstats_update(qdisc, skb);
- }
+ for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
+ struct skb_array *q = band2list(priv, band);
- qdisc->q.qlen--;
- if (qh->qlen == 0)
- priv->bitmap &= ~(1 << band);
+ if (__skb_array_empty(q))
+ continue;
- return skb;
+ skb = skb_array_consume_bh(q);
+ }
+ if (likely(skb)) {
+ qdisc_qstats_cpu_backlog_dec(qdisc, skb);
+ qdisc_bstats_cpu_update(qdisc, skb);
+ qdisc_qstats_cpu_qlen_dec(qdisc);
}
- return NULL;
+ return skb;
}
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
- int band = bitmap2band[priv->bitmap];
+ struct sk_buff *skb = NULL;
+ int band;
- if (band >= 0) {
- struct qdisc_skb_head *qh = band2list(priv, band);
+ for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
+ struct skb_array *q = band2list(priv, band);
- return qh->head;
+ skb = __skb_array_peek(q);
}
- return NULL;
+ return skb;
}
static void pfifo_fast_reset(struct Qdisc *qdisc)
{
- int prio;
+ int i, band;
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
- for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
- __qdisc_reset_queue(band2list(priv, prio));
+ for (band = 0; band < PFIFO_FAST_BANDS; band++) {
+ struct skb_array *q = band2list(priv, band);
+ struct sk_buff *skb;
- priv->bitmap = 0;
- qdisc->qstats.backlog = 0;
- qdisc->q.qlen = 0;
+ while ((skb = skb_array_consume_bh(q)) != NULL)
+ kfree_skb(skb);
+ }
+
+ for_each_possible_cpu(i) {
+ struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i);
+
+ q->backlog = 0;
+ q->qlen = 0;
+ }
}
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
@@ -572,17 +686,48 @@ nla_put_failure:
static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
- int prio;
+ unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+ int prio;
- for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
- qdisc_skb_head_init(band2list(priv, prio));
+ /* guard against zero length rings */
+ if (!qlen)
+ return -EINVAL;
+
+ for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+ struct skb_array *q = band2list(priv, prio);
+ int err;
+
+ err = skb_array_init(q, qlen, GFP_KERNEL);
+ if (err)
+ return -ENOMEM;
+ }
/* Can by-pass the queue discipline */
qdisc->flags |= TCQ_F_CAN_BYPASS;
return 0;
}
+static void pfifo_fast_destroy(struct Qdisc *sch)
+{
+ struct pfifo_fast_priv *priv = qdisc_priv(sch);
+ int prio;
+
+ for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+ struct skb_array *q = band2list(priv, prio);
+
+ /* NULL ring is possible if destroy path is due to a failed
+ * skb_array_init() in pfifo_fast_init() case.
+ */
+ if (!&q->ring.queue)
+ continue;
+ /* Destroy ring but no need to kfree_skb because a call to
+ * pfifo_fast_reset() has already done that work.
+ */
+ ptr_ring_cleanup(&q->ring, NULL);
+ }
+}
+
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
.id = "pfifo_fast",
.priv_size = sizeof(struct pfifo_fast_priv),
@@ -590,9 +735,11 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
.dequeue = pfifo_fast_dequeue,
.peek = pfifo_fast_peek,
.init = pfifo_fast_init,
+ .destroy = pfifo_fast_destroy,
.reset = pfifo_fast_reset,
.dump = pfifo_fast_dump,
.owner = THIS_MODULE,
+ .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
};
EXPORT_SYMBOL(pfifo_fast_ops);
@@ -630,9 +777,24 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
sch->padded = (char *) sch - (char *) p;
}
+ __skb_queue_head_init(&sch->gso_skb);
+ __skb_queue_head_init(&sch->skb_bad_txq);
qdisc_skb_head_init(&sch->q);
spin_lock_init(&sch->q.lock);
+ if (ops->static_flags & TCQ_F_CPUSTATS) {
+ sch->cpu_bstats =
+ netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ if (!sch->cpu_bstats)
+ goto errout1;
+
+ sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+ if (!sch->cpu_qstats) {
+ free_percpu(sch->cpu_bstats);
+ goto errout1;
+ }
+ }
+
spin_lock_init(&sch->busylock);
lockdep_set_class(&sch->busylock,
dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
@@ -642,6 +804,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
dev->qdisc_running_key ?: &qdisc_running_key);
sch->ops = ops;
+ sch->flags = ops->static_flags;
sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev_queue = dev_queue;
@@ -649,6 +812,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
refcount_set(&sch->refcnt, 1);
return sch;
+errout1:
+ kfree(p);
errout:
return ERR_PTR(err);
}
@@ -682,17 +847,21 @@ EXPORT_SYMBOL(qdisc_create_dflt);
void qdisc_reset(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
+ struct sk_buff *skb, *tmp;
if (ops->reset)
ops->reset(qdisc);
- kfree_skb(qdisc->skb_bad_txq);
- qdisc->skb_bad_txq = NULL;
+ skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
+ __skb_unlink(skb, &qdisc->gso_skb);
+ kfree_skb_list(skb);
+ }
- if (qdisc->gso_skb) {
- kfree_skb_list(qdisc->gso_skb);
- qdisc->gso_skb = NULL;
+ skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
+ __skb_unlink(skb, &qdisc->skb_bad_txq);
+ kfree_skb_list(skb);
}
+
qdisc->q.qlen = 0;
qdisc->qstats.backlog = 0;
}
@@ -711,6 +880,7 @@ static void qdisc_free(struct Qdisc *qdisc)
void qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
+ struct sk_buff *skb, *tmp;
if (qdisc->flags & TCQ_F_BUILTIN ||
!refcount_dec_and_test(&qdisc->refcnt))
@@ -730,8 +900,16 @@ void qdisc_destroy(struct Qdisc *qdisc)
module_put(ops->owner);
dev_put(qdisc_dev(qdisc));
- kfree_skb_list(qdisc->gso_skb);
- kfree_skb(qdisc->skb_bad_txq);
+ skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
+ __skb_unlink(skb, &qdisc->gso_skb);
+ kfree_skb_list(skb);
+ }
+
+ skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
+ __skb_unlink(skb, &qdisc->skb_bad_txq);
+ kfree_skb_list(skb);
+ }
+
qdisc_free(qdisc);
}
EXPORT_SYMBOL(qdisc_destroy);
@@ -746,10 +924,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
root_lock = qdisc_lock(oqdisc);
spin_lock_bh(root_lock);
- /* Prune old scheduler */
- if (oqdisc && refcount_read(&oqdisc->refcnt) <= 1)
- qdisc_reset(oqdisc);
-
/* ... and graft new one */
if (qdisc == NULL)
qdisc = &noop_qdisc;
@@ -885,14 +1059,18 @@ static bool some_qdisc_is_busy(struct net_device *dev)
dev_queue = netdev_get_tx_queue(dev, i);
q = dev_queue->qdisc_sleeping;
- root_lock = qdisc_lock(q);
- spin_lock_bh(root_lock);
+ if (q->flags & TCQ_F_NOLOCK) {
+ val = test_bit(__QDISC_STATE_SCHED, &q->state);
+ } else {
+ root_lock = qdisc_lock(q);
+ spin_lock_bh(root_lock);
- val = (qdisc_is_running(q) ||
- test_bit(__QDISC_STATE_SCHED, &q->state));
+ val = (qdisc_is_running(q) ||
+ test_bit(__QDISC_STATE_SCHED, &q->state));
- spin_unlock_bh(root_lock);
+ spin_unlock_bh(root_lock);
+ }
if (val)
return true;
@@ -900,6 +1078,16 @@ static bool some_qdisc_is_busy(struct net_device *dev)
return false;
}
+static void dev_qdisc_reset(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *none)
+{
+ struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+
+ if (qdisc)
+ qdisc_reset(qdisc);
+}
+
/**
* dev_deactivate_many - deactivate transmissions on several devices
* @head: list of devices to deactivate
@@ -910,7 +1098,6 @@ static bool some_qdisc_is_busy(struct net_device *dev)
void dev_deactivate_many(struct list_head *head)
{
struct net_device *dev;
- bool sync_needed = false;
list_for_each_entry(dev, head, close_list) {
netdev_for_each_tx_queue(dev, dev_deactivate_queue,
@@ -920,20 +1107,25 @@ void dev_deactivate_many(struct list_head *head)
&noop_qdisc);
dev_watchdog_down(dev);
- sync_needed |= !dev->dismantle;
}
/* Wait for outstanding qdisc-less dev_queue_xmit calls.
* This is avoided if all devices are in dismantle phase :
* Caller will call synchronize_net() for us
*/
- if (sync_needed)
- synchronize_net();
+ synchronize_net();
/* Wait for outstanding qdisc_run calls. */
- list_for_each_entry(dev, head, close_list)
+ list_for_each_entry(dev, head, close_list) {
while (some_qdisc_is_busy(dev))
yield();
+ /* The new qdisc is assigned at this point so we can safely
+ * unwind stale skb lists and qdisc statistics
+ */
+ netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
+ if (dev_ingress_queue(dev))
+ dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
+ }
}
void dev_deactivate(struct net_device *dev)
@@ -954,6 +1146,8 @@ static void dev_init_scheduler_queue(struct net_device *dev,
rcu_assign_pointer(dev_queue->qdisc, qdisc);
dev_queue->qdisc_sleeping = qdisc;
+ __skb_queue_head_init(&qdisc->gso_skb);
+ __skb_queue_head_init(&qdisc->skb_bad_txq);
}
void dev_init_scheduler(struct net_device *dev)
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 213b586a06a0..8cbb5c829d59 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -17,6 +17,7 @@
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
struct mq_sched {
struct Qdisc **qdiscs;
@@ -97,23 +98,42 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
struct net_device *dev = qdisc_dev(sch);
struct Qdisc *qdisc;
unsigned int ntx;
+ __u32 qlen = 0;
sch->q.qlen = 0;
memset(&sch->bstats, 0, sizeof(sch->bstats));
memset(&sch->qstats, 0, sizeof(sch->qstats));
+ /* MQ supports lockless qdiscs. However, statistics accounting needs
+ * to account for all, none, or a mix of locked and unlocked child
+ * qdiscs. Percpu stats are added to counters in-band and locking
+ * qdisc totals are added at end.
+ */
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
spin_lock_bh(qdisc_lock(qdisc));
- sch->q.qlen += qdisc->q.qlen;
- sch->bstats.bytes += qdisc->bstats.bytes;
- sch->bstats.packets += qdisc->bstats.packets;
- sch->qstats.backlog += qdisc->qstats.backlog;
- sch->qstats.drops += qdisc->qstats.drops;
- sch->qstats.requeues += qdisc->qstats.requeues;
- sch->qstats.overlimits += qdisc->qstats.overlimits;
+
+ if (qdisc_is_percpu_stats(qdisc)) {
+ qlen = qdisc_qlen_sum(qdisc);
+ __gnet_stats_copy_basic(NULL, &sch->bstats,
+ qdisc->cpu_bstats,
+ &qdisc->bstats);
+ __gnet_stats_copy_queue(&sch->qstats,
+ qdisc->cpu_qstats,
+ &qdisc->qstats, qlen);
+ } else {
+ sch->q.qlen += qdisc->q.qlen;
+ sch->bstats.bytes += qdisc->bstats.bytes;
+ sch->bstats.packets += qdisc->bstats.packets;
+ sch->qstats.backlog += qdisc->qstats.backlog;
+ sch->qstats.drops += qdisc->qstats.drops;
+ sch->qstats.requeues += qdisc->qstats.requeues;
+ sch->qstats.overlimits += qdisc->qstats.overlimits;
+ }
+
spin_unlock_bh(qdisc_lock(qdisc));
}
+
return 0;
}
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index b85885a9d8a1..8622745f3cd9 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -388,22 +388,40 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb);
struct tc_mqprio_qopt opt = { 0 };
struct Qdisc *qdisc;
- unsigned int i;
+ unsigned int ntx, tc;
sch->q.qlen = 0;
memset(&sch->bstats, 0, sizeof(sch->bstats));
memset(&sch->qstats, 0, sizeof(sch->qstats));
- for (i = 0; i < dev->num_tx_queues; i++) {
- qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
+ /* MQ supports lockless qdiscs. However, statistics accounting needs
+ * to account for all, none, or a mix of locked and unlocked child
+ * qdiscs. Percpu stats are added to counters in-band and locking
+ * qdisc totals are added at end.
+ */
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
spin_lock_bh(qdisc_lock(qdisc));
- sch->q.qlen += qdisc->q.qlen;
- sch->bstats.bytes += qdisc->bstats.bytes;
- sch->bstats.packets += qdisc->bstats.packets;
- sch->qstats.backlog += qdisc->qstats.backlog;
- sch->qstats.drops += qdisc->qstats.drops;
- sch->qstats.requeues += qdisc->qstats.requeues;
- sch->qstats.overlimits += qdisc->qstats.overlimits;
+
+ if (qdisc_is_percpu_stats(qdisc)) {
+ __u32 qlen = qdisc_qlen_sum(qdisc);
+
+ __gnet_stats_copy_basic(NULL, &sch->bstats,
+ qdisc->cpu_bstats,
+ &qdisc->bstats);
+ __gnet_stats_copy_queue(&sch->qstats,
+ qdisc->cpu_qstats,
+ &qdisc->qstats, qlen);
+ } else {
+ sch->q.qlen += qdisc->q.qlen;
+ sch->bstats.bytes += qdisc->bstats.bytes;
+ sch->bstats.packets += qdisc->bstats.packets;
+ sch->qstats.backlog += qdisc->qstats.backlog;
+ sch->qstats.drops += qdisc->qstats.drops;
+ sch->qstats.requeues += qdisc->qstats.requeues;
+ sch->qstats.overlimits += qdisc->qstats.overlimits;
+ }
+
spin_unlock_bh(qdisc_lock(qdisc));
}
@@ -411,9 +429,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
opt.hw = priv->hw_offload;
- for (i = 0; i < netdev_get_num_tc(dev); i++) {
- opt.count[i] = dev->tc_to_txq[i].count;
- opt.offset[i] = dev->tc_to_txq[i].offset;
+ for (tc = 0; tc < netdev_get_num_tc(dev); tc++) {
+ opt.count[tc] = dev->tc_to_txq[tc].count;
+ opt.offset[tc] = dev->tc_to_txq[tc].offset;
}
if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt))
@@ -495,7 +513,6 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
if (cl >= TC_H_MIN_PRIORITY) {
int i;
__u32 qlen = 0;
- struct Qdisc *qdisc;
struct gnet_stats_queue qstats = {0};
struct gnet_stats_basic_packed bstats = {0};
struct net_device *dev = qdisc_dev(sch);
@@ -511,18 +528,26 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
for (i = tc.offset; i < tc.offset + tc.count; i++) {
struct netdev_queue *q = netdev_get_tx_queue(dev, i);
+ struct Qdisc *qdisc = rtnl_dereference(q->qdisc);
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
+ struct gnet_stats_queue __percpu *cpu_qstats = NULL;
- qdisc = rtnl_dereference(q->qdisc);
spin_lock_bh(qdisc_lock(qdisc));
- qlen += qdisc->q.qlen;
- bstats.bytes += qdisc->bstats.bytes;
- bstats.packets += qdisc->bstats.packets;
- qstats.backlog += qdisc->qstats.backlog;
- qstats.drops += qdisc->qstats.drops;
- qstats.requeues += qdisc->qstats.requeues;
- qstats.overlimits += qdisc->qstats.overlimits;
+ if (qdisc_is_percpu_stats(qdisc)) {
+ cpu_bstats = qdisc->cpu_bstats;
+ cpu_qstats = qdisc->cpu_qstats;
+ }
+
+ qlen = qdisc_qlen_sum(qdisc);
+ __gnet_stats_copy_basic(NULL, &sch->bstats,
+ cpu_bstats, &qdisc->bstats);
+ __gnet_stats_copy_queue(&sch->qstats,
+ cpu_qstats,
+ &qdisc->qstats,
+ qlen);
spin_unlock_bh(qdisc_lock(qdisc));
}
+
/* Reclaim root sleeping lock before completing stats */
if (d->lock)
spin_lock_bh(d->lock);
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 1ca84a288443..54bd9c1a8aa1 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -14,7 +14,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
tsnmap.o bind_addr.o socket.o primitive.o \
output.o input.o debug.o stream.o auth.o \
offload.o stream_sched.o stream_sched_prio.o \
- stream_sched_rr.o
+ stream_sched_rr.o stream_interleave.o
sctp_probe-y := probe.o
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 69394f4d6091..837806dd5799 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -861,7 +861,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
0, spc_state, error, GFP_ATOMIC);
if (event)
- sctp_ulpq_tail_event(&asoc->ulpq, event);
+ asoc->stream.si->enqueue_event(&asoc->ulpq, event);
}
/* Select new active and retran paths. */
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 7f8baa48e7c2..991a530c6b31 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -124,7 +124,7 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent,
error, GFP_ATOMIC);
if (ev)
- sctp_ulpq_tail_event(&asoc->ulpq, ev);
+ asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
}
sctp_chunk_put(chunk);
@@ -191,7 +191,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
*/
max_data = asoc->pathmtu -
sctp_sk(asoc->base.sk)->pf->af->net_header_len -
- sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
+ sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream);
max_data = SCTP_TRUNC4(max_data);
/* If the the peer requested that we authenticate DATA chunks
@@ -264,8 +264,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
frag |= SCTP_DATA_SACK_IMM;
}
- chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
- 0, GFP_KERNEL);
+ chunk = asoc->stream.si->make_datafrag(asoc, sinfo, len, frag,
+ GFP_KERNEL);
if (!chunk) {
err = -ENOMEM;
goto errout;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 4a865cd06d76..01a26ee051e3 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -313,6 +313,7 @@ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet,
/* We believe that this chunk is OK to add to the packet */
switch (chunk->chunk_hdr->type) {
case SCTP_CID_DATA:
+ case SCTP_CID_I_DATA:
/* Account for the data being in the packet */
sctp_packet_append_data(packet, chunk);
/* Disallow SACK bundling after DATA. */
@@ -724,7 +725,7 @@ static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet,
* or delay in hopes of bundling a full sized packet.
*/
if (chunk->skb->len + q->out_qlen > transport->pathmtu -
- packet->overhead - sizeof(struct sctp_data_chunk) - 4)
+ packet->overhead - sctp_datachk_len(&chunk->asoc->stream) - 4)
/* Enough data queued to fill a packet */
return SCTP_XMIT_OK;
@@ -759,7 +760,7 @@ static void sctp_packet_append_data(struct sctp_packet *packet,
asoc->peer.rwnd = rwnd;
sctp_chunk_assign_tsn(chunk);
- sctp_chunk_assign_ssn(chunk);
+ asoc->stream.si->assign_number(chunk);
}
static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet,
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 7d67feeeffc1..af9b5ebcae50 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -67,8 +67,6 @@ static void sctp_mark_missing(struct sctp_outq *q,
__u32 highest_new_tsn,
int count_of_newacks);
-static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
-
static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
/* Add data to the front of the queue. */
@@ -591,7 +589,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
* following the procedures outlined in C1 - C5.
*/
if (reason == SCTP_RTXR_T3_RTX)
- sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point);
+ q->asoc->stream.si->generate_ftsn(q, q->asoc->ctsn_ack_point);
/* Flush the queues only on timeout, since fast_rtx is only
* triggered during sack processing and the queue
@@ -942,6 +940,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
case SCTP_CID_ECN_ECNE:
case SCTP_CID_ASCONF:
case SCTP_CID_FWD_TSN:
+ case SCTP_CID_I_FWD_TSN:
case SCTP_CID_RECONF:
status = sctp_packet_transmit_chunk(packet, chunk,
one_packet, gfp);
@@ -956,7 +955,8 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
* sender MUST assure that at least one T3-rtx
* timer is running.
*/
- if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
+ if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN ||
+ chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) {
sctp_transport_reset_t3_rtx(transport);
transport->last_time_sent = jiffies;
}
@@ -1372,7 +1372,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
asoc->peer.rwnd = sack_a_rwnd;
- sctp_generate_fwdtsn(q, sack_ctsn);
+ asoc->stream.si->generate_ftsn(q, sack_ctsn);
pr_debug("%s: sack cumulative tsn ack:0x%x\n", __func__, sack_ctsn);
pr_debug("%s: cumulative tsn ack of assoc:%p is 0x%x, "
@@ -1795,7 +1795,7 @@ static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist,
}
/* Create and add a fwdtsn chunk to the outq's control queue if needed. */
-static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
+void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
{
struct sctp_association *asoc = q->asoc;
struct sctp_chunk *ftsn_chunk = NULL;
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 26b4be6b4172..4545bc2aff84 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -288,12 +288,8 @@ struct sctp_ht_iter {
static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
{
struct sctp_ht_iter *iter = seq->private;
- int err = sctp_transport_walk_start(&iter->hti);
- if (err) {
- iter->start_fail = 1;
- return ERR_PTR(err);
- }
+ sctp_transport_walk_start(&iter->hti);
iter->start_fail = 0;
return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9bf575f2e8ed..b9b269cf615e 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -228,7 +228,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
struct sctp_inithdr init;
union sctp_params addrs;
struct sctp_sock *sp;
- __u8 extensions[4];
+ __u8 extensions[5];
size_t chunksize;
__be16 types[2];
int num_ext = 0;
@@ -278,6 +278,11 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
if (sp->adaptation_ind)
chunksize += sizeof(aiparam);
+ if (sp->strm_interleave) {
+ extensions[num_ext] = SCTP_CID_I_DATA;
+ num_ext += 1;
+ }
+
chunksize += vparam_len;
/* Account for AUTH related parameters */
@@ -392,7 +397,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
struct sctp_inithdr initack;
union sctp_params addrs;
struct sctp_sock *sp;
- __u8 extensions[4];
+ __u8 extensions[5];
size_t chunksize;
int num_ext = 0;
int cookie_len;
@@ -442,6 +447,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
if (sp->adaptation_ind)
chunksize += sizeof(aiparam);
+ if (asoc->intl_enable) {
+ extensions[num_ext] = SCTP_CID_I_DATA;
+ num_ext += 1;
+ }
+
if (asoc->peer.auth_capable) {
auth_random = (struct sctp_paramhdr *)asoc->c.auth_random;
chunksize += ntohs(auth_random->length);
@@ -711,38 +721,31 @@ nodata:
/* Make a DATA chunk for the given association from the provided
* parameters. However, do not populate the data payload.
*/
-struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
+struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
const struct sctp_sndrcvinfo *sinfo,
- int data_len, __u8 flags, __u16 ssn,
- gfp_t gfp)
+ int len, __u8 flags, gfp_t gfp)
{
struct sctp_chunk *retval;
struct sctp_datahdr dp;
- int chunk_len;
/* We assign the TSN as LATE as possible, not here when
* creating the chunk.
*/
- dp.tsn = 0;
+ memset(&dp, 0, sizeof(dp));
+ dp.ppid = sinfo->sinfo_ppid;
dp.stream = htons(sinfo->sinfo_stream);
- dp.ppid = sinfo->sinfo_ppid;
/* Set the flags for an unordered send. */
- if (sinfo->sinfo_flags & SCTP_UNORDERED) {
+ if (sinfo->sinfo_flags & SCTP_UNORDERED)
flags |= SCTP_DATA_UNORDERED;
- dp.ssn = 0;
- } else
- dp.ssn = htons(ssn);
- chunk_len = sizeof(dp) + data_len;
- retval = sctp_make_data(asoc, flags, chunk_len, gfp);
+ retval = sctp_make_data(asoc, flags, sizeof(dp) + len, gfp);
if (!retval)
- goto nodata;
+ return NULL;
retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));
-nodata:
return retval;
}
@@ -1415,6 +1418,12 @@ static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp);
}
+struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc,
+ __u8 flags, int paylen, gfp_t gfp)
+{
+ return _sctp_make_chunk(asoc, SCTP_CID_I_DATA, flags, paylen, gfp);
+}
+
static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
__u8 type, __u8 flags, int paylen,
gfp_t gfp)
@@ -2032,6 +2041,10 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
if (net->sctp.addip_enable)
asoc->peer.asconf_capable = 1;
break;
+ case SCTP_CID_I_DATA:
+ if (sctp_sk(asoc->base.sk)->strm_interleave)
+ asoc->intl_enable = 1;
+ break;
default:
break;
}
@@ -3523,6 +3536,30 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
return retval;
}
+struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc,
+ __u32 new_cum_tsn, size_t nstreams,
+ struct sctp_ifwdtsn_skip *skiplist)
+{
+ struct sctp_chunk *retval = NULL;
+ struct sctp_ifwdtsn_hdr ftsn_hdr;
+ size_t hint;
+
+ hint = (nstreams + 1) * sizeof(__u32);
+
+ retval = sctp_make_control(asoc, SCTP_CID_I_FWD_TSN, 0, hint,
+ GFP_ATOMIC);
+ if (!retval)
+ return NULL;
+
+ ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn);
+ retval->subh.ifwdtsn_hdr =
+ sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr);
+
+ sctp_addto_chunk(retval, nstreams * sizeof(skiplist[0]), skiplist);
+
+ return retval;
+}
+
/* RE-CONFIG 3.1 (RE-CONFIG chunk)
* 0 1 2 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index df94d77401e7..16ddf2ca1438 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -632,7 +632,7 @@ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands,
struct sctp_chunk *abort;
/* Cancel any partial delivery in progress. */
- sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+ asoc->stream.si->abort_pd(&asoc->ulpq, GFP_ATOMIC);
if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT)
event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST,
@@ -972,7 +972,7 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds,
if (!ev)
return;
- sctp_ulpq_tail_event(&asoc->ulpq, ev);
+ asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
switch (err_hdr->cause) {
case SCTP_ERROR_UNKNOWN_CHUNK:
@@ -1007,18 +1007,6 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds,
}
}
-/* Process variable FWDTSN chunk information. */
-static void sctp_cmd_process_fwdtsn(struct sctp_ulpq *ulpq,
- struct sctp_chunk *chunk)
-{
- struct sctp_fwdtsn_skip *skip;
-
- /* Walk through all the skipped SSNs */
- sctp_walk_fwdtsn(skip, chunk) {
- sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn));
- }
-}
-
/* Helper function to remove the association non-primary peer
* transports.
*/
@@ -1058,7 +1046,7 @@ static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands,
asoc->c.sinit_max_instreams,
NULL, GFP_ATOMIC);
if (ev)
- sctp_ulpq_tail_event(&asoc->ulpq, ev);
+ asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
}
/* Helper function to generate an adaptation indication event */
@@ -1070,7 +1058,7 @@ static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands,
ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC);
if (ev)
- sctp_ulpq_tail_event(&asoc->ulpq, ev);
+ asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
}
@@ -1368,18 +1356,12 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
break;
case SCTP_CMD_REPORT_FWDTSN:
- /* Move the Cumulattive TSN Ack ahead. */
- sctp_tsnmap_skip(&asoc->peer.tsn_map, cmd->obj.u32);
-
- /* purge the fragmentation queue */
- sctp_ulpq_reasm_flushtsn(&asoc->ulpq, cmd->obj.u32);
-
- /* Abort any in progress partial delivery. */
- sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+ asoc->stream.si->report_ftsn(&asoc->ulpq, cmd->obj.u32);
break;
case SCTP_CMD_PROCESS_FWDTSN:
- sctp_cmd_process_fwdtsn(&asoc->ulpq, cmd->obj.chunk);
+ asoc->stream.si->handle_ftsn(&asoc->ulpq,
+ cmd->obj.chunk);
break;
case SCTP_CMD_GEN_SACK:
@@ -1483,8 +1465,9 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
pr_debug("%s: sm_sideff: chunk_up:%p, ulpq:%p\n",
__func__, cmd->obj.chunk, &asoc->ulpq);
- sctp_ulpq_tail_data(&asoc->ulpq, cmd->obj.chunk,
- GFP_ATOMIC);
+ asoc->stream.si->ulpevent_data(&asoc->ulpq,
+ cmd->obj.chunk,
+ GFP_ATOMIC);
break;
case SCTP_CMD_EVENT_ULP:
@@ -1492,7 +1475,8 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
pr_debug("%s: sm_sideff: event_up:%p, ulpq:%p\n",
__func__, cmd->obj.ulpevent, &asoc->ulpq);
- sctp_ulpq_tail_event(&asoc->ulpq, cmd->obj.ulpevent);
+ asoc->stream.si->enqueue_event(&asoc->ulpq,
+ cmd->obj.ulpevent);
break;
case SCTP_CMD_REPLY:
@@ -1729,12 +1713,13 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
break;
case SCTP_CMD_PART_DELIVER:
- sctp_ulpq_partial_delivery(&asoc->ulpq, GFP_ATOMIC);
+ asoc->stream.si->start_pd(&asoc->ulpq, GFP_ATOMIC);
break;
case SCTP_CMD_RENEGE:
- sctp_ulpq_renege(&asoc->ulpq, cmd->obj.chunk,
- GFP_ATOMIC);
+ asoc->stream.si->renege_events(&asoc->ulpq,
+ cmd->obj.chunk,
+ GFP_ATOMIC);
break;
case SCTP_CMD_SETUP_T4:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8f8ccded13e4..541f34735346 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3013,7 +3013,7 @@ enum sctp_disposition sctp_sf_eat_data_6_2(struct net *net,
return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
}
- if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_data_chunk)))
+ if (!sctp_chunk_length_valid(chunk, sctp_datachk_len(&asoc->stream)))
return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
commands);
@@ -3034,7 +3034,7 @@ enum sctp_disposition sctp_sf_eat_data_6_2(struct net *net,
case SCTP_IERROR_PROTO_VIOLATION:
return sctp_sf_abort_violation(net, ep, asoc, chunk, commands,
(u8 *)chunk->subh.data_hdr,
- sizeof(struct sctp_datahdr));
+ sctp_datahdr_len(&asoc->stream));
default:
BUG();
}
@@ -3133,7 +3133,7 @@ enum sctp_disposition sctp_sf_eat_data_fast_4_4(
return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
}
- if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_data_chunk)))
+ if (!sctp_chunk_length_valid(chunk, sctp_datachk_len(&asoc->stream)))
return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
commands);
@@ -3150,7 +3150,7 @@ enum sctp_disposition sctp_sf_eat_data_fast_4_4(
case SCTP_IERROR_PROTO_VIOLATION:
return sctp_sf_abort_violation(net, ep, asoc, chunk, commands,
(u8 *)chunk->subh.data_hdr,
- sizeof(struct sctp_datahdr));
+ sctp_datahdr_len(&asoc->stream));
default:
BUG();
}
@@ -3957,7 +3957,6 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net,
{
struct sctp_fwdtsn_hdr *fwdtsn_hdr;
struct sctp_chunk *chunk = arg;
- struct sctp_fwdtsn_skip *skip;
__u16 len;
__u32 tsn;
@@ -3971,7 +3970,7 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net,
return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
/* Make sure that the FORWARD_TSN chunk has valid length. */
- if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
+ if (!sctp_chunk_length_valid(chunk, sctp_ftsnchk_len(&asoc->stream)))
return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
commands);
@@ -3990,14 +3989,11 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net,
if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0)
goto discard_noforce;
- /* Silently discard the chunk if stream-id is not valid */
- sctp_walk_fwdtsn(skip, chunk) {
- if (ntohs(skip->stream) >= asoc->stream.incnt)
- goto discard_noforce;
- }
+ if (!asoc->stream.si->validate_ftsn(chunk))
+ goto discard_noforce;
sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn));
- if (len > sizeof(struct sctp_fwdtsn_hdr))
+ if (len > sctp_ftsnhdr_len(&asoc->stream))
sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN,
SCTP_CHUNK(chunk));
@@ -4028,7 +4024,6 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast(
{
struct sctp_fwdtsn_hdr *fwdtsn_hdr;
struct sctp_chunk *chunk = arg;
- struct sctp_fwdtsn_skip *skip;
__u16 len;
__u32 tsn;
@@ -4042,7 +4037,7 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast(
return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
/* Make sure that the FORWARD_TSN chunk has a valid length. */
- if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
+ if (!sctp_chunk_length_valid(chunk, sctp_ftsnchk_len(&asoc->stream)))
return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
commands);
@@ -4061,14 +4056,11 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast(
if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0)
goto gen_shutdown;
- /* Silently discard the chunk if stream-id is not valid */
- sctp_walk_fwdtsn(skip, chunk) {
- if (ntohs(skip->stream) >= asoc->stream.incnt)
- goto gen_shutdown;
- }
+ if (!asoc->stream.si->validate_ftsn(chunk))
+ goto gen_shutdown;
sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn));
- if (len > sizeof(struct sctp_fwdtsn_hdr))
+ if (len > sctp_ftsnhdr_len(&asoc->stream))
sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN,
SCTP_CHUNK(chunk));
@@ -6244,14 +6236,12 @@ static int sctp_eat_data(const struct sctp_association *asoc,
struct sctp_chunk *err;
enum sctp_verb deliver;
size_t datalen;
- u8 ordered = 0;
- u16 ssn, sid;
__u32 tsn;
int tmp;
data_hdr = (struct sctp_datahdr *)chunk->skb->data;
chunk->subh.data_hdr = data_hdr;
- skb_pull(chunk->skb, sizeof(*data_hdr));
+ skb_pull(chunk->skb, sctp_datahdr_len(&asoc->stream));
tsn = ntohl(data_hdr->tsn);
pr_debug("%s: TSN 0x%x\n", __func__, tsn);
@@ -6299,7 +6289,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
* Actually, allow a little bit of overflow (up to a MTU).
*/
datalen = ntohs(chunk->chunk_hdr->length);
- datalen -= sizeof(struct sctp_data_chunk);
+ datalen -= sctp_datachk_len(&asoc->stream);
deliver = SCTP_CMD_CHUNK_ULP;
@@ -6394,7 +6384,6 @@ static int sctp_eat_data(const struct sctp_association *asoc,
SCTP_INC_STATS(net, SCTP_MIB_INORDERCHUNKS);
if (chunk->asoc)
chunk->asoc->stats.iodchunks++;
- ordered = 1;
}
/* RFC 2960 6.5 Stream Identifier and Stream Sequence Number
@@ -6405,8 +6394,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
* with cause set to "Invalid Stream Identifier" (See Section 3.3.10)
* and discard the DATA chunk.
*/
- sid = ntohs(data_hdr->stream);
- if (sid >= asoc->stream.incnt) {
+ if (ntohs(data_hdr->stream) >= asoc->stream.incnt) {
/* Mark tsn as received even though we drop it */
sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn));
@@ -6427,8 +6415,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
* SSN is smaller then the next expected one. If it is, it wrapped
* and is invalid.
*/
- ssn = ntohs(data_hdr->ssn);
- if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->stream, in, sid)))
+ if (!asoc->stream.si->validate_data(chunk))
return SCTP_IERROR_PROTO_VIOLATION;
/* Send the data up to the user. Note: Schedule the
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 79b6bee5b768..691d9dc620e3 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -985,11 +985,14 @@ static const struct sctp_sm_table_entry *sctp_chunk_event_lookup(
if (state > SCTP_STATE_MAX)
return &bug;
+ if (cid == SCTP_CID_I_DATA)
+ cid = SCTP_CID_DATA;
+
if (cid <= SCTP_CID_BASE_MAX)
return &chunk_event_table[cid][state];
if (net->sctp.prsctp_enable) {
- if (cid == SCTP_CID_FWD_TSN)
+ if (cid == SCTP_CID_FWD_TSN || cid == SCTP_CID_I_FWD_TSN)
return &prsctp_chunk_event_table[0][state];
}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3253f724a995..5e4100df7bae 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -201,6 +201,22 @@ static void sctp_for_each_tx_datachunk(struct sctp_association *asoc,
cb(chunk);
}
+static void sctp_for_each_rx_skb(struct sctp_association *asoc, struct sock *sk,
+ void (*cb)(struct sk_buff *, struct sock *))
+
+{
+ struct sk_buff *skb, *tmp;
+
+ sctp_skb_for_each(skb, &asoc->ulpq.lobby, tmp)
+ cb(skb, sk);
+
+ sctp_skb_for_each(skb, &asoc->ulpq.reasm, tmp)
+ cb(skb, sk);
+
+ sctp_skb_for_each(skb, &asoc->ulpq.reasm_uo, tmp)
+ cb(skb, sk);
+}
+
/* Verify that this is a valid address. */
static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr,
int len)
@@ -1554,6 +1570,7 @@ static void sctp_close(struct sock *sk, long timeout)
if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) ||
!skb_queue_empty(&asoc->ulpq.reasm) ||
+ !skb_queue_empty(&asoc->ulpq.reasm_uo) ||
(sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) {
struct sctp_chunk *chunk;
@@ -2002,7 +2019,20 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
if (err < 0)
goto out_free;
- wait_connect = true;
+ /* If stream interleave is enabled, wait_connect has to be
+ * done earlier than data enqueue, as it needs to make data
+ * or idata according to asoc->intl_enable which is set
+ * after connection is done.
+ */
+ if (sctp_sk(asoc->base.sk)->strm_interleave) {
+ timeo = sock_sndtimeo(sk, 0);
+ err = sctp_wait_for_connect(asoc, &timeo);
+ if (err)
+ goto out_unlock;
+ } else {
+ wait_connect = true;
+ }
+
pr_debug("%s: we associated primitively\n", __func__);
}
@@ -2281,7 +2311,7 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval,
if (!event)
return -ENOMEM;
- sctp_ulpq_tail_event(&asoc->ulpq, event);
+ asoc->stream.si->enqueue_event(&asoc->ulpq, event);
}
}
@@ -3180,7 +3210,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
if (val == 0) {
val = asoc->pathmtu - sp->pf->af->net_header_len;
val -= sizeof(struct sctphdr) +
- sizeof(struct sctp_data_chunk);
+ sctp_datachk_len(&asoc->stream);
}
asoc->user_frag = val;
asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
@@ -3350,7 +3380,10 @@ static int sctp_setsockopt_fragment_interleave(struct sock *sk,
if (get_user(val, (int __user *)optval))
return -EFAULT;
- sctp_sk(sk)->frag_interleave = (val == 0) ? 0 : 1;
+ sctp_sk(sk)->frag_interleave = !!val;
+
+ if (!sctp_sk(sk)->frag_interleave)
+ sctp_sk(sk)->strm_interleave = 0;
return 0;
}
@@ -4023,6 +4056,40 @@ out:
return retval;
}
+static int sctp_setsockopt_interleaving_supported(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct net *net = sock_net(sk);
+ struct sctp_assoc_value params;
+ int retval = -EINVAL;
+
+ if (optlen < sizeof(params))
+ goto out;
+
+ optlen = sizeof(params);
+ if (copy_from_user(&params, optval, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ if (params.assoc_id)
+ goto out;
+
+ if (!net->sctp.intl_enable || !sp->frag_interleave) {
+ retval = -EPERM;
+ goto out;
+ }
+
+ sp->strm_interleave = !!params.assoc_value;
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
/* API 6.2 setsockopt(), getsockopt()
*
* Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4210,6 +4277,10 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
case SCTP_STREAM_SCHEDULER_VALUE:
retval = sctp_setsockopt_scheduler_value(sk, optval, optlen);
break;
+ case SCTP_INTERLEAVING_SUPPORTED:
+ retval = sctp_setsockopt_interleaving_supported(sk, optval,
+ optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -4680,20 +4751,11 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
EXPORT_SYMBOL_GPL(sctp_get_sctp_info);
/* use callback to avoid exporting the core structure */
-int sctp_transport_walk_start(struct rhashtable_iter *iter)
+void sctp_transport_walk_start(struct rhashtable_iter *iter)
{
- int err;
-
rhltable_walk_enter(&sctp_transport_hashtable, iter);
- err = rhashtable_walk_start(iter);
- if (err && err != -EAGAIN) {
- rhashtable_walk_stop(iter);
- rhashtable_walk_exit(iter);
- return err;
- }
-
- return 0;
+ rhashtable_walk_start(iter);
}
void sctp_transport_walk_stop(struct rhashtable_iter *iter)
@@ -4784,12 +4846,10 @@ int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
struct net *net, int *pos, void *p) {
struct rhashtable_iter hti;
struct sctp_transport *tsp;
- int ret;
+ int ret = 0;
again:
- ret = sctp_transport_walk_start(&hti);
- if (ret)
- return ret;
+ sctp_transport_walk_start(&hti);
tsp = sctp_transport_get_idx(net, &hti, *pos + 1);
for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) {
@@ -6984,6 +7044,47 @@ out:
return retval;
}
+static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ int retval = -EFAULT;
+
+ if (len < sizeof(params)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ len = sizeof(params);
+ if (copy_from_user(&params, optval, len))
+ goto out;
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (asoc) {
+ params.assoc_value = asoc->intl_enable;
+ } else if (!params.assoc_id) {
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ params.assoc_value = sp->strm_interleave;
+ } else {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ if (put_user(len, optlen))
+ goto out;
+
+ if (copy_to_user(optval, &params, len))
+ goto out;
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
static int sctp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -7174,6 +7275,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
retval = sctp_getsockopt_scheduler_value(sk, len, optval,
optlen);
break;
+ case SCTP_INTERLEAVING_SUPPORTED:
+ retval = sctp_getsockopt_interleaving_supported(sk, len, optval,
+ optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -8411,11 +8516,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
}
- sctp_skb_for_each(skb, &assoc->ulpq.reasm, tmp)
- sctp_skb_set_owner_r_frag(skb, newsk);
-
- sctp_skb_for_each(skb, &assoc->ulpq.lobby, tmp)
- sctp_skb_set_owner_r_frag(skb, newsk);
+ sctp_for_each_rx_skb(assoc, newsk, sctp_skb_set_owner_r_frag);
/* Set the type of socket to indicate that it is peeled off from the
* original UDP-style socket or created with the accept() call on a
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 76ea66be0bbe..06b644dd858c 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -167,6 +167,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
sched->init(stream);
in:
+ sctp_stream_interleave_init(stream);
if (!incnt)
goto out;
@@ -215,11 +216,13 @@ void sctp_stream_clear(struct sctp_stream *stream)
{
int i;
- for (i = 0; i < stream->outcnt; i++)
- stream->out[i].ssn = 0;
+ for (i = 0; i < stream->outcnt; i++) {
+ stream->out[i].mid = 0;
+ stream->out[i].mid_uo = 0;
+ }
for (i = 0; i < stream->incnt; i++)
- stream->in[i].ssn = 0;
+ stream->in[i].mid = 0;
}
void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
@@ -606,10 +609,10 @@ struct sctp_chunk *sctp_process_strreset_outreq(
}
for (i = 0; i < nums; i++)
- stream->in[ntohs(str_p[i])].ssn = 0;
+ stream->in[ntohs(str_p[i])].mid = 0;
} else {
for (i = 0; i < stream->incnt; i++)
- stream->in[i].ssn = 0;
+ stream->in[i].mid = 0;
}
result = SCTP_STRRESET_PERFORMED;
@@ -753,8 +756,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
* performed.
*/
max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map);
- sctp_ulpq_reasm_flushtsn(&asoc->ulpq, max_tsn_seen);
- sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+ asoc->stream.si->report_ftsn(&asoc->ulpq, max_tsn_seen);
/* G1: Compute an appropriate value for the Receiver's Next TSN -- the
* TSN that the peer should use to send the next DATA chunk. The
@@ -783,10 +785,12 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
/* G5: The next expected and outgoing SSNs MUST be reset to 0 for all
* incoming and outgoing streams.
*/
- for (i = 0; i < stream->outcnt; i++)
- stream->out[i].ssn = 0;
+ for (i = 0; i < stream->outcnt; i++) {
+ stream->out[i].mid = 0;
+ stream->out[i].mid_uo = 0;
+ }
for (i = 0; i < stream->incnt; i++)
- stream->in[i].ssn = 0;
+ stream->in[i].mid = 0;
result = SCTP_STRRESET_PERFORMED;
@@ -976,11 +980,15 @@ struct sctp_chunk *sctp_process_strreset_resp(
if (result == SCTP_STRRESET_PERFORMED) {
if (nums) {
- for (i = 0; i < nums; i++)
- stream->out[ntohs(str_p[i])].ssn = 0;
+ for (i = 0; i < nums; i++) {
+ stream->out[ntohs(str_p[i])].mid = 0;
+ stream->out[ntohs(str_p[i])].mid_uo = 0;
+ }
} else {
- for (i = 0; i < stream->outcnt; i++)
- stream->out[i].ssn = 0;
+ for (i = 0; i < stream->outcnt; i++) {
+ stream->out[i].mid = 0;
+ stream->out[i].mid_uo = 0;
+ }
}
flags = SCTP_STREAM_RESET_OUTGOING_SSN;
@@ -1023,8 +1031,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
&asoc->peer.tsn_map);
LIST_HEAD(temp);
- sctp_ulpq_reasm_flushtsn(&asoc->ulpq, mtsn);
- sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+ asoc->stream.si->report_ftsn(&asoc->ulpq, mtsn);
sctp_tsnmap_init(&asoc->peer.tsn_map,
SCTP_TSN_MAP_INITIAL,
@@ -1042,10 +1049,12 @@ struct sctp_chunk *sctp_process_strreset_resp(
asoc->ctsn_ack_point = asoc->next_tsn - 1;
asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
- for (i = 0; i < stream->outcnt; i++)
- stream->out[i].ssn = 0;
+ for (i = 0; i < stream->outcnt; i++) {
+ stream->out[i].mid = 0;
+ stream->out[i].mid_uo = 0;
+ }
for (i = 0; i < stream->incnt; i++)
- stream->in[i].ssn = 0;
+ stream->in[i].mid = 0;
}
for (i = 0; i < stream->outcnt; i++)
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
new file mode 100644
index 000000000000..8c7cf8f08711
--- /dev/null
+++ b/net/sctp/stream_interleave.c
@@ -0,0 +1,1334 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * Xin Long <lucien.xin@gmail.com>
+ */
+
+#include <net/busy_poll.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/ulpevent.h>
+#include <linux/sctp.h>
+
+static struct sctp_chunk *sctp_make_idatafrag_empty(
+ const struct sctp_association *asoc,
+ const struct sctp_sndrcvinfo *sinfo,
+ int len, __u8 flags, gfp_t gfp)
+{
+ struct sctp_chunk *retval;
+ struct sctp_idatahdr dp;
+
+ memset(&dp, 0, sizeof(dp));
+ dp.stream = htons(sinfo->sinfo_stream);
+
+ if (sinfo->sinfo_flags & SCTP_UNORDERED)
+ flags |= SCTP_DATA_UNORDERED;
+
+ retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp);
+ if (!retval)
+ return NULL;
+
+ retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
+ memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));
+
+ return retval;
+}
+
+static void sctp_chunk_assign_mid(struct sctp_chunk *chunk)
+{
+ struct sctp_stream *stream;
+ struct sctp_chunk *lchunk;
+ __u32 cfsn = 0;
+ __u16 sid;
+
+ if (chunk->has_mid)
+ return;
+
+ sid = sctp_chunk_stream_no(chunk);
+ stream = &chunk->asoc->stream;
+
+ list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) {
+ struct sctp_idatahdr *hdr;
+ __u32 mid;
+
+ lchunk->has_mid = 1;
+
+ hdr = lchunk->subh.idata_hdr;
+
+ if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)
+ hdr->ppid = lchunk->sinfo.sinfo_ppid;
+ else
+ hdr->fsn = htonl(cfsn++);
+
+ if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
+ mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ?
+ sctp_mid_uo_next(stream, out, sid) :
+ sctp_mid_uo_peek(stream, out, sid);
+ } else {
+ mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ?
+ sctp_mid_next(stream, out, sid) :
+ sctp_mid_peek(stream, out, sid);
+ }
+ hdr->mid = htonl(mid);
+ }
+}
+
+static bool sctp_validate_data(struct sctp_chunk *chunk)
+{
+ const struct sctp_stream *stream;
+ __u16 sid, ssn;
+
+ if (chunk->chunk_hdr->type != SCTP_CID_DATA)
+ return false;
+
+ if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+ return true;
+
+ stream = &chunk->asoc->stream;
+ sid = sctp_chunk_stream_no(chunk);
+ ssn = ntohs(chunk->subh.data_hdr->ssn);
+
+ return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid));
+}
+
+static bool sctp_validate_idata(struct sctp_chunk *chunk)
+{
+ struct sctp_stream *stream;
+ __u32 mid;
+ __u16 sid;
+
+ if (chunk->chunk_hdr->type != SCTP_CID_I_DATA)
+ return false;
+
+ if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+ return true;
+
+ stream = &chunk->asoc->stream;
+ sid = sctp_chunk_stream_no(chunk);
+ mid = ntohl(chunk->subh.idata_hdr->mid);
+
+ return !MID_lt(mid, sctp_mid_peek(stream, in, sid));
+}
+
+static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sctp_ulpevent *cevent;
+ struct sk_buff *pos;
+
+ pos = skb_peek_tail(&ulpq->reasm);
+ if (!pos) {
+ __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+ return;
+ }
+
+ cevent = sctp_skb2event(pos);
+
+ if (event->stream == cevent->stream &&
+ event->mid == cevent->mid &&
+ (cevent->msg_flags & SCTP_DATA_FIRST_FRAG ||
+ (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) &&
+ event->fsn > cevent->fsn))) {
+ __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+ return;
+ }
+
+ if ((event->stream == cevent->stream &&
+ MID_lt(cevent->mid, event->mid)) ||
+ event->stream > cevent->stream) {
+ __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+ return;
+ }
+
+ skb_queue_walk(&ulpq->reasm, pos) {
+ cevent = sctp_skb2event(pos);
+
+ if (event->stream < cevent->stream ||
+ (event->stream == cevent->stream &&
+ MID_lt(event->mid, cevent->mid)))
+ break;
+
+ if (event->stream == cevent->stream &&
+ event->mid == cevent->mid &&
+ !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) &&
+ (event->msg_flags & SCTP_DATA_FIRST_FRAG ||
+ event->fsn < cevent->fsn))
+ break;
+ }
+
+ __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event));
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_partial(
+ struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sk_buff *first_frag = NULL;
+ struct sk_buff *last_frag = NULL;
+ struct sctp_ulpevent *retval;
+ struct sctp_stream_in *sin;
+ struct sk_buff *pos;
+ __u32 next_fsn = 0;
+ int is_last = 0;
+
+ sin = sctp_stream_in(ulpq->asoc, event->stream);
+
+ skb_queue_walk(&ulpq->reasm, pos) {
+ struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+ if (cevent->stream < event->stream)
+ continue;
+
+ if (cevent->stream > event->stream ||
+ cevent->mid != sin->mid)
+ break;
+
+ switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+ case SCTP_DATA_FIRST_FRAG:
+ goto out;
+ case SCTP_DATA_MIDDLE_FRAG:
+ if (!first_frag) {
+ if (cevent->fsn == sin->fsn) {
+ first_frag = pos;
+ last_frag = pos;
+ next_fsn = cevent->fsn + 1;
+ }
+ } else if (cevent->fsn == next_fsn) {
+ last_frag = pos;
+ next_fsn++;
+ } else {
+ goto out;
+ }
+ break;
+ case SCTP_DATA_LAST_FRAG:
+ if (!first_frag) {
+ if (cevent->fsn == sin->fsn) {
+ first_frag = pos;
+ last_frag = pos;
+ next_fsn = 0;
+ is_last = 1;
+ }
+ } else if (cevent->fsn == next_fsn) {
+ last_frag = pos;
+ next_fsn = 0;
+ is_last = 1;
+ }
+ goto out;
+ default:
+ goto out;
+ }
+ }
+
+out:
+ if (!first_frag)
+ return NULL;
+
+ retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+ &ulpq->reasm, first_frag,
+ last_frag);
+ if (retval) {
+ sin->fsn = next_fsn;
+ if (is_last) {
+ retval->msg_flags |= MSG_EOR;
+ sin->pd_mode = 0;
+ }
+ }
+
+ return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_reassembled(
+ struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sctp_association *asoc = ulpq->asoc;
+ struct sk_buff *pos, *first_frag = NULL;
+ struct sctp_ulpevent *retval = NULL;
+ struct sk_buff *pd_first = NULL;
+ struct sk_buff *pd_last = NULL;
+ struct sctp_stream_in *sin;
+ __u32 next_fsn = 0;
+ __u32 pd_point = 0;
+ __u32 pd_len = 0;
+ __u32 mid = 0;
+
+ sin = sctp_stream_in(ulpq->asoc, event->stream);
+
+ skb_queue_walk(&ulpq->reasm, pos) {
+ struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+ if (cevent->stream < event->stream)
+ continue;
+ if (cevent->stream > event->stream)
+ break;
+
+ if (MID_lt(cevent->mid, event->mid))
+ continue;
+ if (MID_lt(event->mid, cevent->mid))
+ break;
+
+ switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+ case SCTP_DATA_FIRST_FRAG:
+ if (cevent->mid == sin->mid) {
+ pd_first = pos;
+ pd_last = pos;
+ pd_len = pos->len;
+ }
+
+ first_frag = pos;
+ next_fsn = 0;
+ mid = cevent->mid;
+ break;
+
+ case SCTP_DATA_MIDDLE_FRAG:
+ if (first_frag && cevent->mid == mid &&
+ cevent->fsn == next_fsn) {
+ next_fsn++;
+ if (pd_first) {
+ pd_last = pos;
+ pd_len += pos->len;
+ }
+ } else {
+ first_frag = NULL;
+ }
+ break;
+
+ case SCTP_DATA_LAST_FRAG:
+ if (first_frag && cevent->mid == mid &&
+ cevent->fsn == next_fsn)
+ goto found;
+ else
+ first_frag = NULL;
+ break;
+ }
+ }
+
+ if (!pd_first)
+ goto out;
+
+ pd_point = sctp_sk(asoc->base.sk)->pd_point;
+ if (pd_point && pd_point <= pd_len) {
+ retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+ &ulpq->reasm,
+ pd_first, pd_last);
+ if (retval) {
+ sin->fsn = next_fsn;
+ sin->pd_mode = 1;
+ }
+ }
+ goto out;
+
+found:
+ retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+ &ulpq->reasm,
+ first_frag, pos);
+ if (retval)
+ retval->msg_flags |= MSG_EOR;
+
+out:
+ return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sctp_ulpevent *retval = NULL;
+ struct sctp_stream_in *sin;
+
+ if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) {
+ event->msg_flags |= MSG_EOR;
+ return event;
+ }
+
+ sctp_intl_store_reasm(ulpq, event);
+
+ sin = sctp_stream_in(ulpq->asoc, event->stream);
+ if (sin->pd_mode && event->mid == sin->mid &&
+ event->fsn == sin->fsn)
+ retval = sctp_intl_retrieve_partial(ulpq, event);
+
+ if (!retval)
+ retval = sctp_intl_retrieve_reassembled(ulpq, event);
+
+ return retval;
+}
+
+static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sctp_ulpevent *cevent;
+ struct sk_buff *pos;
+
+ pos = skb_peek_tail(&ulpq->lobby);
+ if (!pos) {
+ __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+ return;
+ }
+
+ cevent = (struct sctp_ulpevent *)pos->cb;
+ if (event->stream == cevent->stream &&
+ MID_lt(cevent->mid, event->mid)) {
+ __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+ return;
+ }
+
+ if (event->stream > cevent->stream) {
+ __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+ return;
+ }
+
+ skb_queue_walk(&ulpq->lobby, pos) {
+ cevent = (struct sctp_ulpevent *)pos->cb;
+
+ if (cevent->stream > event->stream)
+ break;
+
+ if (cevent->stream == event->stream &&
+ MID_lt(event->mid, cevent->mid))
+ break;
+ }
+
+ __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event));
+}
+
+static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sk_buff_head *event_list;
+ struct sctp_stream *stream;
+ struct sk_buff *pos, *tmp;
+ __u16 sid = event->stream;
+
+ stream = &ulpq->asoc->stream;
+ event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev;
+
+ sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
+ struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb;
+
+ if (cevent->stream > sid)
+ break;
+
+ if (cevent->stream < sid)
+ continue;
+
+ if (cevent->mid != sctp_mid_peek(stream, in, sid))
+ break;
+
+ sctp_mid_next(stream, in, sid);
+
+ __skb_unlink(pos, &ulpq->lobby);
+
+ __skb_queue_tail(event_list, pos);
+ }
+}
+
+static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sctp_stream *stream;
+ __u16 sid;
+
+ stream = &ulpq->asoc->stream;
+ sid = event->stream;
+
+ if (event->mid != sctp_mid_peek(stream, in, sid)) {
+ sctp_intl_store_ordered(ulpq, event);
+ return NULL;
+ }
+
+ sctp_mid_next(stream, in, sid);
+
+ sctp_intl_retrieve_ordered(ulpq, event);
+
+ return event;
+}
+
+static int sctp_enqueue_event(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sk_buff *skb = sctp_event2skb(event);
+ struct sock *sk = ulpq->asoc->base.sk;
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sk_buff_head *skb_list;
+
+ skb_list = (struct sk_buff_head *)skb->prev;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN &&
+ (sk->sk_shutdown & SEND_SHUTDOWN ||
+ !sctp_ulpevent_is_notification(event)))
+ goto out_free;
+
+ if (!sctp_ulpevent_is_notification(event)) {
+ sk_mark_napi_id(sk, skb);
+ sk_incoming_cpu_update(sk);
+ }
+
+ if (!sctp_ulpevent_is_enabled(event, &sp->subscribe))
+ goto out_free;
+
+ if (skb_list)
+ skb_queue_splice_tail_init(skb_list,
+ &sk->sk_receive_queue);
+ else
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+
+ if (!sp->data_ready_signalled) {
+ sp->data_ready_signalled = 1;
+ sk->sk_data_ready(sk);
+ }
+
+ return 1;
+
+out_free:
+ if (skb_list)
+ sctp_queue_purge_ulpevents(skb_list);
+ else
+ sctp_ulpevent_free(event);
+
+ return 0;
+}
+
+static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sctp_ulpevent *cevent;
+ struct sk_buff *pos;
+
+ pos = skb_peek_tail(&ulpq->reasm_uo);
+ if (!pos) {
+ __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event));
+ return;
+ }
+
+ cevent = sctp_skb2event(pos);
+
+ if (event->stream == cevent->stream &&
+ event->mid == cevent->mid &&
+ (cevent->msg_flags & SCTP_DATA_FIRST_FRAG ||
+ (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) &&
+ event->fsn > cevent->fsn))) {
+ __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event));
+ return;
+ }
+
+ if ((event->stream == cevent->stream &&
+ MID_lt(cevent->mid, event->mid)) ||
+ event->stream > cevent->stream) {
+ __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event));
+ return;
+ }
+
+ skb_queue_walk(&ulpq->reasm_uo, pos) {
+ cevent = sctp_skb2event(pos);
+
+ if (event->stream < cevent->stream ||
+ (event->stream == cevent->stream &&
+ MID_lt(event->mid, cevent->mid)))
+ break;
+
+ if (event->stream == cevent->stream &&
+ event->mid == cevent->mid &&
+ !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) &&
+ (event->msg_flags & SCTP_DATA_FIRST_FRAG ||
+ event->fsn < cevent->fsn))
+ break;
+ }
+
+ __skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event));
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo(
+ struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sk_buff *first_frag = NULL;
+ struct sk_buff *last_frag = NULL;
+ struct sctp_ulpevent *retval;
+ struct sctp_stream_in *sin;
+ struct sk_buff *pos;
+ __u32 next_fsn = 0;
+ int is_last = 0;
+
+ sin = sctp_stream_in(ulpq->asoc, event->stream);
+
+ skb_queue_walk(&ulpq->reasm_uo, pos) {
+ struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+ if (cevent->stream < event->stream)
+ continue;
+ if (cevent->stream > event->stream)
+ break;
+
+ if (MID_lt(cevent->mid, sin->mid_uo))
+ continue;
+ if (MID_lt(sin->mid_uo, cevent->mid))
+ break;
+
+ switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+ case SCTP_DATA_FIRST_FRAG:
+ goto out;
+ case SCTP_DATA_MIDDLE_FRAG:
+ if (!first_frag) {
+ if (cevent->fsn == sin->fsn_uo) {
+ first_frag = pos;
+ last_frag = pos;
+ next_fsn = cevent->fsn + 1;
+ }
+ } else if (cevent->fsn == next_fsn) {
+ last_frag = pos;
+ next_fsn++;
+ } else {
+ goto out;
+ }
+ break;
+ case SCTP_DATA_LAST_FRAG:
+ if (!first_frag) {
+ if (cevent->fsn == sin->fsn_uo) {
+ first_frag = pos;
+ last_frag = pos;
+ next_fsn = 0;
+ is_last = 1;
+ }
+ } else if (cevent->fsn == next_fsn) {
+ last_frag = pos;
+ next_fsn = 0;
+ is_last = 1;
+ }
+ goto out;
+ default:
+ goto out;
+ }
+ }
+
+out:
+ if (!first_frag)
+ return NULL;
+
+ retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+ &ulpq->reasm_uo, first_frag,
+ last_frag);
+ if (retval) {
+ sin->fsn_uo = next_fsn;
+ if (is_last) {
+ retval->msg_flags |= MSG_EOR;
+ sin->pd_mode_uo = 0;
+ }
+ }
+
+ return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo(
+ struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sctp_association *asoc = ulpq->asoc;
+ struct sk_buff *pos, *first_frag = NULL;
+ struct sctp_ulpevent *retval = NULL;
+ struct sk_buff *pd_first = NULL;
+ struct sk_buff *pd_last = NULL;
+ struct sctp_stream_in *sin;
+ __u32 next_fsn = 0;
+ __u32 pd_point = 0;
+ __u32 pd_len = 0;
+ __u32 mid = 0;
+
+ sin = sctp_stream_in(ulpq->asoc, event->stream);
+
+ skb_queue_walk(&ulpq->reasm_uo, pos) {
+ struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+ if (cevent->stream < event->stream)
+ continue;
+ if (cevent->stream > event->stream)
+ break;
+
+ if (MID_lt(cevent->mid, event->mid))
+ continue;
+ if (MID_lt(event->mid, cevent->mid))
+ break;
+
+ switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+ case SCTP_DATA_FIRST_FRAG:
+ if (!sin->pd_mode_uo) {
+ sin->mid_uo = cevent->mid;
+ pd_first = pos;
+ pd_last = pos;
+ pd_len = pos->len;
+ }
+
+ first_frag = pos;
+ next_fsn = 0;
+ mid = cevent->mid;
+ break;
+
+ case SCTP_DATA_MIDDLE_FRAG:
+ if (first_frag && cevent->mid == mid &&
+ cevent->fsn == next_fsn) {
+ next_fsn++;
+ if (pd_first) {
+ pd_last = pos;
+ pd_len += pos->len;
+ }
+ } else {
+ first_frag = NULL;
+ }
+ break;
+
+ case SCTP_DATA_LAST_FRAG:
+ if (first_frag && cevent->mid == mid &&
+ cevent->fsn == next_fsn)
+ goto found;
+ else
+ first_frag = NULL;
+ break;
+ }
+ }
+
+ if (!pd_first)
+ goto out;
+
+ pd_point = sctp_sk(asoc->base.sk)->pd_point;
+ if (pd_point && pd_point <= pd_len) {
+ retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+ &ulpq->reasm_uo,
+ pd_first, pd_last);
+ if (retval) {
+ sin->fsn_uo = next_fsn;
+ sin->pd_mode_uo = 1;
+ }
+ }
+ goto out;
+
+found:
+ retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+ &ulpq->reasm_uo,
+ first_frag, pos);
+ if (retval)
+ retval->msg_flags |= MSG_EOR;
+
+out:
+ return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sctp_ulpevent *retval = NULL;
+ struct sctp_stream_in *sin;
+
+ if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) {
+ event->msg_flags |= MSG_EOR;
+ return event;
+ }
+
+ sctp_intl_store_reasm_uo(ulpq, event);
+
+ sin = sctp_stream_in(ulpq->asoc, event->stream);
+ if (sin->pd_mode_uo && event->mid == sin->mid_uo &&
+ event->fsn == sin->fsn_uo)
+ retval = sctp_intl_retrieve_partial_uo(ulpq, event);
+
+ if (!retval)
+ retval = sctp_intl_retrieve_reassembled_uo(ulpq, event);
+
+ return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq)
+{
+ struct sctp_stream_in *csin, *sin = NULL;
+ struct sk_buff *first_frag = NULL;
+ struct sk_buff *last_frag = NULL;
+ struct sctp_ulpevent *retval;
+ struct sk_buff *pos;
+ __u32 next_fsn = 0;
+ __u16 sid = 0;
+
+ skb_queue_walk(&ulpq->reasm_uo, pos) {
+ struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+ csin = sctp_stream_in(ulpq->asoc, cevent->stream);
+ if (csin->pd_mode_uo)
+ continue;
+
+ switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+ case SCTP_DATA_FIRST_FRAG:
+ if (first_frag)
+ goto out;
+ first_frag = pos;
+ last_frag = pos;
+ next_fsn = 0;
+ sin = csin;
+ sid = cevent->stream;
+ sin->mid_uo = cevent->mid;
+ break;
+ case SCTP_DATA_MIDDLE_FRAG:
+ if (!first_frag)
+ break;
+ if (cevent->stream == sid &&
+ cevent->mid == sin->mid_uo &&
+ cevent->fsn == next_fsn) {
+ next_fsn++;
+ last_frag = pos;
+ } else {
+ goto out;
+ }
+ break;
+ case SCTP_DATA_LAST_FRAG:
+ if (first_frag)
+ goto out;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!first_frag)
+ return NULL;
+
+out:
+ retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+ &ulpq->reasm_uo, first_frag,
+ last_frag);
+ if (retval) {
+ sin->fsn_uo = next_fsn;
+ sin->pd_mode_uo = 1;
+ }
+
+ return retval;
+}
+
+static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq,
+ struct sctp_chunk *chunk, gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+ struct sk_buff_head temp;
+ int event_eor = 0;
+
+ event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp);
+ if (!event)
+ return -ENOMEM;
+
+ event->mid = ntohl(chunk->subh.idata_hdr->mid);
+ if (event->msg_flags & SCTP_DATA_FIRST_FRAG)
+ event->ppid = chunk->subh.idata_hdr->ppid;
+ else
+ event->fsn = ntohl(chunk->subh.idata_hdr->fsn);
+
+ if (!(event->msg_flags & SCTP_DATA_UNORDERED)) {
+ event = sctp_intl_reasm(ulpq, event);
+ if (event && event->msg_flags & MSG_EOR) {
+ skb_queue_head_init(&temp);
+ __skb_queue_tail(&temp, sctp_event2skb(event));
+
+ event = sctp_intl_order(ulpq, event);
+ }
+ } else {
+ event = sctp_intl_reasm_uo(ulpq, event);
+ }
+
+ if (event) {
+ event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0;
+ sctp_enqueue_event(ulpq, event);
+ }
+
+ return event_eor;
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq)
+{
+ struct sctp_stream_in *csin, *sin = NULL;
+ struct sk_buff *first_frag = NULL;
+ struct sk_buff *last_frag = NULL;
+ struct sctp_ulpevent *retval;
+ struct sk_buff *pos;
+ __u32 next_fsn = 0;
+ __u16 sid = 0;
+
+ skb_queue_walk(&ulpq->reasm, pos) {
+ struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+ csin = sctp_stream_in(ulpq->asoc, cevent->stream);
+ if (csin->pd_mode)
+ continue;
+
+ switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+ case SCTP_DATA_FIRST_FRAG:
+ if (first_frag)
+ goto out;
+ if (cevent->mid == csin->mid) {
+ first_frag = pos;
+ last_frag = pos;
+ next_fsn = 0;
+ sin = csin;
+ sid = cevent->stream;
+ }
+ break;
+ case SCTP_DATA_MIDDLE_FRAG:
+ if (!first_frag)
+ break;
+ if (cevent->stream == sid &&
+ cevent->mid == sin->mid &&
+ cevent->fsn == next_fsn) {
+ next_fsn++;
+ last_frag = pos;
+ } else {
+ goto out;
+ }
+ break;
+ case SCTP_DATA_LAST_FRAG:
+ if (first_frag)
+ goto out;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!first_frag)
+ return NULL;
+
+out:
+ retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+ &ulpq->reasm, first_frag,
+ last_frag);
+ if (retval) {
+ sin->fsn = next_fsn;
+ sin->pd_mode = 1;
+ }
+
+ return retval;
+}
+
+static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+
+ if (!skb_queue_empty(&ulpq->reasm)) {
+ do {
+ event = sctp_intl_retrieve_first(ulpq);
+ if (event)
+ sctp_enqueue_event(ulpq, event);
+ } while (event);
+ }
+
+ if (!skb_queue_empty(&ulpq->reasm_uo)) {
+ do {
+ event = sctp_intl_retrieve_first_uo(ulpq);
+ if (event)
+ sctp_enqueue_event(ulpq, event);
+ } while (event);
+ }
+}
+
+static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
+ gfp_t gfp)
+{
+ struct sctp_association *asoc = ulpq->asoc;
+ __u32 freed = 0;
+ __u16 needed;
+
+ if (chunk) {
+ needed = ntohs(chunk->chunk_hdr->length);
+ needed -= sizeof(struct sctp_idata_chunk);
+ } else {
+ needed = SCTP_DEFAULT_MAXWINDOW;
+ }
+
+ if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) {
+ freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed);
+ if (freed < needed)
+ freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm,
+ needed);
+ if (freed < needed)
+ freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo,
+ needed);
+ }
+
+ if (chunk && freed >= needed)
+ if (sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0)
+ sctp_intl_start_pd(ulpq, gfp);
+
+ sk_mem_reclaim(asoc->base.sk);
+}
+
+static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid,
+ __u32 mid, __u16 flags, gfp_t gfp)
+{
+ struct sock *sk = ulpq->asoc->base.sk;
+ struct sctp_ulpevent *ev = NULL;
+
+ if (!sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT,
+ &sctp_sk(sk)->subscribe))
+ return;
+
+ ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED,
+ sid, mid, flags, gfp);
+ if (ev) {
+ __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
+
+ if (!sctp_sk(sk)->data_ready_signalled) {
+ sctp_sk(sk)->data_ready_signalled = 1;
+ sk->sk_data_ready(sk);
+ }
+ }
+}
+
+static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
+{
+ struct sctp_stream *stream = &ulpq->asoc->stream;
+ struct sctp_ulpevent *cevent, *event = NULL;
+ struct sk_buff_head *lobby = &ulpq->lobby;
+ struct sk_buff *pos, *tmp;
+ struct sk_buff_head temp;
+ __u16 csid;
+ __u32 cmid;
+
+ skb_queue_head_init(&temp);
+ sctp_skb_for_each(pos, lobby, tmp) {
+ cevent = (struct sctp_ulpevent *)pos->cb;
+ csid = cevent->stream;
+ cmid = cevent->mid;
+
+ if (csid > sid)
+ break;
+
+ if (csid < sid)
+ continue;
+
+ if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid)))
+ break;
+
+ __skb_unlink(pos, lobby);
+ if (!event)
+ event = sctp_skb2event(pos);
+
+ __skb_queue_tail(&temp, pos);
+ }
+
+ if (!event && pos != (struct sk_buff *)lobby) {
+ cevent = (struct sctp_ulpevent *)pos->cb;
+ csid = cevent->stream;
+ cmid = cevent->mid;
+
+ if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) {
+ sctp_mid_next(stream, in, csid);
+ __skb_unlink(pos, lobby);
+ __skb_queue_tail(&temp, pos);
+ event = sctp_skb2event(pos);
+ }
+ }
+
+ if (event) {
+ sctp_intl_retrieve_ordered(ulpq, event);
+ sctp_enqueue_event(ulpq, event);
+ }
+}
+
+static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
+{
+ struct sctp_stream *stream = &ulpq->asoc->stream;
+ __u16 sid;
+
+ for (sid = 0; sid < stream->incnt; sid++) {
+ struct sctp_stream_in *sin = &stream->in[sid];
+ __u32 mid;
+
+ if (sin->pd_mode_uo) {
+ sin->pd_mode_uo = 0;
+
+ mid = sin->mid_uo;
+ sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp);
+ }
+
+ if (sin->pd_mode) {
+ sin->pd_mode = 0;
+
+ mid = sin->mid;
+ sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp);
+ sctp_mid_skip(stream, in, sid, mid);
+
+ sctp_intl_reap_ordered(ulpq, sid);
+ }
+ }
+
+ /* intl abort pd happens only when all data needs to be cleaned */
+ sctp_ulpq_flush(ulpq);
+}
+
+static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist,
+ int nskips, __be16 stream, __u8 flags)
+{
+ int i;
+
+ for (i = 0; i < nskips; i++)
+ if (skiplist[i].stream == stream &&
+ skiplist[i].flags == flags)
+ return i;
+
+ return i;
+}
+
+#define SCTP_FTSN_U_BIT 0x1
+static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn)
+{
+ struct sctp_ifwdtsn_skip ftsn_skip_arr[10];
+ struct sctp_association *asoc = q->asoc;
+ struct sctp_chunk *ftsn_chunk = NULL;
+ struct list_head *lchunk, *temp;
+ int nskips = 0, skip_pos;
+ struct sctp_chunk *chunk;
+ __u32 tsn;
+
+ if (!asoc->peer.prsctp_capable)
+ return;
+
+ if (TSN_lt(asoc->adv_peer_ack_point, ctsn))
+ asoc->adv_peer_ack_point = ctsn;
+
+ list_for_each_safe(lchunk, temp, &q->abandoned) {
+ chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list);
+ tsn = ntohl(chunk->subh.data_hdr->tsn);
+
+ if (TSN_lte(tsn, ctsn)) {
+ list_del_init(lchunk);
+ sctp_chunk_free(chunk);
+ } else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) {
+ __be16 sid = chunk->subh.idata_hdr->stream;
+ __be32 mid = chunk->subh.idata_hdr->mid;
+ __u8 flags = 0;
+
+ if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+ flags |= SCTP_FTSN_U_BIT;
+
+ asoc->adv_peer_ack_point = tsn;
+ skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips,
+ sid, flags);
+ ftsn_skip_arr[skip_pos].stream = sid;
+ ftsn_skip_arr[skip_pos].reserved = 0;
+ ftsn_skip_arr[skip_pos].flags = flags;
+ ftsn_skip_arr[skip_pos].mid = mid;
+ if (skip_pos == nskips)
+ nskips++;
+ if (nskips == 10)
+ break;
+ } else {
+ break;
+ }
+ }
+
+ if (asoc->adv_peer_ack_point > ctsn)
+ ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point,
+ nskips, &ftsn_skip_arr[0]);
+
+ if (ftsn_chunk) {
+ list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
+ SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS);
+ }
+}
+
+#define _sctp_walk_ifwdtsn(pos, chunk, end) \
+ for (pos = chunk->subh.ifwdtsn_hdr->skip; \
+ (void *)pos < (void *)chunk->subh.ifwdtsn_hdr->skip + (end); pos++)
+
+#define sctp_walk_ifwdtsn(pos, ch) \
+ _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \
+ sizeof(struct sctp_ifwdtsn_chunk))
+
+static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk)
+{
+ struct sctp_fwdtsn_skip *skip;
+ __u16 incnt;
+
+ if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN)
+ return false;
+
+ incnt = chunk->asoc->stream.incnt;
+ sctp_walk_fwdtsn(skip, chunk)
+ if (ntohs(skip->stream) >= incnt)
+ return false;
+
+ return true;
+}
+
+static bool sctp_validate_iftsn(struct sctp_chunk *chunk)
+{
+ struct sctp_ifwdtsn_skip *skip;
+ __u16 incnt;
+
+ if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN)
+ return false;
+
+ incnt = chunk->asoc->stream.incnt;
+ sctp_walk_ifwdtsn(skip, chunk)
+ if (ntohs(skip->stream) >= incnt)
+ return false;
+
+ return true;
+}
+
+static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn)
+{
+ /* Move the Cumulattive TSN Ack ahead. */
+ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn);
+ /* purge the fragmentation queue */
+ sctp_ulpq_reasm_flushtsn(ulpq, ftsn);
+ /* Abort any in progress partial delivery. */
+ sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC);
+}
+
+static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn)
+{
+ struct sk_buff *pos, *tmp;
+
+ skb_queue_walk_safe(&ulpq->reasm, pos, tmp) {
+ struct sctp_ulpevent *event = sctp_skb2event(pos);
+ __u32 tsn = event->tsn;
+
+ if (TSN_lte(tsn, ftsn)) {
+ __skb_unlink(pos, &ulpq->reasm);
+ sctp_ulpevent_free(event);
+ }
+ }
+
+ skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) {
+ struct sctp_ulpevent *event = sctp_skb2event(pos);
+ __u32 tsn = event->tsn;
+
+ if (TSN_lte(tsn, ftsn)) {
+ __skb_unlink(pos, &ulpq->reasm_uo);
+ sctp_ulpevent_free(event);
+ }
+ }
+}
+
+static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn)
+{
+ /* Move the Cumulattive TSN Ack ahead. */
+ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn);
+ /* purge the fragmentation queue */
+ sctp_intl_reasm_flushtsn(ulpq, ftsn);
+ /* abort only when it's for all data */
+ if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map))
+ sctp_intl_abort_pd(ulpq, GFP_ATOMIC);
+}
+
+static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk)
+{
+ struct sctp_fwdtsn_skip *skip;
+
+ /* Walk through all the skipped SSNs */
+ sctp_walk_fwdtsn(skip, chunk)
+ sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn));
+}
+
+static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid,
+ __u8 flags)
+{
+ struct sctp_stream_in *sin = sctp_stream_in(ulpq->asoc, sid);
+ struct sctp_stream *stream = &ulpq->asoc->stream;
+
+ if (flags & SCTP_FTSN_U_BIT) {
+ if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) {
+ sin->pd_mode_uo = 0;
+ sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1,
+ GFP_ATOMIC);
+ }
+ return;
+ }
+
+ if (MID_lt(mid, sctp_mid_peek(stream, in, sid)))
+ return;
+
+ if (sin->pd_mode) {
+ sin->pd_mode = 0;
+ sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC);
+ }
+
+ sctp_mid_skip(stream, in, sid, mid);
+
+ sctp_intl_reap_ordered(ulpq, sid);
+}
+
+static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk)
+{
+ struct sctp_ifwdtsn_skip *skip;
+
+ /* Walk through all the skipped MIDs and abort stream pd if possible */
+ sctp_walk_ifwdtsn(skip, chunk)
+ sctp_intl_skip(ulpq, ntohs(skip->stream),
+ ntohl(skip->mid), skip->flags);
+}
+
+static struct sctp_stream_interleave sctp_stream_interleave_0 = {
+ .data_chunk_len = sizeof(struct sctp_data_chunk),
+ .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk),
+ /* DATA process functions */
+ .make_datafrag = sctp_make_datafrag_empty,
+ .assign_number = sctp_chunk_assign_ssn,
+ .validate_data = sctp_validate_data,
+ .ulpevent_data = sctp_ulpq_tail_data,
+ .enqueue_event = sctp_ulpq_tail_event,
+ .renege_events = sctp_ulpq_renege,
+ .start_pd = sctp_ulpq_partial_delivery,
+ .abort_pd = sctp_ulpq_abort_pd,
+ /* FORWARD-TSN process functions */
+ .generate_ftsn = sctp_generate_fwdtsn,
+ .validate_ftsn = sctp_validate_fwdtsn,
+ .report_ftsn = sctp_report_fwdtsn,
+ .handle_ftsn = sctp_handle_fwdtsn,
+};
+
+static struct sctp_stream_interleave sctp_stream_interleave_1 = {
+ .data_chunk_len = sizeof(struct sctp_idata_chunk),
+ .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk),
+ /* I-DATA process functions */
+ .make_datafrag = sctp_make_idatafrag_empty,
+ .assign_number = sctp_chunk_assign_mid,
+ .validate_data = sctp_validate_idata,
+ .ulpevent_data = sctp_ulpevent_idata,
+ .enqueue_event = sctp_enqueue_event,
+ .renege_events = sctp_renege_events,
+ .start_pd = sctp_intl_start_pd,
+ .abort_pd = sctp_intl_abort_pd,
+ /* I-FORWARD-TSN process functions */
+ .generate_ftsn = sctp_generate_iftsn,
+ .validate_ftsn = sctp_validate_iftsn,
+ .report_ftsn = sctp_report_iftsn,
+ .handle_ftsn = sctp_handle_iftsn,
+};
+
+void sctp_stream_interleave_init(struct sctp_stream *stream)
+{
+ struct sctp_association *asoc;
+
+ asoc = container_of(stream, struct sctp_association, stream);
+ stream->si = asoc->intl_enable ? &sctp_stream_interleave_1
+ : &sctp_stream_interleave_0;
+}
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index d8c162a4089c..f5fcd425232a 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -242,7 +242,8 @@ int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch)
{
- if (!list_is_last(&ch->frag_list, &ch->msg->chunks)) {
+ if (!list_is_last(&ch->frag_list, &ch->msg->chunks) &&
+ !q->asoc->intl_enable) {
struct sctp_stream_out *sout;
__u16 sid;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index ef7ca44d6e6a..33ca5b73cdb3 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -289,6 +289,13 @@ static struct ctl_table sctp_net_table[] = {
.proc_handler = proc_sctp_do_auth,
},
{
+ .procname = "intl_enable",
+ .data = &init_net.sctp.intl_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "addr_scope_policy",
.data = &init_net.sctp.scope_policy,
.maxlen = sizeof(int),
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 5447228bf1a0..84207ad33e8e 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -443,8 +443,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
goto fail;
/* Pull off the common chunk header and DATA header. */
- skb_pull(skb, sizeof(struct sctp_data_chunk));
- len -= sizeof(struct sctp_data_chunk);
+ skb_pull(skb, sctp_datachk_len(&asoc->stream));
+ len -= sctp_datachk_len(&asoc->stream);
/* Embed the event fields inside the cloned skb. */
event = sctp_skb2event(skb);
@@ -705,8 +705,6 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
sctp_ulpevent_receive_data(event, asoc);
event->stream = ntohs(chunk->subh.data_hdr->stream);
- event->ssn = ntohs(chunk->subh.data_hdr->ssn);
- event->ppid = chunk->subh.data_hdr->ppid;
if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
event->flags |= SCTP_UNORDERED;
event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
@@ -732,8 +730,9 @@ fail:
* various events.
*/
struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
- const struct sctp_association *asoc, __u32 indication,
- gfp_t gfp)
+ const struct sctp_association *asoc,
+ __u32 indication, __u32 sid, __u32 seq,
+ __u32 flags, gfp_t gfp)
{
struct sctp_ulpevent *event;
struct sctp_pdapi_event *pd;
@@ -754,7 +753,9 @@ struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
* Currently unused.
*/
pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT;
- pd->pdapi_flags = 0;
+ pd->pdapi_flags = flags;
+ pd->pdapi_stream = sid;
+ pd->pdapi_seq = seq;
/* pdapi_length: 32 bits (unsigned integer)
*
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index a71be33f3afe..97fae53310e0 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -60,6 +60,7 @@ struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq,
ulpq->asoc = asoc;
skb_queue_head_init(&ulpq->reasm);
+ skb_queue_head_init(&ulpq->reasm_uo);
skb_queue_head_init(&ulpq->lobby);
ulpq->pd_mode = 0;
@@ -83,6 +84,10 @@ void sctp_ulpq_flush(struct sctp_ulpq *ulpq)
sctp_ulpevent_free(event);
}
+ while ((skb = __skb_dequeue(&ulpq->reasm_uo)) != NULL) {
+ event = sctp_skb2event(skb);
+ sctp_ulpevent_free(event);
+ }
}
/* Dispose of a ulpqueue. */
@@ -104,6 +109,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
if (!event)
return -ENOMEM;
+ event->ssn = ntohs(chunk->subh.data_hdr->ssn);
+ event->ppid = chunk->subh.data_hdr->ppid;
+
/* Do reassembly if needed. */
event = sctp_ulpq_reasm(ulpq, event);
@@ -328,9 +336,10 @@ static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq,
* payload was fragmented on the way and ip had to reassemble them.
* We add the rest of skb's to the first skb's fraglist.
*/
-static struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net,
- struct sk_buff_head *queue, struct sk_buff *f_frag,
- struct sk_buff *l_frag)
+struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net,
+ struct sk_buff_head *queue,
+ struct sk_buff *f_frag,
+ struct sk_buff *l_frag)
{
struct sk_buff *pos;
struct sk_buff *new = NULL;
@@ -853,7 +862,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
struct sctp_stream *stream;
/* Check if this message needs ordering. */
- if (SCTP_DATA_UNORDERED & event->msg_flags)
+ if (event->msg_flags & SCTP_DATA_UNORDERED)
return event;
/* Note: The stream ID must be verified before this routine. */
@@ -974,8 +983,8 @@ void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn)
sctp_ulpq_reap_ordered(ulpq, sid);
}
-static __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq,
- struct sk_buff_head *list, __u16 needed)
+__u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, struct sk_buff_head *list,
+ __u16 needed)
{
__u16 freed = 0;
__u32 tsn, last_tsn;
@@ -1140,7 +1149,7 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
&sctp_sk(sk)->subscribe))
ev = sctp_ulpevent_make_pdapi(ulpq->asoc,
SCTP_PARTIAL_DELIVERY_ABORTED,
- gfp);
+ 0, 0, 0, gfp);
if (ev)
__skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 6451c5013e06..daf8075f5a4c 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -520,7 +520,7 @@ decline_rdma:
smc->use_fallback = true;
if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
rc = smc_clc_send_decline(smc, reason_code);
- if (rc < sizeof(struct smc_clc_msg_decline))
+ if (rc < 0)
goto out_err;
}
goto out_connected;
@@ -751,14 +751,16 @@ static void smc_listen_work(struct work_struct *work)
{
struct smc_sock *new_smc = container_of(work, struct smc_sock,
smc_listen_work);
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
struct socket *newclcsock = new_smc->clcsock;
struct smc_sock *lsmc = new_smc->listen_smc;
struct smc_clc_msg_accept_confirm cclc;
int local_contact = SMC_REUSE_CONTACT;
struct sock *newsmcsk = &new_smc->sk;
- struct smc_clc_msg_proposal pclc;
+ struct smc_clc_msg_proposal *pclc;
struct smc_ib_device *smcibdev;
struct sockaddr_in peeraddr;
+ u8 buf[SMC_CLC_MAX_LEN];
struct smc_link *link;
int reason_code = 0;
int rc = 0, len;
@@ -775,7 +777,7 @@ static void smc_listen_work(struct work_struct *work)
/* do inband token exchange -
*wait for and receive SMC Proposal CLC message
*/
- reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
+ reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
SMC_CLC_PROPOSAL);
if (reason_code < 0)
goto out_err;
@@ -804,8 +806,11 @@ static void smc_listen_work(struct work_struct *work)
reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
goto decline_rdma;
}
- if ((pclc.outgoing_subnet != subnet) ||
- (pclc.prefix_len != prefix_len)) {
+
+ pclc = (struct smc_clc_msg_proposal *)&buf;
+ pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+ if (pclc_prfx->outgoing_subnet != subnet ||
+ pclc_prfx->prefix_len != prefix_len) {
reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
goto decline_rdma;
}
@@ -816,7 +821,7 @@ static void smc_listen_work(struct work_struct *work)
/* allocate connection / link group */
mutex_lock(&smc_create_lgr_pending);
local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
- smcibdev, ibport, &pclc.lcl, 0);
+ smcibdev, ibport, &pclc->lcl, 0);
if (local_contact < 0) {
rc = local_contact;
if (rc == -ENOMEM)
@@ -879,11 +884,9 @@ static void smc_listen_work(struct work_struct *work)
}
/* QP confirmation over RoCE fabric */
reason_code = smc_serv_conf_first_link(new_smc);
- if (reason_code < 0) {
+ if (reason_code < 0)
/* peer is not aware of a problem */
- rc = reason_code;
goto out_err_unlock;
- }
if (reason_code > 0)
goto decline_rdma_unlock;
}
@@ -916,8 +919,7 @@ decline_rdma:
smc_conn_free(&new_smc->conn);
new_smc->use_fallback = true;
if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
- rc = smc_clc_send_decline(new_smc, reason_code);
- if (rc < sizeof(struct smc_clc_msg_decline))
+ if (smc_clc_send_decline(new_smc, reason_code) < 0)
goto out_err;
}
goto out_connected;
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 87f7bede6eab..d4155ff6acde 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -213,6 +213,9 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
smp_mb__after_atomic();
smc->sk.sk_data_ready(&smc->sk);
+ } else if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
+ (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) {
+ smc->sk.sk_data_ready(&smc->sk);
}
if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
@@ -234,15 +237,6 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
/* trigger socket release if connection closed */
smc_close_wake_tx_prepared(smc);
}
-
- /* socket connected but not accepted */
- if (!smc->sk.sk_socket)
- return;
-
- /* data available */
- if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
- (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
- smc_tx_consumer_update(conn);
}
/* called under tasklet context */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 1800e16b2a02..abf7ceb6690b 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -22,6 +22,54 @@
#include "smc_clc.h"
#include "smc_ib.h"
+/* check if received message has a correct header length and contains valid
+ * heading and trailing eyecatchers
+ */
+static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
+{
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct smc_clc_msg_accept_confirm *clc;
+ struct smc_clc_msg_proposal *pclc;
+ struct smc_clc_msg_decline *dclc;
+ struct smc_clc_msg_trail *trl;
+
+ if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)))
+ return false;
+ switch (clcm->type) {
+ case SMC_CLC_PROPOSAL:
+ pclc = (struct smc_clc_msg_proposal *)clcm;
+ pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+ if (ntohs(pclc->hdr.length) !=
+ sizeof(*pclc) + ntohs(pclc->iparea_offset) +
+ sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(struct smc_clc_ipv6_prefix) +
+ sizeof(*trl))
+ return false;
+ trl = (struct smc_clc_msg_trail *)
+ ((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl));
+ break;
+ case SMC_CLC_ACCEPT:
+ case SMC_CLC_CONFIRM:
+ clc = (struct smc_clc_msg_accept_confirm *)clcm;
+ if (ntohs(clc->hdr.length) != sizeof(*clc))
+ return false;
+ trl = &clc->trl;
+ break;
+ case SMC_CLC_DECLINE:
+ dclc = (struct smc_clc_msg_decline *)clcm;
+ if (ntohs(dclc->hdr.length) != sizeof(*dclc))
+ return false;
+ trl = &dclc->trl;
+ break;
+ default:
+ return false;
+ }
+ if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)))
+ return false;
+ return true;
+}
+
/* Wait for data on the tcp-socket, analyze received data
* Returns:
* 0 if success and it was not a decline that we received.
@@ -72,9 +120,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
}
datlen = ntohs(clcm->length);
if ((len < sizeof(struct smc_clc_msg_hdr)) ||
- (datlen < sizeof(struct smc_clc_msg_decline)) ||
- (datlen > sizeof(struct smc_clc_msg_accept_confirm)) ||
- memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) ||
+ (datlen > buflen) ||
((clcm->type != SMC_CLC_DECLINE) &&
(clcm->type != expected_type))) {
smc->sk.sk_err = EPROTO;
@@ -89,7 +135,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
krflags = MSG_WAITALL;
smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags);
- if (len < datlen) {
+ if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) {
smc->sk.sk_err = EPROTO;
reason_code = -EPROTO;
goto out;
@@ -133,7 +179,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
smc->sk.sk_err = EPROTO;
if (len < 0)
smc->sk.sk_err = -len;
- return len;
+ return sock_error(&smc->sk);
}
/* send CLC PROPOSAL message across internal TCP socket */
@@ -141,33 +187,43 @@ int smc_clc_send_proposal(struct smc_sock *smc,
struct smc_ib_device *smcibdev,
u8 ibport)
{
+ struct smc_clc_msg_proposal_prefix pclc_prfx;
struct smc_clc_msg_proposal pclc;
+ struct smc_clc_msg_trail trl;
int reason_code = 0;
+ struct kvec vec[3];
struct msghdr msg;
- struct kvec vec;
- int len, rc;
+ int len, plen, rc;
/* send SMC Proposal CLC message */
+ plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl);
memset(&pclc, 0, sizeof(pclc));
memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
pclc.hdr.type = SMC_CLC_PROPOSAL;
- pclc.hdr.length = htons(sizeof(pclc));
+ pclc.hdr.length = htons(plen);
pclc.hdr.version = SMC_CLC_V1; /* SMC version */
memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
+ pclc.iparea_offset = htons(0);
+ memset(&pclc_prfx, 0, sizeof(pclc_prfx));
/* determine subnet and mask from internal TCP socket */
- rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
- &pclc.prefix_len);
+ rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet,
+ &pclc_prfx.prefix_len);
if (rc)
return SMC_CLC_DECL_CNFERR; /* configuration error */
- memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ pclc_prfx.ipv6_prefixes_cnt = 0;
+ memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
memset(&msg, 0, sizeof(msg));
- vec.iov_base = &pclc;
- vec.iov_len = sizeof(pclc);
+ vec[0].iov_base = &pclc;
+ vec[0].iov_len = sizeof(pclc);
+ vec[1].iov_base = &pclc_prfx;
+ vec[1].iov_len = sizeof(pclc_prfx);
+ vec[2].iov_base = &trl;
+ vec[2].iov_len = sizeof(trl);
/* due to the few bytes needed for clc-handshake this cannot block */
- len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc));
+ len = kernel_sendmsg(smc->clcsock, &msg, vec, 3, plen);
if (len < sizeof(pclc)) {
if (len >= 0) {
reason_code = -ENETUNREACH;
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 12a9af1539a2..c145a0f36a68 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -44,7 +44,7 @@ struct smc_clc_msg_hdr { /* header1 of clc messages */
#if defined(__BIG_ENDIAN_BITFIELD)
u8 version : 4,
flag : 1,
- rsvd : 3;
+ rsvd : 3;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
u8 rsvd : 3,
flag : 1,
@@ -62,17 +62,31 @@ struct smc_clc_msg_local { /* header2 of clc messages */
u8 mac[6]; /* mac of ib_device port */
};
-struct smc_clc_msg_proposal { /* clc proposal message */
- struct smc_clc_msg_hdr hdr;
- struct smc_clc_msg_local lcl;
- __be16 iparea_offset; /* offset to IP address information area */
+struct smc_clc_ipv6_prefix {
+ u8 prefix[4];
+ u8 prefix_len;
+} __packed;
+
+struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/
__be32 outgoing_subnet; /* subnet mask */
u8 prefix_len; /* number of significant bits in mask */
u8 reserved[2];
u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
- struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
} __aligned(4);
+struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
+ struct smc_clc_msg_hdr hdr;
+ struct smc_clc_msg_local lcl;
+ __be16 iparea_offset; /* offset to IP address information area */
+} __aligned(4);
+
+#define SMC_CLC_PROPOSAL_MAX_OFFSET 0x28
+#define SMC_CLC_PROPOSAL_MAX_PREFIX (8 * sizeof(struct smc_clc_ipv6_prefix))
+#define SMC_CLC_MAX_LEN (sizeof(struct smc_clc_msg_proposal) + \
+ SMC_CLC_PROPOSAL_MAX_OFFSET + \
+ SMC_CLC_PROPOSAL_MAX_PREFIX + \
+ sizeof(struct smc_clc_msg_trail))
+
struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
struct smc_clc_msg_hdr hdr;
struct smc_clc_msg_local lcl;
@@ -102,6 +116,14 @@ struct smc_clc_msg_decline { /* clc decline message */
struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
} __aligned(4);
+/* determine start of the prefix area within the proposal message */
+static inline struct smc_clc_msg_proposal_prefix *
+smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
+{
+ return (struct smc_clc_msg_proposal_prefix *)
+ ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
+}
+
struct smc_sock;
struct smc_ib_device;
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 48615d2ac4aa..e194c6cc308a 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -113,7 +113,7 @@ static int smc_close_abort(struct smc_connection *conn)
/* terminate smc socket abnormally - active abort
* RDMA communication no longer possible
*/
-void smc_close_active_abort(struct smc_sock *smc)
+static void smc_close_active_abort(struct smc_sock *smc)
{
struct smc_cdc_conn_state_flags *txflags =
&smc->conn.local_tx_ctrl.conn_state_flags;
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
index ed82506b1b0a..8c498885d758 100644
--- a/net/smc/smc_close.h
+++ b/net/smc/smc_close.h
@@ -20,7 +20,6 @@
#define SMC_CLOSE_SOCK_PUT_DELAY HZ
void smc_close_wake_tx_prepared(struct smc_sock *smc);
-void smc_close_active_abort(struct smc_sock *smc);
int smc_close_active(struct smc_sock *smc);
void smc_close_sock_put_work(struct work_struct *work);
int smc_close_shutdown_write(struct smc_sock *smc);
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index cbf58637ee14..9dc392ca06bf 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -65,7 +65,6 @@ static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
rc = sk_wait_event(sk, timeo,
sk->sk_err ||
sk->sk_shutdown & RCV_SHUTDOWN ||
- sock_flag(sk, SOCK_DONE) ||
atomic_read(&conn->bytes_to_rcv) ||
smc_cdc_rxed_any_close_or_senddone(conn),
&wait);
@@ -116,7 +115,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
if (read_done) {
if (sk->sk_err ||
sk->sk_state == SMC_CLOSED ||
- (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ sk->sk_shutdown & RCV_SHUTDOWN ||
!timeo ||
signal_pending(current) ||
smc_cdc_rxed_any_close_or_senddone(conn) ||
@@ -124,8 +123,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
peer_conn_abort)
break;
} else {
- if (sock_flag(sk, SOCK_DONE))
- break;
if (sk->sk_err) {
read_done = sock_error(sk);
break;
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index c48dc2d5fd3a..2e50fddf8ce9 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -104,14 +104,12 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
if (atomic_read(&conn->sndbuf_space))
break; /* at least 1 byte of free space available */
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- sk->sk_write_pending++;
sk_wait_event(sk, &timeo,
sk->sk_err ||
(sk->sk_shutdown & SEND_SHUTDOWN) ||
smc_cdc_rxed_any_close_or_senddone(conn) ||
atomic_read(&conn->sndbuf_space),
&wait);
- sk->sk_write_pending--;
}
remove_wait_queue(sk_sleep(sk), &wait);
return rc;
@@ -450,9 +448,7 @@ static void smc_tx_work(struct work_struct *work)
void smc_tx_consumer_update(struct smc_connection *conn)
{
union smc_host_cursor cfed, cons;
- struct smc_cdc_tx_pend *pend;
- struct smc_wr_buf *wr_buf;
- int to_confirm, rc;
+ int to_confirm;
smc_curs_write(&cons,
smc_curs_read(&conn->local_tx_ctrl.cons, conn),
@@ -466,10 +462,7 @@ void smc_tx_consumer_update(struct smc_connection *conn)
((to_confirm > conn->rmbe_update_limit) &&
((to_confirm > (conn->rmbe_size / 2)) ||
conn->local_rx_ctrl.prod_flags.write_blocked))) {
- rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend);
- if (!rc)
- rc = smc_cdc_msg_send(conn, wr_buf, pend);
- if (rc < 0) {
+ if (smc_cdc_get_slot_and_msg_send(conn) < 0) {
schedule_delayed_work(&conn->tx_work,
SMC_TX_WORK_DELAY);
return;
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 329325bd553e..37892b3909af 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -1,7 +1,7 @@
/*
* net/tipc/bcast.c: TIPC broadcast code
*
- * Copyright (c) 2004-2006, 2014-2016, Ericsson AB
+ * Copyright (c) 2004-2006, 2014-2017, Ericsson AB
* Copyright (c) 2004, Intel Corporation.
* Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
@@ -42,8 +42,8 @@
#include "link.h"
#include "name_table.h"
-#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */
-#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */
+#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */
+#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */
const char tipc_bclink_name[] = "broadcast-link";
@@ -74,6 +74,10 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net)
return tipc_net(net)->bcbase;
}
+/* tipc_bcast_get_mtu(): -get the MTU currently used by broadcast link
+ * Note: the MTU is decremented to give room for a tunnel header, in
+ * case the message needs to be sent as replicast
+ */
int tipc_bcast_get_mtu(struct net *net)
{
return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE;
@@ -515,7 +519,7 @@ int tipc_bcast_init(struct net *net)
spin_lock_init(&tipc_net(net)->bclock);
if (!tipc_link_bc_create(net, 0, 0,
- U16_MAX,
+ FB_MTU,
BCLINK_WIN_DEFAULT,
0,
&bb->inputq,
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 6bce0b1117bd..2d6b2aed30e0 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -483,7 +483,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
/**
* tipc_link_bc_create - create new link to be used for broadcast
* @n: pointer to associated node
- * @mtu: mtu to be used
+ * @mtu: mtu to be used initially if no peers
* @window: send window to be used
* @inputq: queue to put messages ready for delivery
* @namedq: queue to put binding table update messages ready for delivery
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index b0d07b35909d..55d8ba92291d 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -251,20 +251,23 @@ bool tipc_msg_validate(struct sk_buff **_skb)
* @pktmax: Max packet size that can be used
* @list: Buffer or chain of buffers to be returned to caller
*
+ * Note that the recursive call we are making here is safe, since it can
+ * logically go only one further level down.
+ *
* Returns message data size or errno: -ENOMEM, -EFAULT
*/
-int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
- int offset, int dsz, int pktmax, struct sk_buff_head *list)
+int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset,
+ int dsz, int pktmax, struct sk_buff_head *list)
{
int mhsz = msg_hdr_sz(mhdr);
+ struct tipc_msg pkthdr;
int msz = mhsz + dsz;
- int pktno = 1;
- int pktsz;
int pktrem = pktmax;
- int drem = dsz;
- struct tipc_msg pkthdr;
struct sk_buff *skb;
+ int drem = dsz;
+ int pktno = 1;
char *pktpos;
+ int pktsz;
int rc;
msg_set_size(mhdr, msz);
@@ -272,8 +275,18 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
/* No fragmentation needed? */
if (likely(msz <= pktmax)) {
skb = tipc_buf_acquire(msz, GFP_KERNEL);
- if (unlikely(!skb))
+
+ /* Fall back to smaller MTU if node local message */
+ if (unlikely(!skb)) {
+ if (pktmax != MAX_MSG_SIZE)
+ return -ENOMEM;
+ rc = tipc_msg_build(mhdr, m, offset, dsz, FB_MTU, list);
+ if (rc != dsz)
+ return rc;
+ if (tipc_msg_assemble(list))
+ return dsz;
return -ENOMEM;
+ }
skb_orphan(skb);
__skb_queue_tail(list, skb);
skb_copy_to_linear_data(skb, mhdr, mhsz);
@@ -589,6 +602,30 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
return true;
}
+/* tipc_msg_assemble() - assemble chain of fragments into one message
+ */
+bool tipc_msg_assemble(struct sk_buff_head *list)
+{
+ struct sk_buff *skb, *tmp = NULL;
+
+ if (skb_queue_len(list) == 1)
+ return true;
+
+ while ((skb = __skb_dequeue(list))) {
+ skb->next = NULL;
+ if (tipc_buf_append(&tmp, &skb)) {
+ __skb_queue_tail(list, skb);
+ return true;
+ }
+ if (!tmp)
+ break;
+ }
+ __skb_queue_purge(list);
+ __skb_queue_head_init(list);
+ pr_warn("Failed do assemble buffer\n");
+ return false;
+}
+
/* tipc_msg_reassemble() - clone a buffer chain of fragments and
* reassemble the clones into one message
*/
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 3e4384c222f7..b4ba1b4f9ae7 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -98,7 +98,7 @@ struct plist;
#define MAX_H_SIZE 60 /* Largest possible TIPC header size */
#define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE)
-
+#define FB_MTU 3744
#define TIPC_MEDIA_INFO_OFFSET 5
struct tipc_skb_cb {
@@ -943,6 +943,7 @@ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos);
int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
int offset, int dsz, int mtu, struct sk_buff_head *list);
bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
+bool tipc_msg_assemble(struct sk_buff_head *list);
bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
struct sk_buff_head *cpy);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 41127d0b925e..0cdf5c2ad881 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2640,9 +2640,7 @@ void tipc_sk_reinit(struct net *net)
rhashtable_walk_enter(&tn->sk_rht, &iter);
do {
- tsk = ERR_PTR(rhashtable_walk_start(&iter));
- if (IS_ERR(tsk))
- goto walk_stop;
+ rhashtable_walk_start(&iter);
while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) {
spin_lock_bh(&tsk->sk.sk_lock.slock);
@@ -2651,7 +2649,7 @@ void tipc_sk_reinit(struct net *net)
msg_set_orignode(msg, tn->own_addr);
spin_unlock_bh(&tsk->sk.sk_lock.slock);
}
-walk_stop:
+
rhashtable_walk_stop(&iter);
} while (tsk == ERR_PTR(-EAGAIN));
}
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 30e5746085b8..00641b611aed 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -67,7 +67,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
/* We don't yet support UDP encapsulation, TFC padding and ESN. */
if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN))
- return 0;
+ return -EINVAL;
dev = dev_get_by_index(net, xuo->ifindex);
if (!dev) {
@@ -120,8 +120,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
if (!x->type_offload || x->encap)
return false;
- if ((x->xso.offload_handle && (dev == dst->path->dev)) &&
- !dst->child->xfrm && x->type->get_mtu) {
+ if ((x->xso.offload_handle && (dev == xfrm_dst_path(dst)->dev)) &&
+ !xdst->child->xfrm && x->type->get_mtu) {
mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
if (skb->len <= mtu)
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 347ab31574d5..ac277b97e0d7 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -231,7 +231,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
if (xo && (xo->flags & CRYPTO_DONE)) {
crypto_done = true;
- x = xfrm_input_state(skb);
family = XFRM_SPI_SKB_CB(skb)->family;
if (!(xo->status & CRYPTO_SUCCESS)) {
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 73ad8c8ef344..23468672a767 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -44,7 +44,7 @@ static int xfrm_skb_check_space(struct sk_buff *skb)
static struct dst_entry *skb_dst_pop(struct sk_buff *skb)
{
- struct dst_entry *child = dst_clone(skb_dst(skb)->child);
+ struct dst_entry *child = dst_clone(xfrm_dst_child(skb_dst(skb)));
skb_dst_drop(skb);
return child;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 9542975eb2f9..e3a5aca9cdda 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -54,7 +54,7 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
static struct kmem_cache *xfrm_dst_cache __read_mostly;
static __read_mostly seqcount_t xfrm_policy_hash_generation;
-static void xfrm_init_pmtu(struct dst_entry *dst);
+static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
static int stale_bundle(struct dst_entry *dst);
static int xfrm_bundle_ok(struct xfrm_dst *xdst);
static void xfrm_policy_queue_process(struct timer_list *t);
@@ -1251,7 +1251,7 @@ EXPORT_SYMBOL(xfrm_policy_delete);
int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
{
- struct net *net = xp_net(pol);
+ struct net *net = sock_net(sk);
struct xfrm_policy *old_pol;
#ifdef CONFIG_XFRM_SUB_POLICY
@@ -1538,7 +1538,9 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
*/
static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
- struct xfrm_state **xfrm, int nx,
+ struct xfrm_state **xfrm,
+ struct xfrm_dst **bundle,
+ int nx,
const struct flowi *fl,
struct dst_entry *dst)
{
@@ -1546,8 +1548,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
unsigned long now = jiffies;
struct net_device *dev;
struct xfrm_mode *inner_mode;
- struct dst_entry *dst_prev = NULL;
- struct dst_entry *dst0 = NULL;
+ struct xfrm_dst *xdst_prev = NULL;
+ struct xfrm_dst *xdst0 = NULL;
int i = 0;
int err;
int header_len = 0;
@@ -1573,13 +1575,14 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
goto put_states;
}
- if (!dst_prev)
- dst0 = dst1;
+ bundle[i] = xdst;
+ if (!xdst_prev)
+ xdst0 = xdst;
else
/* Ref count is taken during xfrm_alloc_dst()
* No need to do dst_clone() on dst1
*/
- dst_prev->child = dst1;
+ xfrm_dst_set_child(xdst_prev, &xdst->u.dst);
if (xfrm[i]->sel.family == AF_UNSPEC) {
inner_mode = xfrm_ip2inner_mode(xfrm[i],
@@ -1616,8 +1619,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
dst1->input = dst_discard;
dst1->output = inner_mode->afinfo->output;
- dst1->next = dst_prev;
- dst_prev = dst1;
+ xdst_prev = xdst;
header_len += xfrm[i]->props.header_len;
if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
@@ -1625,40 +1627,39 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
trailer_len += xfrm[i]->props.trailer_len;
}
- dst_prev->child = dst;
- dst0->path = dst;
+ xfrm_dst_set_child(xdst_prev, dst);
+ xdst0->path = dst;
err = -ENODEV;
dev = dst->dev;
if (!dev)
goto free_dst;
- xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len);
- xfrm_init_pmtu(dst_prev);
-
- for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) {
- struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev;
+ xfrm_init_path(xdst0, dst, nfheader_len);
+ xfrm_init_pmtu(bundle, nx);
- err = xfrm_fill_dst(xdst, dev, fl);
+ for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst;
+ xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) {
+ err = xfrm_fill_dst(xdst_prev, dev, fl);
if (err)
goto free_dst;
- dst_prev->header_len = header_len;
- dst_prev->trailer_len = trailer_len;
- header_len -= xdst->u.dst.xfrm->props.header_len;
- trailer_len -= xdst->u.dst.xfrm->props.trailer_len;
+ xdst_prev->u.dst.header_len = header_len;
+ xdst_prev->u.dst.trailer_len = trailer_len;
+ header_len -= xdst_prev->u.dst.xfrm->props.header_len;
+ trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len;
}
out:
- return dst0;
+ return &xdst0->u.dst;
put_states:
for (; i < nx; i++)
xfrm_state_put(xfrm[i]);
free_dst:
- if (dst0)
- dst_release_immediate(dst0);
- dst0 = ERR_PTR(err);
+ if (xdst0)
+ dst_release_immediate(&xdst0->u.dst);
+ xdst0 = ERR_PTR(err);
goto out;
}
@@ -1800,7 +1801,7 @@ static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst,
for (i = 0; i < num; i++) {
if (!dst || dst->xfrm != xfrm[i])
return false;
- dst = dst->child;
+ dst = xfrm_dst_child(dst);
}
return xfrm_bundle_ok(xdst);
@@ -1813,6 +1814,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
{
struct net *net = xp_net(pols[0]);
struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+ struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
struct xfrm_dst *xdst, *old;
struct dst_entry *dst;
int err;
@@ -1840,7 +1842,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
old = xdst;
- dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
+ dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
if (IS_ERR(dst)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
return ERR_CAST(dst);
@@ -1880,8 +1882,8 @@ static void xfrm_policy_queue_process(struct timer_list *t)
xfrm_decode_session(skb, &fl, dst->ops->family);
spin_unlock(&pq->hold_queue.lock);
- dst_hold(dst->path);
- dst = xfrm_lookup(net, dst->path, &fl, sk, 0);
+ dst_hold(xfrm_dst_path(dst));
+ dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, 0);
if (IS_ERR(dst))
goto purge_queue;
@@ -1910,8 +1912,8 @@ static void xfrm_policy_queue_process(struct timer_list *t)
skb = __skb_dequeue(&list);
xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
- dst_hold(skb_dst(skb)->path);
- dst = xfrm_lookup(net, skb_dst(skb)->path, &fl, skb->sk, 0);
+ dst_hold(xfrm_dst_path(skb_dst(skb)));
+ dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0);
if (IS_ERR(dst)) {
kfree_skb(skb);
continue;
@@ -2012,8 +2014,8 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
dst1->output = xdst_queue_output;
dst_hold(dst);
- dst1->child = dst;
- dst1->path = dst;
+ xfrm_dst_set_child(xdst, dst);
+ xdst->path = dst;
xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);
@@ -2576,7 +2578,7 @@ static int stale_bundle(struct dst_entry *dst)
void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
{
- while ((dst = dst->child) && dst->xfrm && dst->dev == dev) {
+ while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) {
dst->dev = dev_net(dev)->loopback_dev;
dev_hold(dst->dev);
dev_put(dev);
@@ -2600,13 +2602,15 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
return dst;
}
-static void xfrm_init_pmtu(struct dst_entry *dst)
+static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr)
{
- do {
- struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ while (nr--) {
+ struct xfrm_dst *xdst = bundle[nr];
u32 pmtu, route_mtu_cached;
+ struct dst_entry *dst;
- pmtu = dst_mtu(dst->child);
+ dst = &xdst->u.dst;
+ pmtu = dst_mtu(xfrm_dst_child(dst));
xdst->child_mtu_cached = pmtu;
pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
@@ -2618,7 +2622,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst)
pmtu = route_mtu_cached;
dst_metric_set(dst, RTAX_MTU, pmtu);
- } while ((dst = dst->next));
+ }
}
/* Check that the bundle accepts the flow and its components are
@@ -2627,19 +2631,20 @@ static void xfrm_init_pmtu(struct dst_entry *dst)
static int xfrm_bundle_ok(struct xfrm_dst *first)
{
+ struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
struct dst_entry *dst = &first->u.dst;
- struct xfrm_dst *last;
+ struct xfrm_dst *xdst;
+ int start_from, nr;
u32 mtu;
- if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
+ if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) ||
(dst->dev && !netif_running(dst->dev)))
return 0;
if (dst->flags & DST_XFRM_QUEUE)
return 1;
- last = NULL;
-
+ start_from = nr = 0;
do {
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
@@ -2651,9 +2656,11 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
return 0;
- mtu = dst_mtu(dst->child);
+ bundle[nr++] = xdst;
+
+ mtu = dst_mtu(xfrm_dst_child(dst));
if (xdst->child_mtu_cached != mtu) {
- last = xdst;
+ start_from = nr;
xdst->child_mtu_cached = mtu;
}
@@ -2661,30 +2668,30 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
return 0;
mtu = dst_mtu(xdst->route);
if (xdst->route_mtu_cached != mtu) {
- last = xdst;
+ start_from = nr;
xdst->route_mtu_cached = mtu;
}
- dst = dst->child;
+ dst = xfrm_dst_child(dst);
} while (dst->xfrm);
- if (likely(!last))
+ if (likely(!start_from))
return 1;
- mtu = last->child_mtu_cached;
- for (;;) {
- dst = &last->u.dst;
+ xdst = bundle[start_from - 1];
+ mtu = xdst->child_mtu_cached;
+ while (start_from--) {
+ dst = &xdst->u.dst;
mtu = xfrm_state_mtu(dst->xfrm, mtu);
- if (mtu > last->route_mtu_cached)
- mtu = last->route_mtu_cached;
+ if (mtu > xdst->route_mtu_cached)
+ mtu = xdst->route_mtu_cached;
dst_metric_set(dst, RTAX_MTU, mtu);
-
- if (last == first)
+ if (!start_from)
break;
- last = (struct xfrm_dst *)last->u.dst.next;
- last->child_mtu_cached = mtu;
+ xdst = bundle[start_from - 1];
+ xdst->child_mtu_cached = mtu;
}
return 1;
@@ -2692,22 +2699,20 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
{
- return dst_metric_advmss(dst->path);
+ return dst_metric_advmss(xfrm_dst_path(dst));
}
static unsigned int xfrm_mtu(const struct dst_entry *dst)
{
unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
- return mtu ? : dst_mtu(dst->path);
+ return mtu ? : dst_mtu(xfrm_dst_path(dst));
}
static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
const void *daddr)
{
- const struct dst_entry *path = dst->path;
-
- for (; dst != path; dst = dst->child) {
+ while (dst->xfrm) {
const struct xfrm_state *xfrm = dst->xfrm;
if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
@@ -2716,6 +2721,8 @@ static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
daddr = xfrm->coaddr;
else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
daddr = &xfrm->id.daddr;
+
+ dst = xfrm_dst_child(dst);
}
return daddr;
}
@@ -2724,7 +2731,7 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr)
{
- const struct dst_entry *path = dst->path;
+ const struct dst_entry *path = xfrm_dst_path(dst);
if (!skb)
daddr = xfrm_get_dst_nexthop(dst, daddr);
@@ -2733,7 +2740,7 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
- const struct dst_entry *path = dst->path;
+ const struct dst_entry *path = xfrm_dst_path(dst);
daddr = xfrm_get_dst_nexthop(dst, daddr);
path->ops->confirm_neigh(path, daddr);
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 8b23c5bcf8e8..02501817227b 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -666,7 +666,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff
if (unlikely(oseq < replay_esn->oseq)) {
XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi;
xo->seq.hi = oseq_hi;
-
+ replay_esn->oseq_hi = oseq_hi;
if (replay_esn->oseq_hi == 0) {
replay_esn->oseq--;
replay_esn->oseq_hi--;
@@ -678,7 +678,6 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff
}
replay_esn->oseq = oseq;
- replay_esn->oseq_hi = oseq_hi;
if (xfrm_aevent_is_on(net))
x->repl->notify(x, XFRM_REPLAY_UPDATE);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 065d89606888..1b7856be3eeb 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2048,6 +2048,13 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen
struct xfrm_mgr *km;
struct xfrm_policy *pol = NULL;
+ if (!optval && !optlen) {
+ xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL);
+ xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL);
+ __sk_dst_reset(sk);
+ return 0;
+ }
+
if (optlen <= 0 || optlen > PAGE_SIZE)
return -EMSGSIZE;