diff options
Diffstat (limited to 'net')
133 files changed, 4455 insertions, 1663 deletions
diff --git a/net/atm/common.c b/net/atm/common.c index 8a4f99114cd2..5763fd241dc3 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -14,7 +14,7 @@ #include <linux/capability.h> #include <linux/mm.h> #include <linux/sched/signal.h> -#include <linux/time.h> /* struct timeval */ +#include <linux/time64.h> /* 64-bit time for seconds */ #include <linux/skbuff.h> #include <linux/bitops.h> #include <linux/init.h> diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 7c6a1cc760a2..31e0dcb970f8 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -1089,7 +1089,7 @@ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc) msg->type = SND_MPOA_RES_RQST; msg->content.in_info = entry->ctrl_info; msg_to_mpoad(msg, mpc); - do_gettimeofday(&(entry->reply_wait)); + entry->reply_wait = ktime_get_seconds(); mpc->in_ops->put(entry); return; } @@ -1099,7 +1099,7 @@ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc) msg->type = SND_MPOA_RES_RQST; msg->content.in_info = entry->ctrl_info; msg_to_mpoad(msg, mpc); - do_gettimeofday(&(entry->reply_wait)); + entry->reply_wait = ktime_get_seconds(); mpc->in_ops->put(entry); return; } @@ -1175,8 +1175,9 @@ static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc) } entry->ctrl_info = msg->content.in_info; - do_gettimeofday(&(entry->tv)); - do_gettimeofday(&(entry->reply_wait)); /* Used in refreshing func from now on */ + entry->time = ktime_get_seconds(); + /* Used in refreshing func from now on */ + entry->reply_wait = ktime_get_seconds(); entry->refresh_time = 0; ddprintk_cont("entry->shortcut = %p\n", entry->shortcut); diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c index e01450bb32d6..4bb418313720 100644 --- a/net/atm/mpoa_caches.c +++ b/net/atm/mpoa_caches.c @@ -117,7 +117,7 @@ static in_cache_entry *in_cache_add_entry(__be32 dst_ip, memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN); entry->ctrl_info.in_dst_ip = dst_ip; - do_gettimeofday(&(entry->tv)); + entry->time = ktime_get_seconds(); entry->retry_time = client->parameters.mpc_p4; entry->count = 1; entry->entry_state = INGRESS_INVALID; @@ -148,7 +148,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc) if (qos != NULL) msg.qos = qos->qos; msg_to_mpoad(&msg, mpc); - do_gettimeofday(&(entry->reply_wait)); + entry->reply_wait = ktime_get_seconds(); entry->entry_state = INGRESS_RESOLVING; } if (entry->shortcut != NULL) @@ -171,7 +171,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc) if (qos != NULL) msg.qos = qos->qos; msg_to_mpoad(&msg, mpc); - do_gettimeofday(&(entry->reply_wait)); + entry->reply_wait = ktime_get_seconds(); } return CLOSED; @@ -227,17 +227,16 @@ static void in_cache_remove_entry(in_cache_entry *entry, static void clear_count_and_expired(struct mpoa_client *client) { in_cache_entry *entry, *next_entry; - struct timeval now; + time64_t now; - do_gettimeofday(&now); + now = ktime_get_seconds(); write_lock_bh(&client->ingress_lock); entry = client->in_cache; while (entry != NULL) { entry->count = 0; next_entry = entry->next; - if ((now.tv_sec - entry->tv.tv_sec) - > entry->ctrl_info.holding_time) { + if ((now - entry->time) > entry->ctrl_info.holding_time) { dprintk("holding time expired, ip = %pI4\n", &entry->ctrl_info.in_dst_ip); client->in_ops->remove_entry(entry, client); @@ -253,35 +252,35 @@ static void check_resolving_entries(struct mpoa_client *client) struct atm_mpoa_qos *qos; in_cache_entry *entry; - struct timeval now; + time64_t now; struct k_message msg; - do_gettimeofday(&now); + now = ktime_get_seconds(); read_lock_bh(&client->ingress_lock); entry = client->in_cache; while (entry != NULL) { if (entry->entry_state == INGRESS_RESOLVING) { - if ((now.tv_sec - entry->hold_down.tv_sec) < - client->parameters.mpc_p6) { + + if ((now - entry->hold_down) + < client->parameters.mpc_p6) { entry = entry->next; /* Entry in hold down */ continue; } - if ((now.tv_sec - entry->reply_wait.tv_sec) > - entry->retry_time) { + if ((now - entry->reply_wait) > entry->retry_time) { entry->retry_time = MPC_C1 * (entry->retry_time); /* * Retry time maximum exceeded, * put entry in hold down. */ if (entry->retry_time > client->parameters.mpc_p5) { - do_gettimeofday(&(entry->hold_down)); + entry->hold_down = ktime_get_seconds(); entry->retry_time = client->parameters.mpc_p4; entry = entry->next; continue; } /* Ask daemon to send a resolution request. */ - memset(&(entry->hold_down), 0, sizeof(struct timeval)); + memset(&entry->hold_down, 0, sizeof(time64_t)); msg.type = SND_MPOA_RES_RTRY; memcpy(msg.MPS_ctrl, client->mps_ctrl_addr, ATM_ESA_LEN); msg.content.in_info = entry->ctrl_info; @@ -289,7 +288,7 @@ static void check_resolving_entries(struct mpoa_client *client) if (qos != NULL) msg.qos = qos->qos; msg_to_mpoad(&msg, client); - do_gettimeofday(&(entry->reply_wait)); + entry->reply_wait = ktime_get_seconds(); } } entry = entry->next; @@ -300,18 +299,18 @@ static void check_resolving_entries(struct mpoa_client *client) /* Call this every MPC-p5 seconds. */ static void refresh_entries(struct mpoa_client *client) { - struct timeval now; + time64_t now; struct in_cache_entry *entry = client->in_cache; ddprintk("refresh_entries\n"); - do_gettimeofday(&now); + now = ktime_get_seconds(); read_lock_bh(&client->ingress_lock); while (entry != NULL) { if (entry->entry_state == INGRESS_RESOLVED) { if (!(entry->refresh_time)) entry->refresh_time = (2 * (entry->ctrl_info.holding_time))/3; - if ((now.tv_sec - entry->reply_wait.tv_sec) > + if ((now - entry->reply_wait) > entry->refresh_time) { dprintk("refreshing an entry.\n"); entry->entry_state = INGRESS_REFRESHING; @@ -480,7 +479,7 @@ static eg_cache_entry *eg_cache_add_entry(struct k_message *msg, memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN); entry->ctrl_info = msg->content.eg_info; - do_gettimeofday(&(entry->tv)); + entry->time = ktime_get_seconds(); entry->entry_state = EGRESS_RESOLVED; dprintk("new_eg_cache_entry cache_id %u\n", ntohl(entry->ctrl_info.cache_id)); @@ -495,7 +494,7 @@ static eg_cache_entry *eg_cache_add_entry(struct k_message *msg, static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time) { - do_gettimeofday(&(entry->tv)); + entry->time = ktime_get_seconds(); entry->entry_state = EGRESS_RESOLVED; entry->ctrl_info.holding_time = holding_time; } @@ -503,17 +502,16 @@ static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time) static void clear_expired(struct mpoa_client *client) { eg_cache_entry *entry, *next_entry; - struct timeval now; + time64_t now; struct k_message msg; - do_gettimeofday(&now); + now = ktime_get_seconds(); write_lock_irq(&client->egress_lock); entry = client->eg_cache; while (entry != NULL) { next_entry = entry->next; - if ((now.tv_sec - entry->tv.tv_sec) - > entry->ctrl_info.holding_time) { + if ((now - entry->time) > entry->ctrl_info.holding_time) { msg.type = SND_EGRESS_PURGE; msg.content.eg_info = entry->ctrl_info; dprintk("egress_cache: holding time expired, cache_id = %u.\n", diff --git a/net/atm/mpoa_caches.h b/net/atm/mpoa_caches.h index 6a266669ebf4..464c4c7f8d1f 100644 --- a/net/atm/mpoa_caches.h +++ b/net/atm/mpoa_caches.h @@ -2,6 +2,7 @@ #ifndef MPOA_CACHES_H #define MPOA_CACHES_H +#include <linux/time64.h> #include <linux/netdevice.h> #include <linux/types.h> #include <linux/atm.h> @@ -16,9 +17,9 @@ void atm_mpoa_init_cache(struct mpoa_client *mpc); typedef struct in_cache_entry { struct in_cache_entry *next; struct in_cache_entry *prev; - struct timeval tv; - struct timeval reply_wait; - struct timeval hold_down; + time64_t time; + time64_t reply_wait; + time64_t hold_down; uint32_t packets_fwded; uint16_t entry_state; uint32_t retry_time; @@ -53,7 +54,7 @@ struct in_cache_ops{ typedef struct eg_cache_entry{ struct eg_cache_entry *next; struct eg_cache_entry *prev; - struct timeval tv; + time64_t time; uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN]; struct atm_vcc *shortcut; uint32_t packets_rcvd; diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c index 8a0c17e1c203..2212da9c2da2 100644 --- a/net/atm/mpoa_proc.c +++ b/net/atm/mpoa_proc.c @@ -8,7 +8,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/proc_fs.h> -#include <linux/time.h> +#include <linux/ktime.h> #include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/atmmpc.h> @@ -138,7 +138,7 @@ static int mpc_show(struct seq_file *m, void *v) int i; in_cache_entry *in_entry; eg_cache_entry *eg_entry; - struct timeval now; + time64_t now; unsigned char ip_string[16]; if (v == SEQ_START_TOKEN) { @@ -148,15 +148,17 @@ static int mpc_show(struct seq_file *m, void *v) seq_printf(m, "\nInterface %d:\n\n", mpc->dev_num); seq_printf(m, "Ingress Entries:\nIP address State Holding time Packets fwded VPI VCI\n"); - do_gettimeofday(&now); + now = ktime_get_seconds(); for (in_entry = mpc->in_cache; in_entry; in_entry = in_entry->next) { + unsigned long seconds_delta = now - in_entry->time; + sprintf(ip_string, "%pI4", &in_entry->ctrl_info.in_dst_ip); seq_printf(m, "%-16s%s%-14lu%-12u", ip_string, ingress_state_string(in_entry->entry_state), in_entry->ctrl_info.holding_time - - (now.tv_sec-in_entry->tv.tv_sec), + seconds_delta, in_entry->packets_fwded); if (in_entry->shortcut) seq_printf(m, " %-3d %-3d", @@ -169,13 +171,14 @@ static int mpc_show(struct seq_file *m, void *v) seq_printf(m, "Egress Entries:\nIngress MPC ATM addr\nCache-id State Holding time Packets recvd Latest IP addr VPI VCI\n"); for (eg_entry = mpc->eg_cache; eg_entry; eg_entry = eg_entry->next) { unsigned char *p = eg_entry->ctrl_info.in_MPC_data_ATM_addr; + unsigned long seconds_delta = now - eg_entry->time; + for (i = 0; i < ATM_ESA_LEN; i++) seq_printf(m, "%02x", p[i]); seq_printf(m, "\n%-16lu%s%-14lu%-15u", (unsigned long)ntohl(eg_entry->ctrl_info.cache_id), egress_state_string(eg_entry->entry_state), - (eg_entry->ctrl_info.holding_time - - (now.tv_sec-eg_entry->tv.tv_sec)), + (eg_entry->ctrl_info.holding_time - seconds_delta), eg_entry->packets_rcvd); /* latest IP address */ diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index af5b8c87f590..1285ca30ab0a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -125,9 +125,16 @@ static int br_dev_init(struct net_device *dev) if (!br->stats) return -ENOMEM; + err = br_fdb_hash_init(br); + if (err) { + free_percpu(br->stats); + return err; + } + err = br_vlan_init(br); if (err) { free_percpu(br->stats); + br_fdb_hash_fini(br); return err; } @@ -135,6 +142,7 @@ static int br_dev_init(struct net_device *dev) if (err) { free_percpu(br->stats); br_vlan_flush(br); + br_fdb_hash_fini(br); } br_set_lockdep_class(dev); @@ -148,6 +156,7 @@ static void br_dev_uninit(struct net_device *dev) br_multicast_dev_del(br); br_multicast_uninit_stats(br); br_vlan_flush(br); + br_fdb_hash_fini(br); free_percpu(br->stats); } @@ -416,6 +425,7 @@ void br_dev_setup(struct net_device *dev) br->dev = dev; spin_lock_init(&br->lock); INIT_LIST_HEAD(&br->port_list); + INIT_HLIST_HEAD(&br->fdb_list); spin_lock_init(&br->hash_lock); br->bridge_id.prio[0] = 0x80; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 4ea5c8bbe286..dc87fbc9a23b 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -28,14 +28,20 @@ #include <trace/events/bridge.h> #include "br_private.h" +static const struct rhashtable_params br_fdb_rht_params = { + .head_offset = offsetof(struct net_bridge_fdb_entry, rhnode), + .key_offset = offsetof(struct net_bridge_fdb_entry, key), + .key_len = sizeof(struct net_bridge_fdb_key), + .automatic_shrinking = true, + .locks_mul = 1, +}; + static struct kmem_cache *br_fdb_cache __read_mostly; static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); static void fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *, int); -static u32 fdb_salt __read_mostly; - int __init br_fdb_init(void) { br_fdb_cache = kmem_cache_create("bridge_fdb_cache", @@ -45,7 +51,6 @@ int __init br_fdb_init(void) if (!br_fdb_cache) return -ENOMEM; - get_random_bytes(&fdb_salt, sizeof(fdb_salt)); return 0; } @@ -54,6 +59,15 @@ void br_fdb_fini(void) kmem_cache_destroy(br_fdb_cache); } +int br_fdb_hash_init(struct net_bridge *br) +{ + return rhashtable_init(&br->fdb_hash_tbl, &br_fdb_rht_params); +} + +void br_fdb_hash_fini(struct net_bridge *br) +{ + rhashtable_destroy(&br->fdb_hash_tbl); +} /* if topology_changing then use forward_delay (default 15 sec) * otherwise keep longer (default 5 minutes) @@ -70,13 +84,6 @@ static inline int has_expired(const struct net_bridge *br, time_before_eq(fdb->updated + hold_time(br), jiffies); } -static inline int br_mac_hash(const unsigned char *mac, __u16 vid) -{ - /* use 1 byte of OUI and 3 bytes of NIC */ - u32 key = get_unaligned((u32 *)(mac + 2)); - return jhash_2words(key, vid, fdb_salt) & (BR_HASH_SIZE - 1); -} - static void fdb_rcu_free(struct rcu_head *head) { struct net_bridge_fdb_entry *ent @@ -84,19 +91,18 @@ static void fdb_rcu_free(struct rcu_head *head) kmem_cache_free(br_fdb_cache, ent); } -static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head, +static struct net_bridge_fdb_entry *fdb_find_rcu(struct rhashtable *tbl, const unsigned char *addr, __u16 vid) { - struct net_bridge_fdb_entry *f; + struct net_bridge_fdb_key key; WARN_ON_ONCE(!rcu_read_lock_held()); - hlist_for_each_entry_rcu(f, head, hlist) - if (ether_addr_equal(f->addr.addr, addr) && f->vlan_id == vid) - break; + key.vlan_id = vid; + memcpy(key.addr.addr, addr, sizeof(key.addr.addr)); - return f; + return rhashtable_lookup(tbl, &key, br_fdb_rht_params); } /* requires bridge hash_lock */ @@ -104,13 +110,12 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br, const unsigned char *addr, __u16 vid) { - struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; lockdep_assert_held_once(&br->hash_lock); rcu_read_lock(); - fdb = fdb_find_rcu(head, addr, vid); + fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); rcu_read_unlock(); return fdb; @@ -120,9 +125,7 @@ struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, const unsigned char *addr, __u16 vid) { - struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; - - return fdb_find_rcu(head, addr, vid); + return fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); } /* When a static FDB entry is added, the mac address from the entry is @@ -175,9 +178,11 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) trace_fdb_delete(br, f); if (f->is_static) - fdb_del_hw_addr(br, f->addr.addr); + fdb_del_hw_addr(br, f->key.addr.addr); - hlist_del_init_rcu(&f->hlist); + hlist_del_init_rcu(&f->fdb_node); + rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode, + br_fdb_rht_params); fdb_notify(br, f, RTM_DELNEIGH); call_rcu(&f->rcu, fdb_rcu_free); } @@ -187,11 +192,11 @@ static void fdb_delete_local(struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_fdb_entry *f) { - const unsigned char *addr = f->addr.addr; + const unsigned char *addr = f->key.addr.addr; struct net_bridge_vlan_group *vg; const struct net_bridge_vlan *v; struct net_bridge_port *op; - u16 vid = f->vlan_id; + u16 vid = f->key.vlan_id; /* Maybe another port has same hw addr? */ list_for_each_entry(op, &br->port_list, list) { @@ -233,31 +238,23 @@ void br_fdb_find_delete_local(struct net_bridge *br, void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { struct net_bridge_vlan_group *vg; + struct net_bridge_fdb_entry *f; struct net_bridge *br = p->br; struct net_bridge_vlan *v; - int i; spin_lock_bh(&br->hash_lock); - vg = nbp_vlan_group(p); - /* Search all chains since old address/hash is unknown */ - for (i = 0; i < BR_HASH_SIZE; i++) { - struct hlist_node *h; - hlist_for_each(h, &br->hash[i]) { - struct net_bridge_fdb_entry *f; - - f = hlist_entry(h, struct net_bridge_fdb_entry, hlist); - if (f->dst == p && f->is_local && !f->added_by_user) { - /* delete old one */ - fdb_delete_local(br, p, f); - - /* if this port has no vlan information - * configured, we can safely be done at - * this point. - */ - if (!vg || !vg->num_vlans) - goto insert; - } + hlist_for_each_entry(f, &br->fdb_list, fdb_node) { + if (f->dst == p && f->is_local && !f->added_by_user) { + /* delete old one */ + fdb_delete_local(br, p, f); + + /* if this port has no vlan information + * configured, we can safely be done at + * this point. + */ + if (!vg || !vg->num_vlans) + goto insert; } } @@ -316,35 +313,32 @@ void br_fdb_cleanup(struct work_struct *work) { struct net_bridge *br = container_of(work, struct net_bridge, gc_work.work); + struct net_bridge_fdb_entry *f = NULL; unsigned long delay = hold_time(br); unsigned long work_delay = delay; unsigned long now = jiffies; - int i; - for (i = 0; i < BR_HASH_SIZE; i++) { - struct net_bridge_fdb_entry *f; - struct hlist_node *n; + /* this part is tricky, in order to avoid blocking learning and + * consequently forwarding, we rely on rcu to delete objects with + * delayed freeing allowing us to continue traversing + */ + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { + unsigned long this_timer; - if (!br->hash[i].first) + if (f->is_static || f->added_by_external_learn) continue; - - spin_lock_bh(&br->hash_lock); - hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) { - unsigned long this_timer; - - if (f->is_static) - continue; - if (f->added_by_external_learn) - continue; - this_timer = f->updated + delay; - if (time_after(this_timer, now)) - work_delay = min(work_delay, this_timer - now); - else + this_timer = f->updated + delay; + if (time_after(this_timer, now)) { + work_delay = min(work_delay, this_timer - now); + } else { + spin_lock_bh(&br->hash_lock); + if (!hlist_unhashed(&f->fdb_node)) fdb_delete(br, f); + spin_unlock_bh(&br->hash_lock); } - spin_unlock_bh(&br->hash_lock); - cond_resched(); } + rcu_read_unlock(); /* Cleanup minimum 10 milliseconds apart */ work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10)); @@ -354,16 +348,13 @@ void br_fdb_cleanup(struct work_struct *work) /* Completely flush all dynamic entries in forwarding database.*/ void br_fdb_flush(struct net_bridge *br) { - int i; + struct net_bridge_fdb_entry *f; + struct hlist_node *tmp; spin_lock_bh(&br->hash_lock); - for (i = 0; i < BR_HASH_SIZE; i++) { - struct net_bridge_fdb_entry *f; - struct hlist_node *n; - hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) { - if (!f->is_static) - fdb_delete(br, f); - } + hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { + if (!f->is_static) + fdb_delete(br, f); } spin_unlock_bh(&br->hash_lock); } @@ -377,27 +368,22 @@ void br_fdb_delete_by_port(struct net_bridge *br, u16 vid, int do_all) { - int i; + struct net_bridge_fdb_entry *f; + struct hlist_node *tmp; spin_lock_bh(&br->hash_lock); - for (i = 0; i < BR_HASH_SIZE; i++) { - struct hlist_node *h, *g; + hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { + if (f->dst != p) + continue; - hlist_for_each_safe(h, g, &br->hash[i]) { - struct net_bridge_fdb_entry *f - = hlist_entry(h, struct net_bridge_fdb_entry, hlist); - if (f->dst != p) + if (!do_all) + if (f->is_static || (vid && f->key.vlan_id != vid)) continue; - if (!do_all) - if (f->is_static || (vid && f->vlan_id != vid)) - continue; - - if (f->is_local) - fdb_delete_local(br, p, f); - else - fdb_delete(br, f); - } + if (f->is_local) + fdb_delete_local(br, p, f); + else + fdb_delete(br, f); } spin_unlock_bh(&br->hash_lock); } @@ -433,52 +419,48 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr) int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long maxnum, unsigned long skip) { - struct __fdb_entry *fe = buf; - int i, num = 0; struct net_bridge_fdb_entry *f; + struct __fdb_entry *fe = buf; + int num = 0; memset(buf, 0, maxnum*sizeof(struct __fdb_entry)); rcu_read_lock(); - for (i = 0; i < BR_HASH_SIZE; i++) { - hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { - if (num >= maxnum) - goto out; + hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { + if (num >= maxnum) + break; - if (has_expired(br, f)) - continue; + if (has_expired(br, f)) + continue; - /* ignore pseudo entry for local MAC address */ - if (!f->dst) - continue; + /* ignore pseudo entry for local MAC address */ + if (!f->dst) + continue; - if (skip) { - --skip; - continue; - } + if (skip) { + --skip; + continue; + } - /* convert from internal format to API */ - memcpy(fe->mac_addr, f->addr.addr, ETH_ALEN); + /* convert from internal format to API */ + memcpy(fe->mac_addr, f->key.addr.addr, ETH_ALEN); - /* due to ABI compat need to split into hi/lo */ - fe->port_no = f->dst->port_no; - fe->port_hi = f->dst->port_no >> 8; + /* due to ABI compat need to split into hi/lo */ + fe->port_no = f->dst->port_no; + fe->port_hi = f->dst->port_no >> 8; - fe->is_local = f->is_local; - if (!f->is_static) - fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); - ++fe; - ++num; - } + fe->is_local = f->is_local; + if (!f->is_static) + fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); + ++fe; + ++num; } - - out: rcu_read_unlock(); return num; } -static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, +static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, __u16 vid, @@ -489,16 +471,23 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC); if (fdb) { - memcpy(fdb->addr.addr, addr, ETH_ALEN); + memcpy(fdb->key.addr.addr, addr, ETH_ALEN); fdb->dst = source; - fdb->vlan_id = vid; + fdb->key.vlan_id = vid; fdb->is_local = is_local; fdb->is_static = is_static; fdb->added_by_user = 0; fdb->added_by_external_learn = 0; fdb->offloaded = 0; fdb->updated = fdb->used = jiffies; - hlist_add_head_rcu(&fdb->hlist, head); + if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, + &fdb->rhnode, + br_fdb_rht_params)) { + kmem_cache_free(br_fdb_cache, fdb); + fdb = NULL; + } else { + hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list); + } } return fdb; } @@ -506,7 +495,6 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid) { - struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; if (!is_valid_ether_addr(addr)) @@ -524,7 +512,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, fdb_delete(br, fdb); } - fdb = fdb_create(head, source, addr, vid, 1, 1); + fdb = fdb_create(br, source, addr, vid, 1, 1); if (!fdb) return -ENOMEM; @@ -548,7 +536,6 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, bool added_by_user) { - struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; bool fdb_modified = false; @@ -561,7 +548,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, source->state == BR_STATE_FORWARDING)) return; - fdb = fdb_find_rcu(head, addr, vid); + fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); if (likely(fdb)) { /* attempt to update an entry for a local interface */ if (unlikely(fdb->is_local)) { @@ -590,14 +577,13 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } } else { spin_lock(&br->hash_lock); - if (likely(!fdb_find_rcu(head, addr, vid))) { - fdb = fdb_create(head, source, addr, vid, 0, 0); - if (fdb) { - if (unlikely(added_by_user)) - fdb->added_by_user = 1; - trace_br_fdb_update(br, source, addr, vid, added_by_user); - fdb_notify(br, fdb, RTM_NEWNEIGH); - } + fdb = fdb_create(br, source, addr, vid, 0, 0); + if (fdb) { + if (unlikely(added_by_user)) + fdb->added_by_user = 1; + trace_br_fdb_update(br, source, addr, vid, + added_by_user); + fdb_notify(br, fdb, RTM_NEWNEIGH); } /* else we lose race and someone else inserts * it first, don't bother updating @@ -646,7 +632,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, if (fdb->added_by_external_learn) ndm->ndm_flags |= NTF_EXT_LEARNED; - if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr)) + if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) goto nla_put_failure; if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex)) goto nla_put_failure; @@ -657,7 +643,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; - if (fdb->vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id)) + if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), + &fdb->key.vlan_id)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -711,54 +698,48 @@ int br_fdb_dump(struct sk_buff *skb, int *idx) { struct net_bridge *br = netdev_priv(dev); + struct net_bridge_fdb_entry *f; int err = 0; - int i; if (!(dev->priv_flags & IFF_EBRIDGE)) - goto out; + return err; if (!filter_dev) { err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); if (err < 0) - goto out; + return err; } - for (i = 0; i < BR_HASH_SIZE; i++) { - struct net_bridge_fdb_entry *f; - - hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { - - if (*idx < cb->args[2]) + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { + if (*idx < cb->args[2]) + goto skip; + if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) { + if (filter_dev != dev) goto skip; - - if (filter_dev && - (!f->dst || f->dst->dev != filter_dev)) { - if (filter_dev != dev) - goto skip; - /* !f->dst is a special case for bridge - * It means the MAC belongs to the bridge - * Therefore need a little more filtering - * we only want to dump the !f->dst case - */ - if (f->dst) - goto skip; - } - if (!filter_dev && f->dst) + /* !f->dst is a special case for bridge + * It means the MAC belongs to the bridge + * Therefore need a little more filtering + * we only want to dump the !f->dst case + */ + if (f->dst) goto skip; - - err = fdb_fill_info(skb, br, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI); - if (err < 0) - goto out; -skip: - *idx += 1; } + if (!filter_dev && f->dst) + goto skip; + + err = fdb_fill_info(skb, br, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI); + if (err < 0) + break; +skip: + *idx += 1; } + rcu_read_unlock(); -out: return err; } @@ -766,7 +747,6 @@ out: static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, const __u8 *addr, __u16 state, __u16 flags, __u16 vid) { - struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; bool modified = false; @@ -787,7 +767,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (!(flags & NLM_F_CREATE)) return -ENOENT; - fdb = fdb_create(head, source, addr, vid, 0, 0); + fdb = fdb_create(br, source, addr, vid, 0, 0); if (!fdb) return -ENOMEM; @@ -1012,65 +992,60 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p) { - struct net_bridge_fdb_entry *fdb, *tmp; - int i; + struct net_bridge_fdb_entry *f, *tmp; int err; ASSERT_RTNL(); - for (i = 0; i < BR_HASH_SIZE; i++) { - hlist_for_each_entry(fdb, &br->hash[i], hlist) { - /* We only care for static entries */ - if (!fdb->is_static) - continue; - - err = dev_uc_add(p->dev, fdb->addr.addr); - if (err) - goto rollback; - } + /* the key here is that static entries change only under rtnl */ + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { + /* We only care for static entries */ + if (!f->is_static) + continue; + err = dev_uc_add(p->dev, f->key.addr.addr); + if (err) + goto rollback; } - return 0; +done: + rcu_read_unlock(); -rollback: - for (i = 0; i < BR_HASH_SIZE; i++) { - hlist_for_each_entry(tmp, &br->hash[i], hlist) { - /* If we reached the fdb that failed, we can stop */ - if (tmp == fdb) - break; - - /* We only care for static entries */ - if (!tmp->is_static) - continue; + return err; - dev_uc_del(p->dev, tmp->addr.addr); - } +rollback: + hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) { + /* We only care for static entries */ + if (!tmp->is_static) + continue; + if (tmp == f) + break; + dev_uc_del(p->dev, tmp->key.addr.addr); } - return err; + + goto done; } void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) { - struct net_bridge_fdb_entry *fdb; - int i; + struct net_bridge_fdb_entry *f; ASSERT_RTNL(); - for (i = 0; i < BR_HASH_SIZE; i++) { - hlist_for_each_entry_rcu(fdb, &br->hash[i], hlist) { - /* We only care for static entries */ - if (!fdb->is_static) - continue; + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { + /* We only care for static entries */ + if (!f->is_static) + continue; - dev_uc_del(p->dev, fdb->addr.addr); - } + dev_uc_del(p->dev, f->key.addr.addr); } + rcu_read_unlock(); } int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid) { struct net_bridge_fdb_entry *fdb; - struct hlist_head *head; bool modified = false; int err = 0; @@ -1078,10 +1053,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, spin_lock_bh(&br->hash_lock); - head = &br->hash[br_mac_hash(addr, vid)]; fdb = br_fdb_find(br, addr, vid); if (!fdb) { - fdb = fdb_create(head, p, addr, vid, 0, 0); + fdb = fdb_create(br, p, addr, vid, 0, 0); if (!fdb) { err = -ENOMEM; goto err_unlock; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index b0f4c734900b..6d9f48bd374a 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -760,9 +760,9 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void br_mdb_init(void) { - rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0); - rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0); } void br_mdb_uninit(void) diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c index 20cbb727df4d..8e2d7cfa4e16 100644 --- a/net/bridge/br_nf_core.c +++ b/net/bridge/br_nf_core.c @@ -78,7 +78,6 @@ void br_netfilter_rtable_init(struct net_bridge *br) atomic_set(&rt->dst.__refcnt, 1); rt->dst.dev = br->dev; - rt->dst.path = &rt->dst; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; rt->dst.ops = &fake_dst_ops; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1312b8d20ec3..80559fd11b7e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -168,12 +168,17 @@ struct net_bridge_vlan_group { u16 pvid; }; +struct net_bridge_fdb_key { + mac_addr addr; + u16 vlan_id; +}; + struct net_bridge_fdb_entry { - struct hlist_node hlist; + struct rhash_head rhnode; struct net_bridge_port *dst; - mac_addr addr; - __u16 vlan_id; + struct net_bridge_fdb_key key; + struct hlist_node fdb_node; unsigned char is_local:1, is_static:1, added_by_user:1, @@ -315,7 +320,7 @@ struct net_bridge { struct net_bridge_vlan_group __rcu *vlgrp; #endif - struct hlist_head hash[BR_HASH_SIZE]; + struct rhashtable fdb_hash_tbl; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) union { struct rtable fake_rtable; @@ -405,6 +410,7 @@ struct net_bridge { int offload_fwd_mark; #endif bool neigh_suppress_enabled; + struct hlist_head fdb_list; }; struct br_input_skb_cb { @@ -515,6 +521,8 @@ static inline void br_netpoll_disable(struct net_bridge_port *p) /* br_fdb.c */ int br_fdb_init(void); void br_fdb_fini(void); +int br_fdb_hash_init(struct net_bridge *br); +void br_fdb_hash_fini(struct net_bridge *br); void br_fdb_flush(struct net_bridge *br); void br_fdb_find_delete_local(struct net_bridge *br, const struct net_bridge_port *p, diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 9700e0f3307b..ee775f4ff76c 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -121,13 +121,13 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) switch (type) { case RTM_DELNEIGH: - br_switchdev_fdb_call_notifiers(false, fdb->addr.addr, - fdb->vlan_id, + br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr, + fdb->key.vlan_id, fdb->dst->dev); break; case RTM_NEWNEIGH: - br_switchdev_fdb_call_notifiers(true, fdb->addr.addr, - fdb->vlan_id, + br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr, + fdb->key.vlan_id, fdb->dst->dev); break; } diff --git a/net/can/gw.c b/net/can/gw.c index 73a02af4b5d7..398dd0395ad9 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1014,6 +1014,8 @@ static struct pernet_operations cangw_pernet_ops = { static __init int cgw_module_init(void) { + int ret; + /* sanitize given module parameter */ max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS); @@ -1031,15 +1033,19 @@ static __init int cgw_module_init(void) notifier.notifier_call = cgw_notifier; register_netdevice_notifier(¬ifier); - if (__rtnl_register(PF_CAN, RTM_GETROUTE, NULL, cgw_dump_jobs, 0)) { + ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_GETROUTE, + NULL, cgw_dump_jobs, 0); + if (ret) { unregister_netdevice_notifier(¬ifier); kmem_cache_destroy(cgw_cache); return -ENOBUFS; } - /* Only the first call to __rtnl_register can fail */ - __rtnl_register(PF_CAN, RTM_NEWROUTE, cgw_create_job, NULL, 0); - __rtnl_register(PF_CAN, RTM_DELROUTE, cgw_remove_job, NULL, 0); + /* Only the first call to rtnl_register_module can fail */ + rtnl_register_module(THIS_MODULE, PF_CAN, RTM_NEWROUTE, + cgw_create_job, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_CAN, RTM_DELROUTE, + cgw_remove_job, NULL, 0); return 0; } diff --git a/net/core/dev.c b/net/core/dev.c index f47e96b62308..b0eee49a2489 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2803,7 +2803,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, segs = skb_mac_gso_segment(skb, features); - if (unlikely(skb_needs_check(skb, tx_path))) + if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; @@ -3162,6 +3162,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, int rc; qdisc_calculate_pkt_len(skb, q); + + if (q->flags & TCQ_F_NOLOCK) { + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + __qdisc_drop(skb, &to_free); + rc = NET_XMIT_DROP; + } else { + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + __qdisc_run(q); + } + + if (unlikely(to_free)) + kfree_skb_list(to_free); + return rc; + } + /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3192,9 +3207,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); - } else - qdisc_run_end(q); + } + qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; @@ -3204,6 +3219,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); + qdisc_run_end(q); } } spin_unlock(root_lock); @@ -4143,19 +4159,22 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) while (head) { struct Qdisc *q = head; - spinlock_t *root_lock; + spinlock_t *root_lock = NULL; head = head->next_sched; - root_lock = qdisc_lock(q); - spin_lock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); - spin_unlock(root_lock); + if (root_lock) + spin_unlock(root_lock); } } } @@ -7073,17 +7092,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id) +void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, + struct netdev_bpf *xdp) { - struct netdev_bpf xdp; - - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG; + memset(xdp, 0, sizeof(*xdp)); + xdp->command = XDP_QUERY_PROG; /* Query must always succeed. */ - WARN_ON(bpf_op(dev, &xdp) < 0); - if (prog_id) - *prog_id = xdp.prog_id; + WARN_ON(bpf_op(dev, xdp) < 0); +} + +static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) +{ + struct netdev_bpf xdp; + + __dev_xdp_query(dev, bpf_op, &xdp); return xdp.prog_attached; } @@ -7106,6 +7129,27 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, return bpf_op(dev, &xdp); } +static void dev_xdp_uninstall(struct net_device *dev) +{ + struct netdev_bpf xdp; + bpf_op_t ndo_bpf; + + /* Remove generic XDP */ + WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL)); + + /* Remove from the driver */ + ndo_bpf = dev->netdev_ops->ndo_bpf; + if (!ndo_bpf) + return; + + __dev_xdp_query(dev, ndo_bpf, &xdp); + if (xdp.prog_attached == XDP_ATTACHED_NONE) + return; + + /* Program removal should always succeed */ + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@ -7134,10 +7178,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_chk = generic_xdp_install; if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL)) + if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op, NULL)) + __dev_xdp_attached(dev, bpf_op)) return -EBUSY; prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, @@ -7236,6 +7280,7 @@ static void rollback_registered_many(struct list_head *head) /* Shutdown queueing discipline. */ dev_shutdown(dev); + dev_xdp_uninstall(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. @@ -8195,7 +8240,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs); void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; - struct bpf_prog *prog; might_sleep(); netif_free_tx_queues(dev); @@ -8214,12 +8258,6 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; - prog = rcu_dereference_protected(dev->xdp_prog, 1); - if (prog) { - bpf_prog_put(prog); - static_key_slow_dec(&generic_xdp_needed); - } - /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); diff --git a/net/core/dst.c b/net/core/dst.c index 662a2d4a3d19..007aa0b08291 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/prefetch.h> #include <net/lwtunnel.h> +#include <net/xfrm.h> #include <net/dst.h> #include <net/dst_metadata.h> @@ -62,15 +63,12 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) { - dst->child = NULL; dst->dev = dev; if (dev) dev_hold(dev); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; - dst->path = dst; - dst->from = NULL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif @@ -88,7 +86,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; - dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); } @@ -116,12 +113,17 @@ EXPORT_SYMBOL(dst_alloc); struct dst_entry *dst_destroy(struct dst_entry * dst) { - struct dst_entry *child; + struct dst_entry *child = NULL; smp_rmb(); - child = dst->child; +#ifdef CONFIG_XFRM + if (dst->xfrm) { + struct xfrm_dst *xdst = (struct xfrm_dst *) dst; + child = xdst->child; + } +#endif if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); diff --git a/net/core/filter.c b/net/core/filter.c index 6a85e67fafce..754abe1041b7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3013,6 +3013,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; if (flags & BPF_F_DONT_FRAGMENT) info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -3026,8 +3028,6 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; } return 0; @@ -4437,6 +4437,42 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct bpf_sock_ops, is_fullsock): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + break; + +/* Helper macro for adding read access to tcp_sock fields. */ +#define SOCK_OPS_GET_TCP32(FIELD_NAME) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \ + offsetof(struct tcp_sock, FIELD_NAME)); \ + } while (0) + + case offsetof(struct bpf_sock_ops, snd_cwnd): + SOCK_OPS_GET_TCP32(snd_cwnd); + break; + + case offsetof(struct bpf_sock_ops, srtt_us): + SOCK_OPS_GET_TCP32(srtt_us); + break; } return insn - insn_buf; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 15ce30063765..cc75488d3653 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -133,10 +133,10 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, ctrl->addr_type = type; } -static void -__skb_flow_dissect_tunnel_info(const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - void *target_container) +void +skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) { struct ip_tunnel_info *info; struct ip_tunnel_key *key; @@ -212,6 +212,7 @@ __skb_flow_dissect_tunnel_info(const struct sk_buff *skb, tp->dst = key->tp_dst; } } +EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, @@ -576,9 +577,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - __skb_flow_dissect_tunnel_info(skb, flow_dissector, - target_container); - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 87f28557b329..b2b2323bdc84 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -252,10 +252,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, } } -static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu, - const struct gnet_stats_queue *q, - __u32 qlen) +void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q, + __u32 qlen) { if (cpu) { __gnet_stats_copy_queue_cpu(qstats, cpu); @@ -269,6 +269,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, qstats->qlen = qlen; } +EXPORT_SYMBOL(__gnet_stats_copy_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f95a15086225..b9ce241cd28c 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -399,7 +399,7 @@ struct pktgen_dev { __u8 ipsmode; /* IPSEC mode (config) */ __u8 ipsproto; /* IPSEC type (config) */ __u32 spi; - struct dst_entry dst; + struct xfrm_dst xdst; struct dst_ops dstops; #endif char result[512]; @@ -2609,7 +2609,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) * supports both transport/tunnel mode + ESP/AH type. */ if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0)) - skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF; + skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF; rcu_read_lock_bh(); err = x->outer_mode->output(x, skb); @@ -3742,10 +3742,10 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) * performance under such circumstance. */ pkt_dev->dstops.family = AF_INET; - pkt_dev->dst.dev = pkt_dev->odev; - dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false); - pkt_dev->dst.child = &pkt_dev->dst; - pkt_dev->dst.ops = &pkt_dev->dstops; + pkt_dev->xdst.u.dst.dev = pkt_dev->odev; + dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false); + pkt_dev->xdst.child = &pkt_dev->xdst.u.dst; + pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops; #endif return add_dev_to_thread(t, pkt_dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dabba2a91fc8..c688dc564b11 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -62,7 +62,9 @@ struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; + struct module *owner; unsigned int flags; + struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); @@ -127,8 +129,7 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ -static struct rtnl_link __rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; -static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1]; +static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { @@ -144,72 +145,127 @@ static inline int rtm_msgindex(int msgtype) return msgindex; } -/** - * __rtnl_register - Register a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions - * - * Registers the specified function pointers (at least one of them has - * to be non-NULL) to be called whenever a request message for the - * specified protocol family and message type is received. - * - * The special protocol family PF_UNSPEC may be used to define fallback - * function pointers for the case when no entry for the specific protocol - * family exists. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) +static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) +{ + struct rtnl_link **tab; + + if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) + protocol = PF_UNSPEC; + + tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); + if (!tab) + tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); + + return tab[msgtype]; +} + +static int rtnl_register_internal(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) { - struct rtnl_link *tab; + struct rtnl_link *link, *old; + struct rtnl_link __rcu **tab; int msgindex; + int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); - tab = rcu_dereference_raw(rtnl_msg_handlers[protocol]); + rtnl_lock(); + tab = rtnl_msg_handlers[protocol]; if (tab == NULL) { - tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL); - if (tab == NULL) - return -ENOBUFS; + tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); + if (!tab) + goto unlock; + /* ensures we see the 0 stores */ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } + old = rtnl_dereference(tab[msgindex]); + if (old) { + link = kmemdup(old, sizeof(*old), GFP_KERNEL); + if (!link) + goto unlock; + } else { + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) + goto unlock; + } + + WARN_ON(link->owner && link->owner != owner); + link->owner = owner; + + WARN_ON(doit && link->doit && link->doit != doit); if (doit) - tab[msgindex].doit = doit; + link->doit = doit; + WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); if (dumpit) - tab[msgindex].dumpit = dumpit; - tab[msgindex].flags |= flags; + link->dumpit = dumpit; - return 0; + link->flags |= flags; + + /* publish protocol:msgtype */ + rcu_assign_pointer(tab[msgindex], link); + ret = 0; + if (old) + kfree_rcu(old, rcu); +unlock: + rtnl_unlock(); + return ret; +} + +/** + * rtnl_register_module - Register a rtnetlink message type + * + * @owner: module registering the hook (THIS_MODULE) + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * + * Like rtnl_register, but for use by removable modules. + */ +int rtnl_register_module(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) +{ + return rtnl_register_internal(owner, protocol, msgtype, + doit, dumpit, flags); } -EXPORT_SYMBOL_GPL(__rtnl_register); +EXPORT_SYMBOL_GPL(rtnl_register_module); /** * rtnl_register - Register a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions * - * Identical to __rtnl_register() but panics on failure. This is useful - * as failure of this function is very unlikely, it can only happen due - * to lack of memory when allocating the chain to store all message - * handlers for a protocol. Meant for use in init functions where lack - * of memory implies no sense in continuing. + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. + * + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. */ void rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { - if (__rtnl_register(protocol, msgtype, doit, dumpit, flags) < 0) - panic("Unable to register rtnetlink message handler, " - "protocol = %d, message type = %d\n", - protocol, msgtype); + int err; + + err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, + flags); + if (err) + pr_err("Unable to register rtnetlink message handler, " + "protocol = %d, message type = %d\n", protocol, msgtype); } -EXPORT_SYMBOL_GPL(rtnl_register); /** * rtnl_unregister - Unregister a rtnetlink message type @@ -220,24 +276,25 @@ EXPORT_SYMBOL_GPL(rtnl_register); */ int rtnl_unregister(int protocol, int msgtype) { - struct rtnl_link *handlers; + struct rtnl_link **tab, *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[protocol]); - if (!handlers) { + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + if (!tab) { rtnl_unlock(); return -ENOENT; } - handlers[msgindex].doit = NULL; - handlers[msgindex].dumpit = NULL; - handlers[msgindex].flags = 0; + link = tab[msgindex]; + rcu_assign_pointer(tab[msgindex], NULL); rtnl_unlock(); + kfree_rcu(link, rcu); + return 0; } EXPORT_SYMBOL_GPL(rtnl_unregister); @@ -251,20 +308,27 @@ EXPORT_SYMBOL_GPL(rtnl_unregister); */ void rtnl_unregister_all(int protocol) { - struct rtnl_link *handlers; + struct rtnl_link **tab, *link; + int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[protocol]); + tab = rtnl_msg_handlers[protocol]; RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); + for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { + link = tab[msgindex]; + if (!link) + continue; + + rcu_assign_pointer(tab[msgindex], NULL); + kfree_rcu(link, rcu); + } rtnl_unlock(); synchronize_net(); - while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1) - schedule(); - kfree(handlers); + kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); @@ -1261,6 +1325,7 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) { const struct net_device_ops *ops = dev->netdev_ops; const struct bpf_prog *generic_xdp_prog; + struct netdev_bpf xdp; ASSERT_RTNL(); @@ -1273,7 +1338,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) if (!ops->ndo_bpf) return XDP_ATTACHED_NONE; - return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id); + __dev_xdp_query(dev, ops->ndo_bpf, &xdp); + *prog_id = xdp.prog_id; + + return xdp.prog_attached; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) @@ -1569,6 +1637,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, @@ -2219,6 +2289,34 @@ static int do_setlink(const struct sk_buff *skb, } } + if (tb[IFLA_GSO_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); + + if (max_size > GSO_MAX_SIZE) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_max_size ^ max_size) { + netif_set_gso_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GSO_MAX_SEGS]) { + u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); + + if (max_segs > GSO_MAX_SEGS) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_max_segs ^ max_segs) { + dev->gso_max_segs = max_segs; + status |= DO_SETLINK_MODIFIED; + } + } + if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); @@ -2583,6 +2681,10 @@ struct net_device *rtnl_create_link(struct net *net, dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + if (tb[IFLA_GSO_MAX_SIZE]) + netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); + if (tb[IFLA_GSO_MAX_SEGS]) + dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); return dev; } @@ -2973,18 +3075,26 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { + struct rtnl_link **tab; int type = cb->nlh->nlmsg_type-RTM_BASE; - struct rtnl_link *handlers; + struct rtnl_link *link; rtnl_dumpit_func dumpit; if (idx < s_idx || idx == PF_PACKET) continue; - handlers = rtnl_dereference(rtnl_msg_handlers[idx]); - if (!handlers) + if (type < 0 || type >= RTM_NR_MSGTYPES) continue; - dumpit = READ_ONCE(handlers[type].dumpit); + tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); + if (!tab) + continue; + + link = tab[type]; + if (!link) + continue; + + dumpit = link->dumpit; if (!dumpit) continue; @@ -4314,7 +4424,8 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct rtnl_link *handlers; + struct rtnl_link *link; + struct module *owner; int err = -EOPNOTSUPP; rtnl_doit_func doit; unsigned int flags; @@ -4338,79 +4449,85 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (family >= ARRAY_SIZE(rtnl_msg_handlers)) - family = PF_UNSPEC; - rcu_read_lock(); - handlers = rcu_dereference(rtnl_msg_handlers[family]); - if (!handlers) { - family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[family]); - } - if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { struct sock *rtnl; rtnl_dumpit_func dumpit; u16 min_dump_alloc = 0; - dumpit = READ_ONCE(handlers[type].dumpit); - if (!dumpit) { + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) { family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[PF_UNSPEC]); - if (!handlers) - goto err_unlock; - - dumpit = READ_ONCE(handlers[type].dumpit); - if (!dumpit) + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) goto err_unlock; } - - refcount_inc(&rtnl_msg_handlers_ref[family]); + owner = link->owner; + dumpit = link->dumpit; if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); + err = 0; + /* need to do this before rcu_read_unlock() */ + if (!try_module_get(owner)) + err = -EPROTONOSUPPORT; + rcu_read_unlock(); rtnl = net->rtnl; - { + if (err == 0) { struct netlink_dump_control c = { .dump = dumpit, .min_dump_alloc = min_dump_alloc, + .module = owner, }; err = netlink_dump_start(rtnl, skb, nlh, &c); + /* netlink_dump_start() will keep a reference on + * module if dump is still in progress. + */ + module_put(owner); } - refcount_dec(&rtnl_msg_handlers_ref[family]); return err; } - doit = READ_ONCE(handlers[type].doit); - if (!doit) { + link = rtnl_get_link(family, type); + if (!link || !link->doit) { family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[family]); + link = rtnl_get_link(PF_UNSPEC, type); + if (!link || !link->doit) + goto out_unlock; } - flags = READ_ONCE(handlers[type].flags); + owner = link->owner; + if (!try_module_get(owner)) { + err = -EPROTONOSUPPORT; + goto out_unlock; + } + + flags = link->flags; if (flags & RTNL_FLAG_DOIT_UNLOCKED) { - refcount_inc(&rtnl_msg_handlers_ref[family]); - doit = READ_ONCE(handlers[type].doit); + doit = link->doit; rcu_read_unlock(); if (doit) err = doit(skb, nlh, extack); - refcount_dec(&rtnl_msg_handlers_ref[family]); + module_put(owner); return err; } - rcu_read_unlock(); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[family]); - if (handlers) { - doit = READ_ONCE(handlers[type].doit); - if (doit) - err = doit(skb, nlh, extack); - } + link = rtnl_get_link(family, type); + if (link && link->doit) + err = link->doit(skb, nlh, extack); rtnl_unlock(); + + module_put(owner); + + return err; + +out_unlock: + rcu_read_unlock(); return err; err_unlock: @@ -4498,11 +4615,6 @@ static struct pernet_operations rtnetlink_net_ops = { void __init rtnetlink_init(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(rtnl_msg_handlers_ref); i++) - refcount_set(&rtnl_msg_handlers_ref[i], 1); - if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 5eeb1d20cc38..c5bb52bc73a1 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -235,7 +235,9 @@ struct sock *reuseport_select_sock(struct sock *sk, if (prog && skb) sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); - else + + /* no bpf or invalid bpf result: fall back to hash usage */ + if (!sk2) sk2 = reuse->socks[reciprocal_scale(hash, socks)]; } diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 178bb9833311..37ccbe62eb1a 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -63,9 +63,10 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) */ local_bh_disable(); inet_twsk_schedule(tw, timeo); - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); - inet_twsk_put(tw); + /* Linkage updates. + * Note that access to tw after this point is illegal. + */ + inet_twsk_hashdance(tw, sk, &dccp_hashinfo); local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 9153247dad28..d1885cf59319 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -1418,9 +1418,12 @@ void __init dn_dev_init(void) dn_dev_devices_on(); - rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL, 0); - rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL, 0); - rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWADDR, + dn_nl_newaddr, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELADDR, + dn_nl_deladdr, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR, + NULL, dn_nl_dump_ifaddr, 0); proc_create("decnet_dev", S_IRUGO, init_net.proc_net, &dn_dev_seq_fops); diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index b37a1b833c77..fce94cbd4378 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -792,8 +792,10 @@ void __init dn_fib_init(void) register_dnaddr_notifier(&dn_fib_dnaddr_notifier); - rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL, 0); - rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWROUTE, + dn_fib_rtm_newroute, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE, + dn_fib_rtm_delroute, NULL, 0); } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 324cb9f2f551..73160d4aebbe 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -199,11 +199,11 @@ static void dn_dst_check_expire(struct timer_list *unused) lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { if (atomic_read(&rt->dst.__refcnt) > 1 || (now - rt->dst.lastuse) < expire) { - rtp = &rt->dst.dn_next; + rtp = &rt->dn_next; continue; } - *rtp = rt->dst.dn_next; - rt->dst.dn_next = NULL; + *rtp = rt->dn_next; + rt->dn_next = NULL; dst_dev_put(&rt->dst); dst_release(&rt->dst); } @@ -233,11 +233,11 @@ static int dn_dst_gc(struct dst_ops *ops) lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { if (atomic_read(&rt->dst.__refcnt) > 1 || (now - rt->dst.lastuse) < expire) { - rtp = &rt->dst.dn_next; + rtp = &rt->dn_next; continue; } - *rtp = rt->dst.dn_next; - rt->dst.dn_next = NULL; + *rtp = rt->dn_next; + rt->dn_next = NULL; dst_dev_put(&rt->dst); dst_release(&rt->dst); break; @@ -333,8 +333,8 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou lockdep_is_held(&dn_rt_hash_table[hash].lock))) != NULL) { if (compare_keys(&rth->fld, &rt->fld)) { /* Put it first */ - *rthp = rth->dst.dn_next; - rcu_assign_pointer(rth->dst.dn_next, + *rthp = rth->dn_next; + rcu_assign_pointer(rth->dn_next, dn_rt_hash_table[hash].chain); rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth); @@ -345,10 +345,10 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou *rp = rth; return 0; } - rthp = &rth->dst.dn_next; + rthp = &rth->dn_next; } - rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain); + rcu_assign_pointer(rt->dn_next, dn_rt_hash_table[hash].chain); rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt); dst_hold_and_use(&rt->dst, now); @@ -369,8 +369,8 @@ static void dn_run_flush(struct timer_list *unused) goto nothing_to_declare; for(; rt; rt = next) { - next = rcu_dereference_raw(rt->dst.dn_next); - RCU_INIT_POINTER(rt->dst.dn_next, NULL); + next = rcu_dereference_raw(rt->dn_next); + RCU_INIT_POINTER(rt->dn_next, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } @@ -1183,6 +1183,7 @@ make_route: if (rt == NULL) goto e_nobufs; + rt->dn_next = NULL; memset(&rt->fld, 0, sizeof(rt->fld)); rt->fld.saddr = oldflp->saddr; rt->fld.daddr = oldflp->daddr; @@ -1252,7 +1253,7 @@ static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn * if (!(flags & MSG_TRYHARD)) { rcu_read_lock_bh(); for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt; - rt = rcu_dereference_bh(rt->dst.dn_next)) { + rt = rcu_dereference_bh(rt->dn_next)) { if ((flp->daddr == rt->fld.daddr) && (flp->saddr == rt->fld.saddr) && (flp->flowidn_mark == rt->fld.flowidn_mark) && @@ -1448,6 +1449,7 @@ make_route: if (rt == NULL) goto e_nobufs; + rt->dn_next = NULL; memset(&rt->fld, 0, sizeof(rt->fld)); rt->rt_saddr = fld.saddr; rt->rt_daddr = fld.daddr; @@ -1529,7 +1531,7 @@ static int dn_route_input(struct sk_buff *skb) rcu_read_lock(); for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL; - rt = rcu_dereference(rt->dst.dn_next)) { + rt = rcu_dereference(rt->dn_next)) { if ((rt->fld.saddr == cb->src) && (rt->fld.daddr == cb->dst) && (rt->fld.flowidn_oif == 0) && @@ -1749,7 +1751,7 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock_bh(); for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference_bh(rt->dst.dn_next), idx++) { + rt = rcu_dereference_bh(rt->dn_next), idx++) { if (idx < s_idx) continue; skb_dst_set(skb, dst_clone(&rt->dst)); @@ -1795,7 +1797,7 @@ static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_rou { struct dn_rt_cache_iter_state *s = seq->private; - rt = rcu_dereference_bh(rt->dst.dn_next); + rt = rcu_dereference_bh(rt->dn_next); while (!rt) { rcu_read_unlock_bh(); if (--s->bucket < 0) @@ -1921,11 +1923,11 @@ void __init dn_route_init(void) &dn_rt_cache_seq_fops); #ifdef CONFIG_DECNET_ROUTER - rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute, - dn_fib_dump, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE, + dn_cache_getroute, dn_fib_dump, 0); #else - rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute, - dn_cache_dump, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE, + dn_cache_getroute, dn_cache_dump, 0); #endif } diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 03c3bdf25468..bbf2c82cf7b2 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -16,6 +16,15 @@ config NET_DSA if NET_DSA +config NET_DSA_LEGACY + bool "Support for older platform device and Device Tree registration" + default y + ---help--- + Say Y if you want to enable support for the older platform device and + deprecated Device Tree binding registration. + + This feature is scheduled for removal in 4.17. + # tagging formats config NET_DSA_TAG_BRCM bool diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 0e13c1f95d13..9e4d3536f977 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o +dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o +dsa_core-$(CONFIG_NET_DSA_LEGACY) += legacy.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 1e287420ff49..21f9bed11988 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -241,7 +241,7 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) for (port = 0; port < ds->num_ports; port++) { dp = &ds->ports[port]; - if (dsa_port_is_user(dp)) + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) dp->cpu_dp = dst->cpu_dp; } } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 7d036696e8c4..b03665e8fb4e 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -97,8 +97,17 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); bool dsa_schedule_work(struct work_struct *work); /* legacy.c */ +#if IS_ENABLED(CONFIG_NET_DSA_LEGACY) int dsa_legacy_register(void); void dsa_legacy_unregister(void); +#else +static inline int dsa_legacy_register(void) +{ + return -ENODEV; +} + +static inline void dsa_legacy_unregister(void) { } +#endif int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 84611d7fcfa2..aa56d3fb5da4 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -718,26 +718,6 @@ static int dsa_resume(struct device *d) } #endif -/* legacy way, bypassing the bridge *****************************************/ -int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid, - u16 flags) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - return dsa_port_fdb_add(dp, addr, vid); -} - -int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - return dsa_port_fdb_del(dp, addr, vid); -} - static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume); static const struct of_device_id dsa_of_match_table[] = { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a95a55f79137..f52307296de4 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -708,14 +708,12 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; __be16 protocol = cls->common.protocol; - struct net *net = dev_net(dev); struct dsa_switch *ds = dp->ds; struct net_device *to_dev; const struct tc_action *a; struct dsa_port *to_dp; int err = -EOPNOTSUPP; LIST_HEAD(actions); - int ifindex; if (!ds->ops->port_mirror_add) return err; @@ -729,8 +727,7 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) { struct dsa_mall_mirror_tc_entry *mirror; - ifindex = tcf_mirred_ifindex(a); - to_dev = __dev_get_by_index(net, ifindex); + to_dev = tcf_mirred_dev(a); if (!to_dev) return -EINVAL; @@ -943,6 +940,26 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .set_rxnfc = dsa_slave_set_rxnfc, }; +/* legacy way, bypassing the bridge *****************************************/ +int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 vid, + u16 flags) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return dsa_port_fdb_add(dp, addr, vid); +} + +int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 vid) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return dsa_port_fdb_del(dp, addr, vid); +} + static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 29608d087a7c..b93511726069 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -83,29 +83,52 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, static int dsa_switch_fdb_add(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { - /* Do not care yet about other switch chips of the fabric */ - if (ds->index != info->sw_index) - return 0; + int port = dsa_towards_port(ds, info->sw_index, info->port); if (!ds->ops->port_fdb_add) return -EOPNOTSUPP; - return ds->ops->port_fdb_add(ds, info->port, info->addr, - info->vid); + return ds->ops->port_fdb_add(ds, port, info->addr, info->vid); } static int dsa_switch_fdb_del(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { - /* Do not care yet about other switch chips of the fabric */ - if (ds->index != info->sw_index) - return 0; + int port = dsa_towards_port(ds, info->sw_index, info->port); if (!ds->ops->port_fdb_del) return -EOPNOTSUPP; - return ds->ops->port_fdb_del(ds, info->port, info->addr, - info->vid); + return ds->ops->port_fdb_del(ds, port, info->addr, info->vid); +} + +static int +dsa_switch_mdb_prepare_bitmap(struct dsa_switch *ds, + const struct switchdev_obj_port_mdb *mdb, + const unsigned long *bitmap) +{ + int port, err; + + if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) + return -EOPNOTSUPP; + + for_each_set_bit(port, bitmap, ds->num_ports) { + err = ds->ops->port_mdb_prepare(ds, port, mdb); + if (err) + return err; + } + + return 0; +} + +static void dsa_switch_mdb_add_bitmap(struct dsa_switch *ds, + const struct switchdev_obj_port_mdb *mdb, + const unsigned long *bitmap) +{ + int port; + + for_each_set_bit(port, bitmap, ds->num_ports) + ds->ops->port_mdb_add(ds, port, mdb); } static int dsa_switch_mdb_add(struct dsa_switch *ds, @@ -114,7 +137,7 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, const struct switchdev_obj_port_mdb *mdb = info->mdb; struct switchdev_trans *trans = info->trans; DECLARE_BITMAP(group, ds->num_ports); - int port, err; + int port; /* Build a mask of Multicast group members */ bitmap_zero(group, ds->num_ports); @@ -124,21 +147,10 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, if (dsa_is_dsa_port(ds, port)) set_bit(port, group); - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) - return -EOPNOTSUPP; - - for_each_set_bit(port, group, ds->num_ports) { - err = ds->ops->port_mdb_prepare(ds, port, mdb, trans); - if (err) - return err; - } - - return 0; - } + if (switchdev_trans_ph_prepare(trans)) + return dsa_switch_mdb_prepare_bitmap(ds, mdb, group); - for_each_set_bit(port, group, ds->num_ports) - ds->ops->port_mdb_add(ds, port, mdb, trans); + dsa_switch_mdb_add_bitmap(ds, mdb, group); return 0; } @@ -157,13 +169,43 @@ static int dsa_switch_mdb_del(struct dsa_switch *ds, return 0; } +static int +dsa_switch_vlan_prepare_bitmap(struct dsa_switch *ds, + const struct switchdev_obj_port_vlan *vlan, + const unsigned long *bitmap) +{ + int port, err; + + if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) + return -EOPNOTSUPP; + + for_each_set_bit(port, bitmap, ds->num_ports) { + err = ds->ops->port_vlan_prepare(ds, port, vlan); + if (err) + return err; + } + + return 0; +} + +static void +dsa_switch_vlan_add_bitmap(struct dsa_switch *ds, + const struct switchdev_obj_port_vlan *vlan, + const unsigned long *bitmap) +{ + int port; + + for_each_set_bit(port, bitmap, ds->num_ports) + ds->ops->port_vlan_add(ds, port, vlan); +} + static int dsa_switch_vlan_add(struct dsa_switch *ds, struct dsa_notifier_vlan_info *info) { const struct switchdev_obj_port_vlan *vlan = info->vlan; struct switchdev_trans *trans = info->trans; DECLARE_BITMAP(members, ds->num_ports); - int port, err; + int port; /* Build a mask of VLAN members */ bitmap_zero(members, ds->num_ports); @@ -173,21 +215,10 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) set_bit(port, members); - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) - return -EOPNOTSUPP; - - for_each_set_bit(port, members, ds->num_ports) { - err = ds->ops->port_vlan_prepare(ds, port, vlan, trans); - if (err) - return err; - } - - return 0; - } + if (switchdev_trans_ph_prepare(trans)) + return dsa_switch_vlan_prepare_bitmap(ds, vlan, members); - for_each_set_bit(port, members, ds->num_ports) - ds->ops->port_vlan_add(ds, port, vlan, trans); + dsa_switch_vlan_add_bitmap(ds, vlan, members); return 0; } diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 8475434af7d5..11535bc70743 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -13,10 +13,13 @@ */ #include <linux/etherdevice.h> +#include <linux/if_vlan.h> #include "dsa_priv.h" #define MTK_HDR_LEN 4 +#define MTK_HDR_XMIT_UNTAGGED 0 +#define MTK_HDR_XMIT_TAGGED_TPID_8100 1 #define MTK_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0) #define MTK_HDR_XMIT_DP_BIT_MASK GENMASK(5, 0) @@ -25,20 +28,37 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, { struct dsa_port *dp = dsa_slave_to_port(dev); u8 *mtk_tag; + bool is_vlan_skb = true; - if (skb_cow_head(skb, MTK_HDR_LEN) < 0) - return NULL; - - skb_push(skb, MTK_HDR_LEN); + /* Build the special tag after the MAC Source Address. If VLAN header + * is present, it's required that VLAN header and special tag is + * being combined. Only in this way we can allow the switch can parse + * the both special and VLAN tag at the same time and then look up VLAN + * table with VID. + */ + if (!skb_vlan_tagged(skb)) { + if (skb_cow_head(skb, MTK_HDR_LEN) < 0) + return NULL; - memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN); + skb_push(skb, MTK_HDR_LEN); + memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN); + is_vlan_skb = false; + } - /* Build the tag after the MAC Source Address */ mtk_tag = skb->data + 2 * ETH_ALEN; - mtk_tag[0] = 0; + + /* Mark tag attribute on special tag insertion to notify hardware + * whether that's a combined special tag with 802.1Q header. + */ + mtk_tag[0] = is_vlan_skb ? MTK_HDR_XMIT_TAGGED_TPID_8100 : + MTK_HDR_XMIT_UNTAGGED; mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; - mtk_tag[2] = 0; - mtk_tag[3] = 0; + + /* Tag control information is kept for 802.1Q */ + if (!is_vlan_skb) { + mtk_tag[2] = 0; + mtk_tag[3] = 0; + } return skb; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e7d15fb0d94d..f6f58108b4c5 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -19,6 +19,7 @@ #include <linux/slab.h> #include <linux/wait.h> #include <linux/vmalloc.h> +#include <linux/bootmem.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> @@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) } EXPORT_SYMBOL_GPL(__inet_inherit_port); +static struct inet_listen_hashbucket * +inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) +{ + u32 hash; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + hash = ipv6_portaddr_hash(sock_net(sk), + &sk->sk_v6_rcv_saddr, + inet_sk(sk)->inet_num); + else +#endif + hash = ipv4_portaddr_hash(sock_net(sk), + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_num); + return inet_lhash2_bucket(h, hash); +} + +static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2) + return; + + ilb2 = inet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + if (sk->sk_reuseport && sk->sk_family == AF_INET6) + hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); + else + hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); + ilb2->count++; + spin_unlock(&ilb2->lock); +} + +static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2 || + WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) + return; + + ilb2 = inet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); + ilb2->count--; + spin_unlock(&ilb2->lock); +} + static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif, bool exact_dif) @@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net, */ /* called with rcu_read_lock() : No refcount taken on the socket */ +static struct sock *inet_lhash2_lookup(struct net *net, + struct inet_listen_hashbucket *ilb2, + struct sk_buff *skb, int doff, + const __be32 saddr, __be16 sport, + const __be32 daddr, const unsigned short hnum, + const int dif, const int sdif) +{ + bool exact_dif = inet_exact_dif_match(net, skb); + struct inet_connection_sock *icsk; + struct sock *sk, *result = NULL; + int score, hiscore = 0; + u32 phash = 0; + + inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + score = compute_score(sk, net, hnum, daddr, + dif, sdif, exact_dif); + if (score > hiscore) { + if (sk->sk_reuseport) { + phash = inet_ehashfn(net, daddr, hnum, + saddr, sport); + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; + } + result = sk; + hiscore = score; + } + } + + return result; +} + struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -216,32 +305,57 @@ struct sock *__inet_lookup_listener(struct net *net, { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore = 0, matches = 0, reuseport = 0; bool exact_dif = inet_exact_dif_match(net, skb); + struct inet_listen_hashbucket *ilb2; struct sock *sk, *result = NULL; + int score, hiscore = 0; + unsigned int hash2; u32 phash = 0; + if (ilb->count <= 10 || !hashinfo->lhash2) + goto port_lookup; + + /* Too many sk in the ilb bucket (which is hashed by port alone). + * Try lhash2 (which is hashed by port and addr) instead. + */ + + hash2 = ipv4_portaddr_hash(net, daddr, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + if (result) + return result; + + /* Lookup lhash2 with INADDR_ANY */ + + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + return inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + +port_lookup: sk_for_each_rcu(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, phash, skb, doff); if (result) return result; - matches = 1; } result = sk; hiscore = score; - } else if (score == hiscore && reuseport) { - matches++; - if (reciprocal_scale(phash, matches) == 0) - result = sk; - phash = next_pseudo_random32(phash); } } return result; @@ -483,6 +597,8 @@ int __inet_hash(struct sock *sk, struct sock *osk) hlist_add_tail_rcu(&sk->sk_node, &ilb->head); else hlist_add_head_rcu(&sk->sk_node, &ilb->head); + inet_hash2(hashinfo, sk); + ilb->count++; sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: @@ -509,28 +625,35 @@ EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_listen_hashbucket *ilb; spinlock_t *lock; bool listener = false; - int done; if (sk_unhashed(sk)) return; if (sk->sk_state == TCP_LISTEN) { - lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &ilb->lock; listener = true; } else { lock = inet_ehash_lockp(hashinfo, sk->sk_hash); } spin_lock_bh(lock); + if (sk_unhashed(sk)) + goto unlock; + if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - if (listener) - done = __sk_del_node_init(sk); - else - done = __sk_nulls_del_node_init_rcu(sk); - if (done) - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + if (listener) { + inet_unhash2(hashinfo, sk); + __sk_del_node_init(sk); + ilb->count--; + } else { + __sk_nulls_del_node_init_rcu(sk); + } + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); +unlock: spin_unlock_bh(lock); } EXPORT_SYMBOL_GPL(inet_unhash); @@ -665,10 +788,37 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_HEAD(&h->listening_hash[i].head); + h->listening_hash[i].count = 0; } + + h->lhash2 = NULL; } EXPORT_SYMBOL_GPL(inet_hashinfo_init); +void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, + unsigned long numentries, int scale, + unsigned long low_limit, + unsigned long high_limit) +{ + unsigned int i; + + h->lhash2 = alloc_large_system_hash(name, + sizeof(*h->lhash2), + numentries, + scale, + 0, + NULL, + &h->lhash2_mask, + low_limit, + high_limit); + + for (i = 0; i <= h->lhash2_mask; i++) { + spin_lock_init(&h->lhash2[i].lock); + INIT_HLIST_HEAD(&h->lhash2[i].head); + h->lhash2[i].count = 0; + } +} + int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index b563e0c46bac..277ff69a312d 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -97,7 +97,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. */ -void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, +void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo) { const struct inet_sock *inet = inet_sk(sk); @@ -119,18 +119,6 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, spin_lock(lock); - /* - * Step 2: Hash TW into tcp ehash chain. - * Notes : - * - tw_refcnt is set to 4 because : - * - We have one reference from bhash chain. - * - We have one reference from ehash chain. - * - We have one reference from timer. - * - One reference for ourself (our caller will release it). - * We can use atomic_set() because prior spin_lock()/spin_unlock() - * committed into memory all tw fields. - */ - refcount_set(&tw->tw_refcnt, 4); inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ @@ -138,8 +126,19 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(lock); + + /* tw_refcnt is set to 3 because we have : + * - one reference for bhash chain. + * - one reference for ehash chain. + * - one reference for timer. + * We can use atomic_set() because prior spin_lock()/spin_unlock() + * committed into memory all tw fields. + * Also note that after this point, we lost our implicit reference + * so we are not allowed to use tw anymore. + */ + refcount_set(&tw->tw_refcnt, 3); } -EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); +EXPORT_SYMBOL_GPL(inet_twsk_hashdance); static void tw_timer_handler(struct timer_list *t) { diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9c1735632c8c..9a80d84fc182 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -114,7 +114,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, bool truncate); + __be32 id, u32 index, + bool truncate, bool is_ipv4); static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; @@ -255,34 +256,41 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, { struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; + struct erspan_base_hdr *ershdr; + struct erspan_metadata *pkt_md; struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; - struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 index; + int ver; int len; itn = net_generic(net, erspan_net_id); len = gre_hdr_len + sizeof(*ershdr); + /* Check based hdr len */ if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; iph = ip_hdr(skb); - ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len); + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET; /* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); - index = ershdr->md.index; + pkt_md = (struct erspan_metadata *)(ershdr + 1); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, iph->saddr, iph->daddr, tpi->key); if (tunnel) { + len = gre_hdr_len + erspan_hdr_len(ver); + if (unlikely(!pskb_may_pull(skb, len))) + return -ENOMEM; + if (__iptunnel_pull_header(skb, - gre_hdr_len + sizeof(*ershdr), + len, htons(ETH_P_TEB), false, false) < 0) goto drop; @@ -306,12 +314,27 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (!md) return PACKET_REJECT; - md->index = index; + memcpy(md, pkt_md, sizeof(*md)); + md->version = ver; + info = &tun_dst->u.tun_info; info->key.tun_flags |= TUNNEL_ERSPAN_OPT; info->options_len = sizeof(*md); } else { - tunnel->index = ntohl(index); + tunnel->erspan_ver = ver; + if (ver == 1) { + tunnel->index = ntohl(pkt_md->u.index); + } else { + u16 md2_flags; + u16 dir, hwid; + + md2_flags = ntohs(pkt_md->u.md2.flags); + dir = (md2_flags & DIR_MASK) >> DIR_OFFSET; + hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET; + tunnel->dir = dir; + tunnel->hwid = hwid; + } + } skb_reset_mac_header(skb); @@ -405,7 +428,8 @@ static int gre_rcv(struct sk_buff *skb) if (hdr_len < 0) goto drop; - if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) { + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || + tpi.proto == htons(ETH_P_ERSPAN2))) { if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; } @@ -560,6 +584,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, bool truncate = false; struct flowi4 fl; int tunnel_hlen; + int version; __be16 df; tun_info = skb_tunnel_info(skb); @@ -568,9 +593,13 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; key = &tun_info->key; + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto err_free_rt; /* ERSPAN has fixed 8 byte GRE header */ - tunnel_hlen = 8 + sizeof(struct erspanhdr); + version = md->version; + tunnel_hlen = 8 + erspan_hdr_len(version); rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); if (!rt) @@ -584,12 +613,23 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, truncate = true; } - md = ip_tunnel_info_opts(tun_info); - if (!md) - goto err_free_rt; + if (version == 1) { + erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), + ntohl(md->u.index), truncate, true); + } else if (version == 2) { + u16 md2_flags; + u8 direction; + u16 hwid; - erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), - ntohl(md->index), truncate); + md2_flags = ntohs(md->u.md2.flags); + direction = (md2_flags & DIR_MASK) >> DIR_OFFSET; + hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET; + + erspan_build_header_v2(skb, tunnel_id_to_key32(key->tun_id), + direction, hwid, truncate, true); + } else { + goto err_free_rt; + } gre_build_header(skb, 8, TUNNEL_SEQ, htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++)); @@ -668,52 +708,6 @@ free_skb: return NETDEV_TX_OK; } -static inline u8 tos_to_cos(u8 tos) -{ - u8 dscp, cos; - - dscp = tos >> 2; - cos = dscp >> 3; - return cos; -} - -static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, bool truncate) -{ - struct iphdr *iphdr = ip_hdr(skb); - struct ethhdr *eth = eth_hdr(skb); - enum erspan_encap_type enc_type; - struct erspanhdr *ershdr; - struct qtag_prefix { - __be16 eth_type; - __be16 tci; - } *qp; - u16 vlan_tci = 0; - - enc_type = ERSPAN_ENCAP_NOVLAN; - - /* If mirrored packet has vlan tag, extract tci and - * perserve vlan header in the mirrored frame. - */ - if (eth->h_proto == htons(ETH_P_8021Q)) { - qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); - vlan_tci = ntohs(qp->tci); - enc_type = ERSPAN_ENCAP_INFRAME; - } - - skb_push(skb, sizeof(*ershdr)); - ershdr = (struct erspanhdr *)skb->data; - memset(ershdr, 0, sizeof(*ershdr)); - - ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | - (ERSPAN_VERSION << VER_OFFSET)); - ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) | - ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) | - (enc_type << EN_OFFSET & EN_MASK) | - ((truncate << T_OFFSET) & T_MASK)); - ershdr->md.index = htonl(index & INDEX_MASK); -} - static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -737,7 +731,14 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, } /* Push ERSPAN header */ - erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate); + if (tunnel->erspan_ver == 1) + erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, + truncate, true); + else + erspan_build_header_v2(skb, tunnel->parms.o_key, + tunnel->dir, tunnel->hwid, + truncate, true); + tunnel->parms.o_flags &= ~TUNNEL_KEY; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); return NETDEV_TX_OK; @@ -1209,13 +1210,32 @@ static int ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_FWMARK]) *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); - if (data[IFLA_GRE_ERSPAN_INDEX]) { - t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (data[IFLA_GRE_ERSPAN_VER]) { + t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); - if (t->index & ~INDEX_MASK) + if (t->erspan_ver != 1 && t->erspan_ver != 2) return -EINVAL; } + if (t->erspan_ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) { + t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (t->index & ~INDEX_MASK) + return -EINVAL; + } + } else if (t->erspan_ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) { + t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + if (t->dir & ~(DIR_MASK >> DIR_OFFSET)) + return -EINVAL; + } + if (data[IFLA_GRE_ERSPAN_HWID]) { + t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + if (t->hwid & ~(HWID_MASK >> HWID_OFFSET)) + return -EINVAL; + } + } + return 0; } @@ -1282,7 +1302,7 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + - sizeof(struct erspanhdr); + erspan_hdr_len(tunnel->erspan_ver); t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; @@ -1412,6 +1432,12 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(4) + /* IFLA_GRE_ERSPAN_INDEX */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_VER */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_DIR */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_HWID */ + nla_total_size(2) + 0; } @@ -1454,9 +1480,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } - if (t->index) + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) + goto nla_put_failure; + + if (t->erspan_ver == 1) { if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) goto nla_put_failure; + } else if (t->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) + goto nla_put_failure; + } return 0; @@ -1492,6 +1527,9 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 43b69af242e1..f0ed031f3594 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1106,7 +1106,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); + __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f08eebe60446..c470fec9062f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3578,6 +3578,9 @@ void __init tcp_init(void) percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); inet_hashinfo_init(&tcp_hashinfo); + inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", + thash_entries, 21, /* one slot per 2 MB*/ + 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 78c192ee03a4..018a48477355 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -379,18 +379,9 @@ fastopen: bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { - unsigned long last_syn_loss = 0; const struct dst_entry *dst; - int syn_loss = 0; - tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss); - - /* Recurring FO SYN losses: no cookie or data in SYN */ - if (syn_loss > 1 && - time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { - cookie->len = -1; - return false; - } + tcp_fastopen_cache_get(sk, mss, cookie); /* Firewall blackhole issue check */ if (tcp_fastopen_active_should_disable(sk)) { @@ -448,6 +439,8 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * following circumstances: * 1. client side TFO socket receives out of order FIN * 2. client side TFO socket receives out of order RST + * 3. client side TFO socket has timed out three times consecutively during + * or after handshake * We disable active side TFO globally for 1hr at first. Then if it * happens again, we disable it for 2h, then 4h, 8h, ... * And we reset the timeout back to 1hr when we see a successful active @@ -524,3 +517,20 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) dst_release(dst); } } + +void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired) +{ + u32 timeouts = inet_csk(sk)->icsk_retransmits; + struct tcp_sock *tp = tcp_sk(sk); + + /* Broken middle-boxes may black-hole Fast Open connection during or + * even after the handshake. Be extremely conservative and pause + * Fast Open globally after hitting the third consecutive timeout or + * exceeding the configured timeout limit. + */ + if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) && + (timeouts == 2 || (timeouts < 2 && expired))) { + tcp_fastopen_active_disable(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); + } +} diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 45f750e85714..4d55c4b338ee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -578,8 +578,8 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 copied; int time; - int copied; tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); @@ -602,38 +602,31 @@ void tcp_rcv_space_adjust(struct sock *sk) if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvwin, rcvmem, rcvbuf; + int rcvmem, rcvbuf; + u64 rcvwin, grow; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. */ - rcvwin = (copied << 1) + 16 * tp->advmss; + rcvwin = ((u64)copied << 1) + 16 * tp->advmss; - /* If rate increased by 25%, - * assume slow start, rcvwin = 3 * copied - * If rate increased by 50%, - * assume sender can use 2x growth, rcvwin = 4 * copied - */ - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) - rcvwin <<= 1; - else - rcvwin += (rcvwin >> 1); - } + /* Accommodate for sender rate increase (eg. slow start) */ + grow = rcvwin * (copied - tp->rcvq_space.space); + do_div(grow, tp->rcvq_space.space); + rcvwin += (grow << 1); rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + do_div(rcvwin, tp->advmss); + rcvbuf = min_t(u64, rcvwin * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; /* Make the window clamp follow along. */ - tp->window_clamp = rcvwin; + tp->window_clamp = tcp_win_from_space(sk, rcvbuf); } } tp->rcvq_space.space = copied; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 7097f92d16e5..759e6bc8327b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -546,8 +546,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) static DEFINE_SEQLOCK(fastopen_seqlock); void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, - struct tcp_fastopen_cookie *cookie, - int *syn_loss, unsigned long *last_syn_loss) + struct tcp_fastopen_cookie *cookie) { struct tcp_metrics_block *tm; @@ -564,8 +563,6 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, *cookie = tfom->cookie; if (cookie->len <= 0 && tfom->try_exp == 1) cookie->exp = true; - *syn_loss = tfom->syn_loss; - *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; } while (read_seqretry(&fastopen_seqlock, seq)); } rcu_read_unlock(); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b079b619b60c..a8384b0c11f8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -316,9 +316,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) */ local_bh_disable(); inet_twsk_schedule(tw, timeo); - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); - inet_twsk_put(tw); + /* Linkage updates. + * Note that access to tw after this point is illegal. + */ + inet_twsk_hashdance(tw, sk, &tcp_hashinfo); local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a4d214c7b506..04be9f833927 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2414,15 +2414,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; /* Schedule a loss probe in 2*RTT for SACK capable connections - * in Open state, that are either limited by cwnd or application. + * not in loss recovery, that are either limited by cwnd or application. */ if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || - icsk->icsk_ca_state != TCP_CA_Open) - return false; - - if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && - !tcp_write_queue_empty(sk)) + (icsk->icsk_ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_CWR)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 968fda198376..6db3124cdbda 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -183,11 +183,6 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) { dst_negative_advice(sk); - if (tp->syn_fastopen || tp->syn_data) - tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (tp->syn_data && icsk->icsk_retransmits == 1) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); } else if (!tp->syn_data && !tp->syn_fastopen) { sk_rethink_txhash(sk); } @@ -195,17 +190,6 @@ static int tcp_write_timeout(struct sock *sk) expired = icsk->icsk_retransmits >= retry_until; } else { if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { - /* Some middle-boxes may black-hole Fast Open _after_ - * the handshake. Therefore we conservatively disable - * Fast Open on this path on recurring timeouts after - * successful Fast Open. - */ - if (tp->syn_data_acked) { - tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); - } /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -228,6 +212,7 @@ static int tcp_write_timeout(struct sock *sk) expired = retransmits_timed_out(sk, retry_until, icsk->icsk_user_timeout); } + tcp_fastopen_active_detect_blackhole(sk, expired); if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e4ff25c947c5..e9c0d1e1772e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -357,18 +357,12 @@ fail: } EXPORT_SYMBOL(udp_lib_get_port); -static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, - unsigned int port) -{ - return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; -} - int udp_v4_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = - udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); + ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); unsigned int hash2_partial = - udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); + ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; @@ -445,7 +439,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct sk_buff *skb) { struct sock *sk, *result; - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; result = NULL; @@ -454,23 +448,16 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } badness = score; result = sk; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -488,11 +475,11 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; bool exact_dif = udp_lib_exact_dif_match(net, skb); - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; if (hslot->count > 10) { - hash2 = udp4_portaddr_hash(net, daddr, hnum); + hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) @@ -503,7 +490,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; - hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; /* avoid searching the same slot again. */ if (unlikely(slot2 == old_slot2)) @@ -526,23 +513,16 @@ begin: score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -1775,7 +1755,7 @@ EXPORT_SYMBOL(udp_lib_rehash); static void udp_v4_rehash(struct sock *sk) { - u16 new_hash = udp4_portaddr_hash(sock_net(sk), + u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); udp_lib_rehash(sk, new_hash); @@ -1966,9 +1946,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct sk_buff *nskb; if (use_hash2) { - hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & + hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & udptable->mask; - hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask; + hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); @@ -2200,7 +2180,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, int dif, int sdif) { unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); + unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index e6265e2c274e..7d885a44dc9d 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -62,7 +62,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - top_iph->ttl = ip4_dst_hoplimit(dst->child); + top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f49bd7897e95..ed06b1190f05 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6595,27 +6595,45 @@ int __init addrconf_init(void) rtnl_af_register(&inet6_ops); - err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo, - 0); + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETLINK, + NULL, inet6_dump_ifinfo, 0); if (err < 0) goto errout; - /* Only the first call to __rtnl_register can fail */ - __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0); - __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0); - __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, - inet6_dump_ifaddr, RTNL_FLAG_DOIT_UNLOCKED); - __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, - inet6_dump_ifmcaddr, 0); - __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, - inet6_dump_ifacaddr, 0); - __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf, - inet6_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED); - - ipv6_addr_label_rtnl_register(); + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDR, + inet6_rtm_newaddr, NULL, 0); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDR, + inet6_rtm_deladdr, NULL, 0); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDR, + inet6_rtm_getaddr, inet6_dump_ifaddr, + RTNL_FLAG_DOIT_UNLOCKED); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMULTICAST, + NULL, inet6_dump_ifmcaddr, 0); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST, + NULL, inet6_dump_ifacaddr, 0); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETNETCONF, + inet6_netconf_get_devconf, + inet6_netconf_dump_devconf, + RTNL_FLAG_DOIT_UNLOCKED); + if (err < 0) + goto errout; + err = ipv6_addr_label_rtnl_register(); + if (err < 0) + goto errout; return 0; errout: + rtnl_unregister_all(PF_INET6); rtnl_af_unregister(&inet6_ops); unregister_netdevice_notifier(&ipv6_dev_notf); errlo: diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 00e1f8ee08f8..1d6ced37ad71 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -547,13 +547,22 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return err; } -void __init ipv6_addr_label_rtnl_register(void) +int __init ipv6_addr_label_rtnl_register(void) { - __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, - NULL, RTNL_FLAG_DOIT_UNLOCKED); - __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, - NULL, RTNL_FLAG_DOIT_UNLOCKED); - __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, - ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); -} + int ret; + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDRLABEL, + ip6addrlbl_newdel, + NULL, RTNL_FLAG_DOIT_UNLOCKED); + if (ret < 0) + return ret; + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDRLABEL, + ip6addrlbl_newdel, + NULL, RTNL_FLAG_DOIT_UNLOCKED); + if (ret < 0) + return ret; + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL, + ip6addrlbl_get, + ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); + return ret; +} diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 6eb5e68f112a..44c39c5f0638 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -512,9 +512,7 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) struct ila_map *ila; int ret; - ret = rhashtable_walk_start(rhiter); - if (ret && ret != -EAGAIN) - goto done; + rhashtable_walk_start(rhiter); for (;;) { ila = rhashtable_walk_next(rhiter); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b01858f5deb1..2febe26de6a1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -125,6 +125,40 @@ static inline int compute_score(struct sock *sk, struct net *net, } /* called with rcu_read_lock() */ +static struct sock *inet6_lhash2_lookup(struct net *net, + struct inet_listen_hashbucket *ilb2, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, const struct in6_addr *daddr, + const unsigned short hnum, const int dif, const int sdif) +{ + bool exact_dif = inet6_exact_dif_match(net, skb); + struct inet_connection_sock *icsk; + struct sock *sk, *result = NULL; + int score, hiscore = 0; + u32 phash = 0; + + inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + score = compute_score(sk, net, hnum, daddr, dif, sdif, + exact_dif); + if (score > hiscore) { + if (sk->sk_reuseport) { + phash = inet6_ehashfn(net, daddr, hnum, + saddr, sport); + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; + } + result = sk; + hiscore = score; + } + } + + return result; +} + struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -134,31 +168,56 @@ struct sock *inet6_lookup_listener(struct net *net, { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore = 0, matches = 0, reuseport = 0; bool exact_dif = inet6_exact_dif_match(net, skb); + struct inet_listen_hashbucket *ilb2; struct sock *sk, *result = NULL; + int score, hiscore = 0; + unsigned int hash2; u32 phash = 0; + if (ilb->count <= 10 || !hashinfo->lhash2) + goto port_lookup; + + /* Too many sk in the ilb bucket (which is hashed by port alone). + * Try lhash2 (which is hashed by port and addr) instead. + */ + + hash2 = ipv6_portaddr_hash(net, daddr, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + result = inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + if (result) + return result; + + /* Lookup lhash2 with in6addr_any */ + + hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + return inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + +port_lookup: sk_for_each(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, phash, skb, doff); if (result) return result; - matches = 1; } result = sk; hiscore = score; - } else if (score == hiscore && reuseport) { - matches++; - if (reciprocal_scale(phash, matches) == 0) - result = sk; - phash = next_pseudo_random32(phash); } } return result; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f5285f4e1d08..a64d559fa513 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -893,7 +893,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ins = &fn->leaf; for (iter = leaf; iter; - iter = rcu_dereference_protected(iter->dst.rt6_next, + iter = rcu_dereference_protected(iter->rt6_next, lockdep_is_held(&rt->rt6i_table->tb6_lock))) { /* * Search for duplicates @@ -950,7 +950,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, break; next_iter: - ins = &iter->dst.rt6_next; + ins = &iter->rt6_next; } if (fallback_ins && !found) { @@ -979,7 +979,7 @@ next_iter: &sibling->rt6i_siblings); break; } - sibling = rcu_dereference_protected(sibling->dst.rt6_next, + sibling = rcu_dereference_protected(sibling->rt6_next, lockdep_is_held(&rt->rt6i_table->tb6_lock)); } /* For each sibling in the list, increment the counter of @@ -1009,7 +1009,7 @@ add: if (err) return err; - rcu_assign_pointer(rt->dst.rt6_next, iter); + rcu_assign_pointer(rt->rt6_next, iter); atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rcu_assign_pointer(*ins, rt); @@ -1040,7 +1040,7 @@ add: atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); - rt->dst.rt6_next = iter->dst.rt6_next; + rt->rt6_next = iter->rt6_next; rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); @@ -1059,14 +1059,14 @@ add: if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ - ins = &rt->dst.rt6_next; + ins = &rt->rt6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->rt6i_table->tb6_lock)); while (iter) { if (iter->rt6i_metric > rt->rt6i_metric) break; if (rt6_qualify_for_ecmp(iter)) { - *ins = iter->dst.rt6_next; + *ins = iter->rt6_next; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) @@ -1075,7 +1075,7 @@ add: nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { - ins = &iter->dst.rt6_next; + ins = &iter->rt6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->rt6i_table->tb6_lock)); @@ -1644,7 +1644,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE); /* Unlink it */ - *rtp = rt->dst.rt6_next; + *rtp = rt->rt6_next; rt->rt6i_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; @@ -1672,7 +1672,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rcu_dereference_protected(rt->dst.rt6_next, + w->leaf = rcu_dereference_protected(rt->rt6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; @@ -1731,7 +1731,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) fib6_del_route(table, fn, rtp, info); return 0; } - rtp_next = &cur->dst.rt6_next; + rtp_next = &cur->rt6_next; } return -ENOENT; } @@ -2142,8 +2142,8 @@ int __init fib6_init(void) if (ret) goto out_kmem_cache_create; - ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, - 0); + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, + inet6_dump_fib, 0); if (ret) goto out_unregister_subsys; @@ -2208,7 +2208,7 @@ static int ipv6_route_yield(struct fib6_walker *w) do { iter->w.leaf = rcu_dereference_protected( - iter->w.leaf->dst.rt6_next, + iter->w.leaf->rt6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) @@ -2274,7 +2274,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next); + n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next); if (n) { ++*pos; return n; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4cfd8e0696fe..5c9c65f1d5c2 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -55,6 +55,8 @@ #include <net/ip6_route.h> #include <net/ip6_tunnel.h> #include <net/gre.h> +#include <net/erspan.h> +#include <net/dst_metadata.h> static bool log_ecn_error = true; @@ -68,11 +70,13 @@ static unsigned int ip6gre_net_id __read_mostly; struct ip6gre_net { struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE]; + struct ip6_tnl __rcu *collect_md_tun; struct net_device *fb_tunnel_dev; }; static struct rtnl_link_ops ip6gre_link_ops __read_mostly; static struct rtnl_link_ops ip6gre_tap_ops __read_mostly; +static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly; static int ip6gre_tunnel_init(struct net_device *dev); static void ip6gre_tunnel_setup(struct net_device *dev); static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t); @@ -121,7 +125,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, unsigned int h1 = HASH_KEY(key); struct ip6_tnl *t, *cand = NULL; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - int dev_type = (gre_proto == htons(ETH_P_TEB)) ? + int dev_type = (gre_proto == htons(ETH_P_TEB) || + gre_proto == htons(ETH_P_ERSPAN)) ? ARPHRD_ETHER : ARPHRD_IP6GRE; int score, cand_score = 4; @@ -226,6 +231,10 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, if (cand) return cand; + t = rcu_dereference(ign->collect_md_tun); + if (t && t->dev->flags & IFF_UP) + return t; + dev = ign->fb_tunnel_dev; if (dev->flags & IFF_UP) return netdev_priv(dev); @@ -261,6 +270,9 @@ static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t); + if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun, t); + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } @@ -270,6 +282,9 @@ static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t) struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; + if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun, NULL); + for (tp = ip6gre_bucket(ign, t); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { @@ -460,7 +475,108 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) &ipv6h->saddr, &ipv6h->daddr, tpi->key, tpi->proto); if (tunnel) { - ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + if (tunnel->parms.collect_md) { + struct metadata_dst *tun_dst; + __be64 tun_id; + __be16 flags; + + flags = tpi->flags; + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0); + if (!tun_dst) + return PACKET_REJECT; + + ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + } else { + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + } + + return PACKET_RCVD; + } + + return PACKET_REJECT; +} + +static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len, + struct tnl_ptk_info *tpi) +{ + struct erspan_base_hdr *ershdr; + struct erspan_metadata *pkt_md; + const struct ipv6hdr *ipv6h; + struct ip6_tnl *tunnel; + u8 ver; + + ipv6h = ipv6_hdr(skb); + ershdr = (struct erspan_base_hdr *)skb->data; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) + return PACKET_REJECT; + + ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET; + tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); + pkt_md = (struct erspan_metadata *)(ershdr + 1); + + tunnel = ip6gre_tunnel_lookup(skb->dev, + &ipv6h->saddr, &ipv6h->daddr, tpi->key, + tpi->proto); + if (tunnel) { + int len = erspan_hdr_len(ver); + + if (unlikely(!pskb_may_pull(skb, len))) + return -ENOMEM; + + if (__iptunnel_pull_header(skb, len, + htons(ETH_P_TEB), + false, false) < 0) + return PACKET_REJECT; + + if (tunnel->parms.collect_md) { + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + struct erspan_metadata *md; + __be64 tun_id; + __be16 flags; + + tpi->flags |= TUNNEL_KEY; + flags = tpi->flags; + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, + sizeof(*md)); + if (!tun_dst) + return PACKET_REJECT; + + info = &tun_dst->u.tun_info; + md = ip_tunnel_info_opts(info); + if (!md) + return PACKET_REJECT; + + memcpy(md, pkt_md, sizeof(*md)); + md->version = ver; + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + info->options_len = sizeof(*md); + + ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + + } else { + tunnel->parms.erspan_ver = ver; + + if (ver == 1) { + tunnel->parms.index = ntohl(pkt_md->u.index); + } else { + u16 md2_flags; + u16 dir, hwid; + + md2_flags = ntohs(pkt_md->u.md2.flags); + dir = (md2_flags & DIR_MASK) >> DIR_OFFSET; + hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET; + tunnel->parms.dir = dir; + tunnel->parms.hwid = hwid; + } + + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + } return PACKET_RCVD; } @@ -481,6 +597,13 @@ static int gre_rcv(struct sk_buff *skb) if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false)) goto drop; + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || + tpi.proto == htons(ETH_P_ERSPAN2))) { + if (ip6erspan_rcv(skb, hdr_len, &tpi) == PACKET_RCVD) + return 0; + goto drop; + } + if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD) return 0; @@ -496,6 +619,78 @@ static int gre_handle_offloads(struct sk_buff *skb, bool csum) csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } +static void prepare_ip6gre_xmit_ipv4(struct sk_buff *skb, + struct net_device *dev, + struct flowi6 *fl6, __u8 *dsfield, + int *encap_limit) +{ + const struct iphdr *iph = ip_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + *encap_limit = t->parms.encap_limit; + + memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + *dsfield = ipv4_get_dsfield(iph); + else + *dsfield = ip6_tclass(t->parms.flowinfo); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6->flowi6_mark = skb->mark; + else + fl6->flowi6_mark = t->parms.fwmark; + + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); +} + +static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb, + struct net_device *dev, + struct flowi6 *fl6, __u8 *dsfield, + int *encap_limit) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + __u16 offset; + + offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ + + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + + tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + *encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { + *encap_limit = t->parms.encap_limit; + } + + memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + *dsfield = ipv6_get_dsfield(ipv6h); + else + *dsfield = ip6_tclass(t->parms.flowinfo); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6->flowlabel |= ip6_flowlabel(ipv6h); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6->flowi6_mark = skb->mark; + else + fl6->flowi6_mark = t->parms.fwmark; + + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + return 0; +} + static netdev_tx_t __gre6_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, @@ -517,8 +712,38 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; - gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, - protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); + + if (tunnel->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + __be16 flags; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || + !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) + return -EINVAL; + + key = &tun_info->key; + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_proto = IPPROTO_GRE; + fl6->daddr = key->u.ipv6.dst; + fl6->flowlabel = key->label; + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + dsfield = key->tos; + flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + tunnel->tun_hlen = gre_calc_hlen(flags); + + gre_build_header(skb, tunnel->tun_hlen, + flags, protocol, + tunnel_id_to_key32(tun_info->key.tun_id), 0); + + } else { + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, + protocol, tunnel->parms.o_key, + htonl(tunnel->o_seqno)); + } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); @@ -527,30 +752,17 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - const struct iphdr *iph = ip_hdr(skb); int encap_limit = -1; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield = 0; __u32 mtu; int err; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv4_get_dsfield(iph); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + if (!t->parms.collect_md) + prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, + &dsfield, &encap_limit); err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) @@ -574,46 +786,17 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h = ipv6_hdr(skb); int encap_limit = -1; - __u16 offset; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield = 0; __u32 mtu; int err; if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) return -1; - offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); - /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ - ipv6h = ipv6_hdr(skb); - - if (offset > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); - return -1; - } - encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv6_get_dsfield(ipv6h); - else - dsfield = ip6_tclass(t->parms.flowinfo); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= ip6_flowlabel(ipv6h); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + if (!t->parms.collect_md && + prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) + return -1; if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) return -1; @@ -660,7 +843,8 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + if (!t->parms.collect_md) + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) @@ -705,6 +889,141 @@ tx_err: return NETDEV_TX_OK; } +static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + struct dst_entry *dst = skb_dst(skb); + struct net_device_stats *stats; + bool truncate = false; + int encap_limit = -1; + __u8 dsfield = false; + struct flowi6 fl6; + int err = -EINVAL; + __u32 mtu; + + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) + goto tx_err; + + if (gre_handle_offloads(skb, false)) + goto tx_err; + + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); + truncate = true; + } + + t->parms.o_flags &= ~TUNNEL_KEY; + IPCB(skb)->flags = 0; + + /* For collect_md mode, derive fl6 from the tunnel key, + * for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}. + */ + if (t->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct erspan_metadata *md; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || + !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) + return -EINVAL; + + key = &tun_info->key; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_GRE; + fl6.daddr = key->u.ipv6.dst; + fl6.flowlabel = key->label; + fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + dsfield = key->tos; + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto tx_err; + + if (md->version == 1) { + erspan_build_header(skb, + tunnel_id_to_key32(key->tun_id), + ntohl(md->u.index), truncate, + false); + } else if (md->version == 2) { + u16 md2_flags; + u16 dir, hwid; + + md2_flags = ntohs(md->u.md2.flags); + dir = (md2_flags & DIR_MASK) >> DIR_OFFSET; + hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET; + + erspan_build_header_v2(skb, + tunnel_id_to_key32(key->tun_id), + dir, hwid, truncate, + false); + } + } else { + switch (skb->protocol) { + case htons(ETH_P_IP): + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, + &dsfield, &encap_limit); + break; + case htons(ETH_P_IPV6): + if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) + goto tx_err; + if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, + &dsfield, &encap_limit)) + goto tx_err; + break; + default: + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + break; + } + + if (t->parms.erspan_ver == 1) + erspan_build_header(skb, t->parms.o_key, + t->parms.index, + truncate, false); + else + erspan_build_header_v2(skb, t->parms.o_key, + t->parms.dir, + t->parms.hwid, + truncate, false); + fl6.daddr = t->parms.raddr; + } + + /* Push GRE header. */ + gre_build_header(skb, 8, TUNNEL_SEQ, + htons(ETH_P_ERSPAN), 0, htonl(t->o_seqno++)); + + /* TooBig packet may have updated dst->dev's mtu */ + if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) + dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu); + + err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, + NEXTHDR_GRE); + if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ + if (err == -EMSGSIZE) { + if (skb->protocol == htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } + + goto tx_err; + } + return NETDEV_TX_OK; + +tx_err: + stats = &t->dev->stats; + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) { struct net_device *dev = t->dev; @@ -1048,6 +1367,11 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; + if (tunnel->parms.collect_md) { + dev->features |= NETIF_F_NETNS_LOCAL; + netif_keep_dst(dev); + } + return 0; } @@ -1062,6 +1386,9 @@ static int ip6gre_tunnel_init(struct net_device *dev) tunnel = netdev_priv(dev); + if (tunnel->parms.collect_md) + return 0; + memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); @@ -1084,7 +1411,6 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) dev_hold(dev); } - static struct inet6_protocol ip6gre_protocol __read_mostly = { .handler = gre_rcv, .err_handler = ip6gre_err, @@ -1099,7 +1425,8 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6gre_link_ops || - dev->rtnl_link_ops == &ip6gre_tap_ops) + dev->rtnl_link_ops == &ip6gre_tap_ops || + dev->rtnl_link_ops == &ip6erspan_tap_ops) unregister_netdevice_queue(dev, head); for (prio = 0; prio < 4; prio++) { @@ -1221,6 +1548,70 @@ out: return ip6gre_tunnel_validate(tb, data, extack); } +static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + __be16 flags = 0; + int ret, ver = 0; + + if (!data) + return 0; + + ret = ip6gre_tap_validate(tb, data, extack); + if (ret) + return ret; + + /* ERSPAN should only have GRE sequence and key flag */ + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (!data[IFLA_GRE_COLLECT_METADATA] && + flags != (GRE_SEQ | GRE_KEY)) + return -EINVAL; + + /* ERSPAN Session ID only has 10-bit. Since we reuse + * 32-bit key field as ID, check it's range. + */ + if (data[IFLA_GRE_IKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_OKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_ERSPAN_VER]) { + ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); + if (ver != 1 && ver != 2) + return -EINVAL; + } + + if (ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) { + u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + + if (index & ~INDEX_MASK) + return -EINVAL; + } + } else if (ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) { + u16 dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + + if (dir & ~(DIR_MASK >> DIR_OFFSET)) + return -EINVAL; + } + + if (data[IFLA_GRE_ERSPAN_HWID]) { + u16 hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + + if (hwid & ~(HWID_MASK >> HWID_OFFSET)) + return -EINVAL; + } + } + + return 0; +} static void ip6gre_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) @@ -1267,6 +1658,22 @@ static void ip6gre_netlink_parms(struct nlattr *data[], if (data[IFLA_GRE_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + + if (data[IFLA_GRE_COLLECT_METADATA]) + parms->collect_md = true; + + if (data[IFLA_GRE_ERSPAN_VER]) + parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); + + if (parms->erspan_ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) + parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + } else if (parms->erspan_ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) + parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + if (data[IFLA_GRE_ERSPAN_HWID]) + parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + } } static int ip6gre_tap_init(struct net_device *dev) @@ -1303,6 +1710,59 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { NETIF_F_HIGHDMA | \ NETIF_F_HW_CSUM) +static int ip6erspan_tap_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + int t_hlen; + int ret; + + tunnel = netdev_priv(dev); + + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + + tunnel->tun_hlen = 8; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + erspan_hdr_len(tunnel->parms.erspan_ver); + t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); + + dev->hard_header_len = LL_MAX_HEADER + t_hlen; + dev->mtu = ETH_DATA_LEN - t_hlen; + if (dev->type == ARPHRD_ETHER) + dev->mtu -= ETH_HLEN; + if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu -= 8; + + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + tunnel = netdev_priv(dev); + ip6gre_tnl_link_config(tunnel, 1); + + return 0; +} + +static const struct net_device_ops ip6erspan_netdev_ops = { + .ndo_init = ip6erspan_tap_init, + .ndo_uninit = ip6gre_tunnel_uninit, + .ndo_start_xmit = ip6erspan_tunnel_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ip6_tnl_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip6_tnl_get_iflink, +}; + static void ip6gre_tap_setup(struct net_device *dev) { @@ -1372,8 +1832,13 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, ip6gre_netlink_parms(data, &nt->parms); - if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) - return -EEXIST; + if (nt->parms.collect_md) { + if (rtnl_dereference(ign->collect_md_tun)) + return -EEXIST; + } else { + if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) + return -EEXIST; + } if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); @@ -1492,8 +1957,12 @@ static size_t ip6gre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_GRE_COLLECT_METADATA */ + nla_total_size(0) + /* IFLA_GRE_FWMARK */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_INDEX */ + nla_total_size(4) + 0; } @@ -1515,7 +1984,8 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) || - nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark)) + nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark) || + nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, @@ -1528,6 +1998,24 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) t->encap.flags)) goto nla_put_failure; + if (p->collect_md) { + if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) + goto nla_put_failure; + } + + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) + goto nla_put_failure; + + if (p->erspan_ver == 1) { + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) + goto nla_put_failure; + } else if (p->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid)) + goto nla_put_failure; + } + return 0; nla_put_failure: @@ -1550,9 +2038,28 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, }; +static void ip6erspan_tap_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->netdev_ops = &ip6erspan_netdev_ops; + dev->needs_free_netdev = true; + dev->priv_destructor = ip6gre_dev_free; + + dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); +} + static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { .kind = "ip6gre", .maxtype = IFLA_GRE_MAX, @@ -1582,6 +2089,20 @@ static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; +static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = { + .kind = "ip6erspan", + .maxtype = IFLA_GRE_MAX, + .policy = ip6gre_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = ip6erspan_tap_setup, + .validate = ip6erspan_tap_validate, + .newlink = ip6gre_newlink, + .changelink = ip6gre_changelink, + .get_size = ip6gre_get_size, + .fill_info = ip6gre_fill_info, + .get_link_net = ip6_tnl_get_link_net, +}; + /* * And now the modules code and kernel interface. */ @@ -1610,9 +2131,15 @@ static int __init ip6gre_init(void) if (err < 0) goto tap_ops_failed; + err = rtnl_link_register(&ip6erspan_tap_ops); + if (err < 0) + goto erspan_link_failed; + out: return err; +erspan_link_failed: + rtnl_link_unregister(&ip6gre_tap_ops); tap_ops_failed: rtnl_link_unregister(&ip6gre_link_ops); rtnl_link_failed: @@ -1626,6 +2153,7 @@ static void __exit ip6gre_fini(void) { rtnl_link_unregister(&ip6gre_tap_ops); rtnl_link_unregister(&ip6gre_link_ops); + rtnl_link_unregister(&ip6erspan_tap_ops); inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE); unregister_pernet_device(&ip6gre_net_ops); } @@ -1637,4 +2165,5 @@ MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); MODULE_DESCRIPTION("GRE over IPv6 tunneling device"); MODULE_ALIAS_RTNL_LINK("ip6gre"); MODULE_ALIAS_RTNL_LINK("ip6gretap"); +MODULE_ALIAS_RTNL_LINK("ip6erspan"); MODULE_ALIAS_NETDEV("ip6gre0"); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 5110a418cc4d..176d74fb3b4d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1201,13 +1201,13 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, rt->dst.dev->mtu : dst_mtu(&rt->dst); else mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(rt->dst.path); + rt->dst.dev->mtu : dst_mtu(xfrm_dst_path(&rt->dst)); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } cork->base.fragsize = mtu; - if (dst_allfrag(rt->dst.path)) + if (dst_allfrag(xfrm_dst_path(&rt->dst))) cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index db84f523656d..6ff2f21ae3fc 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -861,7 +861,7 @@ int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, struct metadata_dst *tun_dst, bool log_ecn_err) { - return __ip6_tnl_rcv(t, skb, tpi, NULL, ip6ip6_dscp_ecn_decapsulate, + return __ip6_tnl_rcv(t, skb, tpi, tun_dst, ip6ip6_dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); @@ -979,6 +979,9 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, int ret = 0; struct net *net = t->net; + if (t->parms.collect_md) + return 1; + if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index a2e1a864eb46..890f9bda06a4 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1425,10 +1425,13 @@ int __init ip6_mr_init(void) goto add_proto_fail; } #endif - rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL, - ip6mr_rtm_dumproute, 0); - return 0; + err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE, + NULL, ip6mr_rtm_dumproute, 0); + if (err == 0) + return 0; + #ifdef CONFIG_IPV6_PIMSM_V2 + inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); add_proto_fail: unregister_netdevice_notifier(&ip6_mr_notifier); #endif diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7a8d1500d374..b3f4d19b3ca5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -186,7 +186,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) { - return dst_metrics_write_ptr(rt->dst.from); + return dst_metrics_write_ptr(&rt->from->dst); } static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) @@ -391,7 +391,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; struct rt6_exception_bucket *bucket; - struct dst_entry *from = dst->from; + struct rt6_info *from = rt->from; struct inet6_dev *idev; dst_destroy_metrics_generic(dst); @@ -409,8 +409,8 @@ static void ip6_dst_destroy(struct dst_entry *dst) kfree(bucket); } - dst->from = NULL; - dst_release(from); + rt->from = NULL; + dst_release(&from->dst); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -443,9 +443,9 @@ static bool rt6_check_expired(const struct rt6_info *rt) if (rt->rt6i_flags & RTF_EXPIRES) { if (time_after(jiffies, rt->dst.expires)) return true; - } else if (rt->dst.from) { + } else if (rt->from) { return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || - rt6_check_expired((struct rt6_info *)rt->dst.from); + rt6_check_expired(rt->from); } return false; } @@ -502,7 +502,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (!oif && ipv6_addr_any(saddr)) goto out; - for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) { + for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { struct net_device *dev = sprt->dst.dev; if (oif) { @@ -721,7 +721,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; cont = NULL; - for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) { + for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -731,7 +731,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, } for (rt = leaf; rt && rt != rr_head; - rt = rcu_dereference(rt->dst.rt6_next)) { + rt = rcu_dereference(rt->rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -743,7 +743,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, if (match || !cont) return match; - for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next)) + for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next)) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; @@ -781,7 +781,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, &do_rr); if (do_rr) { - struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next); + struct rt6_info *next = rcu_dereference(rt0->rt6_next); /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) @@ -1054,7 +1054,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, */ if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = (struct rt6_info *)ort->dst.from; + ort = ort->from; rcu_read_lock(); dev = ip6_rt_get_dev_rcu(ort); @@ -1274,7 +1274,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, /* ort can't be a cache or pcpu route */ if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = (struct rt6_info *)ort->dst.from; + ort = ort->from; WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); spin_lock_bh(&rt6_exception_lock); @@ -1415,8 +1415,8 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, /* Remove the passed in cached rt from the hash table that contains it */ int rt6_remove_exception_rt(struct rt6_info *rt) { - struct rt6_info *from = (struct rt6_info *)rt->dst.from; struct rt6_exception_bucket *bucket; + struct rt6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; int err; @@ -1460,8 +1460,8 @@ int rt6_remove_exception_rt(struct rt6_info *rt) */ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { - struct rt6_info *from = (struct rt6_info *)rt->dst.from; struct rt6_exception_bucket *bucket; + struct rt6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; @@ -1929,9 +1929,9 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori static void rt6_dst_from_metrics_check(struct rt6_info *rt) { - if (rt->dst.from && - dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) - dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); + if (rt->from && + dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst)) + dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true); } static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) @@ -1951,7 +1951,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) { if (!__rt6_check_expired(rt) && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && - rt6_check((struct rt6_info *)(rt->dst.from), cookie)) + rt6_check(rt->from, cookie)) return &rt->dst; else return NULL; @@ -1971,7 +1971,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt6_dst_from_metrics_check(rt); if (rt->rt6i_flags & RTF_PCPU || - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) + (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) return rt6_dst_from_check(rt, cookie); else return rt6_check(rt, cookie); @@ -3055,11 +3055,11 @@ out: static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) { - BUG_ON(from->dst.from); + BUG_ON(from->from); rt->rt6i_flags &= ~RTF_EXPIRES; dst_hold(&from->dst); - rt->dst.from = &from->dst; + rt->from = from; dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); } @@ -4596,8 +4596,6 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_null_entry) goto out_ip6_dst_entries; - net->ipv6.ip6_null_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); @@ -4609,8 +4607,6 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_prohibit_entry) goto out_ip6_null_entry; - net->ipv6.ip6_prohibit_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_prohibit_entry; net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); @@ -4620,8 +4616,6 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_blk_hole_entry) goto out_ip6_prohibit_entry; - net->ipv6.ip6_blk_hole_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); @@ -4778,11 +4772,20 @@ int __init ip6_route_init(void) if (ret) goto fib6_rules_init; - ret = -ENOBUFS; - if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) || - __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) || - __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, - RTNL_FLAG_DOIT_UNLOCKED)) + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, + inet6_rtm_newroute, NULL, 0); + if (ret < 0) + goto out_register_late_subsys; + + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, + inet6_rtm_delroute, NULL, 0); + if (ret < 0) + goto out_register_late_subsys; + + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, + inet6_rtm_getroute, NULL, + RTNL_FLAG_DOIT_UNLOCKED); + if (ret < 0) goto out_register_late_subsys; ret = register_netdevice_notifier(&ip6_route_dev_notifier); @@ -4800,6 +4803,7 @@ out: return ret; out_register_late_subsys: + rtnl_unregister_all(PF_INET6); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_init: fib6_rules_cleanup(); diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index c81407770956..7f5621d09571 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -306,9 +306,7 @@ static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb) struct seg6_hmac_info *hinfo; int ret; - ret = rhashtable_walk_start(iter); - if (ret && ret != -EAGAIN) - goto done; + rhashtable_walk_start(iter); for (;;) { hinfo = rhashtable_walk_next(iter); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3f30fa313bf2..eecf9f0faf29 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -89,28 +89,12 @@ static u32 udp6_ehashfn(const struct net *net, udp_ipv6_hash_secret + net_hash_mix(net)); } -static u32 udp6_portaddr_hash(const struct net *net, - const struct in6_addr *addr6, - unsigned int port) -{ - unsigned int hash, mix = net_hash_mix(net); - - if (ipv6_addr_any(addr6)) - hash = jhash_1word(0, mix); - else if (ipv6_addr_v4mapped(addr6)) - hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); - else - hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); - - return hash ^ port; -} - int udp_v6_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = - udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum); + ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum); unsigned int hash2_partial = - udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0); + ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; @@ -119,7 +103,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum) static void udp_v6_rehash(struct sock *sk) { - u16 new_hash = udp6_portaddr_hash(sock_net(sk), + u16 new_hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); @@ -184,7 +168,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; result = NULL; @@ -193,8 +177,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); @@ -202,15 +185,9 @@ static struct sock *udp6_lib_lookup2(struct net *net, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -228,11 +205,11 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; bool exact_dif = udp6_lib_exact_dif_match(net, skb); - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; if (hslot->count > 10) { - hash2 = udp6_portaddr_hash(net, daddr, hnum); + hash2 = ipv6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) @@ -243,7 +220,7 @@ struct sock *__udp6_lib_lookup(struct net *net, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; - hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum); + hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); slot2 = hash2 & udptable->mask; /* avoid searching the same slot again. */ if (unlikely(slot2 == old_slot2)) @@ -267,23 +244,16 @@ begin: score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -719,9 +689,9 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct sk_buff *nskb; if (use_hash2) { - hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & + hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) & udptable->mask; - hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask; + hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); @@ -909,7 +879,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, int dif, int sdif) { unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum); + unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 02556e356f87..e66b94f46532 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -59,7 +59,7 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) if (x->props.flags & XFRM_STATE_NOECN) dsfield &= ~INET_ECN_MASK; ipv6_change_dsfield(top_iph, 0, dsfield); - top_iph->hop_limit = ip6_dst_hoplimit(dst->child); + top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); top_iph->saddr = *(struct in6_addr *)&x->props.saddr; top_iph->daddr = *(struct in6_addr *)&x->id.daddr; return 0; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 885ade234a49..09fb44ee3b45 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -265,7 +265,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, in6_dev_put(xdst->u.rt6.rt6i_idev); xdst->u.rt6.rt6i_idev = loopback_idev; in6_dev_hold(loopback_idev); - xdst = (struct xfrm_dst *)xdst->u.dst.child; + xdst = (struct xfrm_dst *)xfrm_dst_child(&xdst->u.dst); } while (xdst->u.dst.xfrm); __in6_dev_put(loopback_idev); diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 86c8dfef56a4..a5125624a76d 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -257,9 +257,7 @@ __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx) if (ret) return NULL; - ret = rhashtable_walk_start(&iter); - if (ret && ret != -EAGAIN) - goto err; + rhashtable_walk_start(&iter); while ((mpath = rhashtable_walk_next(&iter))) { if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) @@ -269,7 +267,6 @@ __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx) if (i++ == idx) break; } -err: rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); @@ -513,9 +510,7 @@ void mesh_plink_broken(struct sta_info *sta) if (ret) return; - ret = rhashtable_walk_start(&iter); - if (ret && ret != -EAGAIN) - goto out; + rhashtable_walk_start(&iter); while ((mpath = rhashtable_walk_next(&iter))) { if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) @@ -535,7 +530,6 @@ void mesh_plink_broken(struct sta_info *sta) WLAN_REASON_MESH_PATH_DEST_UNREACHABLE, bcast); } } -out: rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); } @@ -584,9 +578,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta) if (ret) return; - ret = rhashtable_walk_start(&iter); - if (ret && ret != -EAGAIN) - goto out; + rhashtable_walk_start(&iter); while ((mpath = rhashtable_walk_next(&iter))) { if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) @@ -597,7 +589,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta) if (rcu_access_pointer(mpath->next_hop) == sta) __mesh_path_del(tbl, mpath); } -out: + rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); } @@ -614,9 +606,7 @@ static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata, if (ret) return; - ret = rhashtable_walk_start(&iter); - if (ret && ret != -EAGAIN) - goto out; + rhashtable_walk_start(&iter); while ((mpath = rhashtable_walk_next(&iter))) { if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) @@ -627,7 +617,7 @@ static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata, if (ether_addr_equal(mpath->mpp, proxy)) __mesh_path_del(tbl, mpath); } -out: + rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); } @@ -642,9 +632,7 @@ static void table_flush_by_iface(struct mesh_table *tbl) if (ret) return; - ret = rhashtable_walk_start(&iter); - if (ret && ret != -EAGAIN) - goto out; + rhashtable_walk_start(&iter); while ((mpath = rhashtable_walk_next(&iter))) { if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) @@ -653,7 +641,7 @@ static void table_flush_by_iface(struct mesh_table *tbl) break; __mesh_path_del(tbl, mpath); } -out: + rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); } @@ -873,9 +861,7 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata, if (ret) return; - ret = rhashtable_walk_start(&iter); - if (ret && ret != -EAGAIN) - goto out; + rhashtable_walk_start(&iter); while ((mpath = rhashtable_walk_next(&iter))) { if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) @@ -887,7 +873,7 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata, time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE)) __mesh_path_del(tbl, mpath); } -out: + rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 8ca9915befc8..5dce8336d33f 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2510,12 +2510,15 @@ static int __init mpls_init(void) rtnl_af_register(&mpls_af_ops); - rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, 0); - rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, 0); - rtnl_register(PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes, - 0); - rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, - mpls_netconf_dump_devconf, 0); + rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_NEWROUTE, + mpls_rtm_newroute, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_DELROUTE, + mpls_rtm_delroute, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETROUTE, + mpls_getroute, mpls_dump_routes, 0); + rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETNETCONF, + mpls_netconf_get_devconf, + mpls_netconf_dump_devconf, 0); err = ipgre_tunnel_encap_add_mpls_ops(); if (err) pr_err("Can't add mpls over gre tunnel ops\n"); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index f8166c1d5430..3f1624ee056f 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -251,11 +251,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (err) return; - err = rhashtable_walk_start(&hti); - if (err && err != -EAGAIN) { - iter->err = err; - goto out; - } + rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { @@ -306,9 +302,7 @@ static void nft_rhash_gc(struct work_struct *work) if (err) goto schedule; - err = rhashtable_walk_start(&hti); - if (err && err != -EAGAIN) - goto out; + rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index 2b4ab189bba7..5639fb03bdd9 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -93,7 +93,8 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, if (dst->xfrm == NULL) return -1; - for (i = 0; dst && dst->xfrm; dst = dst->child, i++) { + for (i = 0; dst && dst->xfrm; + dst = ((struct xfrm_dst *)dst)->child, i++) { pos = strict ? i : 0; if (pos >= info->len) return 0; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 79cc1bf36e4a..972bfe113043 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -65,6 +65,7 @@ #include <linux/net_namespace.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/sock.h> #include <net/scm.h> #include <net/netlink.h> @@ -145,8 +146,6 @@ static atomic_t nl_table_users = ATOMIC_INIT(0); static BLOCKING_NOTIFIER_HEAD(netlink_chain); -static DEFINE_SPINLOCK(netlink_tap_lock); -static struct list_head netlink_tap_all __read_mostly; static const struct rhashtable_params netlink_rhashtable_params; @@ -173,14 +172,24 @@ static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, return new; } +static unsigned int netlink_tap_net_id; + +struct netlink_tap_net { + struct list_head netlink_tap_all; + struct mutex netlink_tap_lock; +}; + int netlink_add_tap(struct netlink_tap *nt) { + struct net *net = dev_net(nt->dev); + struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); + if (unlikely(nt->dev->type != ARPHRD_NETLINK)) return -EINVAL; - spin_lock(&netlink_tap_lock); - list_add_rcu(&nt->list, &netlink_tap_all); - spin_unlock(&netlink_tap_lock); + mutex_lock(&nn->netlink_tap_lock); + list_add_rcu(&nt->list, &nn->netlink_tap_all); + mutex_unlock(&nn->netlink_tap_lock); __module_get(nt->module); @@ -190,12 +199,14 @@ EXPORT_SYMBOL_GPL(netlink_add_tap); static int __netlink_remove_tap(struct netlink_tap *nt) { + struct net *net = dev_net(nt->dev); + struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); bool found = false; struct netlink_tap *tmp; - spin_lock(&netlink_tap_lock); + mutex_lock(&nn->netlink_tap_lock); - list_for_each_entry(tmp, &netlink_tap_all, list) { + list_for_each_entry(tmp, &nn->netlink_tap_all, list) { if (nt == tmp) { list_del_rcu(&nt->list); found = true; @@ -205,7 +216,7 @@ static int __netlink_remove_tap(struct netlink_tap *nt) pr_warn("__netlink_remove_tap: %p not found\n", nt); out: - spin_unlock(&netlink_tap_lock); + mutex_unlock(&nn->netlink_tap_lock); if (found) module_put(nt->module); @@ -224,6 +235,26 @@ int netlink_remove_tap(struct netlink_tap *nt) } EXPORT_SYMBOL_GPL(netlink_remove_tap); +static __net_init int netlink_tap_init_net(struct net *net) +{ + struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); + + INIT_LIST_HEAD(&nn->netlink_tap_all); + mutex_init(&nn->netlink_tap_lock); + return 0; +} + +static void __net_exit netlink_tap_exit_net(struct net *net) +{ +} + +static struct pernet_operations netlink_tap_net_ops = { + .init = netlink_tap_init_net, + .exit = netlink_tap_exit_net, + .id = &netlink_tap_net_id, + .size = sizeof(struct netlink_tap_net), +}; + static bool netlink_filter_tap(const struct sk_buff *skb) { struct sock *sk = skb->sk; @@ -277,7 +308,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, return ret; } -static void __netlink_deliver_tap(struct sk_buff *skb) +static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn) { int ret; struct netlink_tap *tmp; @@ -285,19 +316,21 @@ static void __netlink_deliver_tap(struct sk_buff *skb) if (!netlink_filter_tap(skb)) return; - list_for_each_entry_rcu(tmp, &netlink_tap_all, list) { + list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) { ret = __netlink_deliver_tap_skb(skb, tmp->dev); if (unlikely(ret)) break; } } -static void netlink_deliver_tap(struct sk_buff *skb) +static void netlink_deliver_tap(struct net *net, struct sk_buff *skb) { + struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); + rcu_read_lock(); - if (unlikely(!list_empty(&netlink_tap_all))) - __netlink_deliver_tap(skb); + if (unlikely(!list_empty(&nn->netlink_tap_all))) + __netlink_deliver_tap(skb, nn); rcu_read_unlock(); } @@ -306,7 +339,7 @@ static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, struct sk_buff *skb) { if (!(netlink_is_kernel(dst) && netlink_is_kernel(src))) - netlink_deliver_tap(skb); + netlink_deliver_tap(sock_net(dst), skb); } static void netlink_overrun(struct sock *sk) @@ -1216,7 +1249,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; - netlink_deliver_tap(skb); + netlink_deliver_tap(sock_net(sk), skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); @@ -2481,8 +2514,9 @@ static int netlink_walk_start(struct nl_seq_iter *iter) return err; } - err = rhashtable_walk_start(&iter->hti); - return err == -EAGAIN ? 0 : err; + rhashtable_walk_start(&iter->hti); + + return 0; } static void netlink_walk_stop(struct nl_seq_iter *iter) @@ -2733,12 +2767,11 @@ static int __init netlink_proto_init(void) } } - INIT_LIST_HEAD(&netlink_tap_all); - netlink_add_usersock_entry(); sock_register(&netlink_family_ops); register_pernet_subsys(&netlink_net_ops); + register_pernet_subsys(&netlink_tap_net_ops); /* The netlink device handler may be needed early. */ rtnetlink_init(); out: diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 8faa20b4d457..7dda33b9b784 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -115,11 +115,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, if (!s_num) rhashtable_walk_enter(&tbl->hash, hti); - ret = rhashtable_walk_start(hti); - if (ret == -EAGAIN) - ret = 0; - if (ret) - goto stop; + rhashtable_walk_start(hti); while ((nlsk = rhashtable_walk_next(hti))) { if (IS_ERR(nlsk)) { @@ -146,8 +142,8 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, } } -stop: rhashtable_walk_stop(hti); + if (ret) goto done; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index dbe2379329c5..76d050aba7a4 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -56,12 +56,12 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies) { - struct timespec cur_ts; + struct timespec64 cur_ts; u64 cur_ms, idle_ms; - ktime_get_ts(&cur_ts); + ktime_get_ts64(&cur_ts); idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); - cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC + + cur_ms = (u64)(u32)cur_ts.tv_sec * MSEC_PER_SEC + cur_ts.tv_nsec / NSEC_PER_MSEC; return cur_ms - idle_ms; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 624ea74353dd..bce1f78b0de5 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -644,12 +644,12 @@ static int erspan_tun_opt_from_nlattr(const struct nlattr *attr, BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); memset(&opts, 0, sizeof(opts)); - opts.index = nla_get_be32(attr); + opts.u.index = nla_get_be32(attr); /* Index has only 20-bit */ - if (ntohl(opts.index) & ~INDEX_MASK) { + if (ntohl(opts.u.index) & ~INDEX_MASK) { OVS_NLERR(log, "ERSPAN index number %x too large.", - ntohl(opts.index)); + ntohl(opts.u.index)); return -EINVAL; } @@ -907,7 +907,7 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, return -EMSGSIZE; else if (output->tun_flags & TUNNEL_ERSPAN_OPT && nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, - ((struct erspan_metadata *)tun_opts)->index)) + ((struct erspan_metadata *)tun_opts)->u.index)) return -EMSGSIZE; } diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 04a3128adcf0..3e7747549f90 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -126,18 +126,12 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) } } -static void internal_set_rx_headroom(struct net_device *dev, int new_hr) -{ - dev->needed_headroom = new_hr < 0 ? 0 : new_hr; -} - static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_get_stats64 = internal_get_stats, - .ndo_set_rx_headroom = internal_set_rx_headroom, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -154,7 +148,7 @@ static void do_setup(struct net_device *netdev) netdev->priv_flags &= ~IFF_TX_SKB_SHARING; netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | - IFF_PHONY_HEADROOM | IFF_NO_QUEUE; + IFF_NO_QUEUE; netdev->needs_free_netdev = true; netdev->priv_destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; @@ -195,7 +189,6 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) err = -ENOMEM; goto error_free_netdev; } - vport->dev->needed_headroom = vport->dp->max_headroom; dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); internal_dev = internal_dev_priv(vport->dev); diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index da754fc926e7..871eaf2cb85e 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -299,16 +299,21 @@ out: int __init phonet_netlink_register(void) { - int err = __rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit, - NULL, 0); + int err = rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWADDR, + addr_doit, NULL, 0); if (err) return err; - /* Further __rtnl_register() cannot fail */ - __rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL, 0); - __rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, 0); - __rtnl_register(PF_PHONET, RTM_NEWROUTE, route_doit, NULL, 0); - __rtnl_register(PF_PHONET, RTM_DELROUTE, route_doit, NULL, 0); - __rtnl_register(PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, 0); + /* Further rtnl_register_module() cannot fail */ + rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELADDR, + addr_doit, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETADDR, + NULL, getaddr_dumpit, 0); + rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWROUTE, + route_doit, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELROUTE, + route_doit, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETROUTE, + NULL, route_dumpit, 0); return 0; } diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 77ab05e23001..5fb3929e3d7d 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1116,9 +1116,13 @@ static int __init qrtr_proto_init(void) return rc; } - rtnl_register(PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, 0); + rc = rtnl_register_module(THIS_MODULE, PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, 0); + if (rc) { + sock_unregister(qrtr_family.family); + proto_unregister(&qrtr_proto); + } - return 0; + return rc; } postcore_initcall(qrtr_proto_init); diff --git a/net/rds/connection.c b/net/rds/connection.c index 7ee2d5d68b78..6492c0b608a4 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -230,8 +230,8 @@ static struct rds_connection *__rds_conn_create(struct net *net, rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", conn, &laddr, &faddr, - trans->t_name ? trans->t_name : "[unknown]", - is_outgoing ? "(outgoing)" : ""); + strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : + "[unknown]", is_outgoing ? "(outgoing)" : ""); /* * Since we ran without holding the conn lock, someone could @@ -366,6 +366,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp) * to the conn hash, so we never trigger a reconnect on this * conn - the reconnect is always triggered by the active peer. */ cancel_delayed_work_sync(&cp->cp_conn_w); + if (conn->c_destroy_in_prog) + return; rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); @@ -445,7 +447,6 @@ void rds_conn_destroy(struct rds_connection *conn) */ rds_cong_remove_conn(conn); - put_net(conn->c_net); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); diff --git a/net/rds/rds.h b/net/rds/rds.h index c349c71babff..d09f6c1facb4 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -150,7 +150,7 @@ struct rds_connection { /* Protocol version */ unsigned int c_version; - struct net *c_net; + possible_net_t c_net; struct list_head c_map_item; unsigned long c_map_queued; @@ -165,13 +165,13 @@ struct rds_connection { static inline struct net *rds_conn_net(struct rds_connection *conn) { - return conn->c_net; + return read_pnet(&conn->c_net); } static inline void rds_conn_net_set(struct rds_connection *conn, struct net *net) { - conn->c_net = get_net(net); + write_pnet(&conn->c_net, net); } #define RDS_FLAG_CONG_BITMAP 0x01 diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 6b7ee71f40c6..39f502d47969 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -306,7 +306,8 @@ static void rds_tcp_conn_free(void *arg) rdsdebug("freeing tc %p\n", tc); spin_lock_irqsave(&rds_tcp_conn_lock, flags); - list_del(&tc->t_tcp_node); + if (!tc->t_tcp_node_detached) + list_del(&tc->t_tcp_node); spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); kmem_cache_free(rds_tcp_conn_slab, tc); @@ -495,27 +496,6 @@ static struct pernet_operations rds_tcp_net_ops = { .size = sizeof(struct rds_tcp_net), }; -/* explicitly send a RST on each socket, thereby releasing any socket refcnts - * that may otherwise hold up netns deletion. - */ -static void rds_tcp_conn_paths_destroy(struct rds_connection *conn) -{ - struct rds_conn_path *cp; - struct rds_tcp_connection *tc; - int i; - struct sock *sk; - - for (i = 0; i < RDS_MPATH_WORKERS; i++) { - cp = &conn->c_path[i]; - tc = cp->cp_transport_data; - if (!tc->t_sock) - continue; - sk = tc->t_sock->sk; - sk->sk_prot->disconnect(sk, 0); - tcp_done(sk); - } -} - static void rds_tcp_kill_sock(struct net *net) { struct rds_tcp_connection *tc, *_tc; @@ -527,18 +507,20 @@ static void rds_tcp_kill_sock(struct net *net) rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = tc->t_cpath->cp_conn->c_net; + struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); if (net != c_net || !tc->t_sock) continue; - if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) + if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) { list_move_tail(&tc->t_tcp_node, &tmp_list); + } else { + list_del(&tc->t_tcp_node); + tc->t_tcp_node_detached = true; + } } spin_unlock_irq(&rds_tcp_conn_lock); - list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { - rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn); + list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) rds_conn_destroy(tc->t_cpath->cp_conn); - } } void *rds_tcp_listen_sock_def_readable(struct net *net) @@ -586,7 +568,7 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = tc->t_cpath->cp_conn->c_net; + struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); if (net != c_net || !tc->t_sock) continue; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 1aafbf7c3011..e7858ee8ed8b 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -12,6 +12,7 @@ struct rds_tcp_incoming { struct rds_tcp_connection { struct list_head t_tcp_node; + bool t_tcp_node_detached; struct rds_conn_path *t_cpath; /* t_conn_path_lock synchronizes the connection establishment between * rds_tcp_accept_one and rds_tcp_conn_path_connect diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 4d33a50a8a6d..52622a3d2517 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -99,7 +99,7 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) p->tcfa_refcnt--; if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) { if (p->ops->cleanup) - p->ops->cleanup(p, bind); + p->ops->cleanup(p); tcf_idr_remove(p->idrinfo, p); ret = ACT_P_DELETED; } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 5ef8ce8c83d4..b3f2c15affa7 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -357,7 +357,7 @@ out: return ret; } -static void tcf_bpf_cleanup(struct tc_action *act, int bind) +static void tcf_bpf_cleanup(struct tc_action *act) { struct tcf_bpf_cfg tmp; @@ -401,16 +401,14 @@ static __net_init int bpf_init_net(struct net *net) return tc_action_net_init(tn, &act_bpf_ops); } -static void __net_exit bpf_exit_net(struct net *net) +static void __net_exit bpf_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, bpf_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, bpf_net_id); } static struct pernet_operations bpf_net_ops = { .init = bpf_init_net, - .exit = bpf_exit_net, + .exit_batch = bpf_exit_net, .id = &bpf_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 10b7a8855a6c..2b15ba84e0c8 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -209,16 +209,14 @@ static __net_init int connmark_init_net(struct net *net) return tc_action_net_init(tn, &act_connmark_ops); } -static void __net_exit connmark_exit_net(struct net *net) +static void __net_exit connmark_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, connmark_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, connmark_net_id); } static struct pernet_operations connmark_net_ops = { .init = connmark_init_net, - .exit = connmark_exit_net, + .exit_batch = connmark_exit_net, .id = &connmark_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index d836f998117b..af4b8ec60d9a 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -635,16 +635,14 @@ static __net_init int csum_init_net(struct net *net) return tc_action_net_init(tn, &act_csum_ops); } -static void __net_exit csum_exit_net(struct net *net) +static void __net_exit csum_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, csum_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, csum_net_id); } static struct pernet_operations csum_net_ops = { .init = csum_init_net, - .exit = csum_exit_net, + .exit_batch = csum_exit_net, .id = &csum_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index e29a48ef7fc3..9d632e92cad0 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -235,16 +235,14 @@ static __net_init int gact_init_net(struct net *net) return tc_action_net_init(tn, &act_gact_ops); } -static void __net_exit gact_exit_net(struct net *net) +static void __net_exit gact_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, gact_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, gact_net_id); } static struct pernet_operations gact_net_ops = { .init = gact_init_net, - .exit = gact_exit_net, + .exit_batch = gact_exit_net, .id = &gact_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 3007cb1310ea..5954e992685a 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -387,7 +387,7 @@ out_nlmsg_trim: } /* under ife->tcf_lock */ -static void _tcf_ife_cleanup(struct tc_action *a, int bind) +static void _tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_meta_info *e, *n; @@ -405,13 +405,13 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind) } } -static void tcf_ife_cleanup(struct tc_action *a, int bind) +static void tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; spin_lock_bh(&ife->tcf_lock); - _tcf_ife_cleanup(a, bind); + _tcf_ife_cleanup(a); spin_unlock_bh(&ife->tcf_lock); p = rcu_dereference_protected(ife->params, 1); @@ -546,7 +546,7 @@ metadata_parse_err: if (exists) tcf_idr_release(*a, bind); if (ret == ACT_P_CREATED) - _tcf_ife_cleanup(*a, bind); + _tcf_ife_cleanup(*a); if (exists) spin_unlock_bh(&ife->tcf_lock); @@ -567,7 +567,7 @@ metadata_parse_err: err = use_all_metadata(ife); if (err) { if (ret == ACT_P_CREATED) - _tcf_ife_cleanup(*a, bind); + _tcf_ife_cleanup(*a); if (exists) spin_unlock_bh(&ife->tcf_lock); @@ -858,16 +858,14 @@ static __net_init int ife_init_net(struct net *net) return tc_action_net_init(tn, &act_ife_ops); } -static void __net_exit ife_exit_net(struct net *net) +static void __net_exit ife_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, ife_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, ife_net_id); } static struct pernet_operations ife_net_ops = { .init = ife_init_net, - .exit = ife_exit_net, + .exit_batch = ife_exit_net, .id = &ife_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index d9e399a7e3d5..06e380ae0928 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -77,7 +77,7 @@ static void ipt_destroy_target(struct xt_entry_target *t) module_put(par.target->me); } -static void tcf_ipt_release(struct tc_action *a, int bind) +static void tcf_ipt_release(struct tc_action *a) { struct tcf_ipt *ipt = to_ipt(a); ipt_destroy_target(ipt->tcfi_t); @@ -337,16 +337,14 @@ static __net_init int ipt_init_net(struct net *net) return tc_action_net_init(tn, &act_ipt_ops); } -static void __net_exit ipt_exit_net(struct net *net) +static void __net_exit ipt_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, ipt_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, ipt_net_id); } static struct pernet_operations ipt_net_ops = { .init = ipt_init_net, - .exit = ipt_exit_net, + .exit_batch = ipt_exit_net, .id = &ipt_net_id, .size = sizeof(struct tc_action_net), }; @@ -387,16 +385,14 @@ static __net_init int xt_init_net(struct net *net) return tc_action_net_init(tn, &act_xt_ops); } -static void __net_exit xt_exit_net(struct net *net) +static void __net_exit xt_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, xt_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, xt_net_id); } static struct pernet_operations xt_net_ops = { .init = xt_init_net, - .exit = xt_exit_net, + .exit_batch = xt_exit_net, .id = &xt_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 8b3e59388480..37e5e4decbd6 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -29,7 +29,6 @@ #include <net/tc_act/tc_mirred.h> static LIST_HEAD(mirred_list); -static DEFINE_SPINLOCK(mirred_list_lock); static bool tcf_mirred_is_act_redirect(int action) { @@ -50,18 +49,15 @@ static bool tcf_mirred_act_wants_ingress(int action) } } -static void tcf_mirred_release(struct tc_action *a, int bind) +static void tcf_mirred_release(struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; - /* We could be called either in a RCU callback or with RTNL lock held. */ - spin_lock_bh(&mirred_list_lock); list_del(&m->tcfm_list); - dev = rcu_dereference_protected(m->tcfm_dev, 1); + dev = rtnl_dereference(m->tcfm_dev); if (dev) dev_put(dev); - spin_unlock_bh(&mirred_list_lock); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { @@ -139,8 +135,6 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, m->tcf_action = parm->action; m->tcfm_eaction = parm->eaction; if (dev != NULL) { - m->tcfm_ifindex = parm->ifindex; - m->net = net; if (ret != ACT_P_CREATED) dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); dev_hold(dev); @@ -149,9 +143,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } if (ret == ACT_P_CREATED) { - spin_lock_bh(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); - spin_unlock_bh(&mirred_list_lock); tcf_idr_insert(tn, *a); } @@ -247,13 +239,14 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, { unsigned char *b = skb_tail_pointer(skb); struct tcf_mirred *m = to_mirred(a); + struct net_device *dev = rtnl_dereference(m->tcfm_dev); struct tc_mirred opt = { .index = m->tcf_index, .action = m->tcf_action, .refcnt = m->tcf_refcnt - ref, .bindcnt = m->tcf_bindcnt - bind, .eaction = m->tcfm_eaction, - .ifindex = m->tcfm_ifindex, + .ifindex = dev ? dev->ifindex : 0, }; struct tcf_t t; @@ -294,7 +287,6 @@ static int mirred_device_event(struct notifier_block *unused, ASSERT_RTNL(); if (event == NETDEV_UNREGISTER) { - spin_lock_bh(&mirred_list_lock); list_for_each_entry(m, &mirred_list, tcfm_list) { if (rcu_access_pointer(m->tcfm_dev) == dev) { dev_put(dev); @@ -304,7 +296,6 @@ static int mirred_device_event(struct notifier_block *unused, RCU_INIT_POINTER(m->tcfm_dev, NULL); } } - spin_unlock_bh(&mirred_list_lock); } return NOTIFY_DONE; @@ -318,7 +309,7 @@ static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); - return __dev_get_by_index(m->net, m->tcfm_ifindex); + return rtnl_dereference(m->tcfm_dev); } static struct tc_action_ops act_mirred_ops = { @@ -343,16 +334,14 @@ static __net_init int mirred_init_net(struct net *net) return tc_action_net_init(tn, &act_mirred_ops); } -static void __net_exit mirred_exit_net(struct net *net) +static void __net_exit mirred_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, mirred_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, mirred_net_id); } static struct pernet_operations mirred_net_ops = { .init = mirred_init_net, - .exit = mirred_exit_net, + .exit_batch = mirred_exit_net, .id = &mirred_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index c365d01b99c8..98c6a4b2f523 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -310,16 +310,14 @@ static __net_init int nat_init_net(struct net *net) return tc_action_net_init(tn, &act_nat_ops); } -static void __net_exit nat_exit_net(struct net *net) +static void __net_exit nat_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, nat_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, nat_net_id); } static struct pernet_operations nat_net_ops = { .init = nat_init_net, - .exit = nat_exit_net, + .exit_batch = nat_exit_net, .id = &nat_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 491fe5deb09e..349beaffb29e 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -216,7 +216,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, return ret; } -static void tcf_pedit_cleanup(struct tc_action *a, int bind) +static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); struct tc_pedit_key *keys = p->tcfp_keys; @@ -453,16 +453,14 @@ static __net_init int pedit_init_net(struct net *net) return tc_action_net_init(tn, &act_pedit_ops); } -static void __net_exit pedit_exit_net(struct net *net) +static void __net_exit pedit_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, pedit_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, pedit_net_id); } static struct pernet_operations pedit_net_ops = { .init = pedit_init_net, - .exit = pedit_exit_net, + .exit_batch = pedit_exit_net, .id = &pedit_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 3bb2ebf9e9ae..bf483db993a1 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -334,16 +334,14 @@ static __net_init int police_init_net(struct net *net) return tc_action_net_init(tn, &act_police_ops); } -static void __net_exit police_exit_net(struct net *net) +static void __net_exit police_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, police_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, police_net_id); } static struct pernet_operations police_net_ops = { .init = police_init_net, - .exit = police_exit_net, + .exit_batch = police_exit_net, .id = &police_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 9438969290a6..1ba0df238756 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -96,7 +96,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, return ret; } -static void tcf_sample_cleanup(struct tc_action *a, int bind) +static void tcf_sample_cleanup(struct tc_action *a) { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; @@ -236,16 +236,14 @@ static __net_init int sample_init_net(struct net *net) return tc_action_net_init(tn, &act_sample_ops); } -static void __net_exit sample_exit_net(struct net *net) +static void __net_exit sample_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, sample_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, sample_net_id); } static struct pernet_operations sample_net_ops = { .init = sample_init_net, - .exit = sample_exit_net, + .exit_batch = sample_exit_net, .id = &sample_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index e7b57e5071a3..425eac11f6da 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -47,7 +47,7 @@ static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, return d->tcf_action; } -static void tcf_simp_release(struct tc_action *a, int bind) +static void tcf_simp_release(struct tc_action *a) { struct tcf_defact *d = to_defact(a); kfree(d->tcfd_defdata); @@ -204,16 +204,14 @@ static __net_init int simp_init_net(struct net *net) return tc_action_net_init(tn, &act_simp_ops); } -static void __net_exit simp_exit_net(struct net *net) +static void __net_exit simp_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, simp_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, simp_net_id); } static struct pernet_operations simp_net_ops = { .init = simp_init_net, - .exit = simp_exit_net, + .exit_batch = simp_exit_net, .id = &simp_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 59949d61f20d..5a3f691bb545 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -241,16 +241,14 @@ static __net_init int skbedit_init_net(struct net *net) return tc_action_net_init(tn, &act_skbedit_ops); } -static void __net_exit skbedit_exit_net(struct net *net) +static void __net_exit skbedit_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, skbedit_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, skbedit_net_id); } static struct pernet_operations skbedit_net_ops = { .init = skbedit_init_net, - .exit = skbedit_exit_net, + .exit_batch = skbedit_exit_net, .id = &skbedit_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index b642ad3d39dd..fa975262dbac 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -184,7 +184,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, return ret; } -static void tcf_skbmod_cleanup(struct tc_action *a, int bind) +static void tcf_skbmod_cleanup(struct tc_action *a) { struct tcf_skbmod *d = to_skbmod(a); struct tcf_skbmod_params *p; @@ -266,16 +266,14 @@ static __net_init int skbmod_init_net(struct net *net) return tc_action_net_init(tn, &act_skbmod_ops); } -static void __net_exit skbmod_exit_net(struct net *net) +static void __net_exit skbmod_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, skbmod_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, skbmod_net_id); } static struct pernet_operations skbmod_net_ops = { .init = skbmod_init_net, - .exit = skbmod_exit_net, + .exit_batch = skbmod_exit_net, .id = &skbmod_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 30c96274c638..0e23aac09ad6 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -201,7 +201,7 @@ err_out: return ret; } -static void tunnel_key_release(struct tc_action *a, int bind) +static void tunnel_key_release(struct tc_action *a) { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; @@ -325,16 +325,14 @@ static __net_init int tunnel_key_init_net(struct net *net) return tc_action_net_init(tn, &act_tunnel_key_ops); } -static void __net_exit tunnel_key_exit_net(struct net *net) +static void __net_exit tunnel_key_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, tunnel_key_net_id); } static struct pernet_operations tunnel_key_net_ops = { .init = tunnel_key_init_net, - .exit = tunnel_key_exit_net, + .exit_batch = tunnel_key_exit_net, .id = &tunnel_key_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 97f717a13ad5..e1a1b3f3983a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -219,7 +219,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, return ret; } -static void tcf_vlan_cleanup(struct tc_action *a, int bind) +static void tcf_vlan_cleanup(struct tc_action *a) { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; @@ -301,16 +301,14 @@ static __net_init int vlan_init_net(struct net *net) return tc_action_net_init(tn, &act_vlan_ops); } -static void __net_exit vlan_exit_net(struct net *net) +static void __net_exit vlan_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, vlan_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, vlan_net_id); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, - .exit = vlan_exit_net, + .exit_batch = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index b91ea03e3afa..32b1ea7cf863 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -217,8 +217,12 @@ static void tcf_chain_flush(struct tcf_chain *chain) static void tcf_chain_destroy(struct tcf_chain *chain) { + struct tcf_block *block = chain->block; + list_del(&chain->list); kfree(chain); + if (list_empty(&block->chain_list)) + kfree(block); } static void tcf_chain_hold(struct tcf_chain *chain) @@ -329,49 +333,34 @@ int tcf_block_get(struct tcf_block **p_block, } EXPORT_SYMBOL(tcf_block_get); -static void tcf_block_put_final(struct work_struct *work) -{ - struct tcf_block *block = container_of(work, struct tcf_block, work); - struct tcf_chain *chain, *tmp; - - rtnl_lock(); - - /* At this point, all the chains should have refcnt == 1. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) - tcf_chain_put(chain); - rtnl_unlock(); - kfree(block); -} - /* XXX: Standalone actions are not allowed to jump to any chain, and bound * actions should be all removed after flushing. */ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { - struct tcf_chain *chain; + struct tcf_chain *chain, *tmp; - if (!block) - return; - /* Hold a refcnt for all chains, except 0, so that they don't disappear + /* Hold a refcnt for all chains, so that they don't disappear * while we are iterating. */ + if (!block) + return; list_for_each_entry(chain, &block->chain_list, list) - if (chain->index) - tcf_chain_hold(chain); + tcf_chain_hold(chain); list_for_each_entry(chain, &block->chain_list, list) tcf_chain_flush(chain); tcf_block_offload_unbind(block, q, ei); - INIT_WORK(&block->work, tcf_block_put_final); - /* Wait for existing RCU callbacks to cool down, make sure their works - * have been queued before this. We can not flush pending works here - * because we are holding the RTNL lock. - */ - rcu_barrier(); - tcf_queue_work(&block->work); + /* At this point, all the chains should have refcnt >= 1. */ + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + tcf_chain_put(chain); + + /* Finally, put chain 0 and allow block to be freed. */ + chain = list_first_entry(&block->chain_list, struct tcf_chain, list); + tcf_chain_put(chain); } EXPORT_SYMBOL(tcf_block_put_ext); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 543a3e875d05..6132a7317efa 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -166,6 +166,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, * so do it rather here. */ skb_key.basic.n_proto = skb->protocol; + skb_flow_dissect_tunnel_info(skb, &head->dissector, &skb_key); skb_flow_dissect(skb, &head->dissector, &skb_key, 0); fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 0f1eab99ff4e..74c22b4e365e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -799,7 +799,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; - qlen = q->q.qlen; + + qlen = qdisc_qlen_sum(q); stab = rtnl_dereference(q->stab); if (stab && qdisc_dump_stab(skb, stab) < 0) @@ -956,6 +957,11 @@ skip: } else { const struct Qdisc_class_ops *cops = parent->ops->cl_ops; + /* Only support running class lockless if parent is lockless */ + if (new && (new->flags & TCQ_F_NOLOCK) && + parent && !(parent->flags & TCQ_F_NOLOCK)) + new->flags &= ~TCQ_F_NOLOCK; + err = -EOPNOTSUPP; if (cops && cops->graft) { unsigned long cl = cops->find(parent, classid); @@ -1022,7 +1028,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, #endif err = -ENOENT; - if (ops == NULL) + if (!ops) goto err_out; sch = qdisc_alloc(dev_queue, ops); @@ -1062,54 +1068,60 @@ static struct Qdisc *qdisc_create(struct net_device *dev, netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); } - if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { - if (qdisc_is_percpu_stats(sch)) { - sch->cpu_bstats = - netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); - if (!sch->cpu_bstats) - goto err_out4; + if (ops->init) { + err = ops->init(sch, tca[TCA_OPTIONS]); + if (err != 0) + goto err_out5; + } - sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); - if (!sch->cpu_qstats) - goto err_out4; - } + if (qdisc_is_percpu_stats(sch)) { + sch->cpu_bstats = + netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!sch->cpu_bstats) + goto err_out4; - if (tca[TCA_STAB]) { - stab = qdisc_get_stab(tca[TCA_STAB]); - if (IS_ERR(stab)) { - err = PTR_ERR(stab); - goto err_out4; - } - rcu_assign_pointer(sch->stab, stab); - } - if (tca[TCA_RATE]) { - seqcount_t *running; - - err = -EOPNOTSUPP; - if (sch->flags & TCQ_F_MQROOT) - goto err_out4; + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!sch->cpu_qstats) + goto err_out4; + } - if ((sch->parent != TC_H_ROOT) && - !(sch->flags & TCQ_F_INGRESS) && - (!p || !(p->flags & TCQ_F_MQROOT))) - running = qdisc_root_sleeping_running(sch); - else - running = &sch->running; - - err = gen_new_estimator(&sch->bstats, - sch->cpu_bstats, - &sch->rate_est, - NULL, - running, - tca[TCA_RATE]); - if (err) - goto err_out4; + if (tca[TCA_STAB]) { + stab = qdisc_get_stab(tca[TCA_STAB]); + if (IS_ERR(stab)) { + err = PTR_ERR(stab); + goto err_out4; } + rcu_assign_pointer(sch->stab, stab); + } + if (tca[TCA_RATE]) { + seqcount_t *running; - qdisc_hash_add(sch, false); + err = -EOPNOTSUPP; + if (sch->flags & TCQ_F_MQROOT) + goto err_out4; - return sch; + if (sch->parent != TC_H_ROOT && + !(sch->flags & TCQ_F_INGRESS) && + (!p || !(p->flags & TCQ_F_MQROOT))) + running = qdisc_root_sleeping_running(sch); + else + running = &sch->running; + + err = gen_new_estimator(&sch->bstats, + sch->cpu_bstats, + &sch->rate_est, + NULL, + running, + tca[TCA_RATE]); + if (err) + goto err_out4; } + + qdisc_hash_add(sch, false); + + return sch; + +err_out5: /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */ if (ops->destroy) ops->destroy(sch); @@ -1141,7 +1153,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) int err = 0; if (tca[TCA_OPTIONS]) { - if (sch->ops->change == NULL) + if (!sch->ops->change) return -EINVAL; err = sch->ops->change(sch, tca[TCA_OPTIONS]); if (err) @@ -1346,7 +1358,8 @@ replay: goto create_n_graft; if (n->nlmsg_flags & NLM_F_EXCL) return -EEXIST; - if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) + if (tca[TCA_KIND] && + nla_strcmp(tca[TCA_KIND], q->ops->id)) return -EINVAL; if (q == p || (p && check_loop(q, p, 0))) @@ -1391,7 +1404,7 @@ replay: } /* Change qdisc parameters */ - if (q == NULL) + if (!q) return -ENOENT; if (n->nlmsg_flags & NLM_F_EXCL) return -EEXIST; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cd1b200acae7..981c08fe810b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -26,6 +26,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/if_vlan.h> +#include <linux/skb_array.h> #include <linux/if_macvlan.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> @@ -47,9 +48,70 @@ EXPORT_SYMBOL(default_qdisc_ops); * - updates to tree and tree walking are only done under the rtnl mutex. */ -static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) +{ + const struct netdev_queue *txq = q->dev_queue; + spinlock_t *lock = NULL; + struct sk_buff *skb; + + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); + } + + skb = skb_peek(&q->skb_bad_txq); + if (skb) { + /* check the reason of requeuing without tx lock first */ + txq = skb_get_tx_queue(txq->dev, skb); + if (!netif_xmit_frozen_or_stopped(txq)) { + skb = __skb_dequeue(&q->skb_bad_txq); + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_dec(q, skb); + qdisc_qstats_cpu_qlen_dec(q); + } else { + qdisc_qstats_backlog_dec(q, skb); + q->q.qlen--; + } + } else { + skb = NULL; + } + } + + if (lock) + spin_unlock(lock); + + return skb; +} + +static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q) +{ + struct sk_buff *skb = skb_peek(&q->skb_bad_txq); + + if (unlikely(skb)) + skb = __skb_dequeue_bad_txq(q); + + return skb; +} + +static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, + struct sk_buff *skb) { - q->gso_skb = skb; + spinlock_t *lock = NULL; + + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); + } + + __skb_queue_tail(&q->skb_bad_txq, skb); + + if (lock) + spin_unlock(lock); +} + +static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +{ + __skb_queue_head(&q->gso_skb, skb); q->qstats.requeues++; qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; /* it's still part of the queue */ @@ -58,6 +120,30 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) return 0; } +static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q) +{ + spinlock_t *lock = qdisc_lock(q); + + spin_lock(lock); + __skb_queue_tail(&q->gso_skb, skb); + spin_unlock(lock); + + qdisc_qstats_cpu_requeues_inc(q); + qdisc_qstats_cpu_backlog_inc(q, skb); + qdisc_qstats_cpu_qlen_inc(q); + __netif_schedule(q); + + return 0; +} + +static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +{ + if (q->flags & TCQ_F_NOLOCK) + return dev_requeue_skb_locked(skb, q); + else + return __dev_requeue_skb(skb, q); +} + static void try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, const struct netdev_queue *txq, @@ -95,9 +181,15 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, if (!nskb) break; if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { - q->skb_bad_txq = nskb; - qdisc_qstats_backlog_inc(q, nskb); - q->q.qlen++; + qdisc_enqueue_skb_bad_txq(q, nskb); + + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_inc(q, nskb); + qdisc_qstats_cpu_qlen_inc(q); + } else { + qdisc_qstats_backlog_inc(q, nskb); + q->q.qlen++; + } break; } skb->next = nskb; @@ -113,40 +205,60 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, int *packets) { - struct sk_buff *skb = q->gso_skb; const struct netdev_queue *txq = q->dev_queue; + struct sk_buff *skb = NULL; *packets = 1; - if (unlikely(skb)) { + if (unlikely(!skb_queue_empty(&q->gso_skb))) { + spinlock_t *lock = NULL; + + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); + } + + skb = skb_peek(&q->gso_skb); + + /* skb may be null if another cpu pulls gso_skb off in between + * empty check and lock. + */ + if (!skb) { + if (lock) + spin_unlock(lock); + goto validate; + } + /* skb in gso_skb were already validated */ *validate = false; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { - q->gso_skb = NULL; - qdisc_qstats_backlog_dec(q, skb); - q->q.qlen--; - } else + skb = __skb_dequeue(&q->gso_skb); + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_dec(q, skb); + qdisc_qstats_cpu_qlen_dec(q); + } else { + qdisc_qstats_backlog_dec(q, skb); + q->q.qlen--; + } + } else { skb = NULL; - goto trace; - } - *validate = true; - skb = q->skb_bad_txq; - if (unlikely(skb)) { - /* check the reason of requeuing without tx lock first */ - txq = skb_get_tx_queue(txq->dev, skb); - if (!netif_xmit_frozen_or_stopped(txq)) { - q->skb_bad_txq = NULL; - qdisc_qstats_backlog_dec(q, skb); - q->q.qlen--; - goto bulk; } - skb = NULL; + if (lock) + spin_unlock(lock); goto trace; } - if (!(q->flags & TCQ_F_ONETXQUEUE) || - !netif_xmit_frozen_or_stopped(txq)) - skb = q->dequeue(q); +validate: + *validate = true; + + if ((q->flags & TCQ_F_ONETXQUEUE) && + netif_xmit_frozen_or_stopped(txq)) + return skb; + + skb = qdisc_dequeue_skb_bad_txq(q); + if (unlikely(skb)) + goto bulk; + skb = q->dequeue(q); if (skb) { bulk: if (qdisc_may_bulk(q)) @@ -165,17 +277,18 @@ trace: * only one CPU can execute this function. * * Returns to the caller: - * 0 - queue is empty or throttled. - * >0 - queue is not empty. + * false - hardware queue frozen backoff + * true - feel free to send more pkts */ -int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, - struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock, bool validate) +bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, struct netdev_queue *txq, + spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; /* And release qdisc */ - spin_unlock(root_lock); + if (root_lock) + spin_unlock(root_lock); /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate) @@ -188,27 +301,28 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_UNLOCK(dev, txq); } else { - spin_lock(root_lock); - return qdisc_qlen(q); + if (root_lock) + spin_lock(root_lock); + return true; } - spin_lock(root_lock); - if (dev_xmit_complete(ret)) { - /* Driver sent out skb successfully or skb was consumed */ - ret = qdisc_qlen(q); - } else { + if (root_lock) + spin_lock(root_lock); + + if (!dev_xmit_complete(ret)) { /* Driver returned NETDEV_TX_BUSY - requeue skb */ if (unlikely(ret != NETDEV_TX_BUSY)) net_warn_ratelimited("BUG %s code %d qlen %d\n", dev->name, ret, q->q.qlen); - ret = dev_requeue_skb(skb, q); + dev_requeue_skb(skb, q); + return false; } if (ret && netif_xmit_frozen_or_stopped(txq)) - ret = 0; + return false; - return ret; + return true; } /* @@ -230,20 +344,22 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, * >0 - queue is not empty. * */ -static inline int qdisc_restart(struct Qdisc *q, int *packets) +static inline bool qdisc_restart(struct Qdisc *q, int *packets) { + spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; - spinlock_t *root_lock; struct sk_buff *skb; bool validate; /* Dequeue packet */ skb = dequeue_skb(q, &validate, packets); if (unlikely(!skb)) - return 0; + return false; + + if (!(q->flags & TCQ_F_NOLOCK)) + root_lock = qdisc_lock(q); - root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); @@ -267,8 +383,6 @@ void __qdisc_run(struct Qdisc *q) break; } } - - qdisc_run_end(q); } unsigned long dev_trans_start(struct net_device *dev) @@ -468,93 +582,93 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = { /* * Private data for a pfifo_fast scheduler containing: - * - queues for the three band - * - bitmap indicating which of the bands contain skbs + * - rings for priority bands */ struct pfifo_fast_priv { - u32 bitmap; - struct qdisc_skb_head q[PFIFO_FAST_BANDS]; + struct skb_array q[PFIFO_FAST_BANDS]; }; -/* - * Convert a bitmap to the first band number where an skb is queued, where: - * bitmap=0 means there are no skbs on any band. - * bitmap=1 means there is an skb on band 0. - * bitmap=7 means there are skbs on all 3 bands, etc. - */ -static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; - -static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv, - int band) +static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, + int band) { - return priv->q + band; + return &priv->q[band]; } static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { - if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) { - int band = prio2band[skb->priority & TC_PRIO_MAX]; - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - struct qdisc_skb_head *list = band2list(priv, band); - - priv->bitmap |= (1 << band); - qdisc->q.qlen++; - return __qdisc_enqueue_tail(skb, qdisc, list); - } + int band = prio2band[skb->priority & TC_PRIO_MAX]; + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + struct skb_array *q = band2list(priv, band); + int err; + + err = skb_array_produce(q, skb); - return qdisc_drop(skb, qdisc, to_free); + if (unlikely(err)) + return qdisc_drop_cpu(skb, qdisc, to_free); + + qdisc_qstats_cpu_qlen_inc(qdisc); + qdisc_qstats_cpu_backlog_inc(qdisc, skb); + return NET_XMIT_SUCCESS; } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; - - if (likely(band >= 0)) { - struct qdisc_skb_head *qh = band2list(priv, band); - struct sk_buff *skb = __qdisc_dequeue_head(qh); + struct sk_buff *skb = NULL; + int band; - if (likely(skb != NULL)) { - qdisc_qstats_backlog_dec(qdisc, skb); - qdisc_bstats_update(qdisc, skb); - } + for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { + struct skb_array *q = band2list(priv, band); - qdisc->q.qlen--; - if (qh->qlen == 0) - priv->bitmap &= ~(1 << band); + if (__skb_array_empty(q)) + continue; - return skb; + skb = skb_array_consume_bh(q); + } + if (likely(skb)) { + qdisc_qstats_cpu_backlog_dec(qdisc, skb); + qdisc_bstats_cpu_update(qdisc, skb); + qdisc_qstats_cpu_qlen_dec(qdisc); } - return NULL; + return skb; } static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; + struct sk_buff *skb = NULL; + int band; - if (band >= 0) { - struct qdisc_skb_head *qh = band2list(priv, band); + for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { + struct skb_array *q = band2list(priv, band); - return qh->head; + skb = __skb_array_peek(q); } - return NULL; + return skb; } static void pfifo_fast_reset(struct Qdisc *qdisc) { - int prio; + int i, band; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(band2list(priv, prio)); + for (band = 0; band < PFIFO_FAST_BANDS; band++) { + struct skb_array *q = band2list(priv, band); + struct sk_buff *skb; - priv->bitmap = 0; - qdisc->qstats.backlog = 0; - qdisc->q.qlen = 0; + while ((skb = skb_array_consume_bh(q)) != NULL) + kfree_skb(skb); + } + + for_each_possible_cpu(i) { + struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i); + + q->backlog = 0; + q->qlen = 0; + } } static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) @@ -572,17 +686,48 @@ nla_put_failure: static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) { - int prio; + unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + int prio; - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - qdisc_skb_head_init(band2list(priv, prio)); + /* guard against zero length rings */ + if (!qlen) + return -EINVAL; + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + struct skb_array *q = band2list(priv, prio); + int err; + + err = skb_array_init(q, qlen, GFP_KERNEL); + if (err) + return -ENOMEM; + } /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; return 0; } +static void pfifo_fast_destroy(struct Qdisc *sch) +{ + struct pfifo_fast_priv *priv = qdisc_priv(sch); + int prio; + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + struct skb_array *q = band2list(priv, prio); + + /* NULL ring is possible if destroy path is due to a failed + * skb_array_init() in pfifo_fast_init() case. + */ + if (!&q->ring.queue) + continue; + /* Destroy ring but no need to kfree_skb because a call to + * pfifo_fast_reset() has already done that work. + */ + ptr_ring_cleanup(&q->ring, NULL); + } +} + struct Qdisc_ops pfifo_fast_ops __read_mostly = { .id = "pfifo_fast", .priv_size = sizeof(struct pfifo_fast_priv), @@ -590,9 +735,11 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { .dequeue = pfifo_fast_dequeue, .peek = pfifo_fast_peek, .init = pfifo_fast_init, + .destroy = pfifo_fast_destroy, .reset = pfifo_fast_reset, .dump = pfifo_fast_dump, .owner = THIS_MODULE, + .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, }; EXPORT_SYMBOL(pfifo_fast_ops); @@ -630,9 +777,24 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); sch->padded = (char *) sch - (char *) p; } + __skb_queue_head_init(&sch->gso_skb); + __skb_queue_head_init(&sch->skb_bad_txq); qdisc_skb_head_init(&sch->q); spin_lock_init(&sch->q.lock); + if (ops->static_flags & TCQ_F_CPUSTATS) { + sch->cpu_bstats = + netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!sch->cpu_bstats) + goto errout1; + + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!sch->cpu_qstats) { + free_percpu(sch->cpu_bstats); + goto errout1; + } + } + spin_lock_init(&sch->busylock); lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); @@ -642,6 +804,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; + sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; @@ -649,6 +812,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, refcount_set(&sch->refcnt, 1); return sch; +errout1: + kfree(p); errout: return ERR_PTR(err); } @@ -682,17 +847,21 @@ EXPORT_SYMBOL(qdisc_create_dflt); void qdisc_reset(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; + struct sk_buff *skb, *tmp; if (ops->reset) ops->reset(qdisc); - kfree_skb(qdisc->skb_bad_txq); - qdisc->skb_bad_txq = NULL; + skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { + __skb_unlink(skb, &qdisc->gso_skb); + kfree_skb_list(skb); + } - if (qdisc->gso_skb) { - kfree_skb_list(qdisc->gso_skb); - qdisc->gso_skb = NULL; + skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) { + __skb_unlink(skb, &qdisc->skb_bad_txq); + kfree_skb_list(skb); } + qdisc->q.qlen = 0; qdisc->qstats.backlog = 0; } @@ -711,6 +880,7 @@ static void qdisc_free(struct Qdisc *qdisc) void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; + struct sk_buff *skb, *tmp; if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt)) @@ -730,8 +900,16 @@ void qdisc_destroy(struct Qdisc *qdisc) module_put(ops->owner); dev_put(qdisc_dev(qdisc)); - kfree_skb_list(qdisc->gso_skb); - kfree_skb(qdisc->skb_bad_txq); + skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { + __skb_unlink(skb, &qdisc->gso_skb); + kfree_skb_list(skb); + } + + skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) { + __skb_unlink(skb, &qdisc->skb_bad_txq); + kfree_skb_list(skb); + } + qdisc_free(qdisc); } EXPORT_SYMBOL(qdisc_destroy); @@ -746,10 +924,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, root_lock = qdisc_lock(oqdisc); spin_lock_bh(root_lock); - /* Prune old scheduler */ - if (oqdisc && refcount_read(&oqdisc->refcnt) <= 1) - qdisc_reset(oqdisc); - /* ... and graft new one */ if (qdisc == NULL) qdisc = &noop_qdisc; @@ -885,14 +1059,18 @@ static bool some_qdisc_is_busy(struct net_device *dev) dev_queue = netdev_get_tx_queue(dev, i); q = dev_queue->qdisc_sleeping; - root_lock = qdisc_lock(q); - spin_lock_bh(root_lock); + if (q->flags & TCQ_F_NOLOCK) { + val = test_bit(__QDISC_STATE_SCHED, &q->state); + } else { + root_lock = qdisc_lock(q); + spin_lock_bh(root_lock); - val = (qdisc_is_running(q) || - test_bit(__QDISC_STATE_SCHED, &q->state)); + val = (qdisc_is_running(q) || + test_bit(__QDISC_STATE_SCHED, &q->state)); - spin_unlock_bh(root_lock); + spin_unlock_bh(root_lock); + } if (val) return true; @@ -900,6 +1078,16 @@ static bool some_qdisc_is_busy(struct net_device *dev) return false; } +static void dev_qdisc_reset(struct net_device *dev, + struct netdev_queue *dev_queue, + void *none) +{ + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + + if (qdisc) + qdisc_reset(qdisc); +} + /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate @@ -910,7 +1098,6 @@ static bool some_qdisc_is_busy(struct net_device *dev) void dev_deactivate_many(struct list_head *head) { struct net_device *dev; - bool sync_needed = false; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, @@ -920,20 +1107,25 @@ void dev_deactivate_many(struct list_head *head) &noop_qdisc); dev_watchdog_down(dev); - sync_needed |= !dev->dismantle; } /* Wait for outstanding qdisc-less dev_queue_xmit calls. * This is avoided if all devices are in dismantle phase : * Caller will call synchronize_net() for us */ - if (sync_needed) - synchronize_net(); + synchronize_net(); /* Wait for outstanding qdisc_run calls. */ - list_for_each_entry(dev, head, close_list) + list_for_each_entry(dev, head, close_list) { while (some_qdisc_is_busy(dev)) yield(); + /* The new qdisc is assigned at this point so we can safely + * unwind stale skb lists and qdisc statistics + */ + netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL); + if (dev_ingress_queue(dev)) + dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL); + } } void dev_deactivate(struct net_device *dev) @@ -954,6 +1146,8 @@ static void dev_init_scheduler_queue(struct net_device *dev, rcu_assign_pointer(dev_queue->qdisc, qdisc); dev_queue->qdisc_sleeping = qdisc; + __skb_queue_head_init(&qdisc->gso_skb); + __skb_queue_head_init(&qdisc->skb_bad_txq); } void dev_init_scheduler(struct net_device *dev) diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 213b586a06a0..8cbb5c829d59 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -17,6 +17,7 @@ #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/sch_generic.h> struct mq_sched { struct Qdisc **qdiscs; @@ -97,23 +98,42 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int ntx; + __u32 qlen = 0; sch->q.qlen = 0; memset(&sch->bstats, 0, sizeof(sch->bstats)); memset(&sch->qstats, 0, sizeof(sch->qstats)); + /* MQ supports lockless qdiscs. However, statistics accounting needs + * to account for all, none, or a mix of locked and unlocked child + * qdiscs. Percpu stats are added to counters in-band and locking + * qdisc totals are added at end. + */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; + + if (qdisc_is_percpu_stats(qdisc)) { + qlen = qdisc_qlen_sum(qdisc); + __gnet_stats_copy_basic(NULL, &sch->bstats, + qdisc->cpu_bstats, + &qdisc->bstats); + __gnet_stats_copy_queue(&sch->qstats, + qdisc->cpu_qstats, + &qdisc->qstats, qlen); + } else { + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; + sch->bstats.packets += qdisc->bstats.packets; + sch->qstats.backlog += qdisc->qstats.backlog; + sch->qstats.drops += qdisc->qstats.drops; + sch->qstats.requeues += qdisc->qstats.requeues; + sch->qstats.overlimits += qdisc->qstats.overlimits; + } + spin_unlock_bh(qdisc_lock(qdisc)); } + return 0; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index b85885a9d8a1..8622745f3cd9 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -388,22 +388,40 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; - unsigned int i; + unsigned int ntx, tc; sch->q.qlen = 0; memset(&sch->bstats, 0, sizeof(sch->bstats)); memset(&sch->qstats, 0, sizeof(sch->qstats)); - for (i = 0; i < dev->num_tx_queues; i++) { - qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); + /* MQ supports lockless qdiscs. However, statistics accounting needs + * to account for all, none, or a mix of locked and unlocked child + * qdiscs. Percpu stats are added to counters in-band and locking + * qdisc totals are added at end. + */ + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; + + if (qdisc_is_percpu_stats(qdisc)) { + __u32 qlen = qdisc_qlen_sum(qdisc); + + __gnet_stats_copy_basic(NULL, &sch->bstats, + qdisc->cpu_bstats, + &qdisc->bstats); + __gnet_stats_copy_queue(&sch->qstats, + qdisc->cpu_qstats, + &qdisc->qstats, qlen); + } else { + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; + sch->bstats.packets += qdisc->bstats.packets; + sch->qstats.backlog += qdisc->qstats.backlog; + sch->qstats.drops += qdisc->qstats.drops; + sch->qstats.requeues += qdisc->qstats.requeues; + sch->qstats.overlimits += qdisc->qstats.overlimits; + } + spin_unlock_bh(qdisc_lock(qdisc)); } @@ -411,9 +429,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); opt.hw = priv->hw_offload; - for (i = 0; i < netdev_get_num_tc(dev); i++) { - opt.count[i] = dev->tc_to_txq[i].count; - opt.offset[i] = dev->tc_to_txq[i].offset; + for (tc = 0; tc < netdev_get_num_tc(dev); tc++) { + opt.count[tc] = dev->tc_to_txq[tc].count; + opt.offset[tc] = dev->tc_to_txq[tc].offset; } if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt)) @@ -495,7 +513,6 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (cl >= TC_H_MIN_PRIORITY) { int i; __u32 qlen = 0; - struct Qdisc *qdisc; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_packed bstats = {0}; struct net_device *dev = qdisc_dev(sch); @@ -511,18 +528,26 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); + struct Qdisc *qdisc = rtnl_dereference(q->qdisc); + struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; - qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); - qlen += qdisc->q.qlen; - bstats.bytes += qdisc->bstats.bytes; - bstats.packets += qdisc->bstats.packets; - qstats.backlog += qdisc->qstats.backlog; - qstats.drops += qdisc->qstats.drops; - qstats.requeues += qdisc->qstats.requeues; - qstats.overlimits += qdisc->qstats.overlimits; + if (qdisc_is_percpu_stats(qdisc)) { + cpu_bstats = qdisc->cpu_bstats; + cpu_qstats = qdisc->cpu_qstats; + } + + qlen = qdisc_qlen_sum(qdisc); + __gnet_stats_copy_basic(NULL, &sch->bstats, + cpu_bstats, &qdisc->bstats); + __gnet_stats_copy_queue(&sch->qstats, + cpu_qstats, + &qdisc->qstats, + qlen); spin_unlock_bh(qdisc_lock(qdisc)); } + /* Reclaim root sleeping lock before completing stats */ if (d->lock) spin_lock_bh(d->lock); diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 1ca84a288443..54bd9c1a8aa1 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -14,7 +14,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o stream.o auth.o \ offload.o stream_sched.o stream_sched_prio.o \ - stream_sched_rr.o + stream_sched_rr.o stream_interleave.o sctp_probe-y := probe.o diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 69394f4d6091..837806dd5799 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -861,7 +861,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, 0, spc_state, error, GFP_ATOMIC); if (event) - sctp_ulpq_tail_event(&asoc->ulpq, event); + asoc->stream.si->enqueue_event(&asoc->ulpq, event); } /* Select new active and retran paths. */ diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 7f8baa48e7c2..991a530c6b31 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -124,7 +124,7 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) - sctp_ulpq_tail_event(&asoc->ulpq, ev); + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_chunk_put(chunk); @@ -191,7 +191,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, */ max_data = asoc->pathmtu - sctp_sk(asoc->base.sk)->pf->af->net_header_len - - sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk); + sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream); max_data = SCTP_TRUNC4(max_data); /* If the the peer requested that we authenticate DATA chunks @@ -264,8 +264,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, frag |= SCTP_DATA_SACK_IMM; } - chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, - 0, GFP_KERNEL); + chunk = asoc->stream.si->make_datafrag(asoc, sinfo, len, frag, + GFP_KERNEL); if (!chunk) { err = -ENOMEM; goto errout; diff --git a/net/sctp/output.c b/net/sctp/output.c index 4a865cd06d76..01a26ee051e3 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -313,6 +313,7 @@ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, /* We believe that this chunk is OK to add to the packet */ switch (chunk->chunk_hdr->type) { case SCTP_CID_DATA: + case SCTP_CID_I_DATA: /* Account for the data being in the packet */ sctp_packet_append_data(packet, chunk); /* Disallow SACK bundling after DATA. */ @@ -724,7 +725,7 @@ static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, * or delay in hopes of bundling a full sized packet. */ if (chunk->skb->len + q->out_qlen > transport->pathmtu - - packet->overhead - sizeof(struct sctp_data_chunk) - 4) + packet->overhead - sctp_datachk_len(&chunk->asoc->stream) - 4) /* Enough data queued to fill a packet */ return SCTP_XMIT_OK; @@ -759,7 +760,7 @@ static void sctp_packet_append_data(struct sctp_packet *packet, asoc->peer.rwnd = rwnd; sctp_chunk_assign_tsn(chunk); - sctp_chunk_assign_ssn(chunk); + asoc->stream.si->assign_number(chunk); } static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 7d67feeeffc1..af9b5ebcae50 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -67,8 +67,6 @@ static void sctp_mark_missing(struct sctp_outq *q, __u32 highest_new_tsn, int count_of_newacks); -static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); - static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ @@ -591,7 +589,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, * following the procedures outlined in C1 - C5. */ if (reason == SCTP_RTXR_T3_RTX) - sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point); + q->asoc->stream.si->generate_ftsn(q, q->asoc->ctsn_ack_point); /* Flush the queues only on timeout, since fast_rtx is only * triggered during sack processing and the queue @@ -942,6 +940,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) case SCTP_CID_ECN_ECNE: case SCTP_CID_ASCONF: case SCTP_CID_FWD_TSN: + case SCTP_CID_I_FWD_TSN: case SCTP_CID_RECONF: status = sctp_packet_transmit_chunk(packet, chunk, one_packet, gfp); @@ -956,7 +955,8 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) * sender MUST assure that at least one T3-rtx * timer is running. */ - if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) { + if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN || + chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) { sctp_transport_reset_t3_rtx(transport); transport->last_time_sent = jiffies; } @@ -1372,7 +1372,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) asoc->peer.rwnd = sack_a_rwnd; - sctp_generate_fwdtsn(q, sack_ctsn); + asoc->stream.si->generate_ftsn(q, sack_ctsn); pr_debug("%s: sack cumulative tsn ack:0x%x\n", __func__, sack_ctsn); pr_debug("%s: cumulative tsn ack of assoc:%p is 0x%x, " @@ -1795,7 +1795,7 @@ static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist, } /* Create and add a fwdtsn chunk to the outq's control queue if needed. */ -static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) +void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 26b4be6b4172..4545bc2aff84 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -288,12 +288,8 @@ struct sctp_ht_iter { static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; - int err = sctp_transport_walk_start(&iter->hti); - if (err) { - iter->start_fail = 1; - return ERR_PTR(err); - } + sctp_transport_walk_start(&iter->hti); iter->start_fail = 0; return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 9bf575f2e8ed..b9b269cf615e 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -228,7 +228,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, struct sctp_inithdr init; union sctp_params addrs; struct sctp_sock *sp; - __u8 extensions[4]; + __u8 extensions[5]; size_t chunksize; __be16 types[2]; int num_ext = 0; @@ -278,6 +278,11 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, if (sp->adaptation_ind) chunksize += sizeof(aiparam); + if (sp->strm_interleave) { + extensions[num_ext] = SCTP_CID_I_DATA; + num_ext += 1; + } + chunksize += vparam_len; /* Account for AUTH related parameters */ @@ -392,7 +397,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, struct sctp_inithdr initack; union sctp_params addrs; struct sctp_sock *sp; - __u8 extensions[4]; + __u8 extensions[5]; size_t chunksize; int num_ext = 0; int cookie_len; @@ -442,6 +447,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, if (sp->adaptation_ind) chunksize += sizeof(aiparam); + if (asoc->intl_enable) { + extensions[num_ext] = SCTP_CID_I_DATA; + num_ext += 1; + } + if (asoc->peer.auth_capable) { auth_random = (struct sctp_paramhdr *)asoc->c.auth_random; chunksize += ntohs(auth_random->length); @@ -711,38 +721,31 @@ nodata: /* Make a DATA chunk for the given association from the provided * parameters. However, do not populate the data payload. */ -struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, +struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, - int data_len, __u8 flags, __u16 ssn, - gfp_t gfp) + int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_datahdr dp; - int chunk_len; /* We assign the TSN as LATE as possible, not here when * creating the chunk. */ - dp.tsn = 0; + memset(&dp, 0, sizeof(dp)); + dp.ppid = sinfo->sinfo_ppid; dp.stream = htons(sinfo->sinfo_stream); - dp.ppid = sinfo->sinfo_ppid; /* Set the flags for an unordered send. */ - if (sinfo->sinfo_flags & SCTP_UNORDERED) { + if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; - dp.ssn = 0; - } else - dp.ssn = htons(ssn); - chunk_len = sizeof(dp) + data_len; - retval = sctp_make_data(asoc, flags, chunk_len, gfp); + retval = sctp_make_data(asoc, flags, sizeof(dp) + len, gfp); if (!retval) - goto nodata; + return NULL; retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); -nodata: return retval; } @@ -1415,6 +1418,12 @@ static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp); } +struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, + __u8 flags, int paylen, gfp_t gfp) +{ + return _sctp_make_chunk(asoc, SCTP_CID_I_DATA, flags, paylen, gfp); +} + static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) @@ -2032,6 +2041,10 @@ static void sctp_process_ext_param(struct sctp_association *asoc, if (net->sctp.addip_enable) asoc->peer.asconf_capable = 1; break; + case SCTP_CID_I_DATA: + if (sctp_sk(asoc->base.sk)->strm_interleave) + asoc->intl_enable = 1; + break; default: break; } @@ -3523,6 +3536,30 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, return retval; } +struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc, + __u32 new_cum_tsn, size_t nstreams, + struct sctp_ifwdtsn_skip *skiplist) +{ + struct sctp_chunk *retval = NULL; + struct sctp_ifwdtsn_hdr ftsn_hdr; + size_t hint; + + hint = (nstreams + 1) * sizeof(__u32); + + retval = sctp_make_control(asoc, SCTP_CID_I_FWD_TSN, 0, hint, + GFP_ATOMIC); + if (!retval) + return NULL; + + ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); + retval->subh.ifwdtsn_hdr = + sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); + + sctp_addto_chunk(retval, nstreams * sizeof(skiplist[0]), skiplist); + + return retval; +} + /* RE-CONFIG 3.1 (RE-CONFIG chunk) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index df94d77401e7..16ddf2ca1438 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -632,7 +632,7 @@ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands, struct sctp_chunk *abort; /* Cancel any partial delivery in progress. */ - sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + asoc->stream.si->abort_pd(&asoc->ulpq, GFP_ATOMIC); if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT) event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, @@ -972,7 +972,7 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds, if (!ev) return; - sctp_ulpq_tail_event(&asoc->ulpq, ev); + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); switch (err_hdr->cause) { case SCTP_ERROR_UNKNOWN_CHUNK: @@ -1007,18 +1007,6 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds, } } -/* Process variable FWDTSN chunk information. */ -static void sctp_cmd_process_fwdtsn(struct sctp_ulpq *ulpq, - struct sctp_chunk *chunk) -{ - struct sctp_fwdtsn_skip *skip; - - /* Walk through all the skipped SSNs */ - sctp_walk_fwdtsn(skip, chunk) { - sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); - } -} - /* Helper function to remove the association non-primary peer * transports. */ @@ -1058,7 +1046,7 @@ static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands, asoc->c.sinit_max_instreams, NULL, GFP_ATOMIC); if (ev) - sctp_ulpq_tail_event(&asoc->ulpq, ev); + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } /* Helper function to generate an adaptation indication event */ @@ -1070,7 +1058,7 @@ static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands, ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC); if (ev) - sctp_ulpq_tail_event(&asoc->ulpq, ev); + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } @@ -1368,18 +1356,12 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, break; case SCTP_CMD_REPORT_FWDTSN: - /* Move the Cumulattive TSN Ack ahead. */ - sctp_tsnmap_skip(&asoc->peer.tsn_map, cmd->obj.u32); - - /* purge the fragmentation queue */ - sctp_ulpq_reasm_flushtsn(&asoc->ulpq, cmd->obj.u32); - - /* Abort any in progress partial delivery. */ - sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + asoc->stream.si->report_ftsn(&asoc->ulpq, cmd->obj.u32); break; case SCTP_CMD_PROCESS_FWDTSN: - sctp_cmd_process_fwdtsn(&asoc->ulpq, cmd->obj.chunk); + asoc->stream.si->handle_ftsn(&asoc->ulpq, + cmd->obj.chunk); break; case SCTP_CMD_GEN_SACK: @@ -1483,8 +1465,9 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, pr_debug("%s: sm_sideff: chunk_up:%p, ulpq:%p\n", __func__, cmd->obj.chunk, &asoc->ulpq); - sctp_ulpq_tail_data(&asoc->ulpq, cmd->obj.chunk, - GFP_ATOMIC); + asoc->stream.si->ulpevent_data(&asoc->ulpq, + cmd->obj.chunk, + GFP_ATOMIC); break; case SCTP_CMD_EVENT_ULP: @@ -1492,7 +1475,8 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, pr_debug("%s: sm_sideff: event_up:%p, ulpq:%p\n", __func__, cmd->obj.ulpevent, &asoc->ulpq); - sctp_ulpq_tail_event(&asoc->ulpq, cmd->obj.ulpevent); + asoc->stream.si->enqueue_event(&asoc->ulpq, + cmd->obj.ulpevent); break; case SCTP_CMD_REPLY: @@ -1729,12 +1713,13 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, break; case SCTP_CMD_PART_DELIVER: - sctp_ulpq_partial_delivery(&asoc->ulpq, GFP_ATOMIC); + asoc->stream.si->start_pd(&asoc->ulpq, GFP_ATOMIC); break; case SCTP_CMD_RENEGE: - sctp_ulpq_renege(&asoc->ulpq, cmd->obj.chunk, - GFP_ATOMIC); + asoc->stream.si->renege_events(&asoc->ulpq, + cmd->obj.chunk, + GFP_ATOMIC); break; case SCTP_CMD_SETUP_T4: diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 8f8ccded13e4..541f34735346 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3013,7 +3013,7 @@ enum sctp_disposition sctp_sf_eat_data_6_2(struct net *net, return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_data_chunk))) + if (!sctp_chunk_length_valid(chunk, sctp_datachk_len(&asoc->stream))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -3034,7 +3034,7 @@ enum sctp_disposition sctp_sf_eat_data_6_2(struct net *net, case SCTP_IERROR_PROTO_VIOLATION: return sctp_sf_abort_violation(net, ep, asoc, chunk, commands, (u8 *)chunk->subh.data_hdr, - sizeof(struct sctp_datahdr)); + sctp_datahdr_len(&asoc->stream)); default: BUG(); } @@ -3133,7 +3133,7 @@ enum sctp_disposition sctp_sf_eat_data_fast_4_4( return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_data_chunk))) + if (!sctp_chunk_length_valid(chunk, sctp_datachk_len(&asoc->stream))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -3150,7 +3150,7 @@ enum sctp_disposition sctp_sf_eat_data_fast_4_4( case SCTP_IERROR_PROTO_VIOLATION: return sctp_sf_abort_violation(net, ep, asoc, chunk, commands, (u8 *)chunk->subh.data_hdr, - sizeof(struct sctp_datahdr)); + sctp_datahdr_len(&asoc->stream)); default: BUG(); } @@ -3957,7 +3957,6 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net, { struct sctp_fwdtsn_hdr *fwdtsn_hdr; struct sctp_chunk *chunk = arg; - struct sctp_fwdtsn_skip *skip; __u16 len; __u32 tsn; @@ -3971,7 +3970,7 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net, return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands); /* Make sure that the FORWARD_TSN chunk has valid length. */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) + if (!sctp_chunk_length_valid(chunk, sctp_ftsnchk_len(&asoc->stream))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -3990,14 +3989,11 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net, if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0) goto discard_noforce; - /* Silently discard the chunk if stream-id is not valid */ - sctp_walk_fwdtsn(skip, chunk) { - if (ntohs(skip->stream) >= asoc->stream.incnt) - goto discard_noforce; - } + if (!asoc->stream.si->validate_ftsn(chunk)) + goto discard_noforce; sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn)); - if (len > sizeof(struct sctp_fwdtsn_hdr)) + if (len > sctp_ftsnhdr_len(&asoc->stream)) sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN, SCTP_CHUNK(chunk)); @@ -4028,7 +4024,6 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast( { struct sctp_fwdtsn_hdr *fwdtsn_hdr; struct sctp_chunk *chunk = arg; - struct sctp_fwdtsn_skip *skip; __u16 len; __u32 tsn; @@ -4042,7 +4037,7 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast( return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands); /* Make sure that the FORWARD_TSN chunk has a valid length. */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) + if (!sctp_chunk_length_valid(chunk, sctp_ftsnchk_len(&asoc->stream))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -4061,14 +4056,11 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast( if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0) goto gen_shutdown; - /* Silently discard the chunk if stream-id is not valid */ - sctp_walk_fwdtsn(skip, chunk) { - if (ntohs(skip->stream) >= asoc->stream.incnt) - goto gen_shutdown; - } + if (!asoc->stream.si->validate_ftsn(chunk)) + goto gen_shutdown; sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn)); - if (len > sizeof(struct sctp_fwdtsn_hdr)) + if (len > sctp_ftsnhdr_len(&asoc->stream)) sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN, SCTP_CHUNK(chunk)); @@ -6244,14 +6236,12 @@ static int sctp_eat_data(const struct sctp_association *asoc, struct sctp_chunk *err; enum sctp_verb deliver; size_t datalen; - u8 ordered = 0; - u16 ssn, sid; __u32 tsn; int tmp; data_hdr = (struct sctp_datahdr *)chunk->skb->data; chunk->subh.data_hdr = data_hdr; - skb_pull(chunk->skb, sizeof(*data_hdr)); + skb_pull(chunk->skb, sctp_datahdr_len(&asoc->stream)); tsn = ntohl(data_hdr->tsn); pr_debug("%s: TSN 0x%x\n", __func__, tsn); @@ -6299,7 +6289,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, * Actually, allow a little bit of overflow (up to a MTU). */ datalen = ntohs(chunk->chunk_hdr->length); - datalen -= sizeof(struct sctp_data_chunk); + datalen -= sctp_datachk_len(&asoc->stream); deliver = SCTP_CMD_CHUNK_ULP; @@ -6394,7 +6384,6 @@ static int sctp_eat_data(const struct sctp_association *asoc, SCTP_INC_STATS(net, SCTP_MIB_INORDERCHUNKS); if (chunk->asoc) chunk->asoc->stats.iodchunks++; - ordered = 1; } /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number @@ -6405,8 +6394,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, * with cause set to "Invalid Stream Identifier" (See Section 3.3.10) * and discard the DATA chunk. */ - sid = ntohs(data_hdr->stream); - if (sid >= asoc->stream.incnt) { + if (ntohs(data_hdr->stream) >= asoc->stream.incnt) { /* Mark tsn as received even though we drop it */ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn)); @@ -6427,8 +6415,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, * SSN is smaller then the next expected one. If it is, it wrapped * and is invalid. */ - ssn = ntohs(data_hdr->ssn); - if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->stream, in, sid))) + if (!asoc->stream.si->validate_data(chunk)) return SCTP_IERROR_PROTO_VIOLATION; /* Send the data up to the user. Note: Schedule the diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 79b6bee5b768..691d9dc620e3 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -985,11 +985,14 @@ static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( if (state > SCTP_STATE_MAX) return &bug; + if (cid == SCTP_CID_I_DATA) + cid = SCTP_CID_DATA; + if (cid <= SCTP_CID_BASE_MAX) return &chunk_event_table[cid][state]; if (net->sctp.prsctp_enable) { - if (cid == SCTP_CID_FWD_TSN) + if (cid == SCTP_CID_FWD_TSN || cid == SCTP_CID_I_FWD_TSN) return &prsctp_chunk_event_table[0][state]; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 3253f724a995..5e4100df7bae 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -201,6 +201,22 @@ static void sctp_for_each_tx_datachunk(struct sctp_association *asoc, cb(chunk); } +static void sctp_for_each_rx_skb(struct sctp_association *asoc, struct sock *sk, + void (*cb)(struct sk_buff *, struct sock *)) + +{ + struct sk_buff *skb, *tmp; + + sctp_skb_for_each(skb, &asoc->ulpq.lobby, tmp) + cb(skb, sk); + + sctp_skb_for_each(skb, &asoc->ulpq.reasm, tmp) + cb(skb, sk); + + sctp_skb_for_each(skb, &asoc->ulpq.reasm_uo, tmp) + cb(skb, sk); +} + /* Verify that this is a valid address. */ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr, int len) @@ -1554,6 +1570,7 @@ static void sctp_close(struct sock *sk, long timeout) if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) || !skb_queue_empty(&asoc->ulpq.reasm) || + !skb_queue_empty(&asoc->ulpq.reasm_uo) || (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) { struct sctp_chunk *chunk; @@ -2002,7 +2019,20 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) if (err < 0) goto out_free; - wait_connect = true; + /* If stream interleave is enabled, wait_connect has to be + * done earlier than data enqueue, as it needs to make data + * or idata according to asoc->intl_enable which is set + * after connection is done. + */ + if (sctp_sk(asoc->base.sk)->strm_interleave) { + timeo = sock_sndtimeo(sk, 0); + err = sctp_wait_for_connect(asoc, &timeo); + if (err) + goto out_unlock; + } else { + wait_connect = true; + } + pr_debug("%s: we associated primitively\n", __func__); } @@ -2281,7 +2311,7 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, if (!event) return -ENOMEM; - sctp_ulpq_tail_event(&asoc->ulpq, event); + asoc->stream.si->enqueue_event(&asoc->ulpq, event); } } @@ -3180,7 +3210,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned if (val == 0) { val = asoc->pathmtu - sp->pf->af->net_header_len; val -= sizeof(struct sctphdr) + - sizeof(struct sctp_data_chunk); + sctp_datachk_len(&asoc->stream); } asoc->user_frag = val; asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu); @@ -3350,7 +3380,10 @@ static int sctp_setsockopt_fragment_interleave(struct sock *sk, if (get_user(val, (int __user *)optval)) return -EFAULT; - sctp_sk(sk)->frag_interleave = (val == 0) ? 0 : 1; + sctp_sk(sk)->frag_interleave = !!val; + + if (!sctp_sk(sk)->frag_interleave) + sctp_sk(sk)->strm_interleave = 0; return 0; } @@ -4023,6 +4056,40 @@ out: return retval; } +static int sctp_setsockopt_interleaving_supported(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct net *net = sock_net(sk); + struct sctp_assoc_value params; + int retval = -EINVAL; + + if (optlen < sizeof(params)) + goto out; + + optlen = sizeof(params); + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_id) + goto out; + + if (!net->sctp.intl_enable || !sp->frag_interleave) { + retval = -EPERM; + goto out; + } + + sp->strm_interleave = !!params.assoc_value; + + retval = 0; + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4210,6 +4277,10 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_STREAM_SCHEDULER_VALUE: retval = sctp_setsockopt_scheduler_value(sk, optval, optlen); break; + case SCTP_INTERLEAVING_SUPPORTED: + retval = sctp_setsockopt_interleaving_supported(sk, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -4680,20 +4751,11 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, EXPORT_SYMBOL_GPL(sctp_get_sctp_info); /* use callback to avoid exporting the core structure */ -int sctp_transport_walk_start(struct rhashtable_iter *iter) +void sctp_transport_walk_start(struct rhashtable_iter *iter) { - int err; - rhltable_walk_enter(&sctp_transport_hashtable, iter); - err = rhashtable_walk_start(iter); - if (err && err != -EAGAIN) { - rhashtable_walk_stop(iter); - rhashtable_walk_exit(iter); - return err; - } - - return 0; + rhashtable_walk_start(iter); } void sctp_transport_walk_stop(struct rhashtable_iter *iter) @@ -4784,12 +4846,10 @@ int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *), struct net *net, int *pos, void *p) { struct rhashtable_iter hti; struct sctp_transport *tsp; - int ret; + int ret = 0; again: - ret = sctp_transport_walk_start(&hti); - if (ret) - return ret; + sctp_transport_walk_start(&hti); tsp = sctp_transport_get_idx(net, &hti, *pos + 1); for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) { @@ -6984,6 +7044,47 @@ out: return retval; } +static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + params.assoc_value = asoc->intl_enable; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + params.assoc_value = sp->strm_interleave; + } else { + retval = -EINVAL; + goto out; + } + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7174,6 +7275,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_scheduler_value(sk, len, optval, optlen); break; + case SCTP_INTERLEAVING_SUPPORTED: + retval = sctp_getsockopt_interleaving_supported(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -8411,11 +8516,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, } - sctp_skb_for_each(skb, &assoc->ulpq.reasm, tmp) - sctp_skb_set_owner_r_frag(skb, newsk); - - sctp_skb_for_each(skb, &assoc->ulpq.lobby, tmp) - sctp_skb_set_owner_r_frag(skb, newsk); + sctp_for_each_rx_skb(assoc, newsk, sctp_skb_set_owner_r_frag); /* Set the type of socket to indicate that it is peeled off from the * original UDP-style socket or created with the accept() call on a diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 76ea66be0bbe..06b644dd858c 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -167,6 +167,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, sched->init(stream); in: + sctp_stream_interleave_init(stream); if (!incnt) goto out; @@ -215,11 +216,13 @@ void sctp_stream_clear(struct sctp_stream *stream) { int i; - for (i = 0; i < stream->outcnt; i++) - stream->out[i].ssn = 0; + for (i = 0; i < stream->outcnt; i++) { + stream->out[i].mid = 0; + stream->out[i].mid_uo = 0; + } for (i = 0; i < stream->incnt; i++) - stream->in[i].ssn = 0; + stream->in[i].mid = 0; } void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) @@ -606,10 +609,10 @@ struct sctp_chunk *sctp_process_strreset_outreq( } for (i = 0; i < nums; i++) - stream->in[ntohs(str_p[i])].ssn = 0; + stream->in[ntohs(str_p[i])].mid = 0; } else { for (i = 0; i < stream->incnt; i++) - stream->in[i].ssn = 0; + stream->in[i].mid = 0; } result = SCTP_STRRESET_PERFORMED; @@ -753,8 +756,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( * performed. */ max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); - sctp_ulpq_reasm_flushtsn(&asoc->ulpq, max_tsn_seen); - sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + asoc->stream.si->report_ftsn(&asoc->ulpq, max_tsn_seen); /* G1: Compute an appropriate value for the Receiver's Next TSN -- the * TSN that the peer should use to send the next DATA chunk. The @@ -783,10 +785,12 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( /* G5: The next expected and outgoing SSNs MUST be reset to 0 for all * incoming and outgoing streams. */ - for (i = 0; i < stream->outcnt; i++) - stream->out[i].ssn = 0; + for (i = 0; i < stream->outcnt; i++) { + stream->out[i].mid = 0; + stream->out[i].mid_uo = 0; + } for (i = 0; i < stream->incnt; i++) - stream->in[i].ssn = 0; + stream->in[i].mid = 0; result = SCTP_STRRESET_PERFORMED; @@ -976,11 +980,15 @@ struct sctp_chunk *sctp_process_strreset_resp( if (result == SCTP_STRRESET_PERFORMED) { if (nums) { - for (i = 0; i < nums; i++) - stream->out[ntohs(str_p[i])].ssn = 0; + for (i = 0; i < nums; i++) { + stream->out[ntohs(str_p[i])].mid = 0; + stream->out[ntohs(str_p[i])].mid_uo = 0; + } } else { - for (i = 0; i < stream->outcnt; i++) - stream->out[i].ssn = 0; + for (i = 0; i < stream->outcnt; i++) { + stream->out[i].mid = 0; + stream->out[i].mid_uo = 0; + } } flags = SCTP_STREAM_RESET_OUTGOING_SSN; @@ -1023,8 +1031,7 @@ struct sctp_chunk *sctp_process_strreset_resp( &asoc->peer.tsn_map); LIST_HEAD(temp); - sctp_ulpq_reasm_flushtsn(&asoc->ulpq, mtsn); - sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + asoc->stream.si->report_ftsn(&asoc->ulpq, mtsn); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, @@ -1042,10 +1049,12 @@ struct sctp_chunk *sctp_process_strreset_resp( asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; - for (i = 0; i < stream->outcnt; i++) - stream->out[i].ssn = 0; + for (i = 0; i < stream->outcnt; i++) { + stream->out[i].mid = 0; + stream->out[i].mid_uo = 0; + } for (i = 0; i < stream->incnt; i++) - stream->in[i].ssn = 0; + stream->in[i].mid = 0; } for (i = 0; i < stream->outcnt; i++) diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c new file mode 100644 index 000000000000..8c7cf8f08711 --- /dev/null +++ b/net/sctp/stream_interleave.c @@ -0,0 +1,1334 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * <http://www.gnu.org/licenses/>. + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers <linux-sctp@vger.kernel.org> + * + * Written or modified by: + * Xin Long <lucien.xin@gmail.com> + */ + +#include <net/busy_poll.h> +#include <net/sctp/sctp.h> +#include <net/sctp/sm.h> +#include <net/sctp/ulpevent.h> +#include <linux/sctp.h> + +static struct sctp_chunk *sctp_make_idatafrag_empty( + const struct sctp_association *asoc, + const struct sctp_sndrcvinfo *sinfo, + int len, __u8 flags, gfp_t gfp) +{ + struct sctp_chunk *retval; + struct sctp_idatahdr dp; + + memset(&dp, 0, sizeof(dp)); + dp.stream = htons(sinfo->sinfo_stream); + + if (sinfo->sinfo_flags & SCTP_UNORDERED) + flags |= SCTP_DATA_UNORDERED; + + retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp); + if (!retval) + return NULL; + + retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); + memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); + + return retval; +} + +static void sctp_chunk_assign_mid(struct sctp_chunk *chunk) +{ + struct sctp_stream *stream; + struct sctp_chunk *lchunk; + __u32 cfsn = 0; + __u16 sid; + + if (chunk->has_mid) + return; + + sid = sctp_chunk_stream_no(chunk); + stream = &chunk->asoc->stream; + + list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) { + struct sctp_idatahdr *hdr; + __u32 mid; + + lchunk->has_mid = 1; + + hdr = lchunk->subh.idata_hdr; + + if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) + hdr->ppid = lchunk->sinfo.sinfo_ppid; + else + hdr->fsn = htonl(cfsn++); + + if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { + mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? + sctp_mid_uo_next(stream, out, sid) : + sctp_mid_uo_peek(stream, out, sid); + } else { + mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? + sctp_mid_next(stream, out, sid) : + sctp_mid_peek(stream, out, sid); + } + hdr->mid = htonl(mid); + } +} + +static bool sctp_validate_data(struct sctp_chunk *chunk) +{ + const struct sctp_stream *stream; + __u16 sid, ssn; + + if (chunk->chunk_hdr->type != SCTP_CID_DATA) + return false; + + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) + return true; + + stream = &chunk->asoc->stream; + sid = sctp_chunk_stream_no(chunk); + ssn = ntohs(chunk->subh.data_hdr->ssn); + + return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)); +} + +static bool sctp_validate_idata(struct sctp_chunk *chunk) +{ + struct sctp_stream *stream; + __u32 mid; + __u16 sid; + + if (chunk->chunk_hdr->type != SCTP_CID_I_DATA) + return false; + + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) + return true; + + stream = &chunk->asoc->stream; + sid = sctp_chunk_stream_no(chunk); + mid = ntohl(chunk->subh.idata_hdr->mid); + + return !MID_lt(mid, sctp_mid_peek(stream, in, sid)); +} + +static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_ulpevent *cevent; + struct sk_buff *pos; + + pos = skb_peek_tail(&ulpq->reasm); + if (!pos) { + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + return; + } + + cevent = sctp_skb2event(pos); + + if (event->stream == cevent->stream && + event->mid == cevent->mid && + (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || + (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && + event->fsn > cevent->fsn))) { + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + return; + } + + if ((event->stream == cevent->stream && + MID_lt(cevent->mid, event->mid)) || + event->stream > cevent->stream) { + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + return; + } + + skb_queue_walk(&ulpq->reasm, pos) { + cevent = sctp_skb2event(pos); + + if (event->stream < cevent->stream || + (event->stream == cevent->stream && + MID_lt(event->mid, cevent->mid))) + break; + + if (event->stream == cevent->stream && + event->mid == cevent->mid && + !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && + (event->msg_flags & SCTP_DATA_FIRST_FRAG || + event->fsn < cevent->fsn)) + break; + } + + __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event)); +} + +static struct sctp_ulpevent *sctp_intl_retrieve_partial( + struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff *first_frag = NULL; + struct sk_buff *last_frag = NULL; + struct sctp_ulpevent *retval; + struct sctp_stream_in *sin; + struct sk_buff *pos; + __u32 next_fsn = 0; + int is_last = 0; + + sin = sctp_stream_in(ulpq->asoc, event->stream); + + skb_queue_walk(&ulpq->reasm, pos) { + struct sctp_ulpevent *cevent = sctp_skb2event(pos); + + if (cevent->stream < event->stream) + continue; + + if (cevent->stream > event->stream || + cevent->mid != sin->mid) + break; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + goto out; + case SCTP_DATA_MIDDLE_FRAG: + if (!first_frag) { + if (cevent->fsn == sin->fsn) { + first_frag = pos; + last_frag = pos; + next_fsn = cevent->fsn + 1; + } + } else if (cevent->fsn == next_fsn) { + last_frag = pos; + next_fsn++; + } else { + goto out; + } + break; + case SCTP_DATA_LAST_FRAG: + if (!first_frag) { + if (cevent->fsn == sin->fsn) { + first_frag = pos; + last_frag = pos; + next_fsn = 0; + is_last = 1; + } + } else if (cevent->fsn == next_fsn) { + last_frag = pos; + next_fsn = 0; + is_last = 1; + } + goto out; + default: + goto out; + } + } + +out: + if (!first_frag) + return NULL; + + retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + &ulpq->reasm, first_frag, + last_frag); + if (retval) { + sin->fsn = next_fsn; + if (is_last) { + retval->msg_flags |= MSG_EOR; + sin->pd_mode = 0; + } + } + + return retval; +} + +static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( + struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_association *asoc = ulpq->asoc; + struct sk_buff *pos, *first_frag = NULL; + struct sctp_ulpevent *retval = NULL; + struct sk_buff *pd_first = NULL; + struct sk_buff *pd_last = NULL; + struct sctp_stream_in *sin; + __u32 next_fsn = 0; + __u32 pd_point = 0; + __u32 pd_len = 0; + __u32 mid = 0; + + sin = sctp_stream_in(ulpq->asoc, event->stream); + + skb_queue_walk(&ulpq->reasm, pos) { + struct sctp_ulpevent *cevent = sctp_skb2event(pos); + + if (cevent->stream < event->stream) + continue; + if (cevent->stream > event->stream) + break; + + if (MID_lt(cevent->mid, event->mid)) + continue; + if (MID_lt(event->mid, cevent->mid)) + break; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + if (cevent->mid == sin->mid) { + pd_first = pos; + pd_last = pos; + pd_len = pos->len; + } + + first_frag = pos; + next_fsn = 0; + mid = cevent->mid; + break; + + case SCTP_DATA_MIDDLE_FRAG: + if (first_frag && cevent->mid == mid && + cevent->fsn == next_fsn) { + next_fsn++; + if (pd_first) { + pd_last = pos; + pd_len += pos->len; + } + } else { + first_frag = NULL; + } + break; + + case SCTP_DATA_LAST_FRAG: + if (first_frag && cevent->mid == mid && + cevent->fsn == next_fsn) + goto found; + else + first_frag = NULL; + break; + } + } + + if (!pd_first) + goto out; + + pd_point = sctp_sk(asoc->base.sk)->pd_point; + if (pd_point && pd_point <= pd_len) { + retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + &ulpq->reasm, + pd_first, pd_last); + if (retval) { + sin->fsn = next_fsn; + sin->pd_mode = 1; + } + } + goto out; + +found: + retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + &ulpq->reasm, + first_frag, pos); + if (retval) + retval->msg_flags |= MSG_EOR; + +out: + return retval; +} + +static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_ulpevent *retval = NULL; + struct sctp_stream_in *sin; + + if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { + event->msg_flags |= MSG_EOR; + return event; + } + + sctp_intl_store_reasm(ulpq, event); + + sin = sctp_stream_in(ulpq->asoc, event->stream); + if (sin->pd_mode && event->mid == sin->mid && + event->fsn == sin->fsn) + retval = sctp_intl_retrieve_partial(ulpq, event); + + if (!retval) + retval = sctp_intl_retrieve_reassembled(ulpq, event); + + return retval; +} + +static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_ulpevent *cevent; + struct sk_buff *pos; + + pos = skb_peek_tail(&ulpq->lobby); + if (!pos) { + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + return; + } + + cevent = (struct sctp_ulpevent *)pos->cb; + if (event->stream == cevent->stream && + MID_lt(cevent->mid, event->mid)) { + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + return; + } + + if (event->stream > cevent->stream) { + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + return; + } + + skb_queue_walk(&ulpq->lobby, pos) { + cevent = (struct sctp_ulpevent *)pos->cb; + + if (cevent->stream > event->stream) + break; + + if (cevent->stream == event->stream && + MID_lt(event->mid, cevent->mid)) + break; + } + + __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event)); +} + +static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff_head *event_list; + struct sctp_stream *stream; + struct sk_buff *pos, *tmp; + __u16 sid = event->stream; + + stream = &ulpq->asoc->stream; + event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev; + + sctp_skb_for_each(pos, &ulpq->lobby, tmp) { + struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb; + + if (cevent->stream > sid) + break; + + if (cevent->stream < sid) + continue; + + if (cevent->mid != sctp_mid_peek(stream, in, sid)) + break; + + sctp_mid_next(stream, in, sid); + + __skb_unlink(pos, &ulpq->lobby); + + __skb_queue_tail(event_list, pos); + } +} + +static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_stream *stream; + __u16 sid; + + stream = &ulpq->asoc->stream; + sid = event->stream; + + if (event->mid != sctp_mid_peek(stream, in, sid)) { + sctp_intl_store_ordered(ulpq, event); + return NULL; + } + + sctp_mid_next(stream, in, sid); + + sctp_intl_retrieve_ordered(ulpq, event); + + return event; +} + +static int sctp_enqueue_event(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff *skb = sctp_event2skb(event); + struct sock *sk = ulpq->asoc->base.sk; + struct sctp_sock *sp = sctp_sk(sk); + struct sk_buff_head *skb_list; + + skb_list = (struct sk_buff_head *)skb->prev; + + if (sk->sk_shutdown & RCV_SHUTDOWN && + (sk->sk_shutdown & SEND_SHUTDOWN || + !sctp_ulpevent_is_notification(event))) + goto out_free; + + if (!sctp_ulpevent_is_notification(event)) { + sk_mark_napi_id(sk, skb); + sk_incoming_cpu_update(sk); + } + + if (!sctp_ulpevent_is_enabled(event, &sp->subscribe)) + goto out_free; + + if (skb_list) + skb_queue_splice_tail_init(skb_list, + &sk->sk_receive_queue); + else + __skb_queue_tail(&sk->sk_receive_queue, skb); + + if (!sp->data_ready_signalled) { + sp->data_ready_signalled = 1; + sk->sk_data_ready(sk); + } + + return 1; + +out_free: + if (skb_list) + sctp_queue_purge_ulpevents(skb_list); + else + sctp_ulpevent_free(event); + + return 0; +} + +static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_ulpevent *cevent; + struct sk_buff *pos; + + pos = skb_peek_tail(&ulpq->reasm_uo); + if (!pos) { + __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); + return; + } + + cevent = sctp_skb2event(pos); + + if (event->stream == cevent->stream && + event->mid == cevent->mid && + (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || + (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && + event->fsn > cevent->fsn))) { + __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); + return; + } + + if ((event->stream == cevent->stream && + MID_lt(cevent->mid, event->mid)) || + event->stream > cevent->stream) { + __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); + return; + } + + skb_queue_walk(&ulpq->reasm_uo, pos) { + cevent = sctp_skb2event(pos); + + if (event->stream < cevent->stream || + (event->stream == cevent->stream && + MID_lt(event->mid, cevent->mid))) + break; + + if (event->stream == cevent->stream && + event->mid == cevent->mid && + !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && + (event->msg_flags & SCTP_DATA_FIRST_FRAG || + event->fsn < cevent->fsn)) + break; + } + + __skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event)); +} + +static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo( + struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff *first_frag = NULL; + struct sk_buff *last_frag = NULL; + struct sctp_ulpevent *retval; + struct sctp_stream_in *sin; + struct sk_buff *pos; + __u32 next_fsn = 0; + int is_last = 0; + + sin = sctp_stream_in(ulpq->asoc, event->stream); + + skb_queue_walk(&ulpq->reasm_uo, pos) { + struct sctp_ulpevent *cevent = sctp_skb2event(pos); + + if (cevent->stream < event->stream) + continue; + if (cevent->stream > event->stream) + break; + + if (MID_lt(cevent->mid, sin->mid_uo)) + continue; + if (MID_lt(sin->mid_uo, cevent->mid)) + break; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + goto out; + case SCTP_DATA_MIDDLE_FRAG: + if (!first_frag) { + if (cevent->fsn == sin->fsn_uo) { + first_frag = pos; + last_frag = pos; + next_fsn = cevent->fsn + 1; + } + } else if (cevent->fsn == next_fsn) { + last_frag = pos; + next_fsn++; + } else { + goto out; + } + break; + case SCTP_DATA_LAST_FRAG: + if (!first_frag) { + if (cevent->fsn == sin->fsn_uo) { + first_frag = pos; + last_frag = pos; + next_fsn = 0; + is_last = 1; + } + } else if (cevent->fsn == next_fsn) { + last_frag = pos; + next_fsn = 0; + is_last = 1; + } + goto out; + default: + goto out; + } + } + +out: + if (!first_frag) + return NULL; + + retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + &ulpq->reasm_uo, first_frag, + last_frag); + if (retval) { + sin->fsn_uo = next_fsn; + if (is_last) { + retval->msg_flags |= MSG_EOR; + sin->pd_mode_uo = 0; + } + } + + return retval; +} + +static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( + struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_association *asoc = ulpq->asoc; + struct sk_buff *pos, *first_frag = NULL; + struct sctp_ulpevent *retval = NULL; + struct sk_buff *pd_first = NULL; + struct sk_buff *pd_last = NULL; + struct sctp_stream_in *sin; + __u32 next_fsn = 0; + __u32 pd_point = 0; + __u32 pd_len = 0; + __u32 mid = 0; + + sin = sctp_stream_in(ulpq->asoc, event->stream); + + skb_queue_walk(&ulpq->reasm_uo, pos) { + struct sctp_ulpevent *cevent = sctp_skb2event(pos); + + if (cevent->stream < event->stream) + continue; + if (cevent->stream > event->stream) + break; + + if (MID_lt(cevent->mid, event->mid)) + continue; + if (MID_lt(event->mid, cevent->mid)) + break; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + if (!sin->pd_mode_uo) { + sin->mid_uo = cevent->mid; + pd_first = pos; + pd_last = pos; + pd_len = pos->len; + } + + first_frag = pos; + next_fsn = 0; + mid = cevent->mid; + break; + + case SCTP_DATA_MIDDLE_FRAG: + if (first_frag && cevent->mid == mid && + cevent->fsn == next_fsn) { + next_fsn++; + if (pd_first) { + pd_last = pos; + pd_len += pos->len; + } + } else { + first_frag = NULL; + } + break; + + case SCTP_DATA_LAST_FRAG: + if (first_frag && cevent->mid == mid && + cevent->fsn == next_fsn) + goto found; + else + first_frag = NULL; + break; + } + } + + if (!pd_first) + goto out; + + pd_point = sctp_sk(asoc->base.sk)->pd_point; + if (pd_point && pd_point <= pd_len) { + retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + &ulpq->reasm_uo, + pd_first, pd_last); + if (retval) { + sin->fsn_uo = next_fsn; + sin->pd_mode_uo = 1; + } + } + goto out; + +found: + retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + &ulpq->reasm_uo, + first_frag, pos); + if (retval) + retval->msg_flags |= MSG_EOR; + +out: + return retval; +} + +static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_ulpevent *retval = NULL; + struct sctp_stream_in *sin; + + if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { + event->msg_flags |= MSG_EOR; + return event; + } + + sctp_intl_store_reasm_uo(ulpq, event); + + sin = sctp_stream_in(ulpq->asoc, event->stream); + if (sin->pd_mode_uo && event->mid == sin->mid_uo && + event->fsn == sin->fsn_uo) + retval = sctp_intl_retrieve_partial_uo(ulpq, event); + + if (!retval) + retval = sctp_intl_retrieve_reassembled_uo(ulpq, event); + + return retval; +} + +static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) +{ + struct sctp_stream_in *csin, *sin = NULL; + struct sk_buff *first_frag = NULL; + struct sk_buff *last_frag = NULL; + struct sctp_ulpevent *retval; + struct sk_buff *pos; + __u32 next_fsn = 0; + __u16 sid = 0; + + skb_queue_walk(&ulpq->reasm_uo, pos) { + struct sctp_ulpevent *cevent = sctp_skb2event(pos); + + csin = sctp_stream_in(ulpq->asoc, cevent->stream); + if (csin->pd_mode_uo) + continue; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + if (first_frag) + goto out; + first_frag = pos; + last_frag = pos; + next_fsn = 0; + sin = csin; + sid = cevent->stream; + sin->mid_uo = cevent->mid; + break; + case SCTP_DATA_MIDDLE_FRAG: + if (!first_frag) + break; + if (cevent->stream == sid && + cevent->mid == sin->mid_uo && + cevent->fsn == next_fsn) { + next_fsn++; + last_frag = pos; + } else { + goto out; + } + break; + case SCTP_DATA_LAST_FRAG: + if (first_frag) + goto out; + break; + default: + break; + } + } + + if (!first_frag) + return NULL; + +out: + retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + &ulpq->reasm_uo, first_frag, + last_frag); + if (retval) { + sin->fsn_uo = next_fsn; + sin->pd_mode_uo = 1; + } + + return retval; +} + +static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq, + struct sctp_chunk *chunk, gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sk_buff_head temp; + int event_eor = 0; + + event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); + if (!event) + return -ENOMEM; + + event->mid = ntohl(chunk->subh.idata_hdr->mid); + if (event->msg_flags & SCTP_DATA_FIRST_FRAG) + event->ppid = chunk->subh.idata_hdr->ppid; + else + event->fsn = ntohl(chunk->subh.idata_hdr->fsn); + + if (!(event->msg_flags & SCTP_DATA_UNORDERED)) { + event = sctp_intl_reasm(ulpq, event); + if (event && event->msg_flags & MSG_EOR) { + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + + event = sctp_intl_order(ulpq, event); + } + } else { + event = sctp_intl_reasm_uo(ulpq, event); + } + + if (event) { + event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; + sctp_enqueue_event(ulpq, event); + } + + return event_eor; +} + +static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) +{ + struct sctp_stream_in *csin, *sin = NULL; + struct sk_buff *first_frag = NULL; + struct sk_buff *last_frag = NULL; + struct sctp_ulpevent *retval; + struct sk_buff *pos; + __u32 next_fsn = 0; + __u16 sid = 0; + + skb_queue_walk(&ulpq->reasm, pos) { + struct sctp_ulpevent *cevent = sctp_skb2event(pos); + + csin = sctp_stream_in(ulpq->asoc, cevent->stream); + if (csin->pd_mode) + continue; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + if (first_frag) + goto out; + if (cevent->mid == csin->mid) { + first_frag = pos; + last_frag = pos; + next_fsn = 0; + sin = csin; + sid = cevent->stream; + } + break; + case SCTP_DATA_MIDDLE_FRAG: + if (!first_frag) + break; + if (cevent->stream == sid && + cevent->mid == sin->mid && + cevent->fsn == next_fsn) { + next_fsn++; + last_frag = pos; + } else { + goto out; + } + break; + case SCTP_DATA_LAST_FRAG: + if (first_frag) + goto out; + break; + default: + break; + } + } + + if (!first_frag) + return NULL; + +out: + retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + &ulpq->reasm, first_frag, + last_frag); + if (retval) { + sin->fsn = next_fsn; + sin->pd_mode = 1; + } + + return retval; +} + +static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp) +{ + struct sctp_ulpevent *event; + + if (!skb_queue_empty(&ulpq->reasm)) { + do { + event = sctp_intl_retrieve_first(ulpq); + if (event) + sctp_enqueue_event(ulpq, event); + } while (event); + } + + if (!skb_queue_empty(&ulpq->reasm_uo)) { + do { + event = sctp_intl_retrieve_first_uo(ulpq); + if (event) + sctp_enqueue_event(ulpq, event); + } while (event); + } +} + +static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, + gfp_t gfp) +{ + struct sctp_association *asoc = ulpq->asoc; + __u32 freed = 0; + __u16 needed; + + if (chunk) { + needed = ntohs(chunk->chunk_hdr->length); + needed -= sizeof(struct sctp_idata_chunk); + } else { + needed = SCTP_DEFAULT_MAXWINDOW; + } + + if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { + freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); + if (freed < needed) + freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm, + needed); + if (freed < needed) + freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo, + needed); + } + + if (chunk && freed >= needed) + if (sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0) + sctp_intl_start_pd(ulpq, gfp); + + sk_mem_reclaim(asoc->base.sk); +} + +static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, + __u32 mid, __u16 flags, gfp_t gfp) +{ + struct sock *sk = ulpq->asoc->base.sk; + struct sctp_ulpevent *ev = NULL; + + if (!sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, + &sctp_sk(sk)->subscribe)) + return; + + ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, + sid, mid, flags, gfp); + if (ev) { + __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); + + if (!sctp_sk(sk)->data_ready_signalled) { + sctp_sk(sk)->data_ready_signalled = 1; + sk->sk_data_ready(sk); + } + } +} + +static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) +{ + struct sctp_stream *stream = &ulpq->asoc->stream; + struct sctp_ulpevent *cevent, *event = NULL; + struct sk_buff_head *lobby = &ulpq->lobby; + struct sk_buff *pos, *tmp; + struct sk_buff_head temp; + __u16 csid; + __u32 cmid; + + skb_queue_head_init(&temp); + sctp_skb_for_each(pos, lobby, tmp) { + cevent = (struct sctp_ulpevent *)pos->cb; + csid = cevent->stream; + cmid = cevent->mid; + + if (csid > sid) + break; + + if (csid < sid) + continue; + + if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid))) + break; + + __skb_unlink(pos, lobby); + if (!event) + event = sctp_skb2event(pos); + + __skb_queue_tail(&temp, pos); + } + + if (!event && pos != (struct sk_buff *)lobby) { + cevent = (struct sctp_ulpevent *)pos->cb; + csid = cevent->stream; + cmid = cevent->mid; + + if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) { + sctp_mid_next(stream, in, csid); + __skb_unlink(pos, lobby); + __skb_queue_tail(&temp, pos); + event = sctp_skb2event(pos); + } + } + + if (event) { + sctp_intl_retrieve_ordered(ulpq, event); + sctp_enqueue_event(ulpq, event); + } +} + +static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) +{ + struct sctp_stream *stream = &ulpq->asoc->stream; + __u16 sid; + + for (sid = 0; sid < stream->incnt; sid++) { + struct sctp_stream_in *sin = &stream->in[sid]; + __u32 mid; + + if (sin->pd_mode_uo) { + sin->pd_mode_uo = 0; + + mid = sin->mid_uo; + sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp); + } + + if (sin->pd_mode) { + sin->pd_mode = 0; + + mid = sin->mid; + sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp); + sctp_mid_skip(stream, in, sid, mid); + + sctp_intl_reap_ordered(ulpq, sid); + } + } + + /* intl abort pd happens only when all data needs to be cleaned */ + sctp_ulpq_flush(ulpq); +} + +static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist, + int nskips, __be16 stream, __u8 flags) +{ + int i; + + for (i = 0; i < nskips; i++) + if (skiplist[i].stream == stream && + skiplist[i].flags == flags) + return i; + + return i; +} + +#define SCTP_FTSN_U_BIT 0x1 +static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) +{ + struct sctp_ifwdtsn_skip ftsn_skip_arr[10]; + struct sctp_association *asoc = q->asoc; + struct sctp_chunk *ftsn_chunk = NULL; + struct list_head *lchunk, *temp; + int nskips = 0, skip_pos; + struct sctp_chunk *chunk; + __u32 tsn; + + if (!asoc->peer.prsctp_capable) + return; + + if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) + asoc->adv_peer_ack_point = ctsn; + + list_for_each_safe(lchunk, temp, &q->abandoned) { + chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); + tsn = ntohl(chunk->subh.data_hdr->tsn); + + if (TSN_lte(tsn, ctsn)) { + list_del_init(lchunk); + sctp_chunk_free(chunk); + } else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) { + __be16 sid = chunk->subh.idata_hdr->stream; + __be32 mid = chunk->subh.idata_hdr->mid; + __u8 flags = 0; + + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) + flags |= SCTP_FTSN_U_BIT; + + asoc->adv_peer_ack_point = tsn; + skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, + sid, flags); + ftsn_skip_arr[skip_pos].stream = sid; + ftsn_skip_arr[skip_pos].reserved = 0; + ftsn_skip_arr[skip_pos].flags = flags; + ftsn_skip_arr[skip_pos].mid = mid; + if (skip_pos == nskips) + nskips++; + if (nskips == 10) + break; + } else { + break; + } + } + + if (asoc->adv_peer_ack_point > ctsn) + ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point, + nskips, &ftsn_skip_arr[0]); + + if (ftsn_chunk) { + list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); + SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS); + } +} + +#define _sctp_walk_ifwdtsn(pos, chunk, end) \ + for (pos = chunk->subh.ifwdtsn_hdr->skip; \ + (void *)pos < (void *)chunk->subh.ifwdtsn_hdr->skip + (end); pos++) + +#define sctp_walk_ifwdtsn(pos, ch) \ + _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \ + sizeof(struct sctp_ifwdtsn_chunk)) + +static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk) +{ + struct sctp_fwdtsn_skip *skip; + __u16 incnt; + + if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN) + return false; + + incnt = chunk->asoc->stream.incnt; + sctp_walk_fwdtsn(skip, chunk) + if (ntohs(skip->stream) >= incnt) + return false; + + return true; +} + +static bool sctp_validate_iftsn(struct sctp_chunk *chunk) +{ + struct sctp_ifwdtsn_skip *skip; + __u16 incnt; + + if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN) + return false; + + incnt = chunk->asoc->stream.incnt; + sctp_walk_ifwdtsn(skip, chunk) + if (ntohs(skip->stream) >= incnt) + return false; + + return true; +} + +static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn) +{ + /* Move the Cumulattive TSN Ack ahead. */ + sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); + /* purge the fragmentation queue */ + sctp_ulpq_reasm_flushtsn(ulpq, ftsn); + /* Abort any in progress partial delivery. */ + sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC); +} + +static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn) +{ + struct sk_buff *pos, *tmp; + + skb_queue_walk_safe(&ulpq->reasm, pos, tmp) { + struct sctp_ulpevent *event = sctp_skb2event(pos); + __u32 tsn = event->tsn; + + if (TSN_lte(tsn, ftsn)) { + __skb_unlink(pos, &ulpq->reasm); + sctp_ulpevent_free(event); + } + } + + skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) { + struct sctp_ulpevent *event = sctp_skb2event(pos); + __u32 tsn = event->tsn; + + if (TSN_lte(tsn, ftsn)) { + __skb_unlink(pos, &ulpq->reasm_uo); + sctp_ulpevent_free(event); + } + } +} + +static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn) +{ + /* Move the Cumulattive TSN Ack ahead. */ + sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); + /* purge the fragmentation queue */ + sctp_intl_reasm_flushtsn(ulpq, ftsn); + /* abort only when it's for all data */ + if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map)) + sctp_intl_abort_pd(ulpq, GFP_ATOMIC); +} + +static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) +{ + struct sctp_fwdtsn_skip *skip; + + /* Walk through all the skipped SSNs */ + sctp_walk_fwdtsn(skip, chunk) + sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); +} + +static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, + __u8 flags) +{ + struct sctp_stream_in *sin = sctp_stream_in(ulpq->asoc, sid); + struct sctp_stream *stream = &ulpq->asoc->stream; + + if (flags & SCTP_FTSN_U_BIT) { + if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) { + sin->pd_mode_uo = 0; + sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, + GFP_ATOMIC); + } + return; + } + + if (MID_lt(mid, sctp_mid_peek(stream, in, sid))) + return; + + if (sin->pd_mode) { + sin->pd_mode = 0; + sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC); + } + + sctp_mid_skip(stream, in, sid, mid); + + sctp_intl_reap_ordered(ulpq, sid); +} + +static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) +{ + struct sctp_ifwdtsn_skip *skip; + + /* Walk through all the skipped MIDs and abort stream pd if possible */ + sctp_walk_ifwdtsn(skip, chunk) + sctp_intl_skip(ulpq, ntohs(skip->stream), + ntohl(skip->mid), skip->flags); +} + +static struct sctp_stream_interleave sctp_stream_interleave_0 = { + .data_chunk_len = sizeof(struct sctp_data_chunk), + .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk), + /* DATA process functions */ + .make_datafrag = sctp_make_datafrag_empty, + .assign_number = sctp_chunk_assign_ssn, + .validate_data = sctp_validate_data, + .ulpevent_data = sctp_ulpq_tail_data, + .enqueue_event = sctp_ulpq_tail_event, + .renege_events = sctp_ulpq_renege, + .start_pd = sctp_ulpq_partial_delivery, + .abort_pd = sctp_ulpq_abort_pd, + /* FORWARD-TSN process functions */ + .generate_ftsn = sctp_generate_fwdtsn, + .validate_ftsn = sctp_validate_fwdtsn, + .report_ftsn = sctp_report_fwdtsn, + .handle_ftsn = sctp_handle_fwdtsn, +}; + +static struct sctp_stream_interleave sctp_stream_interleave_1 = { + .data_chunk_len = sizeof(struct sctp_idata_chunk), + .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk), + /* I-DATA process functions */ + .make_datafrag = sctp_make_idatafrag_empty, + .assign_number = sctp_chunk_assign_mid, + .validate_data = sctp_validate_idata, + .ulpevent_data = sctp_ulpevent_idata, + .enqueue_event = sctp_enqueue_event, + .renege_events = sctp_renege_events, + .start_pd = sctp_intl_start_pd, + .abort_pd = sctp_intl_abort_pd, + /* I-FORWARD-TSN process functions */ + .generate_ftsn = sctp_generate_iftsn, + .validate_ftsn = sctp_validate_iftsn, + .report_ftsn = sctp_report_iftsn, + .handle_ftsn = sctp_handle_iftsn, +}; + +void sctp_stream_interleave_init(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + + asoc = container_of(stream, struct sctp_association, stream); + stream->si = asoc->intl_enable ? &sctp_stream_interleave_1 + : &sctp_stream_interleave_0; +} diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index d8c162a4089c..f5fcd425232a 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -242,7 +242,8 @@ int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { - if (!list_is_last(&ch->frag_list, &ch->msg->chunks)) { + if (!list_is_last(&ch->frag_list, &ch->msg->chunks) && + !q->asoc->intl_enable) { struct sctp_stream_out *sout; __u16 sid; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index ef7ca44d6e6a..33ca5b73cdb3 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -289,6 +289,13 @@ static struct ctl_table sctp_net_table[] = { .proc_handler = proc_sctp_do_auth, }, { + .procname = "intl_enable", + .data = &init_net.sctp.intl_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "addr_scope_policy", .data = &init_net.sctp.scope_policy, .maxlen = sizeof(int), diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 5447228bf1a0..84207ad33e8e 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -443,8 +443,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_send_failed( goto fail; /* Pull off the common chunk header and DATA header. */ - skb_pull(skb, sizeof(struct sctp_data_chunk)); - len -= sizeof(struct sctp_data_chunk); + skb_pull(skb, sctp_datachk_len(&asoc->stream)); + len -= sctp_datachk_len(&asoc->stream); /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); @@ -705,8 +705,6 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, sctp_ulpevent_receive_data(event, asoc); event->stream = ntohs(chunk->subh.data_hdr->stream); - event->ssn = ntohs(chunk->subh.data_hdr->ssn); - event->ppid = chunk->subh.data_hdr->ppid; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { event->flags |= SCTP_UNORDERED; event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); @@ -732,8 +730,9 @@ fail: * various events. */ struct sctp_ulpevent *sctp_ulpevent_make_pdapi( - const struct sctp_association *asoc, __u32 indication, - gfp_t gfp) + const struct sctp_association *asoc, + __u32 indication, __u32 sid, __u32 seq, + __u32 flags, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_pdapi_event *pd; @@ -754,7 +753,9 @@ struct sctp_ulpevent *sctp_ulpevent_make_pdapi( * Currently unused. */ pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT; - pd->pdapi_flags = 0; + pd->pdapi_flags = flags; + pd->pdapi_stream = sid; + pd->pdapi_seq = seq; /* pdapi_length: 32 bits (unsigned integer) * diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index a71be33f3afe..97fae53310e0 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -60,6 +60,7 @@ struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq, ulpq->asoc = asoc; skb_queue_head_init(&ulpq->reasm); + skb_queue_head_init(&ulpq->reasm_uo); skb_queue_head_init(&ulpq->lobby); ulpq->pd_mode = 0; @@ -83,6 +84,10 @@ void sctp_ulpq_flush(struct sctp_ulpq *ulpq) sctp_ulpevent_free(event); } + while ((skb = __skb_dequeue(&ulpq->reasm_uo)) != NULL) { + event = sctp_skb2event(skb); + sctp_ulpevent_free(event); + } } /* Dispose of a ulpqueue. */ @@ -104,6 +109,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, if (!event) return -ENOMEM; + event->ssn = ntohs(chunk->subh.data_hdr->ssn); + event->ppid = chunk->subh.data_hdr->ppid; + /* Do reassembly if needed. */ event = sctp_ulpq_reasm(ulpq, event); @@ -328,9 +336,10 @@ static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq, * payload was fragmented on the way and ip had to reassemble them. * We add the rest of skb's to the first skb's fraglist. */ -static struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net, - struct sk_buff_head *queue, struct sk_buff *f_frag, - struct sk_buff *l_frag) +struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net, + struct sk_buff_head *queue, + struct sk_buff *f_frag, + struct sk_buff *l_frag) { struct sk_buff *pos; struct sk_buff *new = NULL; @@ -853,7 +862,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, struct sctp_stream *stream; /* Check if this message needs ordering. */ - if (SCTP_DATA_UNORDERED & event->msg_flags) + if (event->msg_flags & SCTP_DATA_UNORDERED) return event; /* Note: The stream ID must be verified before this routine. */ @@ -974,8 +983,8 @@ void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn) sctp_ulpq_reap_ordered(ulpq, sid); } -static __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, - struct sk_buff_head *list, __u16 needed) +__u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, struct sk_buff_head *list, + __u16 needed) { __u16 freed = 0; __u32 tsn, last_tsn; @@ -1140,7 +1149,7 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) &sctp_sk(sk)->subscribe)) ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, - gfp); + 0, 0, 0, gfp); if (ev) __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6451c5013e06..daf8075f5a4c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -520,7 +520,7 @@ decline_rdma: smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { rc = smc_clc_send_decline(smc, reason_code); - if (rc < sizeof(struct smc_clc_msg_decline)) + if (rc < 0) goto out_err; } goto out_connected; @@ -751,14 +751,16 @@ static void smc_listen_work(struct work_struct *work) { struct smc_sock *new_smc = container_of(work, struct smc_sock, smc_listen_work); + struct smc_clc_msg_proposal_prefix *pclc_prfx; struct socket *newclcsock = new_smc->clcsock; struct smc_sock *lsmc = new_smc->listen_smc; struct smc_clc_msg_accept_confirm cclc; int local_contact = SMC_REUSE_CONTACT; struct sock *newsmcsk = &new_smc->sk; - struct smc_clc_msg_proposal pclc; + struct smc_clc_msg_proposal *pclc; struct smc_ib_device *smcibdev; struct sockaddr_in peeraddr; + u8 buf[SMC_CLC_MAX_LEN]; struct smc_link *link; int reason_code = 0; int rc = 0, len; @@ -775,7 +777,7 @@ static void smc_listen_work(struct work_struct *work) /* do inband token exchange - *wait for and receive SMC Proposal CLC message */ - reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc), + reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf), SMC_CLC_PROPOSAL); if (reason_code < 0) goto out_err; @@ -804,8 +806,11 @@ static void smc_listen_work(struct work_struct *work) reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ goto decline_rdma; } - if ((pclc.outgoing_subnet != subnet) || - (pclc.prefix_len != prefix_len)) { + + pclc = (struct smc_clc_msg_proposal *)&buf; + pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (pclc_prfx->outgoing_subnet != subnet || + pclc_prfx->prefix_len != prefix_len) { reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ goto decline_rdma; } @@ -816,7 +821,7 @@ static void smc_listen_work(struct work_struct *work) /* allocate connection / link group */ mutex_lock(&smc_create_lgr_pending); local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, - smcibdev, ibport, &pclc.lcl, 0); + smcibdev, ibport, &pclc->lcl, 0); if (local_contact < 0) { rc = local_contact; if (rc == -ENOMEM) @@ -879,11 +884,9 @@ static void smc_listen_work(struct work_struct *work) } /* QP confirmation over RoCE fabric */ reason_code = smc_serv_conf_first_link(new_smc); - if (reason_code < 0) { + if (reason_code < 0) /* peer is not aware of a problem */ - rc = reason_code; goto out_err_unlock; - } if (reason_code > 0) goto decline_rdma_unlock; } @@ -916,8 +919,7 @@ decline_rdma: smc_conn_free(&new_smc->conn); new_smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(new_smc, reason_code); - if (rc < sizeof(struct smc_clc_msg_decline)) + if (smc_clc_send_decline(new_smc, reason_code) < 0) goto out_err; } goto out_connected; diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 87f7bede6eab..d4155ff6acde 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -213,6 +213,9 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ smp_mb__after_atomic(); smc->sk.sk_data_ready(&smc->sk); + } else if ((conn->local_rx_ctrl.prod_flags.write_blocked) || + (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) { + smc->sk.sk_data_ready(&smc->sk); } if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { @@ -234,15 +237,6 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, /* trigger socket release if connection closed */ smc_close_wake_tx_prepared(smc); } - - /* socket connected but not accepted */ - if (!smc->sk.sk_socket) - return; - - /* data available */ - if ((conn->local_rx_ctrl.prod_flags.write_blocked) || - (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) - smc_tx_consumer_update(conn); } /* called under tasklet context */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 1800e16b2a02..abf7ceb6690b 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -22,6 +22,54 @@ #include "smc_clc.h" #include "smc_ib.h" +/* check if received message has a correct header length and contains valid + * heading and trailing eyecatchers + */ +static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) +{ + struct smc_clc_msg_proposal_prefix *pclc_prfx; + struct smc_clc_msg_accept_confirm *clc; + struct smc_clc_msg_proposal *pclc; + struct smc_clc_msg_decline *dclc; + struct smc_clc_msg_trail *trl; + + if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) + return false; + switch (clcm->type) { + case SMC_CLC_PROPOSAL: + pclc = (struct smc_clc_msg_proposal *)clcm; + pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (ntohs(pclc->hdr.length) != + sizeof(*pclc) + ntohs(pclc->iparea_offset) + + sizeof(*pclc_prfx) + + pclc_prfx->ipv6_prefixes_cnt * + sizeof(struct smc_clc_ipv6_prefix) + + sizeof(*trl)) + return false; + trl = (struct smc_clc_msg_trail *) + ((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl)); + break; + case SMC_CLC_ACCEPT: + case SMC_CLC_CONFIRM: + clc = (struct smc_clc_msg_accept_confirm *)clcm; + if (ntohs(clc->hdr.length) != sizeof(*clc)) + return false; + trl = &clc->trl; + break; + case SMC_CLC_DECLINE: + dclc = (struct smc_clc_msg_decline *)clcm; + if (ntohs(dclc->hdr.length) != sizeof(*dclc)) + return false; + trl = &dclc->trl; + break; + default: + return false; + } + if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) + return false; + return true; +} + /* Wait for data on the tcp-socket, analyze received data * Returns: * 0 if success and it was not a decline that we received. @@ -72,9 +120,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } datlen = ntohs(clcm->length); if ((len < sizeof(struct smc_clc_msg_hdr)) || - (datlen < sizeof(struct smc_clc_msg_decline)) || - (datlen > sizeof(struct smc_clc_msg_accept_confirm)) || - memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) || + (datlen > buflen) || ((clcm->type != SMC_CLC_DECLINE) && (clcm->type != expected_type))) { smc->sk.sk_err = EPROTO; @@ -89,7 +135,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, krflags = MSG_WAITALL; smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags); - if (len < datlen) { + if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) { smc->sk.sk_err = EPROTO; reason_code = -EPROTO; goto out; @@ -133,7 +179,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) smc->sk.sk_err = EPROTO; if (len < 0) smc->sk.sk_err = -len; - return len; + return sock_error(&smc->sk); } /* send CLC PROPOSAL message across internal TCP socket */ @@ -141,33 +187,43 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport) { + struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_msg_proposal pclc; + struct smc_clc_msg_trail trl; int reason_code = 0; + struct kvec vec[3]; struct msghdr msg; - struct kvec vec; - int len, rc; + int len, plen, rc; /* send SMC Proposal CLC message */ + plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl); memset(&pclc, 0, sizeof(pclc)); memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); pclc.hdr.type = SMC_CLC_PROPOSAL; - pclc.hdr.length = htons(sizeof(pclc)); + pclc.hdr.length = htons(plen); pclc.hdr.version = SMC_CLC_V1; /* SMC version */ memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE); memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); + pclc.iparea_offset = htons(0); + memset(&pclc_prfx, 0, sizeof(pclc_prfx)); /* determine subnet and mask from internal TCP socket */ - rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet, - &pclc.prefix_len); + rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet, + &pclc_prfx.prefix_len); if (rc) return SMC_CLC_DECL_CNFERR; /* configuration error */ - memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + pclc_prfx.ipv6_prefixes_cnt = 0; + memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memset(&msg, 0, sizeof(msg)); - vec.iov_base = &pclc; - vec.iov_len = sizeof(pclc); + vec[0].iov_base = &pclc; + vec[0].iov_len = sizeof(pclc); + vec[1].iov_base = &pclc_prfx; + vec[1].iov_len = sizeof(pclc_prfx); + vec[2].iov_base = &trl; + vec[2].iov_len = sizeof(trl); /* due to the few bytes needed for clc-handshake this cannot block */ - len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc)); + len = kernel_sendmsg(smc->clcsock, &msg, vec, 3, plen); if (len < sizeof(pclc)) { if (len >= 0) { reason_code = -ENETUNREACH; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 12a9af1539a2..c145a0f36a68 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -44,7 +44,7 @@ struct smc_clc_msg_hdr { /* header1 of clc messages */ #if defined(__BIG_ENDIAN_BITFIELD) u8 version : 4, flag : 1, - rsvd : 3; + rsvd : 3; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 rsvd : 3, flag : 1, @@ -62,17 +62,31 @@ struct smc_clc_msg_local { /* header2 of clc messages */ u8 mac[6]; /* mac of ib_device port */ }; -struct smc_clc_msg_proposal { /* clc proposal message */ - struct smc_clc_msg_hdr hdr; - struct smc_clc_msg_local lcl; - __be16 iparea_offset; /* offset to IP address information area */ +struct smc_clc_ipv6_prefix { + u8 prefix[4]; + u8 prefix_len; +} __packed; + +struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ __be32 outgoing_subnet; /* subnet mask */ u8 prefix_len; /* number of significant bits in mask */ u8 reserved[2]; u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */ - struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ } __aligned(4); +struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ + struct smc_clc_msg_hdr hdr; + struct smc_clc_msg_local lcl; + __be16 iparea_offset; /* offset to IP address information area */ +} __aligned(4); + +#define SMC_CLC_PROPOSAL_MAX_OFFSET 0x28 +#define SMC_CLC_PROPOSAL_MAX_PREFIX (8 * sizeof(struct smc_clc_ipv6_prefix)) +#define SMC_CLC_MAX_LEN (sizeof(struct smc_clc_msg_proposal) + \ + SMC_CLC_PROPOSAL_MAX_OFFSET + \ + SMC_CLC_PROPOSAL_MAX_PREFIX + \ + sizeof(struct smc_clc_msg_trail)) + struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_hdr hdr; struct smc_clc_msg_local lcl; @@ -102,6 +116,14 @@ struct smc_clc_msg_decline { /* clc decline message */ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ } __aligned(4); +/* determine start of the prefix area within the proposal message */ +static inline struct smc_clc_msg_proposal_prefix * +smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) +{ + return (struct smc_clc_msg_proposal_prefix *) + ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); +} + struct smc_sock; struct smc_ib_device; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 48615d2ac4aa..e194c6cc308a 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -113,7 +113,7 @@ static int smc_close_abort(struct smc_connection *conn) /* terminate smc socket abnormally - active abort * RDMA communication no longer possible */ -void smc_close_active_abort(struct smc_sock *smc) +static void smc_close_active_abort(struct smc_sock *smc) { struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h index ed82506b1b0a..8c498885d758 100644 --- a/net/smc/smc_close.h +++ b/net/smc/smc_close.h @@ -20,7 +20,6 @@ #define SMC_CLOSE_SOCK_PUT_DELAY HZ void smc_close_wake_tx_prepared(struct smc_sock *smc); -void smc_close_active_abort(struct smc_sock *smc); int smc_close_active(struct smc_sock *smc); void smc_close_sock_put_work(struct work_struct *work); int smc_close_shutdown_write(struct smc_sock *smc); diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index cbf58637ee14..9dc392ca06bf 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -65,7 +65,6 @@ static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) rc = sk_wait_event(sk, timeo, sk->sk_err || sk->sk_shutdown & RCV_SHUTDOWN || - sock_flag(sk, SOCK_DONE) || atomic_read(&conn->bytes_to_rcv) || smc_cdc_rxed_any_close_or_senddone(conn), &wait); @@ -116,7 +115,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, if (read_done) { if (sk->sk_err || sk->sk_state == SMC_CLOSED || - (sk->sk_shutdown & RCV_SHUTDOWN) || + sk->sk_shutdown & RCV_SHUTDOWN || !timeo || signal_pending(current) || smc_cdc_rxed_any_close_or_senddone(conn) || @@ -124,8 +123,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, peer_conn_abort) break; } else { - if (sock_flag(sk, SOCK_DONE)) - break; if (sk->sk_err) { read_done = sock_error(sk); break; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index c48dc2d5fd3a..2e50fddf8ce9 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -104,14 +104,12 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags) if (atomic_read(&conn->sndbuf_space)) break; /* at least 1 byte of free space available */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - sk->sk_write_pending++; sk_wait_event(sk, &timeo, sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || smc_cdc_rxed_any_close_or_senddone(conn) || atomic_read(&conn->sndbuf_space), &wait); - sk->sk_write_pending--; } remove_wait_queue(sk_sleep(sk), &wait); return rc; @@ -450,9 +448,7 @@ static void smc_tx_work(struct work_struct *work) void smc_tx_consumer_update(struct smc_connection *conn) { union smc_host_cursor cfed, cons; - struct smc_cdc_tx_pend *pend; - struct smc_wr_buf *wr_buf; - int to_confirm, rc; + int to_confirm; smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn), @@ -466,10 +462,7 @@ void smc_tx_consumer_update(struct smc_connection *conn) ((to_confirm > conn->rmbe_update_limit) && ((to_confirm > (conn->rmbe_size / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { - rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); - if (!rc) - rc = smc_cdc_msg_send(conn, wr_buf, pend); - if (rc < 0) { + if (smc_cdc_get_slot_and_msg_send(conn) < 0) { schedule_delayed_work(&conn->tx_work, SMC_TX_WORK_DELAY); return; diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 329325bd553e..37892b3909af 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -1,7 +1,7 @@ /* * net/tipc/bcast.c: TIPC broadcast code * - * Copyright (c) 2004-2006, 2014-2016, Ericsson AB + * Copyright (c) 2004-2006, 2014-2017, Ericsson AB * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. @@ -42,8 +42,8 @@ #include "link.h" #include "name_table.h" -#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */ -#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ +#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */ +#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ const char tipc_bclink_name[] = "broadcast-link"; @@ -74,6 +74,10 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net) return tipc_net(net)->bcbase; } +/* tipc_bcast_get_mtu(): -get the MTU currently used by broadcast link + * Note: the MTU is decremented to give room for a tunnel header, in + * case the message needs to be sent as replicast + */ int tipc_bcast_get_mtu(struct net *net) { return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE; @@ -515,7 +519,7 @@ int tipc_bcast_init(struct net *net) spin_lock_init(&tipc_net(net)->bclock); if (!tipc_link_bc_create(net, 0, 0, - U16_MAX, + FB_MTU, BCLINK_WIN_DEFAULT, 0, &bb->inputq, diff --git a/net/tipc/link.c b/net/tipc/link.c index 6bce0b1117bd..2d6b2aed30e0 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -483,7 +483,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, /** * tipc_link_bc_create - create new link to be used for broadcast * @n: pointer to associated node - * @mtu: mtu to be used + * @mtu: mtu to be used initially if no peers * @window: send window to be used * @inputq: queue to put messages ready for delivery * @namedq: queue to put binding table update messages ready for delivery diff --git a/net/tipc/msg.c b/net/tipc/msg.c index b0d07b35909d..55d8ba92291d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -251,20 +251,23 @@ bool tipc_msg_validate(struct sk_buff **_skb) * @pktmax: Max packet size that can be used * @list: Buffer or chain of buffers to be returned to caller * + * Note that the recursive call we are making here is safe, since it can + * logically go only one further level down. + * * Returns message data size or errno: -ENOMEM, -EFAULT */ -int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, - int offset, int dsz, int pktmax, struct sk_buff_head *list) +int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, + int dsz, int pktmax, struct sk_buff_head *list) { int mhsz = msg_hdr_sz(mhdr); + struct tipc_msg pkthdr; int msz = mhsz + dsz; - int pktno = 1; - int pktsz; int pktrem = pktmax; - int drem = dsz; - struct tipc_msg pkthdr; struct sk_buff *skb; + int drem = dsz; + int pktno = 1; char *pktpos; + int pktsz; int rc; msg_set_size(mhdr, msz); @@ -272,8 +275,18 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, /* No fragmentation needed? */ if (likely(msz <= pktmax)) { skb = tipc_buf_acquire(msz, GFP_KERNEL); - if (unlikely(!skb)) + + /* Fall back to smaller MTU if node local message */ + if (unlikely(!skb)) { + if (pktmax != MAX_MSG_SIZE) + return -ENOMEM; + rc = tipc_msg_build(mhdr, m, offset, dsz, FB_MTU, list); + if (rc != dsz) + return rc; + if (tipc_msg_assemble(list)) + return dsz; return -ENOMEM; + } skb_orphan(skb); __skb_queue_tail(list, skb); skb_copy_to_linear_data(skb, mhdr, mhsz); @@ -589,6 +602,30 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) return true; } +/* tipc_msg_assemble() - assemble chain of fragments into one message + */ +bool tipc_msg_assemble(struct sk_buff_head *list) +{ + struct sk_buff *skb, *tmp = NULL; + + if (skb_queue_len(list) == 1) + return true; + + while ((skb = __skb_dequeue(list))) { + skb->next = NULL; + if (tipc_buf_append(&tmp, &skb)) { + __skb_queue_tail(list, skb); + return true; + } + if (!tmp) + break; + } + __skb_queue_purge(list); + __skb_queue_head_init(list); + pr_warn("Failed do assemble buffer\n"); + return false; +} + /* tipc_msg_reassemble() - clone a buffer chain of fragments and * reassemble the clones into one message */ diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 3e4384c222f7..b4ba1b4f9ae7 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -98,7 +98,7 @@ struct plist; #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) - +#define FB_MTU 3744 #define TIPC_MEDIA_INFO_OFFSET 5 struct tipc_skb_cb { @@ -943,6 +943,7 @@ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); +bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 41127d0b925e..0cdf5c2ad881 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2640,9 +2640,7 @@ void tipc_sk_reinit(struct net *net) rhashtable_walk_enter(&tn->sk_rht, &iter); do { - tsk = ERR_PTR(rhashtable_walk_start(&iter)); - if (IS_ERR(tsk)) - goto walk_stop; + rhashtable_walk_start(&iter); while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { spin_lock_bh(&tsk->sk.sk_lock.slock); @@ -2651,7 +2649,7 @@ void tipc_sk_reinit(struct net *net) msg_set_orignode(msg, tn->own_addr); spin_unlock_bh(&tsk->sk.sk_lock.slock); } -walk_stop: + rhashtable_walk_stop(&iter); } while (tsk == ERR_PTR(-EAGAIN)); } diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 30e5746085b8..00641b611aed 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -67,7 +67,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, /* We don't yet support UDP encapsulation, TFC padding and ESN. */ if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN)) - return 0; + return -EINVAL; dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { @@ -120,8 +120,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) if (!x->type_offload || x->encap) return false; - if ((x->xso.offload_handle && (dev == dst->path->dev)) && - !dst->child->xfrm && x->type->get_mtu) { + if ((x->xso.offload_handle && (dev == xfrm_dst_path(dst)->dev)) && + !xdst->child->xfrm && x->type->get_mtu) { mtu = x->type->get_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 347ab31574d5..ac277b97e0d7 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -231,7 +231,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) if (xo && (xo->flags & CRYPTO_DONE)) { crypto_done = true; - x = xfrm_input_state(skb); family = XFRM_SPI_SKB_CB(skb)->family; if (!(xo->status & CRYPTO_SUCCESS)) { diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 73ad8c8ef344..23468672a767 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -44,7 +44,7 @@ static int xfrm_skb_check_space(struct sk_buff *skb) static struct dst_entry *skb_dst_pop(struct sk_buff *skb) { - struct dst_entry *child = dst_clone(skb_dst(skb)->child); + struct dst_entry *child = dst_clone(xfrm_dst_child(skb_dst(skb))); skb_dst_drop(skb); return child; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 9542975eb2f9..e3a5aca9cdda 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -54,7 +54,7 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] static struct kmem_cache *xfrm_dst_cache __read_mostly; static __read_mostly seqcount_t xfrm_policy_hash_generation; -static void xfrm_init_pmtu(struct dst_entry *dst); +static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); static void xfrm_policy_queue_process(struct timer_list *t); @@ -1251,7 +1251,7 @@ EXPORT_SYMBOL(xfrm_policy_delete); int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) { - struct net *net = xp_net(pol); + struct net *net = sock_net(sk); struct xfrm_policy *old_pol; #ifdef CONFIG_XFRM_SUB_POLICY @@ -1538,7 +1538,9 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, */ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, - struct xfrm_state **xfrm, int nx, + struct xfrm_state **xfrm, + struct xfrm_dst **bundle, + int nx, const struct flowi *fl, struct dst_entry *dst) { @@ -1546,8 +1548,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, unsigned long now = jiffies; struct net_device *dev; struct xfrm_mode *inner_mode; - struct dst_entry *dst_prev = NULL; - struct dst_entry *dst0 = NULL; + struct xfrm_dst *xdst_prev = NULL; + struct xfrm_dst *xdst0 = NULL; int i = 0; int err; int header_len = 0; @@ -1573,13 +1575,14 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, goto put_states; } - if (!dst_prev) - dst0 = dst1; + bundle[i] = xdst; + if (!xdst_prev) + xdst0 = xdst; else /* Ref count is taken during xfrm_alloc_dst() * No need to do dst_clone() on dst1 */ - dst_prev->child = dst1; + xfrm_dst_set_child(xdst_prev, &xdst->u.dst); if (xfrm[i]->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(xfrm[i], @@ -1616,8 +1619,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->input = dst_discard; dst1->output = inner_mode->afinfo->output; - dst1->next = dst_prev; - dst_prev = dst1; + xdst_prev = xdst; header_len += xfrm[i]->props.header_len; if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT) @@ -1625,40 +1627,39 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, trailer_len += xfrm[i]->props.trailer_len; } - dst_prev->child = dst; - dst0->path = dst; + xfrm_dst_set_child(xdst_prev, dst); + xdst0->path = dst; err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; - xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len); - xfrm_init_pmtu(dst_prev); - - for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) { - struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev; + xfrm_init_path(xdst0, dst, nfheader_len); + xfrm_init_pmtu(bundle, nx); - err = xfrm_fill_dst(xdst, dev, fl); + for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst; + xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) { + err = xfrm_fill_dst(xdst_prev, dev, fl); if (err) goto free_dst; - dst_prev->header_len = header_len; - dst_prev->trailer_len = trailer_len; - header_len -= xdst->u.dst.xfrm->props.header_len; - trailer_len -= xdst->u.dst.xfrm->props.trailer_len; + xdst_prev->u.dst.header_len = header_len; + xdst_prev->u.dst.trailer_len = trailer_len; + header_len -= xdst_prev->u.dst.xfrm->props.header_len; + trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len; } out: - return dst0; + return &xdst0->u.dst; put_states: for (; i < nx; i++) xfrm_state_put(xfrm[i]); free_dst: - if (dst0) - dst_release_immediate(dst0); - dst0 = ERR_PTR(err); + if (xdst0) + dst_release_immediate(&xdst0->u.dst); + xdst0 = ERR_PTR(err); goto out; } @@ -1800,7 +1801,7 @@ static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst, for (i = 0; i < num; i++) { if (!dst || dst->xfrm != xfrm[i]) return false; - dst = dst->child; + dst = xfrm_dst_child(dst); } return xfrm_bundle_ok(xdst); @@ -1813,6 +1814,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, { struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; + struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct xfrm_dst *xdst, *old; struct dst_entry *dst; int err; @@ -1840,7 +1842,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, old = xdst; - dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig); + dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig); if (IS_ERR(dst)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); return ERR_CAST(dst); @@ -1880,8 +1882,8 @@ static void xfrm_policy_queue_process(struct timer_list *t) xfrm_decode_session(skb, &fl, dst->ops->family); spin_unlock(&pq->hold_queue.lock); - dst_hold(dst->path); - dst = xfrm_lookup(net, dst->path, &fl, sk, 0); + dst_hold(xfrm_dst_path(dst)); + dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, 0); if (IS_ERR(dst)) goto purge_queue; @@ -1910,8 +1912,8 @@ static void xfrm_policy_queue_process(struct timer_list *t) skb = __skb_dequeue(&list); xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); - dst_hold(skb_dst(skb)->path); - dst = xfrm_lookup(net, skb_dst(skb)->path, &fl, skb->sk, 0); + dst_hold(xfrm_dst_path(skb_dst(skb))); + dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; @@ -2012,8 +2014,8 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, dst1->output = xdst_queue_output; dst_hold(dst); - dst1->child = dst; - dst1->path = dst; + xfrm_dst_set_child(xdst, dst); + xdst->path = dst; xfrm_init_path((struct xfrm_dst *)dst1, dst, 0); @@ -2576,7 +2578,7 @@ static int stale_bundle(struct dst_entry *dst) void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { - while ((dst = dst->child) && dst->xfrm && dst->dev == dev) { + while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { dst->dev = dev_net(dev)->loopback_dev; dev_hold(dst->dev); dev_put(dev); @@ -2600,13 +2602,15 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) return dst; } -static void xfrm_init_pmtu(struct dst_entry *dst) +static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) { - do { - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + while (nr--) { + struct xfrm_dst *xdst = bundle[nr]; u32 pmtu, route_mtu_cached; + struct dst_entry *dst; - pmtu = dst_mtu(dst->child); + dst = &xdst->u.dst; + pmtu = dst_mtu(xfrm_dst_child(dst)); xdst->child_mtu_cached = pmtu; pmtu = xfrm_state_mtu(dst->xfrm, pmtu); @@ -2618,7 +2622,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst) pmtu = route_mtu_cached; dst_metric_set(dst, RTAX_MTU, pmtu); - } while ((dst = dst->next)); + } } /* Check that the bundle accepts the flow and its components are @@ -2627,19 +2631,20 @@ static void xfrm_init_pmtu(struct dst_entry *dst) static int xfrm_bundle_ok(struct xfrm_dst *first) { + struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct dst_entry *dst = &first->u.dst; - struct xfrm_dst *last; + struct xfrm_dst *xdst; + int start_from, nr; u32 mtu; - if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) || + if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; if (dst->flags & DST_XFRM_QUEUE) return 1; - last = NULL; - + start_from = nr = 0; do { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; @@ -2651,9 +2656,11 @@ static int xfrm_bundle_ok(struct xfrm_dst *first) xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) return 0; - mtu = dst_mtu(dst->child); + bundle[nr++] = xdst; + + mtu = dst_mtu(xfrm_dst_child(dst)); if (xdst->child_mtu_cached != mtu) { - last = xdst; + start_from = nr; xdst->child_mtu_cached = mtu; } @@ -2661,30 +2668,30 @@ static int xfrm_bundle_ok(struct xfrm_dst *first) return 0; mtu = dst_mtu(xdst->route); if (xdst->route_mtu_cached != mtu) { - last = xdst; + start_from = nr; xdst->route_mtu_cached = mtu; } - dst = dst->child; + dst = xfrm_dst_child(dst); } while (dst->xfrm); - if (likely(!last)) + if (likely(!start_from)) return 1; - mtu = last->child_mtu_cached; - for (;;) { - dst = &last->u.dst; + xdst = bundle[start_from - 1]; + mtu = xdst->child_mtu_cached; + while (start_from--) { + dst = &xdst->u.dst; mtu = xfrm_state_mtu(dst->xfrm, mtu); - if (mtu > last->route_mtu_cached) - mtu = last->route_mtu_cached; + if (mtu > xdst->route_mtu_cached) + mtu = xdst->route_mtu_cached; dst_metric_set(dst, RTAX_MTU, mtu); - - if (last == first) + if (!start_from) break; - last = (struct xfrm_dst *)last->u.dst.next; - last->child_mtu_cached = mtu; + xdst = bundle[start_from - 1]; + xdst->child_mtu_cached = mtu; } return 1; @@ -2692,22 +2699,20 @@ static int xfrm_bundle_ok(struct xfrm_dst *first) static unsigned int xfrm_default_advmss(const struct dst_entry *dst) { - return dst_metric_advmss(dst->path); + return dst_metric_advmss(xfrm_dst_path(dst)); } static unsigned int xfrm_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - return mtu ? : dst_mtu(dst->path); + return mtu ? : dst_mtu(xfrm_dst_path(dst)); } static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, const void *daddr) { - const struct dst_entry *path = dst->path; - - for (; dst != path; dst = dst->child) { + while (dst->xfrm) { const struct xfrm_state *xfrm = dst->xfrm; if (xfrm->props.mode == XFRM_MODE_TRANSPORT) @@ -2716,6 +2721,8 @@ static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, daddr = xfrm->coaddr; else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) daddr = &xfrm->id.daddr; + + dst = xfrm_dst_child(dst); } return daddr; } @@ -2724,7 +2731,7 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { - const struct dst_entry *path = dst->path; + const struct dst_entry *path = xfrm_dst_path(dst); if (!skb) daddr = xfrm_get_dst_nexthop(dst, daddr); @@ -2733,7 +2740,7 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) { - const struct dst_entry *path = dst->path; + const struct dst_entry *path = xfrm_dst_path(dst); daddr = xfrm_get_dst_nexthop(dst, daddr); path->ops->confirm_neigh(path, daddr); diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 8b23c5bcf8e8..02501817227b 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -666,7 +666,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff if (unlikely(oseq < replay_esn->oseq)) { XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi; xo->seq.hi = oseq_hi; - + replay_esn->oseq_hi = oseq_hi; if (replay_esn->oseq_hi == 0) { replay_esn->oseq--; replay_esn->oseq_hi--; @@ -678,7 +678,6 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff } replay_esn->oseq = oseq; - replay_esn->oseq_hi = oseq_hi; if (xfrm_aevent_is_on(net)) x->repl->notify(x, XFRM_REPLAY_UPDATE); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 065d89606888..1b7856be3eeb 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2048,6 +2048,13 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen struct xfrm_mgr *km; struct xfrm_policy *pol = NULL; + if (!optval && !optlen) { + xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); + xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL); + __sk_dst_reset(sk); + return 0; + } + if (optlen <= 0 || optlen > PAGE_SIZE) return -EMSGSIZE; |