From 6cafaf4764a32597c2195aa5411b87728e1fde8a Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 20 Jun 2016 21:11:45 +0800 Subject: netfilter: nf_tables: fix memory leak if expr init fails If expr init fails then we need to free it. So when the user add a nft rule as follows: # nft add rule filter input tcp dport 22 flow table ssh \ { ip saddr limit rate 0/second } memory leak will happen. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2c881871db38..cf7c74599cbe 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1724,9 +1724,11 @@ struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, err = nf_tables_newexpr(ctx, &info, expr); if (err < 0) - goto err2; + goto err3; return expr; +err3: + kfree(expr); err2: module_put(info.ops->type->owner); err1: -- cgit v1.2.3 From 62131e5d735226074cba53095545d76b491e5003 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 8 Jun 2016 20:20:10 +0800 Subject: netfilter: nft_meta: set skb->nf_trace appropriately When user add a nft rule to set nftrace to zero, for example: # nft add rule ip filter input nftrace set 0 We should set nf_trace to zero also. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_meta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 16c50b0dd426..f4bad9dc15c4 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -227,7 +227,7 @@ void nft_meta_set_eval(const struct nft_expr *expr, skb->pkt_type = value; break; case NFT_META_NFTRACE: - skb->nf_trace = 1; + skb->nf_trace = !!value; break; default: WARN_ON(1); -- cgit v1.2.3 From 9cc1c73ad66610bffc80b691136ffc1e9a3b1a58 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 24 Apr 2016 01:18:21 +0200 Subject: netfilter: conntrack: avoid integer overflow when resizing Can overflow so we might allocate very small table when bucket count is high on a 32bit platform. Note: resize is only possible from init_netns. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f204274a9b6b..62c42e970c89 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1601,8 +1601,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) unsigned int nr_slots, i; size_t sz; + if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + + if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + sz = nr_slots * sizeof(struct hlist_nulls_head); hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, get_order(sz)); -- cgit v1.2.3 From 10c78f5854d361ded4736c1831948e0a5f67b932 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 2 Jul 2016 09:52:13 +0200 Subject: batman-adv: Avoid nullptr dereference in bla after vlan_insert_tag vlan_insert_tag can return NULL on errors. The bridge loop avoidance code therefore has to check the return value of vlan_insert_tag for NULL before it can safely operate on this pointer. Fixes: 23721387c409 ("batman-adv: add basic bridge loop avoidance code") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 748a9ead7ce5..712978024c5d 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -418,9 +418,12 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, break; } - if (vid & BATADV_VLAN_HAS_TAG) + if (vid & BATADV_VLAN_HAS_TAG) { skb = vlan_insert_tag(skb, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); + if (!skb) + goto out; + } skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, soft_iface); -- cgit v1.2.3 From 60154a1e0495ffb8343a95cefe1e874634572fa8 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 2 Jul 2016 09:52:14 +0200 Subject: batman-adv: Avoid nullptr dereference in dat after vlan_insert_tag vlan_insert_tag can return NULL on errors. The distributed arp table code therefore has to check the return value of vlan_insert_tag for NULL before it can safely operate on this pointer. Fixes: be1db4f6615b ("batman-adv: make the Distributed ARP Table vlan aware") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/distributed-arp-table.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 278800a99c69..aee3b3991471 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1009,9 +1009,12 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, if (!skb_new) goto out; - if (vid & BATADV_VLAN_HAS_TAG) + if (vid & BATADV_VLAN_HAS_TAG) { skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); + if (!skb_new) + goto out; + } skb_reset_mac_header(skb_new); skb_new->protocol = eth_type_trans(skb_new, @@ -1089,9 +1092,12 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, */ skb_reset_mac_header(skb_new); - if (vid & BATADV_VLAN_HAS_TAG) + if (vid & BATADV_VLAN_HAS_TAG) { skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); + if (!skb_new) + goto out; + } /* To preserve backwards compatibility, the node has choose the outgoing * format based on the incoming request packet type. The assumption is -- cgit v1.2.3 From 33fbb1f3db87ce53da925b3e034b4dd446d483f8 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 30 Jun 2016 20:10:46 +0200 Subject: batman-adv: Fix orig_node_vlan leak on orig_node_release batadv_orig_node_new uses batadv_orig_node_vlan_new to allocate a new batadv_orig_node_vlan and add it to batadv_orig_node::vlan_list. References to this list have also to be cleaned when the batadv_orig_node is removed. Fixes: 7ea7b4a14275 ("batman-adv: make the TT CRC logic VLAN specific") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/originator.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 7f51bc2c06eb..fe2fcda4a984 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -765,6 +765,7 @@ static void batadv_orig_node_release(struct kref *ref) struct batadv_neigh_node *neigh_node; struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; + struct batadv_orig_node_vlan *vlan; orig_node = container_of(ref, struct batadv_orig_node, refcount); @@ -784,6 +785,13 @@ static void batadv_orig_node_release(struct kref *ref) } spin_unlock_bh(&orig_node->neigh_list_lock); + spin_lock_bh(&orig_node->vlan_list_lock); + hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) { + hlist_del_rcu(&vlan->list); + batadv_orig_node_vlan_put(vlan); + } + spin_unlock_bh(&orig_node->vlan_list_lock); + /* Free nc_nodes */ batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); -- cgit v1.2.3 From 3db0decf1185357d6ab2256d0dede1ca9efda03d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 1 Jul 2016 15:49:43 +0200 Subject: batman-adv: Fix non-atomic bla_claim::backbone_gw access The pointer batadv_bla_claim::backbone_gw can be changed at any time. Therefore, access to it must be protected to ensure that two function accessing the same backbone_gw are actually accessing the same. This is especially important when the crc_lock is used or when the backbone_gw of a claim is exchanged. Not doing so leads to invalid memory access and/or reference leaks. Fixes: 23721387c409 ("batman-adv: add basic bridge loop avoidance code") Fixes: 5a1dd8a4773d ("batman-adv: lock crc access in bridge loop avoidance") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.c | 111 ++++++++++++++++++++++++++------- net/batman-adv/types.h | 2 + 2 files changed, 90 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 712978024c5d..825a5cdf4382 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -177,10 +177,21 @@ static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw) static void batadv_claim_release(struct kref *ref) { struct batadv_bla_claim *claim; + struct batadv_bla_backbone_gw *old_backbone_gw; claim = container_of(ref, struct batadv_bla_claim, refcount); - batadv_backbone_gw_put(claim->backbone_gw); + spin_lock_bh(&claim->backbone_lock); + old_backbone_gw = claim->backbone_gw; + claim->backbone_gw = NULL; + spin_unlock_bh(&claim->backbone_lock); + + spin_lock_bh(&old_backbone_gw->crc_lock); + old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + spin_unlock_bh(&old_backbone_gw->crc_lock); + + batadv_backbone_gw_put(old_backbone_gw); + kfree_rcu(claim, rcu); } @@ -677,8 +688,10 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid, struct batadv_bla_backbone_gw *backbone_gw) { + struct batadv_bla_backbone_gw *old_backbone_gw; struct batadv_bla_claim *claim; struct batadv_bla_claim search_claim; + bool remove_crc = false; int hash_added; ether_addr_copy(search_claim.addr, mac); @@ -692,8 +705,10 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, return; ether_addr_copy(claim->addr, mac); + spin_lock_init(&claim->backbone_lock); claim->vid = vid; claim->lasttime = jiffies; + kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; kref_init(&claim->refcount); @@ -721,15 +736,26 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, "bla_add_claim(): changing ownership for %pM, vid %d\n", mac, BATADV_PRINT_VID(vid)); - spin_lock_bh(&claim->backbone_gw->crc_lock); - claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); - spin_unlock_bh(&claim->backbone_gw->crc_lock); - batadv_backbone_gw_put(claim->backbone_gw); + remove_crc = true; } - /* set (new) backbone gw */ + + /* replace backbone_gw atomically and adjust reference counters */ + spin_lock_bh(&claim->backbone_lock); + old_backbone_gw = claim->backbone_gw; kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; + spin_unlock_bh(&claim->backbone_lock); + + if (remove_crc) { + /* remove claim address from old backbone_gw */ + spin_lock_bh(&old_backbone_gw->crc_lock); + old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + spin_unlock_bh(&old_backbone_gw->crc_lock); + } + batadv_backbone_gw_put(old_backbone_gw); + + /* add claim address to new backbone_gw */ spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&backbone_gw->crc_lock); @@ -739,6 +765,26 @@ claim_free_ref: batadv_claim_put(claim); } +/** + * batadv_bla_claim_get_backbone_gw - Get valid reference for backbone_gw of + * claim + * @claim: claim whose backbone_gw should be returned + * + * Return: valid reference to claim::backbone_gw + */ +static struct batadv_bla_backbone_gw * +batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim) +{ + struct batadv_bla_backbone_gw *backbone_gw; + + spin_lock_bh(&claim->backbone_lock); + backbone_gw = claim->backbone_gw; + kref_get(&backbone_gw->refcount); + spin_unlock_bh(&claim->backbone_lock); + + return backbone_gw; +} + /** * batadv_bla_del_claim - delete a claim from the claim hash * @bat_priv: the bat priv with all the soft interface information @@ -763,10 +809,6 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, batadv_choose_claim, claim); batadv_claim_put(claim); /* reference from the hash is gone */ - spin_lock_bh(&claim->backbone_gw->crc_lock); - claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); - spin_unlock_bh(&claim->backbone_gw->crc_lock); - /* don't need the reference from hash_find() anymore */ batadv_claim_put(claim); } @@ -1219,6 +1261,7 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, int now) { + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_bla_claim *claim; struct hlist_head *head; struct batadv_hashtable *hash; @@ -1233,14 +1276,17 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); if (now) goto purge_now; - if (!batadv_compare_eth(claim->backbone_gw->orig, + + if (!batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr)) - continue; + goto skip; + if (!batadv_has_timed_out(claim->lasttime, BATADV_BLA_CLAIM_TIMEOUT)) - continue; + goto skip; batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_purge_claims(): %pM, vid %d, time out\n", @@ -1248,8 +1294,10 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, purge_now: batadv_handle_unclaim(bat_priv, primary_if, - claim->backbone_gw->orig, + backbone_gw->orig, claim->addr, claim->vid); +skip: + batadv_backbone_gw_put(backbone_gw); } rcu_read_unlock(); } @@ -1760,9 +1808,11 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, bool is_bcast) { + struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_hard_iface *primary_if; + bool own_claim; bool ret; ethhdr = eth_hdr(skb); @@ -1797,8 +1847,12 @@ bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, } /* if it is our own claim ... */ - if (batadv_compare_eth(claim->backbone_gw->orig, - primary_if->net_dev->dev_addr)) { + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + own_claim = batadv_compare_eth(backbone_gw->orig, + primary_if->net_dev->dev_addr); + batadv_backbone_gw_put(backbone_gw); + + if (own_claim) { /* ... allow it in any case */ claim->lasttime = jiffies; goto allow; @@ -1862,7 +1916,9 @@ bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, { struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hard_iface *primary_if; + bool client_roamed; bool ret = false; primary_if = batadv_primary_if_get_selected(bat_priv); @@ -1892,8 +1948,12 @@ bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, goto allow; /* check if we are responsible. */ - if (batadv_compare_eth(claim->backbone_gw->orig, - primary_if->net_dev->dev_addr)) { + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + client_roamed = batadv_compare_eth(backbone_gw->orig, + primary_if->net_dev->dev_addr); + batadv_backbone_gw_put(backbone_gw); + + if (client_roamed) { /* if yes, the client has roamed and we have * to unclaim it. */ @@ -1941,6 +2001,7 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) struct net_device *net_dev = (struct net_device *)seq->private; struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hashtable *hash = bat_priv->bla.claim_hash; + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_bla_claim *claim; struct batadv_hard_iface *primary_if; struct hlist_head *head; @@ -1965,17 +2026,21 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { - is_own = batadv_compare_eth(claim->backbone_gw->orig, + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + + is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); - spin_lock_bh(&claim->backbone_gw->crc_lock); - backbone_crc = claim->backbone_gw->crc; - spin_unlock_bh(&claim->backbone_gw->crc_lock); + spin_lock_bh(&backbone_gw->crc_lock); + backbone_crc = backbone_gw->crc; + spin_unlock_bh(&backbone_gw->crc_lock); seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n", claim->addr, BATADV_PRINT_VID(claim->vid), - claim->backbone_gw->orig, + backbone_gw->orig, (is_own ? 'x' : ' '), backbone_crc); + + batadv_backbone_gw_put(backbone_gw); } rcu_read_unlock(); } diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index ba846b078af8..005122234b90 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1042,6 +1042,7 @@ struct batadv_bla_backbone_gw { * @addr: mac address of claimed non-mesh client * @vid: vlan id this client was detected on * @backbone_gw: pointer to backbone gw claiming this client + * @backbone_lock: lock protecting backbone_gw pointer * @lasttime: last time we heard of claim (locals only) * @hash_entry: hlist node for batadv_priv_bla::claim_hash * @refcount: number of contexts the object is used @@ -1051,6 +1052,7 @@ struct batadv_bla_claim { u8 addr[ETH_ALEN]; unsigned short vid; struct batadv_bla_backbone_gw *backbone_gw; + spinlock_t backbone_lock; /* protects backbone_gw */ unsigned long lasttime; struct hlist_node hash_entry; struct rcu_head rcu; -- cgit v1.2.3 From 15c2ed753cd9e3e746472deab8151337a5b6da56 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 30 Jun 2016 20:11:34 +0200 Subject: batman-adv: Fix reference leak in batadv_find_router The replacement of last_bonding_candidate in batadv_orig_node has to be an atomic operation. Otherwise it is possible that the reference counter of a batadv_orig_ifinfo is reduced which was no longer the last_bonding_candidate when the new candidate is added. This can either lead to an invalid memory access or to reference leaks which make it impossible to an interface which was added to batman-adv. Fixes: f3b3d9018975 ("batman-adv: add bonding again") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/routing.c | 52 ++++++++++++++++++++++++++++++++++++------------ net/batman-adv/types.h | 4 +++- 2 files changed, 42 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 6c2901a86230..bfac086b4d01 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -455,6 +455,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, return 0; } +/** + * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node + * @orig_node: originator node whose bonding candidates should be replaced + * @new_candidate: new bonding candidate or NULL + */ +static void +batadv_last_bonding_replace(struct batadv_orig_node *orig_node, + struct batadv_orig_ifinfo *new_candidate) +{ + struct batadv_orig_ifinfo *old_candidate; + + spin_lock_bh(&orig_node->neigh_list_lock); + old_candidate = orig_node->last_bonding_candidate; + + if (new_candidate) + kref_get(&new_candidate->refcount); + orig_node->last_bonding_candidate = new_candidate; + spin_unlock_bh(&orig_node->neigh_list_lock); + + if (old_candidate) + batadv_orig_ifinfo_put(old_candidate); +} + /** * batadv_find_router - find a suitable router for this originator * @bat_priv: the bat priv with all the soft interface information @@ -562,10 +585,6 @@ next: } rcu_read_unlock(); - /* last_bonding_candidate is reset below, remove the old reference. */ - if (orig_node->last_bonding_candidate) - batadv_orig_ifinfo_put(orig_node->last_bonding_candidate); - /* After finding candidates, handle the three cases: * 1) there is a next candidate, use that * 2) there is no next candidate, use the first of the list @@ -574,21 +593,28 @@ next: if (next_candidate) { batadv_neigh_node_put(router); - /* remove references to first candidate, we don't need it. */ - if (first_candidate) { - batadv_neigh_node_put(first_candidate_router); - batadv_orig_ifinfo_put(first_candidate); - } + kref_get(&next_candidate_router->refcount); router = next_candidate_router; - orig_node->last_bonding_candidate = next_candidate; + batadv_last_bonding_replace(orig_node, next_candidate); } else if (first_candidate) { batadv_neigh_node_put(router); - /* refcounting has already been done in the loop above. */ + kref_get(&first_candidate_router->refcount); router = first_candidate_router; - orig_node->last_bonding_candidate = first_candidate; + batadv_last_bonding_replace(orig_node, first_candidate); } else { - orig_node->last_bonding_candidate = NULL; + batadv_last_bonding_replace(orig_node, NULL); + } + + /* cleanup of candidates */ + if (first_candidate) { + batadv_neigh_node_put(first_candidate_router); + batadv_orig_ifinfo_put(first_candidate); + } + + if (next_candidate) { + batadv_neigh_node_put(next_candidate_router); + batadv_orig_ifinfo_put(next_candidate); } return router; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 005122234b90..74d865a4df46 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -330,7 +330,9 @@ struct batadv_orig_node { DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); u32 last_bcast_seqno; struct hlist_head neigh_list; - /* neigh_list_lock protects: neigh_list and router */ + /* neigh_list_lock protects: neigh_list, ifinfo_list, + * last_bonding_candidate and router + */ spinlock_t neigh_list_lock; struct hlist_node hash_entry; struct batadv_priv *bat_priv; -- cgit v1.2.3 From cbef1e102003edb236c6b2319ab269ccef963731 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 30 Jun 2016 21:41:13 +0200 Subject: batman-adv: Free last_bonding_candidate on release of orig_node The orig_ifinfo reference counter for last_bonding_candidate in batadv_orig_node has to be reduced when an originator node is released. Otherwise the orig_ifinfo is leaked and the reference counter the netdevice is not reduced correctly. Fixes: f3b3d9018975 ("batman-adv: add bonding again") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/originator.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index fe2fcda4a984..ab8c4f9738fe 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -766,6 +766,7 @@ static void batadv_orig_node_release(struct kref *ref) struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_orig_node_vlan *vlan; + struct batadv_orig_ifinfo *last_candidate; orig_node = container_of(ref, struct batadv_orig_node, refcount); @@ -783,8 +784,14 @@ static void batadv_orig_node_release(struct kref *ref) hlist_del_rcu(&orig_ifinfo->list); batadv_orig_ifinfo_put(orig_ifinfo); } + + last_candidate = orig_node->last_bonding_candidate; + orig_node->last_bonding_candidate = NULL; spin_unlock_bh(&orig_node->neigh_list_lock); + if (last_candidate) + batadv_orig_ifinfo_put(last_candidate); + spin_lock_bh(&orig_node->vlan_list_lock); hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) { hlist_del_rcu(&vlan->list); -- cgit v1.2.3 From 6e8ef842223b90a33efd570128bb566a9ae6f5ad Mon Sep 17 00:00:00 2001 From: Purushottam Kushwaha Date: Tue, 5 Jul 2016 13:44:51 +0530 Subject: nl80211: Move ACL parsing later to avoid a possible memory leak No support for pbss results in a memory leak for the acl_data (if parse_acl_data succeeds). Fix this by moving the ACL parsing later. Cc: stable@vger.kernel.org Fixes: 34d505193bd10 ("cfg80211: basic support for PBSS network type") Signed-off-by: Purushottam Kushwaha Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d7599014055d..7d72283901a3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3487,16 +3487,16 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) params.smps_mode = NL80211_SMPS_OFF; } + params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); + if (params.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) + return -EOPNOTSUPP; + if (info->attrs[NL80211_ATTR_ACL_POLICY]) { params.acl = parse_acl_data(&rdev->wiphy, info); if (IS_ERR(params.acl)) return PTR_ERR(params.acl); } - params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); - if (params.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) - return -EOPNOTSUPP; - wdev_lock(wdev); err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { -- cgit v1.2.3 From 16a910a6722b7a8680409e634c7c0dac073c01e4 Mon Sep 17 00:00:00 2001 From: Gregory Greenman Date: Tue, 5 Jul 2016 15:23:10 +0300 Subject: cfg80211: handle failed skb allocation Handle the case when dev_alloc_skb returns NULL. Cc: stable@vger.kernel.org Fixes: 2b67f944f88c2 ("cfg80211: reuse existing page fragments in A-MSDU rx") Signed-off-by: Gregory Greenman Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/wireless/util.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index 2443ee30ba5b..b7d1592bd5b8 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -721,6 +721,8 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, * alignment since sizeof(struct ethhdr) is 14. */ frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len); + if (!frame) + return NULL; skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len); -- cgit v1.2.3 From d1fe176ca51fa3cb35f70c1d876d9a090e9befce Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Jun 2016 10:43:19 +0200 Subject: batman-adv: Fix speedy join in gateway client mode Speedy join only works when the received packet is either broadcast or an 4addr unicast packet. Thus packets converted from broadcast to unicast via the gateway handling code have to be converted to 4addr packets to allow the receiving gateway server to add the sender address as temporary entry to the translation table. Not doing it will make the batman-adv gateway server drop the DHCP response in many situations because it doesn't yet have the TT entry for the destination of the DHCP response. Fixes: 371351731e9c ("batman-adv: change interface_rx to get orig node") Signed-off-by: Sven Eckelmann Acked-by: Antonio Quartulli Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index f2f125684ed9..010397650fa5 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -424,8 +424,8 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node *orig_node; orig_node = batadv_gw_get_selected_orig(bat_priv); - return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, - orig_node, vid); + return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, + BATADV_P_DATA, orig_node, vid); } void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface) -- cgit v1.2.3 From 3777ed688fba82d0bd43f9fc1ebbc6abe788576d Mon Sep 17 00:00:00 2001 From: Quentin Armitage Date: Thu, 16 Jun 2016 08:00:14 +0100 Subject: ipvs: fix bind to link-local mcast IPv6 address in backup When using HEAD from https://git.kernel.org/cgit/utils/kernel/ipvsadm/ipvsadm.git/, the command: ipvsadm --start-daemon backup --mcast-interface eth0.60 \ --mcast-group ff02::1:81 fails with the error message: Argument list too long whereas both: ipvsadm --start-daemon master --mcast-interface eth0.60 \ --mcast-group ff02::1:81 and: ipvsadm --start-daemon backup --mcast-interface eth0.60 \ --mcast-group 224.0.0.81 are successful. The error message "Argument list too long" isn't helpful. The error occurs because an IPv6 address is given in backup mode. The error is in make_receive_sock() in net/netfilter/ipvs/ip_vs_sync.c, since it fails to set the interface on the address or the socket before calling inet6_bind() (via sock->ops->bind), where the test 'if (!sk->sk_bound_dev_if)' failed. Setting sock->sk->sk_bound_dev_if on the socket before calling inet6_bind() resolves the issue. Fixes: d33288172e72 ("ipvs: add more mcast parameters for the sync daemon") Signed-off-by: Quentin Armitage Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_sync.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 803001a45aa1..1b07578bedf3 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1545,7 +1545,8 @@ error: /* * Set up receiving multicast socket over UDP */ -static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id) +static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id, + int ifindex) { /* multicast addr */ union ipvs_sockaddr mcast_addr; @@ -1566,6 +1567,7 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id) set_sock_size(sock->sk, 0, result); get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); + sock->sk->sk_bound_dev_if = ifindex; result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); @@ -1868,7 +1870,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, if (state == IP_VS_STATE_MASTER) sock = make_send_sock(ipvs, id); else - sock = make_receive_sock(ipvs, id); + sock = make_receive_sock(ipvs, id, dev->ifindex); if (IS_ERR(sock)) { result = PTR_ERR(sock); goto outtinfo; -- cgit v1.2.3 From c8607e020014cf11a61601a0005270bad81cabdf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Jul 2016 14:53:06 +0200 Subject: netfilter: nft_ct: fix expiration getter We need to compute timeout.expires - jiffies, not the other way around. Add a helper, another patch can then later change more places in conntrack code where we currently open-code this. Will allow us to only change one place later when we remove per-ct timer. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 8 ++++++++ net/netfilter/nft_ct.c | 6 +----- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index dd78bea227c8..b6083c34ef0d 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -284,6 +284,14 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK; } +/* jiffies until ct expires, 0 if already expired */ +static inline unsigned long nf_ct_expires(const struct nf_conn *ct) +{ + long timeout = (long)ct->timeout.expires - (long)jiffies; + + return timeout > 0 ? timeout : 0; +} + struct kernel_param; int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 137e308d5b24..81fbb450783e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -54,7 +54,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr, const struct nf_conn_help *help; const struct nf_conntrack_tuple *tuple; const struct nf_conntrack_helper *helper; - long diff; unsigned int state; ct = nf_ct_get(pkt->skb, &ctinfo); @@ -94,10 +93,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; #endif case NFT_CT_EXPIRATION: - diff = (long)jiffies - (long)ct->timeout.expires; - if (diff < 0) - diff = 0; - *dest = jiffies_to_msecs(diff); + *dest = jiffies_to_msecs(nf_ct_expires(ct)); return; case NFT_CT_HELPER: if (ct->master == NULL) -- cgit v1.2.3 From 95556a883834122c616bbeb942654d745ceb9712 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Jul 2016 11:03:57 +0200 Subject: dccp: avoid deadlock in dccp_v4_ctl_send_reset In the prep work I did before enabling BH while handling socket backlog, I missed two points in DCCP : 1) dccp_v4_ctl_send_reset() uses bh_lock_sock(), assuming BH were blocked. It is not anymore always true. 2) dccp_v4_route_skb() was using __IP_INC_STATS() instead of IP_INC_STATS() A similar fix was done for TCP, in commit 47dcc20a39d0 ("ipv4: tcp: ip_send_unicast_reply() is not BH safe") Fixes: 7309f8821fd6 ("dccp: do not assume DCCP code is non preemptible") Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 5c7e413a3ae4..25dd25b47d41 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -462,7 +462,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { - __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } @@ -527,17 +527,19 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) rxiph->daddr); skb_dst_set(skb, dst_clone(dst)); + local_bh_disable(); bh_lock_sock(ctl_sk); err = ip_build_and_send_pkt(skb, ctl_sk, rxiph->daddr, rxiph->saddr, NULL); bh_unlock_sock(ctl_sk); if (net_xmit_eval(err) == 0) { - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - DCCP_INC_STATS(DCCP_MIB_OUTRSTS); + __DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + __DCCP_INC_STATS(DCCP_MIB_OUTRSTS); } + local_bh_enable(); out: - dst_release(dst); + dst_release(dst); } static void dccp_v4_reqsk_destructor(struct request_sock *req) -- cgit v1.2.3 From a612769774a30e4fc143c4cb6395c12573415660 Mon Sep 17 00:00:00 2001 From: Michal Kubeček Date: Fri, 8 Jul 2016 17:52:33 +0200 Subject: udp: prevent bugcheck if filter truncates packet too much If socket filter truncates an udp packet below the length of UDP header in udpv6_queue_rcv_skb() or udp_queue_rcv_skb(), it will trigger a BUG_ON in skb_pull_rcsum(). This BUG_ON (and therefore a system crash if kernel is configured that way) can be easily enforced by an unprivileged user which was reported as CVE-2016-6162. For a reproducer, see http://seclists.org/oss-sec/2016/q3/8 Fixes: e6afc8ace6dd ("udp: remove headers from UDP packets before queueing") Reported-by: Marco Grassi Signed-off-by: Michal Kubecek Acked-by: Eric Dumazet Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 ++ net/ipv6/udp.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ca5e8ea29538..4aed8fc23d32 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1583,6 +1583,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (sk_filter(sk, skb)) goto drop; + if (unlikely(skb->len < sizeof(struct udphdr))) + goto drop; udp_csum_pull_header(skb); if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 005dc82c2138..acc09705618b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -620,6 +620,8 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (sk_filter(sk, skb)) goto drop; + if (unlikely(skb->len < sizeof(struct udphdr))) + goto drop; udp_csum_pull_header(skb); if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { -- cgit v1.2.3 From 75ff39ccc1bd5d3c455b6822ab09e533c551f758 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Jul 2016 10:04:02 +0200 Subject: tcp: make challenge acks less predictable Yue Cao claims that current host rate limiting of challenge ACKS (RFC 5961) could leak enough information to allow a patient attacker to hijack TCP sessions. He will soon provide details in an academic paper. This patch increases the default limit from 100 to 1000, and adds some randomization so that the attacker can no longer hijack sessions without spending a considerable amount of probes. Based on initial analysis and patch from Linus. Note that we also have per socket rate limiting, so it is tempting to remove the host limit in the future. v2: randomize the count of challenge acks per second, not the period. Fixes: 282f23c6ee34 ("tcp: implement RFC 5961 3.2") Reported-by: Yue Cao Signed-off-by: Eric Dumazet Suggested-by: Linus Torvalds Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6c8f4cd0800..91868bb17818 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -87,7 +87,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); /* rfc5961 challenge ack rate limiting */ -int sysctl_tcp_challenge_ack_limit = 100; +int sysctl_tcp_challenge_ack_limit = 1000; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; @@ -3458,7 +3458,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static u32 challenge_timestamp; static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); - u32 now; + u32 count, now; /* First check our per-socket dupack rate limit. */ if (tcp_oow_rate_limited(sock_net(sk), skb, @@ -3466,13 +3466,18 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) &tp->last_oow_ack_time)) return; - /* Then check the check host-wide RFC 5961 rate limit. */ + /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != challenge_timestamp) { + u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + challenge_timestamp = now; - challenge_count = 0; + WRITE_ONCE(challenge_count, half + + prandom_u32_max(sysctl_tcp_challenge_ack_limit)); } - if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + count = READ_ONCE(challenge_count); + if (count > 0) { + WRITE_ONCE(challenge_count, count - 1); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } -- cgit v1.2.3 From 80610229ef7b26615dbb6cb6e873709a60bacc9f Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 10 Jul 2016 21:11:55 +0300 Subject: ipv4: reject RTNH_F_DEAD and RTNH_F_LINKDOWN from user space Vegard Nossum is reporting for a crash in fib_dump_info when nh_dev = NULL and fib_nhs == 1: Pid: 50, comm: netlink.exe Not tainted 4.7.0-rc5+ RIP: 0033:[<00000000602b3d18>] RSP: 0000000062623890 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 000000006261b800 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000024 RDI: 000000006245ba00 RBP: 00000000626238f0 R08: 000000000000029c R09: 0000000000000000 R10: 0000000062468038 R11: 000000006245ba00 R12: 000000006245ba00 R13: 00000000625f96c0 R14: 00000000601e16f0 R15: 0000000000000000 Kernel panic - not syncing: Kernel mode fault at addr 0x2e0, ip 0x602b3d18 CPU: 0 PID: 50 Comm: netlink.exe Not tainted 4.7.0-rc5+ #581 Stack: 626238f0 960226a02 00000400 000000fe 62623910 600afca7 62623970 62623a48 62468038 00000018 00000000 00000000 Call Trace: [<602b3e93>] rtmsg_fib+0xd3/0x190 [<602b6680>] fib_table_insert+0x260/0x500 [<602b0e5d>] inet_rtm_newroute+0x4d/0x60 [<60250def>] rtnetlink_rcv_msg+0x8f/0x270 [<60267079>] netlink_rcv_skb+0xc9/0xe0 [<60250d4b>] rtnetlink_rcv+0x3b/0x50 [<60265400>] netlink_unicast+0x1a0/0x2c0 [<60265e47>] netlink_sendmsg+0x3f7/0x470 [<6021dc9a>] sock_sendmsg+0x3a/0x90 [<6021e0d0>] ___sys_sendmsg+0x300/0x360 [<6021fa64>] __sys_sendmsg+0x54/0xa0 [<6021fac0>] SyS_sendmsg+0x10/0x20 [<6001ea68>] handle_syscall+0x88/0x90 [<600295fd>] userspace+0x3fd/0x500 [<6001ac55>] fork_handler+0x85/0x90 $ addr2line -e vmlinux -i 0x602b3d18 include/linux/inetdevice.h:222 net/ipv4/fib_semantics.c:1264 Problem happens when RTNH_F_LINKDOWN is provided from user space when creating routes that do not use the flag, catched with netlink fuzzer. Currently, the kernel allows user space to set both flags to nh_flags and fib_flags but this is not intentional, the assumption was that they are not set. Fix this by rejecting both flags with EINVAL. Reported-by: Vegard Nossum Fixes: 0eeb075fad73 ("net: ipv4 sysctl option to ignore routes when nexthop link is down") Signed-off-by: Julian Anastasov Cc: Andy Gospodarek Cc: Dinesh Dutt Cc: Scott Feldman Reviewed-by: Andy Gospodarek Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d09173bf9500..539fa264e67d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -479,6 +479,9 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (!rtnh_ok(rtnh, remaining)) return -EINVAL; + if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + return -EINVAL; + nexthop_nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; nexthop_nh->nh_oif = rtnh->rtnh_ifindex; @@ -1003,6 +1006,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (fib_props[cfg->fc_type].scope > cfg->fc_scope) goto err_inval; + if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + goto err_inval; + #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); -- cgit v1.2.3 From 779f1edec664a7b32b71f7b4702e085a08d60592 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Mon, 11 Jul 2016 16:51:26 -0400 Subject: sock: ignore SCM_RIGHTS and SCM_CREDENTIALS in __sock_cmsg_send Sergei Trofimovich reported that pulse audio sends SCM_CREDENTIALS as a control message to TCP. Since __sock_cmsg_send does not support SCM_RIGHTS and SCM_CREDENTIALS, it returns an error and hence breaks pulse audio over TCP. SCM_RIGHTS and SCM_CREDENTIALS are sent on the SOL_SOCKET layer but they semantically belong to SOL_UNIX. Since all cmsg-processing functions including sock_cmsg_send ignore control messages of other layers, it is best to ignore SCM_RIGHTS and SCM_CREDENTIALS for consistency (and also for fixing pulse audio over TCP). Fixes: c14ac9451c34 ("sock: enable timestamping using control messages") Signed-off-by: Soheil Hassas Yeganeh Reported-by: Sergei Trofimovich Tested-by: Sergei Trofimovich Cc: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 08bf97eceeb3..b7f12639c26a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1938,6 +1938,10 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; + /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ + case SCM_RIGHTS: + case SCM_CREDENTIALS: + break; default: return -EINVAL; } -- cgit v1.2.3 From 2d18ac4ba7454a4260473e68be7e485ae71e7948 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 11 Jul 2016 16:08:35 -0400 Subject: tipc: extend broadcast link initialization criteria At first contact between two nodes, an endpoint might sometimes have time to send out a LINK_PROTOCOL/STATE packet before it has received the broadcast initialization packet from the peer, i.e., before it has received a valid broadcast packet number to add to the 'bc_ack' field of the protocol message. This means that the peer endpoint will receive a protocol packet with an invalid broadcast acknowledge value of 0. Under unlucky circumstances this may lead to the original, already received acknowledge value being overwritten, so that the whole broadcast link goes stale after a while. We fix this by delaying the setting of the link field 'bc_peer_is_up' until we know that the peer really has received our own broadcast initialization message. The latter is always sent out as the first unicast message on a link, and always with seqeunce number 1. Because of this, we only need to look for a non-zero unicast acknowledge value in the arriving STATE messages, and once that is confirmed we know we are safe and can set the mentioned field. Before this moment, we must ignore all broadcast acknowledges from the peer. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 67b6ab9f4c8d..6483dc4333fb 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1559,7 +1559,12 @@ void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (!msg_peer_node_is_up(hdr)) return; - l->bc_peer_is_up = true; + /* Open when peer ackowledges our bcast init msg (pkt #1) */ + if (msg_ack(hdr)) + l->bc_peer_is_up = true; + + if (!l->bc_peer_is_up) + return; /* Ignore if peers_snd_nxt goes beyond receive window */ if (more(peers_snd_nxt, l->rcv_nxt + l->window)) -- cgit v1.2.3 From a71eb720355c28eaeb2de0c4d960247c69bb2c6f Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 11 Jul 2016 16:08:36 -0400 Subject: tipc: ensure correct broadcast send buffer release when peer is lost After a new receiver peer has been added to the broadcast transmission link, we allow immediate transmission of new broadcast packets, trusting that the new peer will not accept the packets until it has received the previously sent unicast broadcast initialiation message. In the same way, the sender must not accept any acknowledges until it has itself received the broadcast initialization from the peer, as well as confirmation of the reception of its own initialization message. Furthermore, when a receiver peer goes down, the sender has to produce the missing acknowledges from the lost peer locally, in order ensure correct release of the buffers that were expected to be acknowledged by the said peer. In a highly stressed system we have observed that contact with a peer may come up and be lost before the above mentioned broadcast initial- ization and confirmation have been received. This leads to the locally produced acknowledges being rejected, and the non-acknowledged buffers to linger in the broadcast link transmission queue until it fills up and the link goes into permanent congestion. In this commit, we remedy this by temporarily setting the corresponding broadcast receive link state to ESTABLISHED and the 'bc_peer_is_up' state to true before we issue the local acknowledges. This ensures that those acknowledges will always be accepted. The mentioned state values are restored immediately afterwards when the link is reset. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 6483dc4333fb..7d89f8713d49 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -349,6 +349,8 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, u16 ack = snd_l->snd_nxt - 1; snd_l->ackers--; + rcv_l->bc_peer_is_up = true; + rcv_l->state = LINK_ESTABLISHED; tipc_link_bc_ack_rcv(rcv_l, ack, xmitq); tipc_link_reset(rcv_l); rcv_l->state = LINK_RESET; -- cgit v1.2.3 From 1fc07f3e1541cc49cc159beb3fdefc5013570eda Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 11 Jul 2016 16:08:37 -0400 Subject: tipc: reset all unicast links when broadcast send link fails In test situations with many nodes and a heavily stressed system we have observed that the transmission broadcast link may fail due to an excessive number of retransmissions of the same packet. In such situations we need to reset all unicast links to all peers, in order to reset and re-synchronize the broadcast link. In this commit, we add a new function tipc_bearer_reset_all() to be used in such situations. The function scans across all bearers and resets all their pertaining links. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 15 +++++++++++++++ net/tipc/bearer.h | 1 + net/tipc/node.c | 15 +++++++++++---- 3 files changed, 27 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index bf8f05c3eb82..a597708ae381 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -330,6 +330,21 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) return 0; } +/* tipc_bearer_reset_all - reset all links on all bearers + */ +void tipc_bearer_reset_all(struct net *net) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_bearer *b; + int i; + + for (i = 0; i < MAX_BEARERS; i++) { + b = rcu_dereference_rtnl(tn->bearer_list[i]); + if (b) + tipc_reset_bearer(net, b); + } +} + /** * bearer_disable * diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index f686e41b5abb..60e49c3be19c 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -198,6 +198,7 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest); void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest); struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name); struct tipc_media *tipc_media_find(const char *name); +void tipc_bearer_reset_all(struct net *net); int tipc_bearer_setup(void); void tipc_bearer_cleanup(void); void tipc_bearer_stop(struct net *net); diff --git a/net/tipc/node.c b/net/tipc/node.c index e01e2c71b5a1..23d4761842a0 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1297,10 +1297,6 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id rc = tipc_bcast_rcv(net, be->link, skb); - /* Broadcast link reset may happen at reassembly failure */ - if (rc & TIPC_LINK_DOWN_EVT) - tipc_node_reset_links(n); - /* Broadcast ACKs are sent on a unicast link */ if (rc & TIPC_LINK_SND_BC_ACK) { tipc_node_read_lock(n); @@ -1320,6 +1316,17 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id spin_unlock_bh(&be->inputq2.lock); tipc_sk_mcast_rcv(net, &be->arrvq, &be->inputq2); } + + if (rc & TIPC_LINK_DOWN_EVT) { + /* Reception reassembly failure => reset all links to peer */ + if (!tipc_link_is_up(be->link)) + tipc_node_reset_links(n); + + /* Retransmission failure => reset all links to all peers */ + if (!tipc_link_is_up(tipc_bc_sndlink(net))) + tipc_bearer_reset_all(net); + } + tipc_node_put(n); } -- cgit v1.2.3 From 590b52e10d410e1439ae86be9fe19d75fdab628b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 11 Jul 2016 17:28:54 +0200 Subject: netfilter: conntrack: skip clash resolution if nat is in place The clash resolution is not easy to apply if the NAT table is registered. Even if no NAT rules are installed, the nul-binding ensures that a unique tuple is used, thus, the packet that loses race gets a different source port number, as described by: http://marc.info/?l=netfilter-devel&m=146818011604484&w=2 Clash resolution with NAT is also problematic if addresses/port range ports are used since the conntrack that wins race may describe a different mangling that we may have earlier applied to the packet via nf_nat_setup_info(). Fixes: 71d8c47fc653 ("netfilter: conntrack: introduce clash resolution on insertion race") Signed-off-by: Pablo Neira Ayuso Tested-by: Marc Dionne --- net/netfilter/nf_conntrack_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 62c42e970c89..9f530adad10d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -646,6 +646,7 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto->allow_clash && + !nfct_nat(ct) && !nf_ct_is_dying(ct) && atomic_inc_not_zero(&ct->ct_general.use)) { nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct); -- cgit v1.2.3 From f4979fcea7fd36d8e2f556abef86f80e0d5af1ba Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 Jul 2016 18:18:56 -0400 Subject: rose: limit sk_filter trim to payload Sockets can have a filter program attached that drops or trims incoming packets based on the filter program return value. Rose requires data packets to have at least ROSE_MIN_LEN bytes. It verifies this on arrival in rose_route_frame and unconditionally pulls the bytes in rose_recvmsg. The filter can trim packets to below this value in-between, causing pull to fail, leaving the partial header at the time of skb_copy_datagram_msg. Place a lower bound on the size to which sk_filter may trim packets by introducing sk_filter_trim_cap and call this for rose packets. Signed-off-by: Willem de Bruijn Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 6 +++++- net/core/filter.c | 10 +++++----- net/rose/rose_in.c | 3 ++- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 6fc31ef1da2d..8f74f3d61894 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -467,7 +467,11 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) } #endif /* CONFIG_DEBUG_SET_MODULE_RONX */ -int sk_filter(struct sock *sk, struct sk_buff *skb); +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); +static inline int sk_filter(struct sock *sk, struct sk_buff *skb) +{ + return sk_filter_trim_cap(sk, skb, 1); +} struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); diff --git a/net/core/filter.c b/net/core/filter.c index c4b330c85c02..e759d90e8cef 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -53,9 +53,10 @@ #include /** - * sk_filter - run a packet through a socket filter + * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter + * @cap: limit on how short the eBPF program may trim the packet * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller @@ -64,7 +65,7 @@ * be accepted or -EPERM if the packet should be tossed. * */ -int sk_filter(struct sock *sk, struct sk_buff *skb) +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) { int err; struct sk_filter *filter; @@ -85,14 +86,13 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) filter = rcu_dereference(sk->sk_filter); if (filter) { unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); - - err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; + err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); return err; } -EXPORT_SYMBOL(sk_filter); +EXPORT_SYMBOL(sk_filter_trim_cap); static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) { diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 79c4abcfa6b4..0a6394754e81 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -164,7 +164,8 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety rose_frames_acked(sk, nr); if (ns == rose->vr) { rose_start_idletimer(sk); - if (sock_queue_rcv_skb(sk, skb) == 0) { + if (sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN) == 0 && + __sock_queue_rcv_skb(sk, skb) == 0) { rose->vr = (rose->vr + 1) % ROSE_MODULUS; queued = 1; } else { -- cgit v1.2.3 From 4f0c40d94461cfd23893a17335b2ab78ecb333c8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 Jul 2016 18:18:57 -0400 Subject: dccp: limit sk_filter trim to payload Dccp verifies packet integrity, including length, at initial rcv in dccp_invalid_packet, later pulls headers in dccp_enqueue_skb. A call to sk_filter in-between can cause __skb_pull to wrap skb->len. skb_copy_datagram_msg interprets this as a negative value, so (correctly) fails with EFAULT. The negative length is reported in ioctl SIOCINQ or possibly in a DCCP_WARN in dccp_close. Introduce an sk_receive_skb variant that caps how small a filter program can trim packets, and call this in dccp with the header length. Excessively trimmed packets are now processed normally and queued for reception as 0B payloads. Fixes: 7c657876b63c ("[DCCP]: Initial implementation") Signed-off-by: Willem de Bruijn Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/sock.h | 8 +++++++- net/core/sock.c | 7 ++++--- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- 4 files changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 649d2a8c17fc..ff5be7e8ddea 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1576,7 +1576,13 @@ static inline void sock_put(struct sock *sk) */ void sock_gen_put(struct sock *sk); -int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested); +int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, + unsigned int trim_cap); +static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, + const int nested) +{ + return __sk_receive_skb(sk, skb, nested, 1); +} static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) { diff --git a/net/core/sock.c b/net/core/sock.c index b7f12639c26a..25dab8b60223 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -452,11 +452,12 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_rcv_skb); -int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) +int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, + const int nested, unsigned int trim_cap) { int rc = NET_RX_SUCCESS; - if (sk_filter(sk, skb)) + if (sk_filter_trim_cap(sk, skb, trim_cap)) goto discard_and_relse; skb->dev = NULL; @@ -492,7 +493,7 @@ discard_and_relse: kfree_skb(skb); goto out; } -EXPORT_SYMBOL(sk_receive_skb); +EXPORT_SYMBOL(__sk_receive_skb); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 25dd25b47d41..345a3aeb8c7e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -868,7 +868,7 @@ lookup: goto discard_and_relse; nf_reset(skb); - return sk_receive_skb(sk, skb, 1); + return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4); no_dccp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d176f4e66369..3ff137d9471d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -732,7 +732,7 @@ lookup: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - return sk_receive_skb(sk, skb, 1) ? -1 : 0; + return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4) ? -1 : 0; no_dccp_socket: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) -- cgit v1.2.3 From 083ae308280d13d187512b9babe3454342a7987e Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 14 Jul 2016 11:38:40 -0400 Subject: tcp: enable per-socket rate limiting of all 'challenge acks' The per-socket rate limit for 'challenge acks' was introduced in the context of limiting ack loops: commit f2b2c582e824 ("tcp: mitigate ACK loops for connections as tcp_sock") And I think it can be extended to rate limit all 'challenge acks' on a per-socket basis. Since we have the global tcp_challenge_ack_limit, this patch allows for tcp_challenge_ack_limit to be set to a large value and effectively rely on the per-socket limit, or set tcp_challenge_ack_limit to a lower value and still prevents a single connections from consuming the entire challenge ack quota. It further moves in the direction of eliminating the global limit at some point, as Eric Dumazet has suggested. This a follow-up to: Subject: tcp: make challenge acks less predictable Cc: Eric Dumazet Cc: David S. Miller Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Yue Cao Signed-off-by: Jason Baron Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 91868bb17818..42bf89aaf6a5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3421,6 +3421,23 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 return flag; } +static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, + u32 *last_oow_ack_time) +{ + if (*last_oow_ack_time) { + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS(net, mib_idx); + return true; /* rate-limited: don't send yet! */ + } + } + + *last_oow_ack_time = tcp_time_stamp; + + return false; /* not rate-limited: go ahead, send dupack now! */ +} + /* Return true if we're currently rate-limiting out-of-window ACKs and * thus shouldn't send a dupack right now. We rate-limit dupacks in * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS @@ -3434,21 +3451,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, /* Data packets without SYNs are not likely part of an ACK loop. */ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && !tcp_hdr(skb)->syn) - goto not_rate_limited; - - if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); - - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { - NET_INC_STATS(net, mib_idx); - return true; /* rate-limited: don't send yet! */ - } - } - - *last_oow_ack_time = tcp_time_stamp; + return false; -not_rate_limited: - return false; /* not rate-limited: go ahead, send dupack now! */ + return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } /* RFC 5961 7 [ACK Throttling] */ @@ -3461,9 +3466,9 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) u32 count, now; /* First check our per-socket dupack rate limit. */ - if (tcp_oow_rate_limited(sock_net(sk), skb, - LINUX_MIB_TCPACKSKIPPEDCHALLENGE, - &tp->last_oow_ack_time)) + if (__tcp_oow_rate_limited(sock_net(sk), + LINUX_MIB_TCPACKSKIPPEDCHALLENGE, + &tp->last_oow_ack_time)) return; /* Then check host-wide RFC 5961 rate limit. */ -- cgit v1.2.3 From 18d3df3eab23796d7f852f9c6bb60962b8372ced Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 14 Jul 2016 18:00:10 +0200 Subject: vlan: use a valid default mtu value for vlan over macsec macsec can't cope with mtu frames which need vlan tag insertion, and vlan device set the default mtu equal to the underlying dev's one. By default vlan over macsec devices use invalid mtu, dropping all the large packets. This patch adds a netif helper to check if an upper vlan device needs mtu reduction. The helper is used during vlan devices initialization to set a valid default and during mtu updating to forbid invalid, too bit, mtu values. The helper currently only check if the lower dev is a macsec device, if we get more users, we need to update only the helper (possibly reserving an additional IFF bit). Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ net/8021q/vlan_dev.c | 10 ++++++---- net/8021q/vlan_netlink.c | 7 +++++-- 3 files changed, 18 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f45929ce8157..da4b33bea982 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4145,6 +4145,13 @@ static inline void netif_keep_dst(struct net_device *dev) dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM); } +/* return true if dev can't cope with mtu frames that need vlan tag insertion */ +static inline bool netif_reduces_vlan_mtu(struct net_device *dev) +{ + /* TODO: reserve and use an additional IFF bit, if we get more users */ + return dev->priv_flags & IFF_MACSEC; +} + extern struct pernet_operations __net_initdata loopback_net_ops; /* Logging, debugging and troubleshooting/diagnostic helpers. */ diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 86ae75b77390..516b0e73263c 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -146,10 +146,12 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) { - /* TODO: gotta make sure the underlying layer can handle it, - * maybe an IFF_VLAN_CAPABLE flag for devices? - */ - if (vlan_dev_priv(dev)->real_dev->mtu < new_mtu) + struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + unsigned int max_mtu = real_dev->mtu; + + if (netif_reduces_vlan_mtu(real_dev)) + max_mtu -= VLAN_HLEN; + if (max_mtu < new_mtu) return -ERANGE; dev->mtu = new_mtu; diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index c92b52f37d38..1270207f3d7c 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -118,6 +118,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev; + unsigned int max_mtu; __be16 proto; int err; @@ -144,9 +145,11 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, if (err < 0) return err; + max_mtu = netif_reduces_vlan_mtu(real_dev) ? real_dev->mtu - VLAN_HLEN : + real_dev->mtu; if (!tb[IFLA_MTU]) - dev->mtu = real_dev->mtu; - else if (dev->mtu > real_dev->mtu) + dev->mtu = max_mtu; + else if (dev->mtu > max_mtu) return -EINVAL; err = vlan_changelink(dev, tb, data); -- cgit v1.2.3 From 0564bf0afae443deeb16f36e2c39fefff89d05f2 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 16 Jul 2016 17:08:56 +0300 Subject: net/sched/sch_htb: clamp xstats tokens to fit into 32-bit int In kernel HTB keeps tokens in signed 64-bit in nanoseconds. In netlink protocol these values are converted into pshed ticks (64ns for now) and truncated to 32-bit. In struct tc_htb_xstats fields "tokens" and "ctokens" are declared as unsigned 32-bit but they could be negative thus tool 'tc' prints them as signed. Big values loose higher bits and/or become negative. This patch clamps tokens in xstat into range from INT_MIN to INT_MAX. In this way it's easier to understand what's going on here. Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 62f9d8100c6e..052f84d6cc23 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1140,8 +1140,10 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) if (!cl->level && cl->un.leaf.q) qlen = cl->un.leaf.q->q.qlen; - cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); - cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); + cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens), + INT_MIN, INT_MAX); + cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens), + INT_MIN, INT_MAX); if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || -- cgit v1.2.3 From c74bfbdba0e8d056e4ba579a666b5cdb8ec3cd35 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sat, 16 Jul 2016 17:33:15 -0400 Subject: sctp: load transport header after sk_filter Do not cache pointers into the skb linear segment across sk_filter. The function call can trigger pskb_expand_head. Signed-off-by: Willem de Bruijn Acked-by: Daniel Borkmann Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index a701527a9480..47cf4604d19c 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -112,7 +112,6 @@ int sctp_rcv(struct sk_buff *skb) struct sctp_ep_common *rcvr; struct sctp_transport *transport = NULL; struct sctp_chunk *chunk; - struct sctphdr *sh; union sctp_addr src; union sctp_addr dest; int family; @@ -127,8 +126,6 @@ int sctp_rcv(struct sk_buff *skb) if (skb_linearize(skb)) goto discard_it; - sh = sctp_hdr(skb); - /* Pull up the IP and SCTP headers. */ __skb_pull(skb, skb_transport_offset(skb)); if (skb->len < sizeof(struct sctphdr)) @@ -230,7 +227,7 @@ int sctp_rcv(struct sk_buff *skb) chunk->rcvr = rcvr; /* Remember the SCTP header. */ - chunk->sctp_hdr = sh; + chunk->sctp_hdr = sctp_hdr(skb); /* Set the source and destination addresses of the incoming chunk. */ sctp_init_addrs(chunk, &src, &dest); -- cgit v1.2.3 From edbe77462302ec0b11a90244de13f9012118c538 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Tue, 19 Jul 2016 14:40:51 +0900 Subject: packet: fix second argument of sock_tx_timestamp() This patch fixes an issue that a syscall (e.g. sendto syscall) cannot work correctly. Since the sendto syscall doesn't have msg_control buffer, the sock_tx_timestamp() in packet_snd() cannot work correctly because the socks.tsflags is set to 0. So, this patch sets the socks.tsflags to sk->sk_tsflags as default. Fixes: c14ac9451c34 ("sock: enable timestamping using control messages") Reported-by: Kazuya Mizuguchi Reported-by: Keita Kobayashi Signed-off-by: Yoshihiro Shimoda Acked-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9f0983fa4d52..53e87ceb26e7 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1927,7 +1927,7 @@ retry: goto out_unlock; } - sockc.tsflags = 0; + sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { @@ -2678,7 +2678,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); } - sockc.tsflags = 0; + sockc.tsflags = po->sk.sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) @@ -2881,7 +2881,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; - sockc.tsflags = 0; + sockc.tsflags = sk->sk_tsflags; sockc.mark = sk->sk_mark; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); -- cgit v1.2.3 From f8e7718cc0445587fe8530fc2d240d9aac2c9072 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Wed, 20 Jul 2016 18:01:18 -0400 Subject: packet: propagate sock_cmsg_send() error sock_cmsg_send() can return different error codes and not only -EINVAL, and we should properly propagate them. Fixes: c14ac9451c34 ("sock: enable timestamping using control messages") Signed-off-by: Soheil Hassas Yeganeh Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 53e87ceb26e7..b43c4015b2f7 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1930,10 +1930,8 @@ retry: sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); - if (unlikely(err)) { - err = -EINVAL; + if (unlikely(err)) goto out_unlock; - } } skb->protocol = proto; -- cgit v1.2.3