summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/net/vrf.c6
-rw-r--r--include/net/dst.h43
-rw-r--r--include/net/ip6_fib.h2
-rw-r--r--include/net/ip6_route.h1
-rw-r--r--include/net/route.h4
-rw-r--r--net/core/dev.c1
-rw-r--r--net/core/dst.c276
-rw-r--r--net/decnet/dn_route.c34
-rw-r--r--net/ipv4/fib_semantics.c9
-rw-r--r--net/ipv4/route.c62
-rw-r--r--net/ipv4/udp.c19
-rw-r--r--net/ipv6/addrconf.c4
-rw-r--r--net/ipv6/ip6_fib.c32
-rw-r--r--net/ipv6/ip6_output.c4
-rw-r--r--net/ipv6/route.c127
-rw-r--r--net/ipv6/udp.c11
-rw-r--r--net/xfrm/xfrm_policy.c49
17 files changed, 218 insertions, 466 deletions
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index c6c0595d267b..997ef25189fd 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -563,7 +563,7 @@ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
static int vrf_rt6_create(struct net_device *dev)
{
- int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM | DST_NOCACHE;
+ int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM;
struct net_vrf *vrf = netdev_priv(dev);
struct net *net = dev_net(dev);
struct fib6_table *rt6i_table;
@@ -583,8 +583,6 @@ static int vrf_rt6_create(struct net_device *dev)
if (!rt6)
goto out;
- dst_hold(&rt6->dst);
-
rt6->rt6i_table = rt6i_table;
rt6->dst.output = vrf_output6;
@@ -597,8 +595,6 @@ static int vrf_rt6_create(struct net_device *dev)
goto out;
}
- dst_hold(&rt6_local->dst);
-
rt6_local->rt6i_idev = in6_dev_get(dev);
rt6_local->rt6i_flags = RTF_UP | RTF_NONEXTHOP | RTF_LOCAL;
rt6_local->rt6i_table = rt6i_table;
diff --git a/include/net/dst.h b/include/net/dst.h
index 1969008783d8..f73611ec4017 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -51,13 +51,11 @@ struct dst_entry {
#define DST_HOST 0x0001
#define DST_NOXFRM 0x0002
#define DST_NOPOLICY 0x0004
-#define DST_NOHASH 0x0008
-#define DST_NOCACHE 0x0010
-#define DST_NOCOUNT 0x0020
-#define DST_FAKE_RTABLE 0x0040
-#define DST_XFRM_TUNNEL 0x0080
-#define DST_XFRM_QUEUE 0x0100
-#define DST_METADATA 0x0200
+#define DST_NOCOUNT 0x0008
+#define DST_FAKE_RTABLE 0x0010
+#define DST_XFRM_TUNNEL 0x0020
+#define DST_XFRM_QUEUE 0x0040
+#define DST_METADATA 0x0080
short error;
@@ -253,7 +251,7 @@ static inline void dst_hold(struct dst_entry *dst)
* __pad_to_align_refcnt declaration in struct dst_entry
*/
BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
- atomic_inc(&dst->__refcnt);
+ WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
}
static inline void dst_use(struct dst_entry *dst, unsigned long time)
@@ -278,6 +276,8 @@ static inline struct dst_entry *dst_clone(struct dst_entry *dst)
void dst_release(struct dst_entry *dst);
+void dst_release_immediate(struct dst_entry *dst);
+
static inline void refdst_drop(unsigned long refdst)
{
if (!(refdst & SKB_DST_NOREF))
@@ -334,10 +334,7 @@ static inline void skb_dst_force(struct sk_buff *skb)
*/
static inline bool dst_hold_safe(struct dst_entry *dst)
{
- if (dst->flags & DST_NOCACHE)
- return atomic_inc_not_zero(&dst->__refcnt);
- dst_hold(dst);
- return true;
+ return atomic_inc_not_zero(&dst->__refcnt);
}
/**
@@ -423,26 +420,8 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref,
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
struct net_device *dev, int initial_ref, int initial_obsolete,
unsigned short flags);
-void __dst_free(struct dst_entry *dst);
struct dst_entry *dst_destroy(struct dst_entry *dst);
-
-static inline void dst_free(struct dst_entry *dst)
-{
- if (dst->obsolete > 0)
- return;
- if (!atomic_read(&dst->__refcnt)) {
- dst = dst_destroy(dst);
- if (!dst)
- return;
- }
- __dst_free(dst);
-}
-
-static inline void dst_rcu_free(struct rcu_head *head)
-{
- struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
- dst_free(dst);
-}
+void dst_dev_put(struct dst_entry *dst);
static inline void dst_confirm(struct dst_entry *dst)
{
@@ -505,8 +484,6 @@ static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
return dst;
}
-void dst_subsys_init(void);
-
/* Flags for xfrm_lookup flags argument. */
enum {
XFRM_LOOKUP_ICMP = 1 << 0,
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index aa50e2e6fa2a..1a88008cc6f5 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -170,7 +170,7 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
static inline u32 rt6_get_cookie(const struct rt6_info *rt)
{
if (rt->rt6i_flags & RTF_PCPU ||
- (unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from))
+ (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
rt = (struct rt6_info *)(rt->dst.from);
return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index f3da9dd2a8db..0fbf73dd531a 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -116,7 +116,6 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
const struct in6_addr *saddr, int oif, int flags);
struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
-int icmp6_dst_gc(void);
void fib6_force_start_gc(struct net *net);
diff --git a/include/net/route.h b/include/net/route.h
index 08e689f23365..cb0a76d9dde1 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -190,7 +190,9 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
rcu_read_lock();
err = ip_route_input_noref(skb, dst, src, tos, devin);
if (!err)
- skb_dst_force(skb);
+ skb_dst_force_safe(skb);
+ if (!skb_dst(skb))
+ err = -EINVAL;
rcu_read_unlock();
return err;
diff --git a/net/core/dev.c b/net/core/dev.c
index b8d6dd9e8b5c..5d1830b8d2cf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8681,7 +8681,6 @@ static int __init net_dev_init(void)
rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
NULL, dev_cpu_dead);
WARN_ON(rc < 0);
- dst_subsys_init();
rc = 0;
out:
return rc;
diff --git a/net/core/dst.c b/net/core/dst.c
index 13ba4a090c41..f851adb9ec9b 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -42,108 +42,6 @@
* to dirty as few cache lines as possible in __dst_free().
* As this is not a very strong hint, we dont force an alignment on SMP.
*/
-static struct {
- spinlock_t lock;
- struct dst_entry *list;
- unsigned long timer_inc;
- unsigned long timer_expires;
-} dst_garbage = {
- .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock),
- .timer_inc = DST_GC_MAX,
-};
-static void dst_gc_task(struct work_struct *work);
-static void ___dst_free(struct dst_entry *dst);
-
-static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
-
-static DEFINE_MUTEX(dst_gc_mutex);
-/*
- * long lived entries are maintained in this list, guarded by dst_gc_mutex
- */
-static struct dst_entry *dst_busy_list;
-
-static void dst_gc_task(struct work_struct *work)
-{
- int delayed = 0;
- int work_performed = 0;
- unsigned long expires = ~0L;
- struct dst_entry *dst, *next, head;
- struct dst_entry *last = &head;
-
- mutex_lock(&dst_gc_mutex);
- next = dst_busy_list;
-
-loop:
- while ((dst = next) != NULL) {
- next = dst->next;
- prefetch(&next->next);
- cond_resched();
- if (likely(atomic_read(&dst->__refcnt))) {
- last->next = dst;
- last = dst;
- delayed++;
- continue;
- }
- work_performed++;
-
- dst = dst_destroy(dst);
- if (dst) {
- /* NOHASH and still referenced. Unless it is already
- * on gc list, invalidate it and add to gc list.
- *
- * Note: this is temporary. Actually, NOHASH dst's
- * must be obsoleted when parent is obsoleted.
- * But we do not have state "obsoleted, but
- * referenced by parent", so it is right.
- */
- if (dst->obsolete > 0)
- continue;
-
- ___dst_free(dst);
- dst->next = next;
- next = dst;
- }
- }
-
- spin_lock_bh(&dst_garbage.lock);
- next = dst_garbage.list;
- if (next) {
- dst_garbage.list = NULL;
- spin_unlock_bh(&dst_garbage.lock);
- goto loop;
- }
- last->next = NULL;
- dst_busy_list = head.next;
- if (!dst_busy_list)
- dst_garbage.timer_inc = DST_GC_MAX;
- else {
- /*
- * if we freed less than 1/10 of delayed entries,
- * we can sleep longer.
- */
- if (work_performed <= delayed/10) {
- dst_garbage.timer_expires += dst_garbage.timer_inc;
- if (dst_garbage.timer_expires > DST_GC_MAX)
- dst_garbage.timer_expires = DST_GC_MAX;
- dst_garbage.timer_inc += DST_GC_INC;
- } else {
- dst_garbage.timer_inc = DST_GC_INC;
- dst_garbage.timer_expires = DST_GC_MIN;
- }
- expires = dst_garbage.timer_expires;
- /*
- * if the next desired timer is more than 4 seconds in the
- * future then round the timer to whole seconds
- */
- if (expires > 4*HZ)
- expires = round_jiffies_relative(expires);
- schedule_delayed_work(&dst_gc_work, expires);
- }
-
- spin_unlock_bh(&dst_garbage.lock);
- mutex_unlock(&dst_gc_mutex);
-}
-
int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
@@ -216,41 +114,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
}
EXPORT_SYMBOL(dst_alloc);
-static void ___dst_free(struct dst_entry *dst)
-{
- /* The first case (dev==NULL) is required, when
- protocol module is unloaded.
- */
- if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
- dst->input = dst_discard;
- dst->output = dst_discard_out;
- }
- dst->obsolete = DST_OBSOLETE_DEAD;
-}
-
-void __dst_free(struct dst_entry *dst)
-{
- spin_lock_bh(&dst_garbage.lock);
- ___dst_free(dst);
- dst->next = dst_garbage.list;
- dst_garbage.list = dst;
- if (dst_garbage.timer_inc > DST_GC_INC) {
- dst_garbage.timer_inc = DST_GC_INC;
- dst_garbage.timer_expires = DST_GC_MIN;
- mod_delayed_work(system_wq, &dst_gc_work,
- dst_garbage.timer_expires);
- }
- spin_unlock_bh(&dst_garbage.lock);
-}
-EXPORT_SYMBOL(__dst_free);
-
struct dst_entry *dst_destroy(struct dst_entry * dst)
{
struct dst_entry *child;
smp_rmb();
-again:
child = dst->child;
if (!(dst->flags & DST_NOCOUNT))
@@ -269,20 +138,8 @@ again:
kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
- if (dst) {
- int nohash = dst->flags & DST_NOHASH;
-
- if (atomic_dec_and_test(&dst->__refcnt)) {
- /* We were real parent of this dst, so kill child. */
- if (nohash)
- goto again;
- } else {
- /* Child is still referenced, return it for freeing. */
- if (nohash)
- return dst;
- /* Child is still in his hash table */
- }
- }
+ if (dst)
+ dst_release_immediate(dst);
return NULL;
}
EXPORT_SYMBOL(dst_destroy);
@@ -292,26 +149,62 @@ static void dst_destroy_rcu(struct rcu_head *head)
struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
dst = dst_destroy(dst);
- if (dst)
- __dst_free(dst);
}
+/* Operations to mark dst as DEAD and clean up the net device referenced
+ * by dst:
+ * 1. put the dst under loopback interface and discard all tx/rx packets
+ * on this route.
+ * 2. release the net_device
+ * This function should be called when removing routes from the fib tree
+ * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
+ * make the next dst_ops->check() fail.
+ */
+void dst_dev_put(struct dst_entry *dst)
+{
+ struct net_device *dev = dst->dev;
+
+ dst->obsolete = DST_OBSOLETE_DEAD;
+ if (dst->ops->ifdown)
+ dst->ops->ifdown(dst, dev, true);
+ dst->input = dst_discard;
+ dst->output = dst_discard_out;
+ dst->dev = dev_net(dst->dev)->loopback_dev;
+ dev_hold(dst->dev);
+ dev_put(dev);
+}
+EXPORT_SYMBOL(dst_dev_put);
+
void dst_release(struct dst_entry *dst)
{
if (dst) {
int newrefcnt;
- unsigned short nocache = dst->flags & DST_NOCACHE;
newrefcnt = atomic_dec_return(&dst->__refcnt);
if (unlikely(newrefcnt < 0))
net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
__func__, dst, newrefcnt);
- if (!newrefcnt && unlikely(nocache))
+ if (!newrefcnt)
call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
}
EXPORT_SYMBOL(dst_release);
+void dst_release_immediate(struct dst_entry *dst)
+{
+ if (dst) {
+ int newrefcnt;
+
+ newrefcnt = atomic_dec_return(&dst->__refcnt);
+ if (unlikely(newrefcnt < 0))
+ net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
+ __func__, dst, newrefcnt);
+ if (!newrefcnt)
+ dst_destroy(dst);
+ }
+}
+EXPORT_SYMBOL(dst_release_immediate);
+
u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
{
struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
@@ -377,7 +270,7 @@ static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
dst = &md_dst->dst;
dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
- DST_METADATA | DST_NOCACHE | DST_NOCOUNT);
+ DST_METADATA | DST_NOCOUNT);
dst->input = dst_md_discard;
dst->output = dst_md_discard_out;
@@ -423,86 +316,3 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
-
-/* Dirty hack. We did it in 2.2 (in __dst_free),
- * we have _very_ good reasons not to repeat
- * this mistake in 2.3, but we have no choice
- * now. _It_ _is_ _explicit_ _deliberate_
- * _race_ _condition_.
- *
- * Commented and originally written by Alexey.
- */
-static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int unregister)
-{
- if (dst->ops->ifdown)
- dst->ops->ifdown(dst, dev, unregister);
-
- if (dev != dst->dev)
- return;
-
- if (!unregister) {
- dst->input = dst_discard;
- dst->output = dst_discard_out;
- } else {
- dst->dev = dev_net(dst->dev)->loopback_dev;
- dev_hold(dst->dev);
- dev_put(dev);
- }
-}
-
-static int dst_dev_event(struct notifier_block *this, unsigned long event,
- void *ptr)
-{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct dst_entry *dst, *last = NULL;
-
- switch (event) {
- case NETDEV_UNREGISTER_FINAL:
- case NETDEV_DOWN:
- mutex_lock(&dst_gc_mutex);
- for (dst = dst_busy_list; dst; dst = dst->next) {
- last = dst;
- dst_ifdown(dst, dev, event != NETDEV_DOWN);
- }
-
- spin_lock_bh(&dst_garbage.lock);
- dst = dst_garbage.list;
- dst_garbage.list = NULL;
- /* The code in dst_ifdown places a hold on the loopback device.
- * If the gc entry processing is set to expire after a lengthy
- * interval, this hold can cause netdev_wait_allrefs() to hang
- * out and wait for a long time -- until the the loopback
- * interface is released. If we're really unlucky, it'll emit
- * pr_emerg messages to console too. Reset the interval here,
- * so dst cleanups occur in a more timely fashion.
- */
- if (dst_garbage.timer_inc > DST_GC_INC) {
- dst_garbage.timer_inc = DST_GC_INC;
- dst_garbage.timer_expires = DST_GC_MIN;
- mod_delayed_work(system_wq, &dst_gc_work,
- dst_garbage.timer_expires);
- }
- spin_unlock_bh(&dst_garbage.lock);
-
- if (last)
- last->next = dst;
- else
- dst_busy_list = dst;
- for (; dst; dst = dst->next)
- dst_ifdown(dst, dev, event != NETDEV_DOWN);
- mutex_unlock(&dst_gc_mutex);
- break;
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block dst_dev_notifier = {
- .notifier_call = dst_dev_event,
- .priority = -10, /* must be called after other network notifiers */
-};
-
-void __init dst_subsys_init(void)
-{
- register_netdevice_notifier(&dst_dev_notifier);
-}
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 6f95612b4d32..5d17d843ac86 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -183,11 +183,6 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst)
return dn_rt_hash_mask & (unsigned int)tmp;
}
-static inline void dnrt_free(struct dn_route *rt)
-{
- call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
static void dn_dst_check_expire(unsigned long dummy)
{
int i;
@@ -202,14 +197,15 @@ static void dn_dst_check_expire(unsigned long dummy)
spin_lock(&dn_rt_hash_table[i].lock);
while ((rt = rcu_dereference_protected(*rtp,
lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
- if (atomic_read(&rt->dst.__refcnt) ||
- (now - rt->dst.lastuse) < expire) {
+ if (atomic_read(&rt->dst.__refcnt) > 1 ||
+ (now - rt->dst.lastuse) < expire) {
rtp = &rt->dst.dn_next;
continue;
}
*rtp = rt->dst.dn_next;
rt->dst.dn_next = NULL;
- dnrt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
spin_unlock(&dn_rt_hash_table[i].lock);
@@ -235,14 +231,15 @@ static int dn_dst_gc(struct dst_ops *ops)
while ((rt = rcu_dereference_protected(*rtp,
lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
- if (atomic_read(&rt->dst.__refcnt) ||
- (now - rt->dst.lastuse) < expire) {
+ if (atomic_read(&rt->dst.__refcnt) > 1 ||
+ (now - rt->dst.lastuse) < expire) {
rtp = &rt->dst.dn_next;
continue;
}
*rtp = rt->dst.dn_next;
rt->dst.dn_next = NULL;
- dnrt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
break;
}
spin_unlock_bh(&dn_rt_hash_table[i].lock);
@@ -344,7 +341,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
dst_use(&rth->dst, now);
spin_unlock_bh(&dn_rt_hash_table[hash].lock);
- dst_free(&rt->dst);
+ dst_release_immediate(&rt->dst);
*rp = rth;
return 0;
}
@@ -374,7 +371,8 @@ static void dn_run_flush(unsigned long dummy)
for(; rt; rt = next) {
next = rcu_dereference_raw(rt->dst.dn_next);
RCU_INIT_POINTER(rt->dst.dn_next, NULL);
- dnrt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
nothing_to_declare:
@@ -1181,7 +1179,7 @@ make_route:
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST);
+ rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST);
if (rt == NULL)
goto e_nobufs;
@@ -1215,6 +1213,7 @@ make_route:
goto e_neighbour;
hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
+ /* dn_insert_route() increments dst->__refcnt */
dn_insert_route(rt, hash, (struct dn_route **)pprt);
done:
@@ -1237,7 +1236,7 @@ e_nobufs:
err = -ENOBUFS;
goto done;
e_neighbour:
- dst_free(&rt->dst);
+ dst_release_immediate(&rt->dst);
goto e_nobufs;
}
@@ -1445,7 +1444,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
}
make_route:
- rt = dst_alloc(&dn_dst_ops, out_dev, 0, DST_OBSOLETE_NONE, DST_HOST);
+ rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, DST_HOST);
if (rt == NULL)
goto e_nobufs;
@@ -1491,6 +1490,7 @@ make_route:
goto e_neighbour;
hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
+ /* dn_insert_route() increments dst->__refcnt */
dn_insert_route(rt, hash, &rt);
skb_dst_set(skb, &rt->dst);
@@ -1514,7 +1514,7 @@ e_nobufs:
goto done;
e_neighbour:
- dst_free(&rt->dst);
+ dst_release_immediate(&rt->dst);
goto done;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2157dc08c407..ff47ea1408fe 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -152,7 +152,8 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp)
* free_fib_info_rcu()
*/
- dst_free(&rt->dst);
+ dst_dev_put(&rt->dst);
+ dst_release_immediate(&rt->dst);
}
static void free_nh_exceptions(struct fib_nh *nh)
@@ -194,8 +195,10 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
struct rtable *rt;
rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
- if (rt)
- dst_free(&rt->dst);
+ if (rt) {
+ dst_dev_put(&rt->dst);
+ dst_release_immediate(&rt->dst);
+ }
}
free_percpu(rtp);
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9b38cf18144e..c816cd53f7fc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -589,11 +589,6 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
build_sk_flow_key(fl4, sk);
}
-static inline void rt_free(struct rtable *rt)
-{
- call_rcu(&rt->dst.rcu_head, dst_rcu_free);
-}
-
static DEFINE_SPINLOCK(fnhe_lock);
static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
@@ -603,12 +598,14 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
rt = rcu_dereference(fnhe->fnhe_rth_input);
if (rt) {
RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
- rt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
rt = rcu_dereference(fnhe->fnhe_rth_output);
if (rt) {
RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
- rt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
}
@@ -1302,7 +1299,7 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
}
static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
- __be32 daddr)
+ __be32 daddr, const bool do_cache)
{
bool ret = false;
@@ -1331,10 +1328,13 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
if (!rt->rt_gateway)
rt->rt_gateway = daddr;
- if (!(rt->dst.flags & DST_NOCACHE)) {
+ if (do_cache) {
+ dst_hold(&rt->dst);
rcu_assign_pointer(*porig, rt);
- if (orig)
- rt_free(orig);
+ if (orig) {
+ dst_dev_put(&orig->dst);
+ dst_release(&orig->dst);
+ }
ret = true;
}
@@ -1357,12 +1357,20 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
}
orig = *p;
+ /* hold dst before doing cmpxchg() to avoid race condition
+ * on this dst
+ */
+ dst_hold(&rt->dst);
prev = cmpxchg(p, orig, rt);
if (prev == orig) {
- if (orig)
- rt_free(orig);
- } else
+ if (orig) {
+ dst_dev_put(&orig->dst);
+ dst_release(&orig->dst);
+ }
+ } else {
+ dst_release(&rt->dst);
ret = false;
+ }
return ret;
}
@@ -1433,7 +1441,8 @@ static bool rt_cache_valid(const struct rtable *rt)
static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
const struct fib_result *res,
struct fib_nh_exception *fnhe,
- struct fib_info *fi, u16 type, u32 itag)
+ struct fib_info *fi, u16 type, u32 itag,
+ const bool do_cache)
{
bool cached = false;
@@ -1454,8 +1463,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
#endif
rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
if (unlikely(fnhe))
- cached = rt_bind_exception(rt, fnhe, daddr);
- else if (!(rt->dst.flags & DST_NOCACHE))
+ cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
+ else if (do_cache)
cached = rt_cache_route(nh, rt);
if (unlikely(!cached)) {
/* Routes we intend to cache in nexthop exception or
@@ -1463,7 +1472,6 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
* However, if we are unsuccessful at storing this
* route into the cache we really need to set it.
*/
- rt->dst.flags |= DST_NOCACHE;
if (!rt->rt_gateway)
rt->rt_gateway = daddr;
rt_add_uncached_list(rt);
@@ -1486,7 +1494,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
struct rtable *rt;
rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
- (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
+ (will_cache ? 0 : DST_HOST) |
(nopolicy ? DST_NOPOLICY : 0) |
(noxfrm ? DST_NOXFRM : 0));
@@ -1730,7 +1738,8 @@ rt_cache:
rth->dst.input = ip_forward;
- rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+ rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
+ do_cache);
set_lwt_redirect(rth);
skb_dst_set(skb, &rth->dst);
out:
@@ -2018,10 +2027,8 @@ local_input:
rth->dst.input = lwtunnel_input;
}
- if (unlikely(!rt_cache_route(nh, rth))) {
- rth->dst.flags |= DST_NOCACHE;
+ if (unlikely(!rt_cache_route(nh, rth)))
rt_add_uncached_list(rth);
- }
}
skb_dst_set(skb, &rth->dst);
err = 0;
@@ -2217,10 +2224,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
rth = rcu_dereference(*prth);
rt_cache:
- if (rt_cache_valid(rth)) {
- dst_hold(&rth->dst);
+ if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
return rth;
- }
}
add:
@@ -2254,7 +2259,7 @@ add:
#endif
}
- rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
+ rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
set_lwt_redirect(rth);
return rth;
@@ -2504,7 +2509,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->input = dst_discard;
new->output = dst_discard_out;
- new->dev = ort->dst.dev;
+ new->dev = net->loopback_dev;
if (new->dev)
dev_hold(new->dev);
@@ -2519,7 +2524,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_uses_gateway = ort->rt_uses_gateway;
INIT_LIST_HEAD(&rt->rt_uncached);
- dst_free(new);
}
dst_release(dst_orig);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2bc638c48b86..f3450f092d71 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1977,9 +1977,10 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
{
struct dst_entry *old;
- dst_hold(dst);
- old = xchg(&sk->sk_rx_dst, dst);
- dst_release(old);
+ if (dst_hold_safe(dst)) {
+ old = xchg(&sk->sk_rx_dst, dst);
+ dst_release(old);
+ }
}
/*
@@ -2303,13 +2304,11 @@ void udp_v4_early_demux(struct sk_buff *skb)
if (dst)
dst = dst_check(dst, 0);
if (dst) {
- /* DST_NOCACHE can not be used without taking a reference */
- if (dst->flags & DST_NOCACHE) {
- if (likely(atomic_inc_not_zero(&dst->__refcnt)))
- skb_dst_set(skb, dst);
- } else {
- skb_dst_set_noref(skb, dst);
- }
+ /* set noref for now.
+ * any place which wants to hold dst has to call
+ * dst_hold_safe()
+ */
+ skb_dst_set_noref(skb, dst);
}
}
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 0aa36b093013..2a6397714d70 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5576,8 +5576,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
ip6_del_rt(rt);
}
if (ifp->rt) {
- dst_hold(&ifp->rt->dst);
- ip6_del_rt(ifp->rt);
+ if (dst_hold_safe(&ifp->rt->dst))
+ ip6_del_rt(ifp->rt);
}
rt_genid_bump_ipv6(net);
break;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index deea901746c8..4f3e4657f2d6 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -153,11 +153,6 @@ static void node_free(struct fib6_node *fn)
kmem_cache_free(fib6_node_kmem, fn);
}
-static void rt6_rcu_free(struct rt6_info *rt)
-{
- call_rcu(&rt->dst.rcu_head, dst_rcu_free);
-}
-
static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
{
int cpu;
@@ -172,7 +167,8 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);
pcpu_rt = *ppcpu_rt;
if (pcpu_rt) {
- rt6_rcu_free(pcpu_rt);
+ dst_dev_put(&pcpu_rt->dst);
+ dst_release(&pcpu_rt->dst);
*ppcpu_rt = NULL;
}
}
@@ -185,7 +181,8 @@ static void rt6_release(struct rt6_info *rt)
{
if (atomic_dec_and_test(&rt->rt6i_ref)) {
rt6_free_pcpu(rt);
- rt6_rcu_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
}
@@ -978,8 +975,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
int replace_required = 0;
int sernum = fib6_new_sernum(info->nl_net);
- if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) &&
- !atomic_read(&rt->dst.__refcnt)))
+ if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
return -EINVAL;
if (info->nlh) {
@@ -1076,7 +1072,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
fib6_start_gc(info->nl_net, rt);
if (!(rt->rt6i_flags & RTF_CACHE))
fib6_prune_clones(info->nl_net, pn);
- rt->dst.flags &= ~DST_NOCACHE;
}
out:
@@ -1101,8 +1096,10 @@ out:
atomic_inc(&pn->leaf->rt6i_ref);
}
#endif
- if (!(rt->dst.flags & DST_NOCACHE))
- dst_free(&rt->dst);
+ /* Always release dst as dst->__refcnt is guaranteed
+ * to be taken before entering this function
+ */
+ dst_release_immediate(&rt->dst);
}
return err;
@@ -1113,8 +1110,10 @@ out:
st_failure:
if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
fib6_repair_tree(info->nl_net, fn);
- if (!(rt->dst.flags & DST_NOCACHE))
- dst_free(&rt->dst);
+ /* Always release dst as dst->__refcnt is guaranteed
+ * to be taken before entering this function
+ */
+ dst_release_immediate(&rt->dst);
return err;
#endif
}
@@ -1783,7 +1782,7 @@ static int fib6_age(struct rt6_info *rt, void *arg)
}
gc_args->more++;
} else if (rt->rt6i_flags & RTF_CACHE) {
- if (atomic_read(&rt->dst.__refcnt) == 0 &&
+ if (atomic_read(&rt->dst.__refcnt) == 1 &&
time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
RT6_TRACE("aging clone %p\n", rt);
return -1;
@@ -1821,8 +1820,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
}
gc_args.timeout = expires ? (int)expires :
net->ipv6.sysctl.ip6_rt_gc_interval;
-
- gc_args.more = icmp6_dst_gc();
+ gc_args.more = 0;
fib6_clean_all(net, fib6_age, &gc_args);
now = jiffies;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8b8efb0e55bf..5baa6fab4b97 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -698,8 +698,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
ipv6_hdr(skb)->payload_len = htons(first_len -
sizeof(struct ipv6hdr));
- dst_hold(&rt->dst);
-
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
@@ -742,7 +740,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
if (err == 0) {
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGOKS);
- ip6_rt_put(rt);
return 0;
}
@@ -750,7 +747,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGFAILS);
- ip6_rt_put(rt);
return err;
slow_path_clean:
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 18fe6e2b88d5..2e4490076061 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -128,7 +128,6 @@ static void rt6_uncached_list_add(struct rt6_info *rt)
{
struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
- rt->dst.flags |= DST_NOCACHE;
rt->rt6i_uncached_list = ul;
spin_lock_bh(&ul->lock);
@@ -354,7 +353,7 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
int flags)
{
struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
- 0, DST_OBSOLETE_FORCE_CHK, flags);
+ 1, DST_OBSOLETE_FORCE_CHK, flags);
if (rt)
rt6_info_init(rt);
@@ -381,7 +380,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net,
*p = NULL;
}
} else {
- dst_destroy((struct dst_entry *)rt);
+ dst_release_immediate(&rt->dst);
return NULL;
}
}
@@ -932,9 +931,9 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
EXPORT_SYMBOL(rt6_lookup);
/* ip6_ins_rt is called with FREE table->tb6_lock.
- It takes new route entry, the addition fails by any reason the
- route is freed. In any case, if caller does not hold it, it may
- be destroyed.
+ * It takes new route entry, the addition fails by any reason the
+ * route is released.
+ * Caller must hold dst before calling it.
*/
static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
@@ -957,6 +956,8 @@ int ip6_ins_rt(struct rt6_info *rt)
struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
struct mx6_config mxc = { .mx = NULL, };
+ /* Hold dst to account for the reference from the fib6 tree */
+ dst_hold(&rt->dst);
return __ip6_ins_rt(rt, &info, &mxc, NULL);
}
@@ -1049,7 +1050,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
prev = cmpxchg(p, NULL, pcpu_rt);
if (prev) {
/* If someone did it before us, return prev instead */
- dst_destroy(&pcpu_rt->dst);
+ dst_release_immediate(&pcpu_rt->dst);
pcpu_rt = prev;
}
} else {
@@ -1059,7 +1060,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
* since rt is going away anyway. The next
* dst_check() will trigger a re-lookup.
*/
- dst_destroy(&pcpu_rt->dst);
+ dst_release_immediate(&pcpu_rt->dst);
pcpu_rt = rt;
}
dst_hold(&pcpu_rt->dst);
@@ -1129,12 +1130,15 @@ redo_rt6_select:
uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
dst_release(&rt->dst);
- if (uncached_rt)
+ if (uncached_rt) {
+ /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
+ * No need for another dst_hold()
+ */
rt6_uncached_list_add(uncached_rt);
- else
+ } else {
uncached_rt = net->ipv6.ip6_null_entry;
-
- dst_hold(&uncached_rt->dst);
+ dst_hold(&uncached_rt->dst);
+ }
trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
return uncached_rt;
@@ -1245,9 +1249,11 @@ EXPORT_SYMBOL_GPL(ip6_route_output_flags);
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
+ struct net_device *loopback_dev = net->loopback_dev;
struct dst_entry *new = NULL;
- rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
+ rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
+ DST_OBSOLETE_NONE, 0);
if (rt) {
rt6_info_init(rt);
@@ -1257,10 +1263,8 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
new->output = dst_discard_out;
dst_copy_metrics(new, &ort->dst);
- rt->rt6i_idev = ort->rt6i_idev;
- if (rt->rt6i_idev)
- in6_dev_hold(rt->rt6i_idev);
+ rt->rt6i_idev = in6_dev_get(loopback_dev);
rt->rt6i_gateway = ort->rt6i_gateway;
rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
rt->rt6i_metric = 0;
@@ -1269,8 +1273,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
#ifdef CONFIG_IPV6_SUBTREES
memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif
-
- dst_free(new);
}
dst_release(dst_orig);
@@ -1323,7 +1325,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
rt6_dst_from_metrics_check(rt);
if (rt->rt6i_flags & RTF_PCPU ||
- (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
+ (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
return rt6_dst_from_check(rt, cookie);
else
return rt6_check(rt, cookie);
@@ -1356,8 +1358,8 @@ static void ip6_link_failure(struct sk_buff *skb)
rt = (struct rt6_info *) skb_dst(skb);
if (rt) {
if (rt->rt6i_flags & RTF_CACHE) {
- dst_hold(&rt->dst);
- ip6_del_rt(rt);
+ if (dst_hold_safe(&rt->dst))
+ ip6_del_rt(rt);
} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
rt->rt6i_node->fn_sernum = -1;
}
@@ -1421,6 +1423,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
* invalidate the sk->sk_dst_cache.
*/
ip6_ins_rt(nrt6);
+ /* Release the reference taken in
+ * ip6_rt_cache_alloc()
+ */
+ dst_release(&nrt6->dst);
}
}
}
@@ -1649,9 +1655,6 @@ out:
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
-static struct dst_entry *icmp6_dst_gc_list;
-static DEFINE_SPINLOCK(icmp6_dst_lock);
-
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
struct flowi6 *fl6)
{
@@ -1672,19 +1675,16 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
rt->dst.flags |= DST_HOST;
rt->dst.output = ip6_output;
- atomic_set(&rt->dst.__refcnt, 1);
rt->rt6i_gateway = fl6->daddr;
rt->rt6i_dst.addr = fl6->daddr;
rt->rt6i_dst.plen = 128;
rt->rt6i_idev = idev;
dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
- spin_lock_bh(&icmp6_dst_lock);
- rt->dst.next = icmp6_dst_gc_list;
- icmp6_dst_gc_list = &rt->dst;
- spin_unlock_bh(&icmp6_dst_lock);
-
- fib6_force_start_gc(net);
+ /* Add this dst into uncached_list so that rt6_ifdown() can
+ * do proper release of the net_device
+ */
+ rt6_uncached_list_add(rt);
dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
@@ -1692,48 +1692,6 @@ out:
return dst;
}
-int icmp6_dst_gc(void)
-{
- struct dst_entry *dst, **pprev;
- int more = 0;
-
- spin_lock_bh(&icmp6_dst_lock);
- pprev = &icmp6_dst_gc_list;
-
- while ((dst = *pprev) != NULL) {
- if (!atomic_read(&dst->__refcnt)) {
- *pprev = dst->next;
- dst_free(dst);
- } else {
- pprev = &dst->next;
- ++more;
- }
- }
-
- spin_unlock_bh(&icmp6_dst_lock);
-
- return more;
-}
-
-static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
- void *arg)
-{
- struct dst_entry *dst, **pprev;
-
- spin_lock_bh(&icmp6_dst_lock);
- pprev = &icmp6_dst_gc_list;
- while ((dst = *pprev) != NULL) {
- struct rt6_info *rt = (struct rt6_info *) dst;
- if (func(rt, arg)) {
- *pprev = dst->next;
- dst_free(dst);
- } else {
- pprev = &dst->next;
- }
- }
- spin_unlock_bh(&icmp6_dst_lock);
-}
-
static int ip6_dst_gc(struct dst_ops *ops)
{
struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
@@ -2130,7 +2088,7 @@ out:
if (idev)
in6_dev_put(idev);
if (rt)
- dst_free(&rt->dst);
+ dst_release_immediate(&rt->dst);
return ERR_PTR(err);
}
@@ -2160,7 +2118,7 @@ int ip6_route_add(struct fib6_config *cfg,
return err;
out:
if (rt)
- dst_free(&rt->dst);
+ dst_release_immediate(&rt->dst);
return err;
}
@@ -2171,8 +2129,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
struct fib6_table *table;
struct net *net = dev_net(rt->dst.dev);
- if (rt == net->ipv6.ip6_null_entry ||
- rt->dst.flags & DST_NOCACHE) {
+ if (rt == net->ipv6.ip6_null_entry) {
err = -ENOENT;
goto out;
}
@@ -2397,7 +2354,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
if (ip6_ins_rt(nrt))
- goto out;
+ goto out_release;
netevent.old = &rt->dst;
netevent.new = &nrt->dst;
@@ -2410,6 +2367,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
ip6_del_rt(rt);
}
+out_release:
+ /* Release the reference taken in
+ * ip6_rt_cache_alloc()
+ */
+ dst_release(&nrt->dst);
+
out:
neigh_release(neigh);
}
@@ -2757,9 +2720,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
rt->rt6i_dst.plen = 128;
tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
rt->rt6i_table = fib6_get_table(net, tb_id);
- rt->dst.flags |= DST_NOCACHE;
-
- atomic_set(&rt->dst.__refcnt, 1);
return rt;
}
@@ -2847,7 +2807,6 @@ void rt6_ifdown(struct net *net, struct net_device *dev)
};
fib6_clean_all(net, fib6_ifdown, &adn);
- icmp6_clean_all(fib6_ifdown, &adn);
if (dev)
rt6_uncached_list_flush_dev(net, dev);
}
@@ -3185,7 +3144,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
if (err) {
- dst_free(&rt->dst);
+ dst_release_immediate(&rt->dst);
goto cleanup;
}
@@ -3249,7 +3208,7 @@ add_errout:
cleanup:
list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
if (nh->rt6_info)
- dst_free(&nh->rt6_info->dst);
+ dst_release_immediate(&nh->rt6_info->dst);
kfree(nh->mxc.mx);
list_del(&nh->next);
kfree(nh);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2e9b52bded2d..2b33847bf931 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -920,12 +920,11 @@ static void udp_v6_early_demux(struct sk_buff *skb)
if (dst)
dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
if (dst) {
- if (dst->flags & DST_NOCACHE) {
- if (likely(atomic_inc_not_zero(&dst->__refcnt)))
- skb_dst_set(skb, dst);
- } else {
- skb_dst_set_noref(skb, dst);
- }
+ /* set noref for now.
+ * any place which wants to hold dst has to call
+ * dst_hold_safe()
+ */
+ skb_dst_set_noref(skb, dst);
}
}
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ed4e52d95172..af8e38f47b5b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1590,7 +1590,9 @@ static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
struct dst_entry *dst = &xdst->u.dst;
- dst_free(dst);
+ /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
+ dst->obsolete = DST_OBSOLETE_DEAD;
+ dst_release_immediate(dst);
}
static const struct flow_cache_ops xfrm_bundle_fc_ops = {
@@ -1620,7 +1622,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
default:
BUG();
}
- xdst = dst_alloc(dst_ops, NULL, 0, DST_OBSOLETE_NONE, 0);
+ xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
if (likely(xdst)) {
struct dst_entry *dst = &xdst->u.dst;
@@ -1723,10 +1725,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
if (!dst_prev)
dst0 = dst1;
- else {
- dst_prev->child = dst_clone(dst1);
- dst1->flags |= DST_NOHASH;
- }
+ else
+ /* Ref count is taken during xfrm_alloc_dst()
+ * No need to do dst_clone() on dst1
+ */
+ dst_prev->child = dst1;
xdst->route = dst;
dst_copy_metrics(dst1, dst);
@@ -1792,7 +1795,7 @@ put_states:
xfrm_state_put(xfrm[i]);
free_dst:
if (dst0)
- dst_free(dst0);
+ dst_release_immediate(dst0);
dst0 = ERR_PTR(err);
goto out;
}
@@ -2073,7 +2076,11 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
pol_dead |= pols[i]->walk.dead;
}
if (pol_dead) {
- dst_free(&xdst->u.dst);
+ /* Mark DST_OBSOLETE_DEAD to fail the next
+ * xfrm_dst_check()
+ */
+ xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
+ dst_release_immediate(&xdst->u.dst);
xdst = NULL;
num_pols = 0;
num_xfrms = 0;
@@ -2120,11 +2127,12 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
if (xdst) {
/* The policies were stolen for newly generated bundle */
xdst->num_pols = 0;
- dst_free(&xdst->u.dst);
+ /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
+ xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
+ dst_release_immediate(&xdst->u.dst);
}
- /* Flow cache does not have reference, it dst_free()'s,
- * but we do need to return one reference for original caller */
+ /* We do need to return one reference for original caller */
dst_hold(&new_xdst->u.dst);
return &new_xdst->flo;
@@ -2147,9 +2155,11 @@ make_dummy_bundle:
inc_error:
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
error:
- if (xdst != NULL)
- dst_free(&xdst->u.dst);
- else
+ if (xdst != NULL) {
+ /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
+ xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
+ dst_release_immediate(&xdst->u.dst);
+ } else
xfrm_pols_put(pols, num_pols);
return ERR_PTR(err);
}
@@ -2221,7 +2231,6 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
}
dst_hold(&xdst->u.dst);
- xdst->u.dst.flags |= DST_NOCACHE;
route = xdst->route;
}
}
@@ -2636,10 +2645,12 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
* notice. That's what we are validating here via the
* stale_bundle() check.
*
- * When a policy's bundle is pruned, we dst_free() the XFRM
- * dst which causes it's ->obsolete field to be set to
- * DST_OBSOLETE_DEAD. If an XFRM dst has been pruned like
- * this, we want to force a new route lookup.
+ * When an xdst is removed from flow cache, DST_OBSOLETE_DEAD will
+ * be marked on it.
+ * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will
+ * be marked on it.
+ * Both will force stable_bundle() to fail on any xdst bundle with
+ * this dst linked in it.
*/
if (dst->obsolete < 0 && !stale_bundle(dst))
return dst;