From ebc0ffae5dfb4447e0a431ffe7fe1d467c48bbb9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Oct 2010 10:41:36 +0000 Subject: fib: RCU conversion of fib_lookup() fib_lookup() converted to be called in RCU protected context, no reference taken and released on a contended cache line (fib_clntref) fib_table_lookup() and fib_semantic_match() get an additional parameter. struct fib_info gets an rcu_head field, and is freed after an rcu grace period. Stress test : (Sending 160.000.000 UDP frames on same neighbour, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_HASH) (about same results for FIB_TRIE) Before patch : real 1m31.199s user 0m13.761s sys 23m24.780s After patch: real 1m5.375s user 0m14.997s sys 15m50.115s Before patch Profile : 13044.00 15.4% __ip_route_output_key vmlinux 8438.00 10.0% dst_destroy vmlinux 5983.00 7.1% fib_semantic_match vmlinux 5410.00 6.4% fib_rules_lookup vmlinux 4803.00 5.7% neigh_lookup vmlinux 4420.00 5.2% _raw_spin_lock vmlinux 3883.00 4.6% rt_set_nexthop vmlinux 3261.00 3.9% _raw_read_lock vmlinux 2794.00 3.3% fib_table_lookup vmlinux 2374.00 2.8% neigh_resolve_output vmlinux 2153.00 2.5% dst_alloc vmlinux 1502.00 1.8% _raw_read_lock_bh vmlinux 1484.00 1.8% kmem_cache_alloc vmlinux 1407.00 1.7% eth_header vmlinux 1406.00 1.7% ipv4_dst_destroy vmlinux 1298.00 1.5% __copy_from_user_ll vmlinux 1174.00 1.4% dev_queue_xmit vmlinux 1000.00 1.2% ip_output vmlinux After patch Profile : 13712.00 15.8% dst_destroy vmlinux 8548.00 9.9% __ip_route_output_key vmlinux 7017.00 8.1% neigh_lookup vmlinux 4554.00 5.3% fib_semantic_match vmlinux 4067.00 4.7% _raw_read_lock vmlinux 3491.00 4.0% dst_alloc vmlinux 3186.00 3.7% neigh_resolve_output vmlinux 3103.00 3.6% fib_table_lookup vmlinux 2098.00 2.4% _raw_read_lock_bh vmlinux 2081.00 2.4% kmem_cache_alloc vmlinux 2013.00 2.3% _raw_spin_lock vmlinux 1763.00 2.0% __copy_from_user_ll vmlinux 1763.00 2.0% ip_output vmlinux 1761.00 2.0% ipv4_dst_destroy vmlinux 1631.00 1.9% eth_header vmlinux 1440.00 1.7% _raw_read_unlock_bh vmlinux Reference results, if IP route cache is enabled : real 0m29.718s user 0m10.845s sys 7m37.341s 25213.00 29.5% __ip_route_output_key vmlinux 9011.00 10.5% dst_release vmlinux 4817.00 5.6% ip_push_pending_frames vmlinux 4232.00 5.0% ip_finish_output vmlinux 3940.00 4.6% udp_sendmsg vmlinux 3730.00 4.4% __copy_from_user_ll vmlinux 3716.00 4.4% ip_route_output_flow vmlinux 2451.00 2.9% __xfrm_lookup vmlinux 2221.00 2.6% ip_append_data vmlinux 1718.00 2.0% _raw_spin_lock_bh vmlinux 1655.00 1.9% __alloc_skb vmlinux 1572.00 1.8% sock_wfree vmlinux 1345.00 1.6% kfree vmlinux Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_hash.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/ipv4/fib_hash.c') diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 4ed7e0dea1bc..83cca68e259c 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -244,7 +244,8 @@ fn_new_zone(struct fn_hash *table, int z) } int fib_table_lookup(struct fib_table *tb, - const struct flowi *flp, struct fib_result *res) + const struct flowi *flp, struct fib_result *res, + int fib_flags) { int err; struct fn_zone *fz; @@ -264,7 +265,7 @@ int fib_table_lookup(struct fib_table *tb, err = fib_semantic_match(&f->fn_alias, flp, res, - fz->fz_order); + fz->fz_order, fib_flags); if (err <= 0) goto out; } -- cgit v1.2.3 From 9bef83edfba72ba58b42c14fb046da2199574bc0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Oct 2010 20:53:04 +0000 Subject: fib_hash: embed initial hash table in fn_zone While looking for false sharing problems, I noticed sizeof(struct fn_zone) was small (28 bytes) and possibly sharing a cache line with an often written kernel structure. Most of the time, fn_zone uses its initial hash table of 16 slots. We can avoid the false sharing problem by embedding this initial hash table in fn_zone itself, so that sizeof(fn_zone) > L1_CACHE_BYTES We did a similar optimization in commit a6501e080c (Reduce memory needs and speedup lookups) Add a fz_revorder field to speedup fn_hash() a bit. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_hash.c | 52 +++++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 29 deletions(-) (limited to 'net/ipv4/fib_hash.c') diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 83cca68e259c..10001aa40692 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -54,23 +54,23 @@ struct fib_node { struct fib_alias fn_embedded_alias; }; +#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head)) + struct fn_zone { struct fn_zone *fz_next; /* Next not empty zone */ struct hlist_head *fz_hash; /* Hash table pointer */ - int fz_nent; /* Number of entries */ - - int fz_divisor; /* Hash divisor */ u32 fz_hashmask; /* (fz_divisor - 1) */ -#define FZ_HASHMASK(fz) ((fz)->fz_hashmask) - int fz_order; /* Zone order */ - __be32 fz_mask; + u8 fz_order; /* Zone order (0..32) */ + u8 fz_revorder; /* 32 - fz_order */ + __be32 fz_mask; /* inet_make_mask(order) */ #define FZ_MASK(fz) ((fz)->fz_mask) -}; -/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask - * can be cheaper than memory lookup, so that FZ_* macros are used. - */ + struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE]; + + int fz_nent; /* Number of entries */ + int fz_divisor; /* Hash size (mask+1) */ +}; struct fn_hash { struct fn_zone *fn_zones[33]; @@ -79,11 +79,11 @@ struct fn_hash { static inline u32 fn_hash(__be32 key, struct fn_zone *fz) { - u32 h = ntohl(key)>>(32 - fz->fz_order); + u32 h = ntohl(key) >> fz->fz_revorder; h ^= (h>>20); h ^= (h>>10); h ^= (h>>5); - h &= FZ_HASHMASK(fz); + h &= fz->fz_hashmask; return h; } @@ -147,14 +147,14 @@ static void fn_rehash_zone(struct fn_zone *fz) int old_divisor, new_divisor; u32 new_hashmask; - old_divisor = fz->fz_divisor; + new_divisor = old_divisor = fz->fz_divisor; switch (old_divisor) { - case 16: - new_divisor = 256; + case EMBEDDED_HASH_SIZE: + new_divisor *= EMBEDDED_HASH_SIZE; break; - case 256: - new_divisor = 1024; + case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE: + new_divisor *= (EMBEDDED_HASH_SIZE/2); break; default: if ((old_divisor << 1) > FZ_MAX_DIVISOR) { @@ -184,7 +184,8 @@ static void fn_rehash_zone(struct fn_zone *fz) fib_hash_genid++; write_unlock_bh(&fib_hash_lock); - fz_hash_free(old_ht, old_divisor); + if (old_ht != fz->fz_embedded_hash) + fz_hash_free(old_ht, old_divisor); } } @@ -210,18 +211,11 @@ fn_new_zone(struct fn_hash *table, int z) if (!fz) return NULL; - if (z) { - fz->fz_divisor = 16; - } else { - fz->fz_divisor = 1; - } - fz->fz_hashmask = (fz->fz_divisor - 1); - fz->fz_hash = fz_hash_alloc(fz->fz_divisor); - if (!fz->fz_hash) { - kfree(fz); - return NULL; - } + fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; + fz->fz_hashmask = fz->fz_divisor - 1; + fz->fz_hash = fz->fz_embedded_hash; fz->fz_order = z; + fz->fz_revorder = 32 - z; fz->fz_mask = inet_make_mask(z); /* Find the first not empty zone with more specific mask */ -- cgit v1.2.3 From 117a8cdea3647e8e11fac10d14eafefc20f9bda5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Oct 2010 20:53:34 +0000 Subject: fib_hash: RCU conversion phase 1 First step for RCU conversion of fib_hash : struct fn_zone are created and never deleted. Very classic conversion, using rcu_assign_pointer(), rcu_dereference() and rtnl_dereference() verbs. __rcu markers on fz_next and fn_zone_list They are created under RTNL, we dont need fib_hash_lock anymore in fn_new_zone(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_hash.c | 57 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 19 deletions(-) (limited to 'net/ipv4/fib_hash.c') diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 10001aa40692..04f05a96b75b 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -57,7 +57,7 @@ struct fib_node { #define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head)) struct fn_zone { - struct fn_zone *fz_next; /* Next not empty zone */ + struct fn_zone __rcu *fz_next; /* Next not empty zone */ struct hlist_head *fz_hash; /* Hash table pointer */ u32 fz_hashmask; /* (fz_divisor - 1) */ @@ -73,8 +73,8 @@ struct fn_zone { }; struct fn_hash { - struct fn_zone *fn_zones[33]; - struct fn_zone *fn_zone_list; + struct fn_zone *fn_zones[33]; + struct fn_zone __rcu *fn_zone_list; }; static inline u32 fn_hash(__be32 key, struct fn_zone *fz) @@ -219,21 +219,21 @@ fn_new_zone(struct fn_hash *table, int z) fz->fz_mask = inet_make_mask(z); /* Find the first not empty zone with more specific mask */ - for (i=z+1; i<=32; i++) + for (i = z + 1; i <= 32; i++) if (table->fn_zones[i]) break; - write_lock_bh(&fib_hash_lock); - if (i>32) { + if (i > 32) { /* No more specific masks, we are the first. */ - fz->fz_next = table->fn_zone_list; - table->fn_zone_list = fz; + rcu_assign_pointer(fz->fz_next, + rtnl_dereference(table->fn_zone_list)); + rcu_assign_pointer(table->fn_zone_list, fz); } else { - fz->fz_next = table->fn_zones[i]->fz_next; - table->fn_zones[i]->fz_next = fz; + rcu_assign_pointer(fz->fz_next, + rtnl_dereference(table->fn_zones[i]->fz_next)); + rcu_assign_pointer(table->fn_zones[i]->fz_next, fz); } table->fn_zones[z] = fz; fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); return fz; } @@ -245,8 +245,11 @@ int fib_table_lookup(struct fib_table *tb, struct fn_zone *fz; struct fn_hash *t = (struct fn_hash *)tb->tb_data; + rcu_read_lock(); read_lock(&fib_hash_lock); - for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { + for (fz = rcu_dereference(t->fn_zone_list); + fz != NULL; + fz = rcu_dereference(fz->fz_next)) { struct hlist_head *head; struct hlist_node *node; struct fib_node *f; @@ -267,6 +270,7 @@ int fib_table_lookup(struct fib_table *tb, err = 1; out: read_unlock(&fib_hash_lock); + rcu_read_unlock(); return err; } @@ -362,6 +366,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) return NULL; } +/* Caller must hold RTNL. */ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; @@ -657,13 +662,16 @@ static int fn_flush_list(struct fn_zone *fz, int idx) return found; } +/* caller must hold RTNL. */ int fib_table_flush(struct fib_table *tb) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; struct fn_zone *fz; int found = 0; - for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { + for (fz = rtnl_dereference(table->fn_zone_list); + fz != NULL; + fz = rtnl_dereference(fz->fz_next)) { int i; for (i = fz->fz_divisor - 1; i >= 0; i--) @@ -741,23 +749,29 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { - int m, s_m; + int m = 0, s_m; struct fn_zone *fz; struct fn_hash *table = (struct fn_hash *)tb->tb_data; s_m = cb->args[2]; + rcu_read_lock(); read_lock(&fib_hash_lock); - for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { - if (m < s_m) continue; + for (fz = rcu_dereference(table->fn_zone_list); + fz != NULL; + fz = rcu_dereference(fz->fz_next), m++) { + if (m < s_m) + continue; if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { cb->args[2] = m; read_unlock(&fib_hash_lock); + rcu_read_unlock(); return -1; } memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0])); } read_unlock(&fib_hash_lock); + rcu_read_unlock(); cb->args[2] = m; return skb->len; } @@ -820,8 +834,9 @@ static struct fib_alias *fib_get_first(struct seq_file *seq) iter->genid = fib_hash_genid; iter->valid = 1; - for (iter->zone = table->fn_zone_list; iter->zone; - iter->zone = iter->zone->fz_next) { + for (iter->zone = rcu_dereference(table->fn_zone_list); + iter->zone != NULL; + iter->zone = rcu_dereference(iter->zone->fz_next)) { int maxslot; if (!iter->zone->fz_nent) @@ -906,7 +921,7 @@ static struct fib_alias *fib_get_next(struct seq_file *seq) } } - iter->zone = iter->zone->fz_next; + iter->zone = rcu_dereference(iter->zone->fz_next); if (!iter->zone) goto out; @@ -946,9 +961,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) static void *fib_seq_start(struct seq_file *seq, loff_t *pos) __acquires(fib_hash_lock) + __acquires(RCU) { void *v = NULL; + rcu_read_lock(); read_lock(&fib_hash_lock); if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; @@ -963,8 +980,10 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void fib_seq_stop(struct seq_file *seq, void *v) __releases(fib_hash_lock) + __releases(RCU) { read_unlock(&fib_hash_lock); + rcu_read_unlock(); } static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) -- cgit v1.2.3 From 19f572565ef66a0439574fd2299a7c804147e133 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Oct 2010 20:56:39 +0000 Subject: fib_hash: RCU conversion phase 2 Get rid of fib_hash_lock rwlock. The fn_zone hash table resize is the noticeable part of this patch. I added a seqlock per fn_zone, so that readers can restart their lookup in the (very rare) case a writer expanded the hash table. Add rcu heads in fib_alias and fib_node, use call_rcu() to defer their freeing, and use appropriate _rcu list manipulations. Stress test (160.000.000 udp frames sent, IP route cache disabled to mimic DDOS attack, FIB_HASH) Before: real 0m41.191s user 0m13.137s sys 8m55.241s After: real 0m38.091s user 0m13.189s sys 7m53.018s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_hash.c | 176 +++++++++++++++++++++++++++++--------------------- net/ipv4/fib_lookup.h | 2 - 2 files changed, 101 insertions(+), 77 deletions(-) (limited to 'net/ipv4/fib_hash.c') diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 04f05a96b75b..4f1aafd3ba89 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -58,7 +58,8 @@ struct fib_node { struct fn_zone { struct fn_zone __rcu *fz_next; /* Next not empty zone */ - struct hlist_head *fz_hash; /* Hash table pointer */ + struct hlist_head __rcu *fz_hash; /* Hash table pointer */ + seqlock_t fz_lock; u32 fz_hashmask; /* (fz_divisor - 1) */ u8 fz_order; /* Zone order (0..32) */ @@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz) return dst & FZ_MASK(fz); } -static DEFINE_RWLOCK(fib_hash_lock); static unsigned int fib_hash_genid; #define FZ_MAX_DIVISOR ((PAGE_SIZE<fn_hash); + hlist_del_rcu(&f->fn_hash); new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; - hlist_add_head(&f->fn_hash, new_head); + hlist_add_head_rcu(&f->fn_hash, new_head); } } } @@ -175,32 +174,55 @@ static void fn_rehash_zone(struct fn_zone *fz) ht = fz_hash_alloc(new_divisor); if (ht) { - write_lock_bh(&fib_hash_lock); + struct fn_zone nfz; + + memcpy(&nfz, fz, sizeof(nfz)); + + write_seqlock_bh(&fz->fz_lock); old_ht = fz->fz_hash; - fz->fz_hash = ht; + nfz.fz_hash = ht; + nfz.fz_hashmask = new_hashmask; + nfz.fz_divisor = new_divisor; + fn_rebuild_zone(&nfz, old_ht, old_divisor); + fib_hash_genid++; + rcu_assign_pointer(fz->fz_hash, ht); fz->fz_hashmask = new_hashmask; fz->fz_divisor = new_divisor; - fn_rebuild_zone(fz, old_ht, old_divisor); - fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); + write_sequnlock_bh(&fz->fz_lock); - if (old_ht != fz->fz_embedded_hash) + if (old_ht != fz->fz_embedded_hash) { + synchronize_rcu(); fz_hash_free(old_ht, old_divisor); + } } } -static inline void fn_free_node(struct fib_node * f) +static void fn_free_node_rcu(struct rcu_head *head) { + struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu); + kmem_cache_free(fn_hash_kmem, f); } +static inline void fn_free_node(struct fib_node *f) +{ + call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu); +} + +static void fn_free_alias_rcu(struct rcu_head *head) +{ + struct fib_alias *fa = container_of(head, struct fib_alias, rcu); + + kmem_cache_free(fn_alias_kmem, fa); +} + static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) { fib_release_info(fa->fa_info); if (fa == &f->fn_embedded_alias) fa->fa_info = NULL; else - kmem_cache_free(fn_alias_kmem, fa); + call_rcu(&fa->rcu, fn_free_alias_rcu); } static struct fn_zone * @@ -211,6 +233,7 @@ fn_new_zone(struct fn_hash *table, int z) if (!fz) return NULL; + seqlock_init(&fz->fz_lock); fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; fz->fz_hashmask = fz->fz_divisor - 1; fz->fz_hash = fz->fz_embedded_hash; @@ -246,30 +269,34 @@ int fib_table_lookup(struct fib_table *tb, struct fn_hash *t = (struct fn_hash *)tb->tb_data; rcu_read_lock(); - read_lock(&fib_hash_lock); for (fz = rcu_dereference(t->fn_zone_list); fz != NULL; fz = rcu_dereference(fz->fz_next)) { - struct hlist_head *head; + struct hlist_head __rcu *head; struct hlist_node *node; struct fib_node *f; - __be32 k = fz_key(flp->fl4_dst, fz); + __be32 k; + unsigned int seq; - head = &fz->fz_hash[fn_hash(k, fz)]; - hlist_for_each_entry(f, node, head, fn_hash) { - if (f->fn_key != k) - continue; + do { + seq = read_seqbegin(&fz->fz_lock); + k = fz_key(flp->fl4_dst, fz); + + head = &fz->fz_hash[fn_hash(k, fz)]; + hlist_for_each_entry_rcu(f, node, head, fn_hash) { + if (f->fn_key != k) + continue; - err = fib_semantic_match(&f->fn_alias, + err = fib_semantic_match(&f->fn_alias, flp, res, fz->fz_order, fib_flags); - if (err <= 0) - goto out; - } + if (err <= 0) + goto out; + } + } while (read_seqretry(&fz->fz_lock, seq)); } err = 1; out: - read_unlock(&fib_hash_lock); rcu_read_unlock(); return err; } @@ -292,11 +319,11 @@ void fib_table_select_default(struct fib_table *tb, last_resort = NULL; order = -1; - read_lock(&fib_hash_lock); - hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { + rcu_read_lock(); + hlist_for_each_entry_rcu(f, node, &fz->fz_hash[0], fn_hash) { struct fib_alias *fa; - list_for_each_entry(fa, &f->fn_alias, fa_list) { + list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { struct fib_info *next_fi = fa->fa_info; if (fa->fa_scope != res->scope || @@ -340,7 +367,7 @@ void fib_table_select_default(struct fib_table *tb, fib_result_assign(res, last_resort); tb->tb_default = last_idx; out: - read_unlock(&fib_hash_lock); + rcu_read_unlock(); } /* Insert node F to FZ. */ @@ -348,7 +375,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) { struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; - hlist_add_head(&f->fn_hash, head); + hlist_add_head_rcu(&f->fn_hash, head); } /* Return the node in FZ matching KEY. */ @@ -358,7 +385,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) struct hlist_node *node; struct fib_node *f; - hlist_for_each_entry(f, node, head, fn_hash) { + hlist_for_each_entry_rcu(f, node, head, fn_hash) { if (f->fn_key == key) return f; } @@ -366,6 +393,16 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) return NULL; } + +static struct fib_alias *fib_fast_alloc(struct fib_node *f) +{ + struct fib_alias *fa = &f->fn_embedded_alias; + + if (fa->fa_info != NULL) + fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); + return fa; +} + /* Caller must hold RTNL. */ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { @@ -451,7 +488,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) } if (cfg->fc_nlflags & NLM_F_REPLACE) { - struct fib_info *fi_drop; u8 state; fa = fa_first; @@ -460,21 +496,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) err = 0; goto out; } - write_lock_bh(&fib_hash_lock); - fi_drop = fa->fa_info; - fa->fa_info = fi; - fa->fa_type = cfg->fc_type; - fa->fa_scope = cfg->fc_scope; + err = -ENOBUFS; + new_fa = fib_fast_alloc(f); + if (new_fa == NULL) + goto out; + + new_fa->fa_tos = fa->fa_tos; + new_fa->fa_info = fi; + new_fa->fa_type = cfg->fc_type; + new_fa->fa_scope = cfg->fc_scope; state = fa->fa_state; - fa->fa_state &= ~FA_S_ACCESSED; + new_fa->fa_state = state & ~FA_S_ACCESSED; fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); + list_replace_rcu(&fa->fa_list, &new_fa->fa_list); - fib_release_info(fi_drop); + fn_free_alias(fa, f); if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, - &cfg->fc_nlinfo, NLM_F_REPLACE); + rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, + tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); return 0; } @@ -506,12 +546,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) f = new_f; } - new_fa = &f->fn_embedded_alias; - if (new_fa->fa_info != NULL) { - new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - if (new_fa == NULL) - goto out; - } + new_fa = fib_fast_alloc(f); + if (new_fa == NULL) + goto out; + new_fa->fa_info = fi; new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; @@ -522,13 +560,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) * Insert new entry to the list. */ - write_lock_bh(&fib_hash_lock); if (new_f) fib_insert_node(fz, new_f); - list_add_tail(&new_fa->fa_list, + list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : &f->fn_alias)); fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); if (new_f) fz->fz_nent++; @@ -603,14 +639,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) tb->tb_id, &cfg->fc_nlinfo, 0); kill_fn = 0; - write_lock_bh(&fib_hash_lock); - list_del(&fa->fa_list); + list_del_rcu(&fa->fa_list); if (list_empty(&f->fn_alias)) { - hlist_del(&f->fn_hash); + hlist_del_rcu(&f->fn_hash); kill_fn = 1; } fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); if (fa->fa_state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); @@ -641,14 +675,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx) struct fib_info *fi = fa->fa_info; if (fi && (fi->fib_flags&RTNH_F_DEAD)) { - write_lock_bh(&fib_hash_lock); - list_del(&fa->fa_list); + list_del_rcu(&fa->fa_list); if (list_empty(&f->fn_alias)) { - hlist_del(&f->fn_hash); + hlist_del_rcu(&f->fn_hash); kill_f = 1; } fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); fn_free_alias(fa, f); found++; @@ -693,10 +725,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, s_i = cb->args[4]; i = 0; - hlist_for_each_entry(f, node, head, fn_hash) { + hlist_for_each_entry_rcu(f, node, head, fn_hash) { struct fib_alias *fa; - list_for_each_entry(fa, &f->fn_alias, fa_list) { + list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { if (i < s_i) goto next; @@ -714,7 +746,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, cb->args[4] = i; return -1; } - next: +next: i++; } } @@ -755,7 +787,6 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, s_m = cb->args[2]; rcu_read_lock(); - read_lock(&fib_hash_lock); for (fz = rcu_dereference(table->fn_zone_list); fz != NULL; fz = rcu_dereference(fz->fz_next), m++) { @@ -763,14 +794,12 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, continue; if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { cb->args[2] = m; - read_unlock(&fib_hash_lock); rcu_read_unlock(); return -1; } memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0])); } - read_unlock(&fib_hash_lock); rcu_read_unlock(); cb->args[2] = m; return skb->len; @@ -960,13 +989,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) } static void *fib_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(fib_hash_lock) __acquires(RCU) { void *v = NULL; rcu_read_lock(); - read_lock(&fib_hash_lock); if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; return v; @@ -979,17 +1006,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void fib_seq_stop(struct seq_file *seq, void *v) - __releases(fib_hash_lock) __releases(RCU) { - read_unlock(&fib_hash_lock); rcu_read_unlock(); } static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { static const unsigned type2flags[RTN_MAX + 1] = { - [7] = RTF_REJECT, [8] = RTF_REJECT, + [7] = RTF_REJECT, + [8] = RTF_REJECT, }; unsigned flags = type2flags[type]; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index b9c9a9f2aee5..5072d8effd5d 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -12,9 +12,7 @@ struct fib_alias { u8 fa_type; u8 fa_scope; u8 fa_state; -#ifdef CONFIG_IP_FIB_TRIE struct rcu_head rcu; -#endif }; #define FA_S_ACCESSED 0x01 -- cgit v1.2.3 From 9b0c290e78d667e6a483bde8c7cef7dd15f49017 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Oct 2010 22:03:38 +0000 Subject: fib: introduce fib_alias_accessed() helper Perf tools session at NFWS 2010 pointed out a false sharing on struct fib_alias that can be avoided pretty easily, if we set FA_S_ACCESSED bit only if needed (ie : not already set) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_hash.c | 3 ++- net/ipv4/fib_lookup.h | 7 +++++++ net/ipv4/fib_semantics.c | 2 +- net/ipv4/fib_trie.c | 3 ++- 4 files changed, 12 insertions(+), 3 deletions(-) (limited to 'net/ipv4/fib_hash.c') diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 4f1aafd3ba89..43e1c594ce8f 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -335,7 +335,8 @@ void fib_table_select_default(struct fib_table *tb, if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; - fa->fa_state |= FA_S_ACCESSED; + + fib_alias_accessed(fa); if (fi == NULL) { if (next_fi != res->fi) diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 5072d8effd5d..a29edf2219c8 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -17,6 +17,13 @@ struct fib_alias { #define FA_S_ACCESSED 0x01 +/* Dont write on fa_state unless needed, to keep it shared on all cpus */ +static inline void fib_alias_accessed(struct fib_alias *fa) +{ + if (!(fa->fa_state & FA_S_ACCESSED)) + fa->fa_state |= FA_S_ACCESSED; +} + /* Exported by fib_semantics.c */ extern int fib_semantic_match(struct list_head *head, const struct flowi *flp, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6734c9cab248..3e0da3ef6116 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -901,7 +901,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, if (fa->fa_scope < flp->fl4_scope) continue; - fa->fa_state |= FA_S_ACCESSED; + fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (err == 0) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 31494f335686..cd5e13aee7d5 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1838,7 +1838,8 @@ void fib_table_select_default(struct fib_table *tb, if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; - fa->fa_state |= FA_S_ACCESSED; + + fib_alias_accessed(fa); if (fi == NULL) { if (next_fi != res->fi) -- cgit v1.2.3