From 66bf79182d6531c14c1f9a507b6bbf374a2ae4cd Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 7 Sep 2008 18:19:25 -0700 Subject: netfilter: nf_conntrack_sip: de-static helper pointers Helper's ->help hook can run concurrently with itself, so iterating over SIP helpers with static pointer won't work reliably. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_sip.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 2f9bbc058b48..1fa306be60fb 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1193,7 +1193,6 @@ static const struct sip_handler sip_handlers[] = { static int process_sip_response(struct sk_buff *skb, const char **dptr, unsigned int *datalen) { - static const struct sip_handler *handler; enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); unsigned int matchoff, matchlen; @@ -1214,6 +1213,8 @@ static int process_sip_response(struct sk_buff *skb, dataoff = matchoff + matchlen + 1; for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; + handler = &sip_handlers[i]; if (handler->response == NULL) continue; @@ -1228,13 +1229,14 @@ static int process_sip_response(struct sk_buff *skb, static int process_sip_request(struct sk_buff *skb, const char **dptr, unsigned int *datalen) { - static const struct sip_handler *handler; enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); unsigned int matchoff, matchlen; unsigned int cseq, i; for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; + handler = &sip_handlers[i]; if (handler->request == NULL) continue; -- cgit v1.2.3 From 887464a41fde7e9e1e11ca86748338033c502446 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 7 Sep 2008 18:20:08 -0700 Subject: netfilter: nf_conntrack_gre: more locking around keymap list gre_keymap_list should be protected in all places. (unless I'm misreading something) Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_proto_gre.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 654a4f7f12c6..b308bb4c12b9 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -97,10 +97,14 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, kmp = &help->help.ct_pptp_info.keymap[dir]; if (*kmp) { /* check whether it's a retransmission */ + read_lock_bh(&nf_ct_gre_lock); list_for_each_entry(km, &gre_keymap_list, list) { - if (gre_key_cmpfn(km, t) && km == *kmp) + if (gre_key_cmpfn(km, t) && km == *kmp) { + read_unlock_bh(&nf_ct_gre_lock); return 0; + } } + read_unlock_bh(&nf_ct_gre_lock); pr_debug("trying to override keymap_%s for ct %p\n", dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); return -EEXIST; -- cgit v1.2.3 From 51807e91a76a531d059ec7ce3395c435e4df52a8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 7 Sep 2008 18:20:36 -0700 Subject: netfilter: nf_conntrack_gre: nf_ct_gre_keymap_flush() fixlet It does "kfree(list_head)" which looks wrong because entity that was allocated is definitely not list_head. However, this all works because list_head is first item in struct nf_ct_gre_keymap. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_proto_gre.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index b308bb4c12b9..9bd03967fea4 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -45,12 +45,12 @@ static LIST_HEAD(gre_keymap_list); void nf_ct_gre_keymap_flush(void) { - struct list_head *pos, *n; + struct nf_ct_gre_keymap *km, *tmp; write_lock_bh(&nf_ct_gre_lock); - list_for_each_safe(pos, n, &gre_keymap_list) { - list_del(pos); - kfree(pos); + list_for_each_entry_safe(km, tmp, &gre_keymap_list, list) { + list_del(&km->list); + kfree(km); } write_unlock_bh(&nf_ct_gre_lock); } -- cgit v1.2.3 From e3b802ba885b54f4050164c3cfd9e0ba9c73173a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 7 Sep 2008 18:21:24 -0700 Subject: netfilter: nf_conntrack_irc: make sure string is terminated before calling simple_strtoul Alexey Dobriyan points out: 1. simple_strtoul() silently accepts all characters for given base even if result won't fit into unsigned long. This is amazing stupidity in itself, but 2. nf_conntrack_irc helper use simple_strtoul() for DCC request parsing. Data first copied into 64KB buffer, so theoretically nothing prevents reading past the end of it, since data comes from network given 1). This is not actually a problem currently since we're guaranteed to have a 0 byte in skb_shared_info or in the buffer the data is copied to, but to make this more robust, make sure the string is actually terminated. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_irc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 1b1226d6653f..20633fdf7e6b 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -68,11 +68,21 @@ static const char *const dccprotos[] = { static int parse_dcc(char *data, const char *data_end, u_int32_t *ip, u_int16_t *port, char **ad_beg_p, char **ad_end_p) { + char *tmp; + /* at least 12: "AAAAAAAA P\1\n" */ while (*data++ != ' ') if (data > data_end - 12) return -1; + /* Make sure we have a newline character within the packet boundaries + * because simple_strtoul parses until the first invalid character. */ + for (tmp = data; tmp <= data_end; tmp++) + if (*tmp == '\n') + break; + if (tmp > data_end || *tmp != '\n') + return -1; + *ad_beg_p = data; *ip = simple_strtoul(data, &data, 10); -- cgit v1.2.3 From e8a83e10d7dfe5d0841062780769b30f65417e15 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Sun, 7 Sep 2008 18:41:21 -0700 Subject: pkt_sched: Fix qdisc state in net_tx_action() net_tx_action() can skip __QDISC_STATE_SCHED bit clearing while qdisc is neither ran nor rescheduled, which may cause endless loop in dev_deactivate(). Reported-by: Denys Fedoryshchenko Tested-by: Denys Fedoryshchenko Signed-off-by: Jarek Poplawski Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 60c51f765887..e719ed29310f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1991,8 +1991,13 @@ static void net_tx_action(struct softirq_action *h) spin_unlock(root_lock); } else { if (!test_bit(__QDISC_STATE_DEACTIVATED, - &q->state)) + &q->state)) { __netif_reschedule(q); + } else { + smp_mb__before_clear_bit(); + clear_bit(__QDISC_STATE_SCHED, + &q->state); + } } } } -- cgit v1.2.3 From d315492b1a6ba29da0fa2860759505ae1b2db857 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 8 Sep 2008 13:17:27 -0700 Subject: netns : fix kernel panic in timewait socket destruction How to reproduce ? - create a network namespace - use tcp protocol and get timewait socket - exit the network namespace - after a moment (when the timewait socket is destroyed), the kernel panics. # BUG: unable to handle kernel NULL pointer dereference at 0000000000000007 IP: [] inet_twdr_do_twkill_work+0x6e/0xb8 PGD 119985067 PUD 11c5c0067 PMD 0 Oops: 0000 [1] SMP CPU 1 Modules linked in: ipv6 button battery ac loop dm_mod tg3 libphy ext3 jbd edd fan thermal processor thermal_sys sg sata_svw libata dock serverworks sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table] Pid: 0, comm: swapper Not tainted 2.6.27-rc2 #3 RIP: 0010:[] [] inet_twdr_do_twkill_work+0x6e/0xb8 RSP: 0018:ffff88011ff7fed0 EFLAGS: 00010246 RAX: ffffffffffffffff RBX: ffffffff82339420 RCX: ffff88011ff7ff30 RDX: 0000000000000001 RSI: ffff88011a4d03c0 RDI: ffff88011ac2fc00 RBP: ffffffff823392e0 R08: 0000000000000000 R09: ffff88002802a200 R10: ffff8800a5c4b000 R11: ffffffff823e4080 R12: ffff88011ac2fc00 R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000 FS: 0000000041cbd940(0000) GS:ffff8800bff839c0(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 0000000000000007 CR3: 00000000bd87c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 0, threadinfo ffff8800bff9e000, task ffff88011ff76690) Stack: ffffffff823392e0 0000000000000100 ffffffff821e3a3a 0000000000000008 0000000000000000 ffffffff821e3a61 ffff8800bff7c000 ffffffff8203c7e7 ffff88011ff7ff10 ffff88011ff7ff10 0000000000000021 ffffffff82351108 Call Trace: [] ? inet_twdr_hangman+0x0/0x9e [] ? inet_twdr_hangman+0x27/0x9e [] ? run_timer_softirq+0x12c/0x193 [] ? __do_softirq+0x5e/0xcd [] ? call_softirq+0x1c/0x28 [] ? do_softirq+0x2c/0x68 [] ? smp_apic_timer_interrupt+0x8e/0xa9 [] ? apic_timer_interrupt+0x66/0x70 [] ? default_idle+0x27/0x3b [] ? cpu_idle+0x5f/0x7d Code: e8 01 00 00 4c 89 e7 41 ff c5 e8 8d fd ff ff 49 8b 44 24 38 4c 89 e7 65 8b 14 25 24 00 00 00 89 d2 48 8b 80 e8 00 00 00 48 f7 d0 <48> 8b 04 d0 48 ff 40 58 e8 fc fc ff ff 48 89 df e8 c0 5f 04 00 RIP [] inet_twdr_do_twkill_work+0x6e/0xb8 RSP CR2: 0000000000000007 This patch provides a function to purge all timewait sockets related to a network namespace. The timewait sockets life cycle is not tied with the network namespace, that means the timewait sockets stay alive while the network namespace dies. The timewait sockets are for avoiding to receive a duplicate packet from the network, if the network namespace is freed, the network stack is removed, so no chance to receive any packets from the outside world. Furthermore, having a pending destruction timer on these sockets with a network namespace freed is not safe and will lead to an oops if the timer callback which try to access data belonging to the namespace like for example in: inet_twdr_do_twkill_work -> NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); Purging the timewait sockets at the network namespace destruction will: 1) speed up memory freeing for the namespace 2) fix kernel panic on asynchronous timewait destruction Signed-off-by: Daniel Lezcano Acked-by: Denis V. Lunev Acked-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 3 +++ net/ipv4/inet_timewait_sock.c | 35 +++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 4 files changed, 40 insertions(+) (limited to 'net') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 95c660c9719b..91324908fccd 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -208,6 +208,9 @@ extern void inet_twsk_schedule(struct inet_timewait_sock *tw, extern void inet_twsk_deschedule(struct inet_timewait_sock *tw, struct inet_timewait_death_row *twdr); +extern void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, + struct inet_timewait_death_row *twdr, int family); + static inline struct net *twsk_net(const struct inet_timewait_sock *twsk) { diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index d985bd613d25..743f011b9a84 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -409,3 +409,38 @@ out: } EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); + +void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, + struct inet_timewait_death_row *twdr, int family) +{ + struct inet_timewait_sock *tw; + struct sock *sk; + struct hlist_node *node; + int h; + + local_bh_disable(); + for (h = 0; h < (hashinfo->ehash_size); h++) { + struct inet_ehash_bucket *head = + inet_ehash_bucket(hashinfo, h); + rwlock_t *lock = inet_ehash_lockp(hashinfo, h); +restart: + write_lock(lock); + sk_for_each(sk, node, &head->twchain) { + + tw = inet_twsk(sk); + if (!net_eq(twsk_net(tw), net) || + tw->tw_family != family) + continue; + + atomic_inc(&tw->tw_refcnt); + write_unlock(lock); + inet_twsk_deschedule(tw, twdr); + inet_twsk_put(tw); + + goto restart; + } + write_unlock(lock); + } + local_bh_enable(); +} +EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 44c1e934824b..1b4fee20fc93 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2376,6 +2376,7 @@ static int __net_init tcp_sk_init(struct net *net) static void __net_exit tcp_sk_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv4.tcp_sock); + inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET); } static struct pernet_operations __net_initdata tcp_sk_ops = { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5b90b369ccb2..b585c850a89a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2148,6 +2148,7 @@ static int tcpv6_net_init(struct net *net) static void tcpv6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.tcp_sk); + inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET6); } static struct pernet_operations tcpv6_net_ops = { -- cgit v1.2.3 From 8d4698f7a54a492a1b96c505b30fe750ae3e61d5 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 8 Sep 2008 13:44:40 -0700 Subject: bridge: don't allow setting hello time to zero Dushan Tcholich reports that on his system ksoftirqd can consume between %6 to %10 of cpu time, and cause ~200 context switches per second. He then correlated this with a report by bdupree@techfinesse.com: http://marc.info/?l=linux-kernel&m=119613299024398&w=2 and the culprit cause seems to be starting the bridge interface. In particular, when starting the bridge interface, his scripts are specifying a hello timer interval of "0". The bridge hello time can't be safely set to values less than 1 second, otherwise it is possible to end up with a runaway timer. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 8 +++++++- net/bridge/br_sysfs_br.c | 26 ++++++++++++++++++-------- 2 files changed, 25 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index eeee218eed80..5bbf07362172 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -188,15 +188,21 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) return 0; case BRCTL_SET_BRIDGE_HELLO_TIME: + { + unsigned long t = clock_t_to_jiffies(args[1]); if (!capable(CAP_NET_ADMIN)) return -EPERM; + if (t < HZ) + return -EINVAL; + spin_lock_bh(&br->lock); - br->bridge_hello_time = clock_t_to_jiffies(args[1]); + br->bridge_hello_time = t; if (br_is_root_bridge(br)) br->hello_time = br->bridge_hello_time; spin_unlock_bh(&br->lock); return 0; + } case BRCTL_SET_BRIDGE_MAX_AGE: if (!capable(CAP_NET_ADMIN)) diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 27d6a511c8c1..158dee8b4965 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -29,11 +29,12 @@ */ static ssize_t store_bridge_parm(struct device *d, const char *buf, size_t len, - void (*set)(struct net_bridge *, unsigned long)) + int (*set)(struct net_bridge *, unsigned long)) { struct net_bridge *br = to_bridge(d); char *endp; unsigned long val; + int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -43,9 +44,9 @@ static ssize_t store_bridge_parm(struct device *d, return -EINVAL; spin_lock_bh(&br->lock); - (*set)(br, val); + err = (*set)(br, val); spin_unlock_bh(&br->lock); - return len; + return err ? err : len; } @@ -56,12 +57,13 @@ static ssize_t show_forward_delay(struct device *d, return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } -static void set_forward_delay(struct net_bridge *br, unsigned long val) +static int set_forward_delay(struct net_bridge *br, unsigned long val) { unsigned long delay = clock_t_to_jiffies(val); br->forward_delay = delay; if (br_is_root_bridge(br)) br->bridge_forward_delay = delay; + return 0; } static ssize_t store_forward_delay(struct device *d, @@ -80,12 +82,17 @@ static ssize_t show_hello_time(struct device *d, struct device_attribute *attr, jiffies_to_clock_t(to_bridge(d)->hello_time)); } -static void set_hello_time(struct net_bridge *br, unsigned long val) +static int set_hello_time(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); + + if (t < HZ) + return -EINVAL; + br->hello_time = t; if (br_is_root_bridge(br)) br->bridge_hello_time = t; + return 0; } static ssize_t store_hello_time(struct device *d, @@ -104,12 +111,13 @@ static ssize_t show_max_age(struct device *d, struct device_attribute *attr, jiffies_to_clock_t(to_bridge(d)->max_age)); } -static void set_max_age(struct net_bridge *br, unsigned long val) +static int set_max_age(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); br->max_age = t; if (br_is_root_bridge(br)) br->bridge_max_age = t; + return 0; } static ssize_t store_max_age(struct device *d, struct device_attribute *attr, @@ -126,9 +134,10 @@ static ssize_t show_ageing_time(struct device *d, return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); } -static void set_ageing_time(struct net_bridge *br, unsigned long val) +static int set_ageing_time(struct net_bridge *br, unsigned long val) { br->ageing_time = clock_t_to_jiffies(val); + return 0; } static ssize_t store_ageing_time(struct device *d, @@ -180,9 +189,10 @@ static ssize_t show_priority(struct device *d, struct device_attribute *attr, (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); } -static void set_priority(struct net_bridge *br, unsigned long val) +static int set_priority(struct net_bridge *br, unsigned long val) { br_stp_set_bridge_priority(br, (u16) val); + return 0; } static ssize_t store_priority(struct device *d, struct device_attribute *attr, -- cgit v1.2.3