From df32cc193ad88f7b1326b90af799c927b27f7654 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 1 Nov 2010 12:55:52 -0700 Subject: net: check queue_index from sock is valid for device In dev_pick_tx recompute the queue index if the value stored in the socket is greater than or equal to the number of real queues for the device. The saved index in the sock structure is not guaranteed to be appropriate for the egress device (this could happen on a route change or in presence of tunnelling). The result of the queue index being bad would be to return a bogus queue (crash could prersumably follow). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 35dfb8318483..0dd54a69dace 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2131,7 +2131,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, } else { struct sock *sk = skb->sk; queue_index = sk_tx_queue_get(sk); - if (queue_index < 0) { + if (queue_index < 0 || queue_index >= dev->real_num_tx_queues) { queue_index = 0; if (dev->real_num_tx_queues > 1) -- cgit v1.2.3 From 86c2c0a8a4965ae5cbc0ff97ed39a4472e8e9b23 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sat, 6 Nov 2010 20:11:38 +0000 Subject: NET: pktgen - fix compile warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should fix the following warning: net/core/pktgen.c: In function ‘pktgen_if_write’: net/core/pktgen.c:890: warning: comparison of distinct pointer types lacks a cast Signed-off-by: Dmitry Torokhov Reviewed-by: Nelson Elhage Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index fbce4b05a53e..1992cd050e26 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -887,7 +887,7 @@ static ssize_t pktgen_if_write(struct file *file, i += len; if (debug) { - size_t copy = min(count, 1023); + size_t copy = min_t(size_t, count, 1023); char tb[copy + 1]; if (copy_from_user(tb, user_buffer, copy)) return -EFAULT; -- cgit v1.2.3 From eb589063ed482f5592b1378e4136d6998419af6e Mon Sep 17 00:00:00 2001 From: Junchang Wang Date: Sun, 7 Nov 2010 23:19:43 +0000 Subject: pktgen: correct uninitialized queue_map This fix a bug reported by backyes. Right the first time pktgen's using queue_map that's not been initialized by set_cur_queue_map(pkt_dev); Signed-off-by: Junchang Wang Signed-off-by: Backyes Signed-off-by: David S. Miller --- net/core/pktgen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 1992cd050e26..33bc3823ac6f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2612,8 +2612,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, /* Update any of the values, used when we're incrementing various * fields. */ - queue_map = pkt_dev->cur_queue_map; mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; datalen = (odev->hard_header_len + 16) & ~0xf; @@ -2976,8 +2976,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, /* Update any of the values, used when we're incrementing various * fields. */ - queue_map = pkt_dev->cur_queue_map; mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; skb = __netdev_alloc_skb(odev, pkt_dev->cur_pkt_size + 64 -- cgit v1.2.3 From 332dd96f7ac15e937088fe11f15cfe0210e8edd1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Nov 2010 11:46:33 -0800 Subject: net/dst: dst_dev_event() called after other notifiers Followup of commit ef885afbf8a37689 (net: use rcu_barrier() in rollback_registered_many) dst_dev_event() scans a garbage dst list that might be feeded by various network notifiers at device dismantle time. Its important to call dst_dev_event() after other notifiers, or we might enter the infamous msleep(250) in netdev_wait_allrefs(), and wait one second before calling again call_netdevice_notifiers(NETDEV_UNREGISTER, dev) to properly remove last device references. Use priority -10 to let dst_dev_notifier be called after other network notifiers (they have the default 0 priority) Reported-by: Ben Greear Reported-by: Nicolas Dichtel Reported-by: Octavian Purdila Reported-by: Benjamin LaHaise Tested-by: Ben Greear Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dst.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index 8abe628b79f1..b99c7c7ffce2 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -370,6 +370,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, static struct notifier_block dst_dev_notifier = { .notifier_call = dst_dev_event, + .priority = -10, /* must be called after other network notifiers */ }; void __init dst_init(void) -- cgit v1.2.3 From 57fe93b374a6b8711995c2d466c502af9f3a08bb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 10 Nov 2010 10:38:24 -0800 Subject: filter: make sure filters dont read uninitialized memory There is a possibility malicious users can get limited information about uninitialized stack mem array. Even if sk_run_filter() result is bound to packet length (0 .. 65535), we could imagine this can be used by hostile user. Initializing mem[] array, like Dan Rosenberg suggested in his patch is expensive since most filters dont even use this array. Its hard to make the filter validation in sk_chk_filter(), because of the jumps. This might be done later. In this patch, I use a bitmap (a single long var) so that only filters using mem[] loads/stores pay the price of added security checks. For other filters, additional cost is a single instruction. [ Since we access fentry->k a lot now, cache it in a local variable and mark filter entry pointer as const. -DaveM ] Reported-by: Dan Rosenberg Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/filter.c | 64 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 29 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 7beaec36b541..23e9b2a6b4c8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -112,39 +112,41 @@ EXPORT_SYMBOL(sk_filter); */ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) { - struct sock_filter *fentry; /* We walk down these */ void *ptr; u32 A = 0; /* Accumulator */ u32 X = 0; /* Index Register */ u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + unsigned long memvalid = 0; u32 tmp; int k; int pc; + BUILD_BUG_ON(BPF_MEMWORDS > BITS_PER_LONG); /* * Process array of filter instructions. */ for (pc = 0; pc < flen; pc++) { - fentry = &filter[pc]; + const struct sock_filter *fentry = &filter[pc]; + u32 f_k = fentry->k; switch (fentry->code) { case BPF_S_ALU_ADD_X: A += X; continue; case BPF_S_ALU_ADD_K: - A += fentry->k; + A += f_k; continue; case BPF_S_ALU_SUB_X: A -= X; continue; case BPF_S_ALU_SUB_K: - A -= fentry->k; + A -= f_k; continue; case BPF_S_ALU_MUL_X: A *= X; continue; case BPF_S_ALU_MUL_K: - A *= fentry->k; + A *= f_k; continue; case BPF_S_ALU_DIV_X: if (X == 0) @@ -152,49 +154,49 @@ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int A /= X; continue; case BPF_S_ALU_DIV_K: - A /= fentry->k; + A /= f_k; continue; case BPF_S_ALU_AND_X: A &= X; continue; case BPF_S_ALU_AND_K: - A &= fentry->k; + A &= f_k; continue; case BPF_S_ALU_OR_X: A |= X; continue; case BPF_S_ALU_OR_K: - A |= fentry->k; + A |= f_k; continue; case BPF_S_ALU_LSH_X: A <<= X; continue; case BPF_S_ALU_LSH_K: - A <<= fentry->k; + A <<= f_k; continue; case BPF_S_ALU_RSH_X: A >>= X; continue; case BPF_S_ALU_RSH_K: - A >>= fentry->k; + A >>= f_k; continue; case BPF_S_ALU_NEG: A = -A; continue; case BPF_S_JMP_JA: - pc += fentry->k; + pc += f_k; continue; case BPF_S_JMP_JGT_K: - pc += (A > fentry->k) ? fentry->jt : fentry->jf; + pc += (A > f_k) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JGE_K: - pc += (A >= fentry->k) ? fentry->jt : fentry->jf; + pc += (A >= f_k) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JEQ_K: - pc += (A == fentry->k) ? fentry->jt : fentry->jf; + pc += (A == f_k) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JSET_K: - pc += (A & fentry->k) ? fentry->jt : fentry->jf; + pc += (A & f_k) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JGT_X: pc += (A > X) ? fentry->jt : fentry->jf; @@ -209,7 +211,7 @@ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int pc += (A & X) ? fentry->jt : fentry->jf; continue; case BPF_S_LD_W_ABS: - k = fentry->k; + k = f_k; load_w: ptr = load_pointer(skb, k, 4, &tmp); if (ptr != NULL) { @@ -218,7 +220,7 @@ load_w: } break; case BPF_S_LD_H_ABS: - k = fentry->k; + k = f_k; load_h: ptr = load_pointer(skb, k, 2, &tmp); if (ptr != NULL) { @@ -227,7 +229,7 @@ load_h: } break; case BPF_S_LD_B_ABS: - k = fentry->k; + k = f_k; load_b: ptr = load_pointer(skb, k, 1, &tmp); if (ptr != NULL) { @@ -242,32 +244,34 @@ load_b: X = skb->len; continue; case BPF_S_LD_W_IND: - k = X + fentry->k; + k = X + f_k; goto load_w; case BPF_S_LD_H_IND: - k = X + fentry->k; + k = X + f_k; goto load_h; case BPF_S_LD_B_IND: - k = X + fentry->k; + k = X + f_k; goto load_b; case BPF_S_LDX_B_MSH: - ptr = load_pointer(skb, fentry->k, 1, &tmp); + ptr = load_pointer(skb, f_k, 1, &tmp); if (ptr != NULL) { X = (*(u8 *)ptr & 0xf) << 2; continue; } return 0; case BPF_S_LD_IMM: - A = fentry->k; + A = f_k; continue; case BPF_S_LDX_IMM: - X = fentry->k; + X = f_k; continue; case BPF_S_LD_MEM: - A = mem[fentry->k]; + A = (memvalid & (1UL << f_k)) ? + mem[f_k] : 0; continue; case BPF_S_LDX_MEM: - X = mem[fentry->k]; + X = (memvalid & (1UL << f_k)) ? + mem[f_k] : 0; continue; case BPF_S_MISC_TAX: X = A; @@ -276,14 +280,16 @@ load_b: A = X; continue; case BPF_S_RET_K: - return fentry->k; + return f_k; case BPF_S_RET_A: return A; case BPF_S_ST: - mem[fentry->k] = A; + memvalid |= 1UL << f_k; + mem[f_k] = A; continue; case BPF_S_STX: - mem[fentry->k] = X; + memvalid |= 1UL << f_k; + mem[f_k] = X; continue; default: WARN_ON(1); -- cgit v1.2.3 From 8d987e5c75107ca7515fa19e857cfa24aab6ec8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Nov 2010 23:24:26 +0000 Subject: net: avoid limits overflow Robin Holt tried to boot a 16TB machine and found some limits were reached : sysctl_tcp_mem[2], sysctl_udp_mem[2] We can switch infrastructure to use long "instead" of "int", now atomic_long_t primitives are available for free. Signed-off-by: Eric Dumazet Reported-by: Robin Holt Reviewed-by: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/net/dn.h | 2 +- include/net/sock.h | 4 ++-- include/net/tcp.h | 6 +++--- include/net/udp.h | 4 ++-- net/core/sock.c | 14 +++++++------- net/decnet/af_decnet.c | 2 +- net/decnet/sysctl_net_decnet.c | 4 ++-- net/ipv4/proc.c | 8 ++++---- net/ipv4/sysctl_net_ipv4.c | 5 ++--- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 11 +++++++---- net/ipv4/udp.c | 4 ++-- net/sctp/protocol.c | 2 +- net/sctp/socket.c | 4 ++-- net/sctp/sysctl.c | 4 ++-- 15 files changed, 40 insertions(+), 38 deletions(-) (limited to 'net/core') diff --git a/include/net/dn.h b/include/net/dn.h index e5469f7b67a3..a514a3cf4573 100644 --- a/include/net/dn.h +++ b/include/net/dn.h @@ -225,7 +225,7 @@ extern int decnet_di_count; extern int decnet_dr_count; extern int decnet_no_fc_max_cwnd; -extern int sysctl_decnet_mem[3]; +extern long sysctl_decnet_mem[3]; extern int sysctl_decnet_wmem[3]; extern int sysctl_decnet_rmem[3]; diff --git a/include/net/sock.h b/include/net/sock.h index c7a736228ca2..a6338d039857 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -762,7 +762,7 @@ struct proto { /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); - atomic_t *memory_allocated; /* Current allocated memory. */ + atomic_long_t *memory_allocated; /* Current allocated memory. */ struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* * Pressure flag: try to collapse. @@ -771,7 +771,7 @@ struct proto { * is strict, actions are advisory and have some latency. */ int *memory_pressure; - int *sysctl_mem; + long *sysctl_mem; int *sysctl_wmem; int *sysctl_rmem; int max_header; diff --git a/include/net/tcp.h b/include/net/tcp.h index 4fee0424af7e..e36c874c7fb1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -224,7 +224,7 @@ extern int sysctl_tcp_fack; extern int sysctl_tcp_reordering; extern int sysctl_tcp_ecn; extern int sysctl_tcp_dsack; -extern int sysctl_tcp_mem[3]; +extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; extern int sysctl_tcp_app_win; @@ -247,7 +247,7 @@ extern int sysctl_tcp_cookie_size; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; -extern atomic_t tcp_memory_allocated; +extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; extern int tcp_memory_pressure; @@ -280,7 +280,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift) } if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) + atomic_long_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) return true; return false; } diff --git a/include/net/udp.h b/include/net/udp.h index 200b82848c9a..bb967dd59bf7 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -105,10 +105,10 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, extern struct proto udp_prot; -extern atomic_t udp_memory_allocated; +extern atomic_long_t udp_memory_allocated; /* sysctl variables for udp */ -extern int sysctl_udp_mem[3]; +extern long sysctl_udp_mem[3]; extern int sysctl_udp_rmem_min; extern int sysctl_udp_wmem_min; diff --git a/net/core/sock.c b/net/core/sock.c index 3eed5424e659..fb6080111461 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1653,10 +1653,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) { struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); - int allocated; + long allocated; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = atomic_add_return(amt, prot->memory_allocated); + allocated = atomic_long_add_return(amt, prot->memory_allocated); /* Under limit. */ if (allocated <= prot->sysctl_mem[0]) { @@ -1714,7 +1714,7 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - atomic_sub(amt, prot->memory_allocated); + atomic_long_sub(amt, prot->memory_allocated); return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -1727,12 +1727,12 @@ void __sk_mem_reclaim(struct sock *sk) { struct proto *prot = sk->sk_prot; - atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, + atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, prot->memory_allocated); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; if (prot->memory_pressure && *prot->memory_pressure && - (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0])) + (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) *prot->memory_pressure = 0; } EXPORT_SYMBOL(__sk_mem_reclaim); @@ -2452,12 +2452,12 @@ static char proto_method_implemented(const void *method) static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { - seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s " + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), - proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, + proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", proto->max_header, proto->slab == NULL ? "no" : "yes", diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index d6b93d19790f..a76b78de679f 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -155,7 +155,7 @@ static const struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; -static atomic_t decnet_memory_allocated; +static atomic_long_t decnet_memory_allocated; static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags); static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index be3eb8e23288..28f8b5e5f73b 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -38,7 +38,7 @@ int decnet_log_martians = 1; int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW; /* Reasonable defaults, I hope, based on tcp's defaults */ -int sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; +long sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; @@ -324,7 +324,7 @@ static ctl_table dn_table[] = { .data = &sysctl_decnet_mem, .maxlen = sizeof(sysctl_decnet_mem), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax }, { .procname = "decnet_rmem", diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4ae1f203f7cb..1b48eb1ed453 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,13 +59,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) local_bh_enable(); socket_seq_show(seq); - seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", + seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, tcp_death_row.tw_count, sockets, - atomic_read(&tcp_memory_allocated)); - seq_printf(seq, "UDP: inuse %d mem %d\n", + atomic_long_read(&tcp_memory_allocated)); + seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), - atomic_read(&udp_memory_allocated)); + atomic_long_read(&udp_memory_allocated)); seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d96c1da4b17c..e91911d7aae2 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -398,7 +398,7 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_tcp_mem, .maxlen = sizeof(sysctl_tcp_mem), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_doulongvec_minmax }, { .procname = "tcp_wmem", @@ -602,8 +602,7 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .proc_handler = proc_doulongvec_minmax, }, { .procname = "udp_rmem_min", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1664a0590bb8..245603c4ad48 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -282,7 +282,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); -int sysctl_tcp_mem[3] __read_mostly; +long sysctl_tcp_mem[3] __read_mostly; int sysctl_tcp_wmem[3] __read_mostly; int sysctl_tcp_rmem[3] __read_mostly; @@ -290,7 +290,7 @@ EXPORT_SYMBOL(sysctl_tcp_mem); EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); -atomic_t tcp_memory_allocated; /* Current allocated memory. */ +atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); /* diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3357f69e353d..6d8ab1c4efc3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -259,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk) int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); - if (sk->sk_sndbuf < 3 * sndmem) - sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); + if (sk->sk_sndbuf < 3 * sndmem) { + sk->sk_sndbuf = 3 * sndmem; + if (sk->sk_sndbuf > sysctl_tcp_wmem[2]) + sk->sk_sndbuf = sysctl_tcp_wmem[2]; + } } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -396,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); } @@ -4861,7 +4864,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ - if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) return 0; /* If we filled the congestion window, do not expand. */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 28cb2d733a3c..5e0a3a582a59 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -110,7 +110,7 @@ struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); -int sysctl_udp_mem[3] __read_mostly; +long sysctl_udp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_udp_mem); int sysctl_udp_rmem_min __read_mostly; @@ -119,7 +119,7 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min); int sysctl_udp_wmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_wmem_min); -atomic_t udp_memory_allocated; +atomic_long_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 1ef29c74d85e..e58f9476f29c 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -92,7 +92,7 @@ static struct sctp_af *sctp_af_v6_specific; struct kmem_cache *sctp_chunk_cachep __read_mostly; struct kmem_cache *sctp_bucket_cachep __read_mostly; -int sysctl_sctp_mem[3]; +long sysctl_sctp_mem[3]; int sysctl_sctp_rmem[3]; int sysctl_sctp_wmem[3]; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e34ca9cc1167..6bd554323a34 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -111,12 +111,12 @@ static void sctp_sock_migrate(struct sock *, struct sock *, static char *sctp_hmac_alg = SCTP_COOKIE_HMAC_ALG; extern struct kmem_cache *sctp_bucket_cachep; -extern int sysctl_sctp_mem[3]; +extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; static int sctp_memory_pressure; -static atomic_t sctp_memory_allocated; +static atomic_long_t sctp_memory_allocated; struct percpu_counter sctp_sockets_allocated; static void sctp_enter_memory_pressure(struct sock *sk) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 832590bbe0c0..50cb57f0919e 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -54,7 +54,7 @@ static int sack_timer_max = 500; static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */ static int rwnd_scale_max = 16; -extern int sysctl_sctp_mem[3]; +extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; @@ -203,7 +203,7 @@ static ctl_table sctp_table[] = { .data = &sysctl_sctp_mem, .maxlen = sizeof(sysctl_sctp_mem), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax }, { .procname = "sctp_rmem", -- cgit v1.2.3 From 369cf77a6a3e41b1110506ddf43d45804103bfde Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 11 Nov 2010 15:47:59 +0000 Subject: rtnetlink: Fix message size calculation for link messages nlmsg_total_size() calculates the length of a netlink message including header and alignment. nla_total_size() calculates the space an individual attribute consumes which was meant to be used in this context. Also, ensure to account for the attribute header for the IFLA_INFO_XSTATS attribute as implementations of get_xstats_size() seem to assume that we do so. The addition of two message headers minus the missing attribute header resulted in a calculated message size that was larger than required. Therefore we never risked running out of skb tailroom. Signed-off-by: Thomas Graf Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8121268ddbdd..841c287ef40a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -347,16 +347,17 @@ static size_t rtnl_link_get_size(const struct net_device *dev) if (!ops) return 0; - size = nlmsg_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ - nlmsg_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ + size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ + nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ if (ops->get_size) /* IFLA_INFO_DATA + nested data */ - size += nlmsg_total_size(sizeof(struct nlattr)) + + size += nla_total_size(sizeof(struct nlattr)) + ops->get_size(dev); if (ops->get_xstats_size) - size += ops->get_xstats_size(dev); /* IFLA_INFO_XSTATS */ + /* IFLA_INFO_XSTATS */ + size += nla_total_size(ops->get_xstats_size(dev)); return size; } -- cgit v1.2.3 From 7d8e76bf9ac3604897f0ce12e8bf09b68c2a2c89 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Nov 2010 19:42:53 +0000 Subject: net: zero kobject in rx_queue_release netif_set_real_num_rx_queues() can decrement and increment the number of rx queues. For example ixgbe does this as features and offloads are toggled. Presumably this could also happen across down/up on most devices if the available resources changed (cpu offlined). The kobject needs to be zero'd in this case so that the state is not preserved across kobject_put()/kobject_init_and_add(). This resolves the following error report. ixgbe 0000:03:00.0: eth2: NIC Link is Up 10 Gbps, Flow Control: RX/TX kobject (ffff880324b83210): tried to init an initialized object, something is seriously wrong. Pid: 1972, comm: lldpad Not tainted 2.6.37-rc18021qaz+ #169 Call Trace: [] kobject_init+0x3a/0x83 [] kobject_init_and_add+0x23/0x57 [] ? mark_lock+0x21/0x267 [] net_rx_queue_update_kobjects+0x63/0xc6 [] netif_set_real_num_rx_queues+0x5f/0x78 [] ixgbe_set_num_queues+0x1c6/0x1ca [ixgbe] [] ixgbe_init_interrupt_scheme+0x1e/0x79c [ixgbe] [] ixgbe_dcbnl_set_state+0x167/0x189 [ixgbe] Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a5ff5a89f376..7f902cad10f8 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -712,15 +712,21 @@ static void rx_queue_release(struct kobject *kobj) map = rcu_dereference_raw(queue->rps_map); - if (map) + if (map) { + RCU_INIT_POINTER(queue->rps_map, NULL); call_rcu(&map->rcu, rps_map_release); + } flow_table = rcu_dereference_raw(queue->rps_flow_table); - if (flow_table) + if (flow_table) { + RCU_INIT_POINTER(queue->rps_flow_table, NULL); call_rcu(&flow_table->rcu, rps_dev_flow_table_release); + } if (atomic_dec_and_test(&first->count)) kfree(first); + else + memset(kobj, 0, sizeof(*kobj)); } static struct kobj_type rx_queue_ktype = { -- cgit v1.2.3 From 0302b8622ce696af1cda22fcf207d3793350e896 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 18 Nov 2010 13:02:37 +0000 Subject: net: fix kernel-doc for sk_filter_rcu_release Fix kernel-doc warning for sk_filter_rcu_release(): Warning(net/core/filter.c:586): missing initial short description on line: * sk_filter_rcu_release: Release a socket filter by rcu_head Signed-off-by: Randy Dunlap Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 23e9b2a6b4c8..c1ee800bc080 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -589,7 +589,7 @@ int sk_chk_filter(struct sock_filter *filter, int flen) EXPORT_SYMBOL(sk_chk_filter); /** - * sk_filter_rcu_release: Release a socket filter by rcu_head + * sk_filter_rcu_release - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ static void sk_filter_rcu_release(struct rcu_head *rcu) -- cgit v1.2.3 From 7a1c8e5ab120a5f352e78bbc1fa5bb64e6f23639 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 20 Nov 2010 07:46:35 +0000 Subject: net: allow GFP_HIGHMEM in __vmalloc() We forgot to use __GFP_HIGHMEM in several __vmalloc() calls. In ceph, add the missing flag. In fib_trie.c, xfrm_hash.c and request_sock.c, using vzalloc() is cleaner and allows using HIGHMEM pages as well. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ceph/buffer.c | 2 +- net/core/request_sock.c | 4 +--- net/ipv4/fib_trie.c | 2 +- net/xfrm/xfrm_hash.c | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index 53d8abfa25d5..bf3e6a13c215 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -19,7 +19,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) if (b->vec.iov_base) { b->is_vmalloc = false; } else { - b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); + b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL); if (!b->vec.iov_base) { kfree(b); return NULL; diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 7552495aff7a..fceeb37d7161 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -45,9 +45,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); if (lopt_size > PAGE_SIZE) - lopt = __vmalloc(lopt_size, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + lopt = vzalloc(lopt_size); else lopt = kzalloc(lopt_size, GFP_KERNEL); if (lopt == NULL) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 200eb538fbb3..0f280348e0fd 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -365,7 +365,7 @@ static struct tnode *tnode_alloc(size_t size) if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else - return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + return vzalloc(size); } static void __tnode_vfree(struct work_struct *arg) diff --git a/net/xfrm/xfrm_hash.c b/net/xfrm/xfrm_hash.c index a2023ec52329..1e98bc0fe0a5 100644 --- a/net/xfrm/xfrm_hash.c +++ b/net/xfrm/xfrm_hash.c @@ -19,7 +19,7 @@ struct hlist_head *xfrm_hash_alloc(unsigned int sz) if (sz <= PAGE_SIZE) n = kzalloc(sz, GFP_KERNEL); else if (hashdist) - n = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + n = vzalloc(sz); else n = (struct hlist_head *) __get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, -- cgit v1.2.3 From 46bcf14f44d8f31ecfdc8b6708ec15a3b33316d9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Dec 2010 09:29:43 -0800 Subject: filter: fix sk_filter rcu handling Pavel Emelyanov tried to fix a race between sk_filter_(de|at)tach and sk_clone() in commit 47e958eac280c263397 Problem is we can have several clones sharing a common sk_filter, and these clones might want to sk_filter_attach() their own filters at the same time, and can overwrite old_filter->rcu, corrupting RCU queues. We can not use filter->rcu without being sure no other thread could do the same thing. Switch code to a more conventional ref-counting technique : Do the atomic decrement immediately and queue one rcu call back when last reference is released. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 4 +++- net/core/filter.c | 19 ++++++------------- 2 files changed, 9 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/include/net/sock.h b/include/net/sock.h index a6338d039857..659d968d95c5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1155,6 +1155,8 @@ extern void sk_common_release(struct sock *sk); /* Initialise core socket variables */ extern void sock_init_data(struct socket *sock, struct sock *sk); +extern void sk_filter_release_rcu(struct rcu_head *rcu); + /** * sk_filter_release - release a socket filter * @fp: filter to remove @@ -1165,7 +1167,7 @@ extern void sock_init_data(struct socket *sock, struct sock *sk); static inline void sk_filter_release(struct sk_filter *fp) { if (atomic_dec_and_test(&fp->refcnt)) - kfree(fp); + call_rcu_bh(&fp->rcu, sk_filter_release_rcu); } static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) diff --git a/net/core/filter.c b/net/core/filter.c index c1ee800bc080..ae21a0d3c4a2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -589,23 +589,16 @@ int sk_chk_filter(struct sock_filter *filter, int flen) EXPORT_SYMBOL(sk_chk_filter); /** - * sk_filter_rcu_release - Release a socket filter by rcu_head + * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ -static void sk_filter_rcu_release(struct rcu_head *rcu) +void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); - sk_filter_release(fp); -} - -static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp) -{ - unsigned int size = sk_filter_len(fp); - - atomic_sub(size, &sk->sk_omem_alloc); - call_rcu_bh(&fp->rcu, sk_filter_rcu_release); + kfree(fp); } +EXPORT_SYMBOL(sk_filter_release_rcu); /** * sk_attach_filter - attach a socket filter @@ -649,7 +642,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) - sk_filter_delayed_uncharge(sk, old_fp); + sk_filter_uncharge(sk, old_fp); return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); @@ -663,7 +656,7 @@ int sk_detach_filter(struct sock *sk) sock_owned_by_user(sk)); if (filter) { rcu_assign_pointer(sk->sk_filter, NULL); - sk_filter_delayed_uncharge(sk, filter); + sk_filter_uncharge(sk, filter); ret = 0; } return ret; -- cgit v1.2.3 From a19faf0250e09b16cac169354126404bc8aa342b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Dec 2010 18:50:32 +0000 Subject: net: fix skb_defer_rx_timestamp() After commit c1f19b51d1d8 (net: support time stamping in phy devices.), kernel might crash if CONFIG_NETWORK_PHY_TIMESTAMPING=y and skb_defer_rx_timestamp() handles a packet without an ethernet header. Fixes kernel bugzilla #24102 Reference: https://bugzilla.kernel.org/show_bug.cgi?id=24102 Reported-and-tested-by: Andrew Watts Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/timestamping.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 0ae6c22da85b..c19bb4ee405e 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -96,11 +96,13 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) struct phy_device *phydev; unsigned int type; - skb_push(skb, ETH_HLEN); + if (skb_headroom(skb) < ETH_HLEN) + return false; + __skb_push(skb, ETH_HLEN); type = classify(skb); - skb_pull(skb, ETH_HLEN); + __skb_pull(skb, ETH_HLEN); switch (type) { case PTP_CLASS_V1_IPV4: -- cgit v1.2.3