From fcbdf09d9652c8919dcf47072e3ae7dcb4eb98ac Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Thu, 16 Dec 2010 14:26:56 -0800 Subject: net: fix nulls list corruptions in sk_prot_alloc Special care is taken inside sk_port_alloc to avoid overwriting skc_node/skc_nulls_node. We should also avoid overwriting skc_bind_node/skc_portaddr_node. The patch fixes the following crash: BUG: unable to handle kernel paging request at fffffffffffffff0 IP: [] udp4_lib_lookup2+0xad/0x370 [] __udp4_lib_lookup+0x282/0x360 [] __udp4_lib_rcv+0x31e/0x700 [] ? ip_local_deliver_finish+0x65/0x190 [] ? ip_local_deliver+0x88/0xa0 [] udp_rcv+0x15/0x20 [] ip_local_deliver_finish+0x65/0x190 [] ip_local_deliver+0x88/0xa0 [] ip_rcv_finish+0x32d/0x6f0 [] ? netif_receive_skb+0x99c/0x11c0 [] ip_rcv+0x2bb/0x350 [] netif_receive_skb+0x99c/0x11c0 Signed-off-by: Leonard Crestez Signed-off-by: Octavian Purdila Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 659d968d95c5..7d3f7ce239b5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -754,6 +754,7 @@ struct proto { void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); + void (*clear_sk)(struct sock *sk, int size); /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS @@ -852,6 +853,8 @@ static inline void __sk_prot_rehash(struct sock *sk) sk->sk_prot->hash(sk); } +void sk_prot_clear_portaddr_nulls(struct sock *sk, int size); + /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) -- cgit v1.2.3 From 4b8fe66300acb2fba8b16d62606e0d30204022fc Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Fri, 17 Dec 2010 12:03:14 -0800 Subject: netlink: fix gcc -Wconversion compilation warning $ cat << EOF | gcc -Wconversion -xc -S -o/dev/null - unsigned f(void) {return NLMSG_HDRLEN;} EOF : In function 'f': :3:26: warning: negative integer implicitly converted to unsigned type Signed-off-by: Dmitry V. Levin Signed-off-by: Kirill A. Shutemov Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 123566912d73..e2b9e63afa68 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -70,7 +70,7 @@ struct nlmsghdr { Check NLM_F_EXCL */ -#define NLMSG_ALIGNTO 4 +#define NLMSG_ALIGNTO 4U #define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) ) #define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr))) #define NLMSG_LENGTH(len) ((len)+NLMSG_ALIGN(NLMSG_HDRLEN)) -- cgit v1.2.3 From ad0081e43af6de3fecf308b0d098f9611835766b Mon Sep 17 00:00:00 2001 From: David Stevens Date: Fri, 17 Dec 2010 11:42:42 +0000 Subject: ipv6: Fragment locally generated tunnel-mode IPSec6 packets as needed. This patch modifies IPsec6 to fragment IPv6 packets that are locally generated as needed. This version of the patch only fragments in tunnel mode, so that fragment headers will not be obscured by ESP in transport mode. Signed-off-by: David L Stevens Acked-by: Herbert Xu Signed-off-by: David S. Miller --- include/net/ip6_route.h | 10 ++++++++++ net/ipv6/ip6_output.c | 12 ++---------- net/ipv6/xfrm6_output.c | 16 +++++++++++++++- 3 files changed, 27 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 278312c95f96..2ab926860cd8 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -164,5 +164,15 @@ static inline int ipv6_unicast_destination(struct sk_buff *skb) return rt->rt6i_flags & RTF_LOCAL; } +int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); + +static inline int ip6_skb_dst_mtu(struct sk_buff *skb) +{ + struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + + return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ? + skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); +} + #endif #endif diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 99157b4cd56e..94b5bf132b2e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -56,7 +56,7 @@ #include #include -static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); +int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); int __ip6_local_out(struct sk_buff *skb) { @@ -145,14 +145,6 @@ static int ip6_finish_output2(struct sk_buff *skb) return -EINVAL; } -static inline int ip6_skb_dst_mtu(struct sk_buff *skb) -{ - struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; - - return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ? - skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); -} - static int ip6_finish_output(struct sk_buff *skb) { if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || @@ -601,7 +593,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) return offset; } -static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) +int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = (struct rt6_info*)skb_dst(skb); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 6434bd5ce088..8e688b3de9ab 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -17,6 +17,7 @@ #include #include #include +#include #include int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, @@ -88,8 +89,21 @@ static int xfrm6_output_finish(struct sk_buff *skb) return xfrm_output(skb); } +static int __xfrm6_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + + if ((x && x->props.mode == XFRM_MODE_TUNNEL) && + ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || + dst_allfrag(skb_dst(skb)))) { + return ip6_fragment(skb, xfrm6_output_finish); + } + return xfrm6_output_finish(skb); +} + int xfrm6_output(struct sk_buff *skb) { return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, - skb_dst(skb)->dev, xfrm6_output_finish); + skb_dst(skb)->dev, __xfrm6_output); } -- cgit v1.2.3 From 173021072e86a0a5b3d2271347493a3e0d5f68e8 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Mon, 20 Dec 2010 04:35:30 +0000 Subject: net_sched: always clone skbs Pawel reported a panic related to handling shared skbs in ixgbe incorrectly. So we need to revert my previous patch to work around this bug. Instead of reverting the patch completely, I just revert the essential lines, so we can add the previous optimization back more easily in future. commit 3511c9132f8b1e1b5634e41a3331c44b0c13be70 Author: Changli Gao Date: Sat Oct 16 13:04:08 2010 +0000 net_sched: remove the unused parameter of qdisc_create_dflt() Reported-by: Pawel Staszewski Signed-off-by: Changli Gao Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sch_generic.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index ea1f8a83160d..79f34e2b752f 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -610,11 +610,7 @@ static inline struct sk_buff *skb_act_clone(struct sk_buff *skb, gfp_t gfp_mask, { struct sk_buff *n; - if ((action == TC_ACT_STOLEN || action == TC_ACT_QUEUED) && - !skb_shared(skb)) - n = skb_get(skb); - else - n = skb_clone(skb, gfp_mask); + n = skb_clone(skb, gfp_mask); if (n) { n->tc_verd = SET_TC_VERD(n->tc_verd, 0); -- cgit v1.2.3 From 9f333281a7da4c3a59bccc0cb53f7590eb850d93 Mon Sep 17 00:00:00 2001 From: Johannes Stezenbach Date: Tue, 30 Nov 2010 16:49:23 +0100 Subject: mac80211/rt2x00: add ieee80211_tx_status_ni() All rt2x00 drivers except rt2800pci call ieee80211_tx_status() from a workqueue, which causes "NOHZ: local_softirq_pending 08" messages. To fix it, add ieee80211_tx_status_ni() similar to ieee80211_rx_ni() which can be called from process context, and call it from rt2x00lib_txdone(). For the rt2800pci special case a driver flag is introduced. https://bugzilla.kernel.org/show_bug.cgi?id=24892 Signed-off-by: Johannes Stezenbach Signed-off-by: John W. Linville --- drivers/net/wireless/rt2x00/rt2800pci.c | 1 + drivers/net/wireless/rt2x00/rt2x00.h | 1 + drivers/net/wireless/rt2x00/rt2x00dev.c | 9 ++++++--- include/net/mac80211.h | 28 ++++++++++++++++++++++++---- 4 files changed, 32 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/rt2x00/rt2800pci.c b/drivers/net/wireless/rt2x00/rt2800pci.c index b26739535986..09a67905c230 100644 --- a/drivers/net/wireless/rt2x00/rt2800pci.c +++ b/drivers/net/wireless/rt2x00/rt2800pci.c @@ -912,6 +912,7 @@ static int rt2800pci_probe_hw(struct rt2x00_dev *rt2x00dev) __set_bit(DRIVER_REQUIRE_DMA, &rt2x00dev->flags); __set_bit(DRIVER_REQUIRE_L2PAD, &rt2x00dev->flags); __set_bit(DRIVER_REQUIRE_TXSTATUS_FIFO, &rt2x00dev->flags); + __set_bit(DRIVER_REQUIRE_TASKLET_CONTEXT, &rt2x00dev->flags); if (!modparam_nohwcrypt) __set_bit(CONFIG_SUPPORT_HW_CRYPTO, &rt2x00dev->flags); __set_bit(DRIVER_SUPPORT_LINK_TUNING, &rt2x00dev->flags); diff --git a/drivers/net/wireless/rt2x00/rt2x00.h b/drivers/net/wireless/rt2x00/rt2x00.h index 94fe589acfaa..ab43e7ca2a23 100644 --- a/drivers/net/wireless/rt2x00/rt2x00.h +++ b/drivers/net/wireless/rt2x00/rt2x00.h @@ -664,6 +664,7 @@ enum rt2x00_flags { DRIVER_REQUIRE_COPY_IV, DRIVER_REQUIRE_L2PAD, DRIVER_REQUIRE_TXSTATUS_FIFO, + DRIVER_REQUIRE_TASKLET_CONTEXT, /* * Driver features diff --git a/drivers/net/wireless/rt2x00/rt2x00dev.c b/drivers/net/wireless/rt2x00/rt2x00dev.c index 5ba79b935f09..d019830ca840 100644 --- a/drivers/net/wireless/rt2x00/rt2x00dev.c +++ b/drivers/net/wireless/rt2x00/rt2x00dev.c @@ -390,9 +390,12 @@ void rt2x00lib_txdone(struct queue_entry *entry, * through a mac80211 library call (RTS/CTS) then we should not * send the status report back. */ - if (!(skbdesc_flags & SKBDESC_NOT_MAC80211)) - ieee80211_tx_status(rt2x00dev->hw, entry->skb); - else + if (!(skbdesc_flags & SKBDESC_NOT_MAC80211)) { + if (test_bit(DRIVER_REQUIRE_TASKLET_CONTEXT, &rt2x00dev->flags)) + ieee80211_tx_status(rt2x00dev->hw, entry->skb); + else + ieee80211_tx_status_ni(rt2x00dev->hw, entry->skb); + } else dev_kfree_skb_any(entry->skb); /* diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 9fdf982d1286..365359b24177 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2024,8 +2024,8 @@ static inline void ieee80211_rx_ni(struct ieee80211_hw *hw, * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls - * to this function and ieee80211_tx_status_irqsafe() may not be mixed - * for a single hardware. + * to this function, ieee80211_tx_status_ni() and ieee80211_tx_status_irqsafe() + * may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call @@ -2033,14 +2033,34 @@ static inline void ieee80211_rx_ni(struct ieee80211_hw *hw, void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb); +/** + * ieee80211_tx_status_ni - transmit status callback (in process context) + * + * Like ieee80211_tx_status() but can be called in process context. + * + * Calls to this function, ieee80211_tx_status() and + * ieee80211_tx_status_irqsafe() may not be mixed + * for a single hardware. + * + * @hw: the hardware the frame was transmitted by + * @skb: the frame that was transmitted, owned by mac80211 after this call + */ +static inline void ieee80211_tx_status_ni(struct ieee80211_hw *hw, + struct sk_buff *skb) +{ + local_bh_disable(); + ieee80211_tx_status(hw, skb); + local_bh_enable(); +} + /** * ieee80211_tx_status_irqsafe - IRQ-safe transmit status callback * * Like ieee80211_tx_status() but can be called in IRQ context * (internally defers to a tasklet.) * - * Calls to this function and ieee80211_tx_status() may not be mixed for a - * single hardware. + * Calls to this function, ieee80211_tx_status() and + * ieee80211_tx_status_ni() may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call -- cgit v1.2.3 From da521b2c4f046383bc8941604174bc0e8bffb430 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 21 Dec 2010 12:43:16 -0800 Subject: net: Fix range checks in tcf_valid_offset(). This function has three bugs: 1) The offset should be valid most of the time, this is just a sanity check, therefore we should use "likely" not "unlikely" 2) This is the only place where we can check for arithmetic overflow of the pointer plus the length. 3) The existing range checks are off by one, the valid range is skb->head to skb_tail_pointer(), inclusive. Based almost entirely upon a patch by Ralph Loader. Reported-by: Ralph Loader Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index dd3031aed9d5..9fcc680ab6b9 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -323,7 +323,9 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) static inline int tcf_valid_offset(const struct sk_buff *skb, const unsigned char *ptr, const int len) { - return unlikely((ptr + len) < skb_tail_pointer(skb) && ptr > skb->head); + return likely((ptr + len) <= skb_tail_pointer(skb) && + ptr >= skb->head && + (ptr <= (ptr + len))); } #ifdef CONFIG_NET_CLS_IND -- cgit v1.2.3 From 4f32e9b1f812fd6c00cc85a127583fefbdedaedc Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Wed, 22 Dec 2010 10:27:53 +0100 Subject: kthread_work: make lockdep happy spinlock in kthread_worker and wait_queue_head in kthread_work both should be lockdep sensible, so change the interface to make it suiltable for CONFIG_LOCKDEP. tj: comment update Reported-by: Nicolas Signed-off-by: Yong Zhang Signed-off-by: Andy Walls Tested-by: Andy Walls Cc: Tejun Heo Cc: Andrew Morton Signed-off-by: Tejun Heo --- include/linux/kthread.h | 45 +++++++++++++++++++++++++++++++++++---------- kernel/kthread.c | 11 +++++++++++ 2 files changed, 46 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 685ea65eb803..ce0775aa64c3 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -81,16 +81,41 @@ struct kthread_work { #define DEFINE_KTHREAD_WORK(work, fn) \ struct kthread_work work = KTHREAD_WORK_INIT(work, fn) -static inline void init_kthread_worker(struct kthread_worker *worker) -{ - *worker = (struct kthread_worker)KTHREAD_WORKER_INIT(*worker); -} - -static inline void init_kthread_work(struct kthread_work *work, - kthread_work_func_t fn) -{ - *work = (struct kthread_work)KTHREAD_WORK_INIT(*work, fn); -} +/* + * kthread_worker.lock and kthread_work.done need their own lockdep class + * keys if they are defined on stack with lockdep enabled. Use the + * following macros when defining them on stack. + */ +#ifdef CONFIG_LOCKDEP +# define KTHREAD_WORKER_INIT_ONSTACK(worker) \ + ({ init_kthread_worker(&worker); worker; }) +# define DEFINE_KTHREAD_WORKER_ONSTACK(worker) \ + struct kthread_worker worker = KTHREAD_WORKER_INIT_ONSTACK(worker) +# define KTHREAD_WORK_INIT_ONSTACK(work, fn) \ + ({ init_kthread_work((&work), fn); work; }) +# define DEFINE_KTHREAD_WORK_ONSTACK(work, fn) \ + struct kthread_work work = KTHREAD_WORK_INIT_ONSTACK(work, fn) +#else +# define DEFINE_KTHREAD_WORKER_ONSTACK(worker) DEFINE_KTHREAD_WORKER(worker) +# define DEFINE_KTHREAD_WORK_ONSTACK(work, fn) DEFINE_KTHREAD_WORK(work, fn) +#endif + +extern void __init_kthread_worker(struct kthread_worker *worker, + const char *name, struct lock_class_key *key); + +#define init_kthread_worker(worker) \ + do { \ + static struct lock_class_key __key; \ + __init_kthread_worker((worker), "("#worker")->lock", &__key); \ + } while (0) + +#define init_kthread_work(work, fn) \ + do { \ + memset((work), 0, sizeof(struct kthread_work)); \ + INIT_LIST_HEAD(&(work)->node); \ + (work)->func = (fn); \ + init_waitqueue_head(&(work)->done); \ + } while (0) int kthread_worker_fn(void *worker_ptr); diff --git a/kernel/kthread.c b/kernel/kthread.c index 2dc3786349d1..ca61bbdd44b2 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -265,6 +265,17 @@ int kthreadd(void *unused) return 0; } +void __init_kthread_worker(struct kthread_worker *worker, + const char *name, + struct lock_class_key *key) +{ + spin_lock_init(&worker->lock); + lockdep_set_class_and_name(&worker->lock, key, name); + INIT_LIST_HEAD(&worker->work_list); + worker->task = NULL; +} +EXPORT_SYMBOL_GPL(__init_kthread_worker); + /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker -- cgit v1.2.3 From 4e06fd14d5fa78826397c891654a37e5a36ee827 Mon Sep 17 00:00:00 2001 From: Will Newton Date: Tue, 21 Dec 2010 17:24:29 -0800 Subject: include/linux/unaligned: pack the whole struct rather than just the field The current packed struct implementation of unaligned access adds the packed attribute only to the field within the unaligned struct rather than to the struct as a whole. This is not sufficient to enforce proper behaviour on architectures with a default struct alignment of more than one byte. For example, the current implementation of __get_unaligned_cpu16 when compiled for arm with gcc -O1 -mstructure-size-boundary=32 assumes the struct is on a 4 byte boundary so performs the load of the 16bit packed field as if it were on a 4 byte boundary: __get_unaligned_cpu16: ldrh r0, [r0, #0] bx lr Moving the packed attribute to the struct rather than the field causes the proper unaligned access code to be generated: __get_unaligned_cpu16: ldrb r3, [r0, #0] @ zero_extendqisi2 ldrb r0, [r0, #1] @ zero_extendqisi2 orr r0, r3, r0, asl #8 bx lr Signed-off-by: Will Newton Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/unaligned/packed_struct.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/unaligned/packed_struct.h b/include/linux/unaligned/packed_struct.h index 2498bb9fe002..c9a6abd972a1 100644 --- a/include/linux/unaligned/packed_struct.h +++ b/include/linux/unaligned/packed_struct.h @@ -3,9 +3,9 @@ #include -struct __una_u16 { u16 x __attribute__((packed)); }; -struct __una_u32 { u32 x __attribute__((packed)); }; -struct __una_u64 { u64 x __attribute__((packed)); }; +struct __una_u16 { u16 x; } __attribute__((packed)); +struct __una_u32 { u32 x; } __attribute__((packed)); +struct __una_u64 { u64 x; } __attribute__((packed)); static inline u16 __get_unaligned_cpu16(const void *p) { -- cgit v1.2.3 From 4be2c95d1f7706ca0e74499f2bd118e1cee19669 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 21 Dec 2010 17:24:30 -0800 Subject: taskstats: pad taskstats netlink response for aligment issues on ia64 The taskstats structure is internally aligned on 8 byte boundaries but the layout of the aggregrate reply, with two NLA headers and the pid (each 4 bytes), actually force the entire structure to be unaligned. This causes the kernel to issue unaligned access warnings on some architectures like ia64. Unfortunately, some software out there doesn't properly unroll the NLA packet and assumes that the start of the taskstats structure will always be 20 bytes from the start of the netlink payload. Aligning the start of the taskstats structure breaks this software, which we don't want. So, for now the alignment only happens on architectures that require it and those users will have to update to fixed versions of those packages. Space is reserved in the packet only when needed. This ifdef should be removed in several years e.g. 2012 once we can be confident that fixed versions are installed on most systems. We add the padding before the aggregate since the aggregate is already a defined type. Commit 85893120 ("delayacct: align to 8 byte boundary on 64-bit systems") previously addressed the alignment issues by padding out the pid field. This was supposed to be a compatible change but the circumstances described above mean that it wasn't. This patch backs out that change, since it was a hack, and introduces a new NULL attribute type to provide the padding. Padding the response with 4 bytes avoids allocating an aligned taskstats structure and copying it back. Since the structure weighs in at 328 bytes, it's too big to do it on the stack. Signed-off-by: Jeff Mahoney Reported-by: Brian Rogers Cc: Jeff Mahoney Cc: Guillaume Chazarain Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/accounting/getdelays.c | 1 + include/linux/taskstats.h | 3 +- kernel/taskstats.c | 57 ++++++++++++++++++++++++++++-------- 3 files changed, 47 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c index a2976a6de033..e9c77788a39d 100644 --- a/Documentation/accounting/getdelays.c +++ b/Documentation/accounting/getdelays.c @@ -516,6 +516,7 @@ int main(int argc, char *argv[]) default: fprintf(stderr, "Unknown nla_type %d\n", na->nla_type); + case TASKSTATS_TYPE_NULL: break; } na = (struct nlattr *) (GENLMSG_DATA(&msg) + len); diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h index 341dddb55090..2466e550a41d 100644 --- a/include/linux/taskstats.h +++ b/include/linux/taskstats.h @@ -33,7 +33,7 @@ */ -#define TASKSTATS_VERSION 7 +#define TASKSTATS_VERSION 8 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -188,6 +188,7 @@ enum { TASKSTATS_TYPE_STATS, /* taskstats structure */ TASKSTATS_TYPE_AGGR_PID, /* contains pid + stats */ TASKSTATS_TYPE_AGGR_TGID, /* contains tgid + stats */ + TASKSTATS_TYPE_NULL, /* contains nothing */ __TASKSTATS_TYPE_MAX, }; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index c8231fb15708..3308fd7f1b52 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -349,25 +349,47 @@ static int parse(struct nlattr *na, struct cpumask *mask) return ret; } +#ifdef CONFIG_IA64 +#define TASKSTATS_NEEDS_PADDING 1 +#endif + static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) { struct nlattr *na, *ret; int aggr; - /* If we don't pad, we end up with alignment on a 4 byte boundary. - * This causes lots of runtime warnings on systems requiring 8 byte - * alignment */ - u32 pids[2] = { pid, 0 }; - int pid_size = ALIGN(sizeof(pid), sizeof(long)); - aggr = (type == TASKSTATS_TYPE_PID) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; + /* + * The taskstats structure is internally aligned on 8 byte + * boundaries but the layout of the aggregrate reply, with + * two NLA headers and the pid (each 4 bytes), actually + * force the entire structure to be unaligned. This causes + * the kernel to issue unaligned access warnings on some + * architectures like ia64. Unfortunately, some software out there + * doesn't properly unroll the NLA packet and assumes that the start + * of the taskstats structure will always be 20 bytes from the start + * of the netlink payload. Aligning the start of the taskstats + * structure breaks this software, which we don't want. So, for now + * the alignment only happens on architectures that require it + * and those users will have to update to fixed versions of those + * packages. Space is reserved in the packet only when needed. + * This ifdef should be removed in several years e.g. 2012 once + * we can be confident that fixed versions are installed on most + * systems. We add the padding before the aggregate since the + * aggregate is already a defined type. + */ +#ifdef TASKSTATS_NEEDS_PADDING + if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) + goto err; +#endif na = nla_nest_start(skb, aggr); if (!na) goto err; - if (nla_put(skb, type, pid_size, pids) < 0) + + if (nla_put(skb, type, sizeof(pid), &pid) < 0) goto err; ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); if (!ret) @@ -456,6 +478,18 @@ out: return rc; } +static size_t taskstats_packet_size(void) +{ + size_t size; + + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); +#ifdef TASKSTATS_NEEDS_PADDING + size += nla_total_size(0); /* Padding for alignment */ +#endif + return size; +} + static int cmd_attr_pid(struct genl_info *info) { struct taskstats *stats; @@ -464,8 +498,7 @@ static int cmd_attr_pid(struct genl_info *info) u32 pid; int rc; - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + size = taskstats_packet_size(); rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) @@ -494,8 +527,7 @@ static int cmd_attr_tgid(struct genl_info *info) u32 tgid; int rc; - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + size = taskstats_packet_size(); rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) @@ -570,8 +602,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) /* * Size includes space for nested attributes */ - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + size = taskstats_packet_size(); is_thread_group = !!taskstats_tgid_alloc(tsk); if (is_thread_group) { -- cgit v1.2.3 From e058464990c2ef1f3ecd6b83a154913c3c06f02a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 23 Dec 2010 12:03:57 -0800 Subject: Revert "ipv4: Allow configuring subnets as local addresses" This reverts commit 4465b469008bc03b98a1b8df4e9ae501b6c69d4b. Conflicts: net/ipv4/fib_frontend.c As reported by Ben Greear, this causes regressions: > Change 4465b469008bc03b98a1b8df4e9ae501b6c69d4b caused rules > to stop matching the input device properly because the > FLOWI_FLAG_MATCH_ANY_IIF is always defined in ip_dev_find(). > > This breaks rules such as: > > ip rule add pref 512 lookup local > ip rule del pref 0 lookup local > ip link set eth2 up > ip -4 addr add 172.16.0.102/24 broadcast 172.16.0.255 dev eth2 > ip rule add to 172.16.0.102 iif eth2 lookup local pref 10 > ip rule add iif eth2 lookup 10001 pref 20 > ip route add 172.16.0.0/24 dev eth2 table 10001 > ip route add unreachable 0/0 table 10001 > > If you had a second interface 'eth0' that was on a different > subnet, pinging a system on that interface would fail: > > [root@ct503-60 ~]# ping 192.168.100.1 > connect: Invalid argument Reported-by: Ben Greear Signed-off-by: David S. Miller --- include/net/flow.h | 1 - net/core/fib_rules.c | 3 +-- net/ipv4/fib_frontend.c | 10 ++++++++-- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/flow.h b/include/net/flow.h index 0ac3fb5e0973..bb08692a20b0 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -49,7 +49,6 @@ struct flowi { __u8 proto; __u8 flags; #define FLOWI_FLAG_ANYSRC 0x01 -#define FLOWI_FLAG_MATCH_ANY_IIF 0x02 union { struct { __be16 sport; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 82a4369ae150..a20e5d3bbfa0 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -181,8 +181,7 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, { int ret = 0; - if (rule->iifindex && (rule->iifindex != fl->iif) && - !(fl->flags & FLOWI_FLAG_MATCH_ANY_IIF)) + if (rule->iifindex && (rule->iifindex != fl->iif)) goto out; if (rule->oifindex && (rule->oifindex != fl->oif)) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index eb6f69a8f27a..c19c1f739fba 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -163,13 +163,19 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) .daddr = addr } }, - .flags = FLOWI_FLAG_MATCH_ANY_IIF }; struct fib_result res = { 0 }; struct net_device *dev = NULL; + struct fib_table *local_table; + +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif rcu_read_lock(); - if (fib_lookup(net, &fl, &res)) { + local_table = fib_get_table(net, RT_TABLE_LOCAL); + if (!local_table || + fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { rcu_read_unlock(); return NULL; } -- cgit v1.2.3