From e5093aec2e6b60c3df2420057ffab9ed4a6d2792 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Wed, 11 Aug 2010 02:02:10 +0000 Subject: net: Fix a memmove bug in dev_gro_receive() >Xin Xiaohui wrote: > I looked into the code dev_gro_receive(), found the code here: > if the frags[0] is pulled to 0, then the page will be released, > and memmove() frags left. > Is that right? I'm not sure if memmove do right or not, but > frags[0].size is never set after memove at least. what I think > a simple way is not to do anything if we found frags[0].size == 0. > The patch is as followed. ... This version of the patch fixes the bug directly in memmove. Reported-by: "Xin, Xiaohui" Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1ae654391442..3721fbb9a83c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3143,7 +3143,7 @@ pull: put_page(skb_shinfo(skb)->frags[0].page); memmove(skb_shinfo(skb)->frags, skb_shinfo(skb)->frags + 1, - --skb_shinfo(skb)->nr_frags); + --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); } } -- cgit v1.2.3 From 3d3be4333fdf6faa080947b331a6a19bce1a4f57 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Sep 2010 00:50:51 +0000 Subject: gro: fix different skb headrooms Packets entering GRO might have different headrooms, even for a given flow (because of implementation details in drivers, like copybreak). We cant force drivers to deliver packets with a fixed headroom. 1) fix skb_segment() skb_segment() makes the false assumption headrooms of fragments are same than the head. When CHECKSUM_PARTIAL is used, this can give csum_start errors, and crash later in skb_copy_and_csum_dev() 2) allocate a minimal skb for head of frag_list skb_gro_receive() uses netdev_alloc_skb(headroom + skb_gro_offset(p)) to allocate a fresh skb. This adds NET_SKB_PAD to a padding already provided by netdevice, depending on various things, like copybreak. Use alloc_skb() to allocate an exact padding, to reduce cache line needs: NET_SKB_PAD + NET_IP_ALIGN bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=16626 Many thanks to Plamen Petrov, testing many debugging patches ! With help of Jarek Poplawski. Reported-by: Plamen Petrov Signed-off-by: Eric Dumazet CC: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/skbuff.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3a2513f0d0c3..26396ff67cf9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2573,6 +2573,10 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) __copy_skb_header(nskb, skb); nskb->mac_len = skb->mac_len; + /* nskb and skb might have different headroom */ + if (nskb->ip_summed == CHECKSUM_PARTIAL) + nskb->csum_start += skb_headroom(nskb) - headroom; + skb_reset_mac_header(nskb); skb_set_network_header(nskb, skb->mac_len); nskb->transport_header = (nskb->network_header + @@ -2702,8 +2706,8 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) } else if (skb_gro_len(p) != pinfo->gso_size) return -E2BIG; - headroom = skb_headroom(p); - nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p)); + headroom = NET_SKB_PAD + NET_IP_ALIGN; + nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); if (unlikely(!nskb)) return -ENOMEM; -- cgit v1.2.3 From 0b5d404e349c0236b11466c0a4785520c0be6982 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Thu, 2 Sep 2010 13:22:11 -0700 Subject: pkt_sched: Fix lockdep warning on est_tree_lock in gen_estimator This patch fixes a lockdep warning: [ 516.287584] ========================================================= [ 516.288386] [ INFO: possible irq lock inversion dependency detected ] [ 516.288386] 2.6.35b #7 [ 516.288386] --------------------------------------------------------- [ 516.288386] swapper/0 just changed the state of lock: [ 516.288386] (&qdisc_tx_lock){+.-...}, at: [] est_timer+0x62/0x1b4 [ 516.288386] but this lock took another, SOFTIRQ-unsafe lock in the past: [ 516.288386] (est_tree_lock){+.+...} [ 516.288386] [ 516.288386] and interrupts could create inverse lock ordering between them. ... So, est_tree_lock needs BH protection because it's taken by qdisc_tx_lock, which is used both in BH and process contexts. (Full warning with this patch at netdev, 02 Sep 2010.) Fixes commit: ae638c47dc040b8def16d05dc6acdd527628f231 ("pkt_sched: gen_estimator: add a new lock") Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/gen_estimator.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 9fbe7f7429b0..6743146e4d6b 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -232,7 +232,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, est->last_packets = bstats->packets; est->avpps = rate_est->pps<<10; - spin_lock(&est_tree_lock); + spin_lock_bh(&est_tree_lock); if (!elist[idx].timer.function) { INIT_LIST_HEAD(&elist[idx].list); setup_timer(&elist[idx].timer, est_timer, idx); @@ -243,7 +243,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, list_add_rcu(&est->list, &elist[idx].list); gen_add_node(est); - spin_unlock(&est_tree_lock); + spin_unlock_bh(&est_tree_lock); return 0; } @@ -270,7 +270,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, { struct gen_estimator *e; - spin_lock(&est_tree_lock); + spin_lock_bh(&est_tree_lock); while ((e = gen_find_node(bstats, rate_est))) { rb_erase(&e->node, &est_root); @@ -281,7 +281,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, list_del_rcu(&e->list); call_rcu(&e->e_rcu, __gen_kill_estimator); } - spin_unlock(&est_tree_lock); + spin_unlock_bh(&est_tree_lock); } EXPORT_SYMBOL(gen_kill_estimator); @@ -320,9 +320,9 @@ bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, ASSERT_RTNL(); - spin_lock(&est_tree_lock); + spin_lock_bh(&est_tree_lock); res = gen_find_node(bstats, rate_est) != NULL; - spin_unlock(&est_tree_lock); + spin_unlock_bh(&est_tree_lock); return res; } -- cgit v1.2.3 From deabc772f39405054a438d711f408d2d94d26d96 Mon Sep 17 00:00:00 2001 From: Helmut Schaa Date: Fri, 3 Sep 2010 02:39:56 +0000 Subject: net: fix tx queue selection for bridged devices implementing select_queue When a net device is implementing the select_queue callback and is part of a bridge, frames coming from the bridge already have a tx queue associated to the socket (introduced in commit a4ee3ce3293dc931fab19beb472a8bde1295aebe, "net: Use sk_tx_queue_mapping for connected sockets"). The call to sk_tx_queue_get will then return the tx queue used by the bridge instead of calling the select_queue callback. In case of mac80211 this broke QoS which is implemented by using the select_queue callback. Furthermore it introduced problems with rt2x00 because frames with the same TID and RA sometimes appeared on different tx queues which the hw cannot handle correctly. Fix this by always calling select_queue first if it is available and only afterwards use the socket tx queue mapping. Signed-off-by: Helmut Schaa Signed-off-by: David S. Miller --- net/core/dev.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3721fbb9a83c..b9b22a3c4c8f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2058,16 +2058,16 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { int queue_index; - struct sock *sk = skb->sk; + const struct net_device_ops *ops = dev->netdev_ops; - queue_index = sk_tx_queue_get(sk); - if (queue_index < 0) { - const struct net_device_ops *ops = dev->netdev_ops; + if (ops->ndo_select_queue) { + queue_index = ops->ndo_select_queue(dev, skb); + queue_index = dev_cap_txqueue(dev, queue_index); + } else { + struct sock *sk = skb->sk; + queue_index = sk_tx_queue_get(sk); + if (queue_index < 0) { - if (ops->ndo_select_queue) { - queue_index = ops->ndo_select_queue(dev, skb); - queue_index = dev_cap_txqueue(dev, queue_index); - } else { queue_index = 0; if (dev->real_num_tx_queues > 1) queue_index = skb_tx_hash(dev, skb); -- cgit v1.2.3 From 64289c8e6851bca0e589e064c9a5c9fbd6ae5dd4 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Sat, 4 Sep 2010 10:34:29 +0000 Subject: gro: Re-fix different skb headrooms The patch: "gro: fix different skb headrooms" in its part: "2) allocate a minimal skb for head of frag_list" is buggy. The copied skb has p->data set at the ip header at the moment, and skb_gro_offset is the length of ip + tcp headers. So, after the change the length of mac header is skipped. Later skb_set_mac_header() sets it into the NET_SKB_PAD area (if it's long enough) and ip header is misaligned at NET_SKB_PAD + NET_IP_ALIGN offset. There is no reason to assume the original skb was wrongly allocated, so let's copy it as it was. bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=16626 fixes commit: 3d3be4333fdf6faa080947b331a6a19bce1a4f57 Reported-by: Plamen Petrov Signed-off-by: Jarek Poplawski CC: Eric Dumazet Acked-by: Eric Dumazet Tested-by: Plamen Petrov Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 26396ff67cf9..c83b421341c0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2706,7 +2706,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) } else if (skb_gro_len(p) != pinfo->gso_size) return -E2BIG; - headroom = NET_SKB_PAD + NET_IP_ALIGN; + headroom = skb_headroom(p); nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); if (unlikely(!nskb)) return -ENOMEM; -- cgit v1.2.3 From ef885afbf8a37689afc1d9d545e2f3e7a8276c17 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Sep 2010 12:24:54 +0000 Subject: net: use rcu_barrier() in rollback_registered_many netdev_wait_allrefs() waits that all references to a device vanishes. It currently uses a _very_ pessimistic 250 ms delay between each probe. Some users reported that no more than 4 devices can be dismantled per second, this is a pretty serious problem for some setups. Most of the time, a refcount is about to be released by an RCU callback, that is still in flight because rollback_registered_many() uses a synchronize_rcu() call instead of rcu_barrier(). Problem is visible if number of online cpus is one, because synchronize_rcu() is then a no op. time to remove 50 ipip tunnels on a UP machine : before patch : real 11.910s after patch : real 1.250s Reported-by: Nicolas Dichtel Reported-by: Octavian Purdila Reported-by: Benjamin LaHaise Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b9b22a3c4c8f..660dd41aaaa6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4845,7 +4845,7 @@ static void rollback_registered_many(struct list_head *head) dev = list_first_entry(head, struct net_device, unreg_list); call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); - synchronize_net(); + rcu_barrier(); list_for_each_entry(dev, head, unreg_list) dev_put(dev); -- cgit v1.2.3 From f064af1e500a2bf4607706f0f458163bdb2a6ea5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Sep 2010 12:43:39 +0000 Subject: net: fix a lockdep splat We have for each socket : One spinlock (sk_slock.slock) One rwlock (sk_callback_lock) Possible scenarios are : (A) (this is used in net/sunrpc/xprtsock.c) read_lock(&sk->sk_callback_lock) (without blocking BH) spin_lock(&sk->sk_slock.slock); ... read_lock(&sk->sk_callback_lock); ... (B) write_lock_bh(&sk->sk_callback_lock) stuff write_unlock_bh(&sk->sk_callback_lock) (C) spin_lock_bh(&sk->sk_slock) ... write_lock_bh(&sk->sk_callback_lock) stuff write_unlock_bh(&sk->sk_callback_lock) spin_unlock_bh(&sk->sk_slock) This (C) case conflicts with (A) : CPU1 [A] CPU2 [C] read_lock(callback_lock) spin_lock_bh(slock) We have one problematic (C) use case in inet_csk_listen_stop() : local_bh_disable(); bh_lock_sock(child); // spin_lock_bh(&sk->sk_slock) WARN_ON(sock_owned_by_user(child)); ... sock_orphan(child); // write_lock_bh(&sk->sk_callback_lock) lockdep is not happy with this, as reported by Tetsuo Handa It seems only way to deal with this is to use read_lock_bh(callbacklock) everywhere. Thanks to Jarek for pointing a bug in my first attempt and suggesting this solution. Reported-by: Tetsuo Handa Tested-by: Tetsuo Handa Signed-off-by: Eric Dumazet CC: Jarek Poplawski Tested-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 8 ++++---- net/rds/tcp_connect.c | 4 ++-- net/rds/tcp_listen.c | 4 ++-- net/rds/tcp_recv.c | 4 ++-- net/rds/tcp_send.c | 4 ++-- net/sunrpc/xprtsock.c | 28 ++++++++++++++-------------- 6 files changed, 26 insertions(+), 26 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index b05b9b6ddb87..ef30e9d286e7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1351,9 +1351,9 @@ int sock_i_uid(struct sock *sk) { int uid; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); return uid; } EXPORT_SYMBOL(sock_i_uid); @@ -1362,9 +1362,9 @@ unsigned long sock_i_ino(struct sock *sk) { unsigned long ino; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); return ino; } EXPORT_SYMBOL(sock_i_ino); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index c397524c039c..c519939e8da9 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -43,7 +43,7 @@ void rds_tcp_state_change(struct sock *sk) struct rds_connection *conn; struct rds_tcp_connection *tc; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); conn = sk->sk_user_data; if (conn == NULL) { state_change = sk->sk_state_change; @@ -68,7 +68,7 @@ void rds_tcp_state_change(struct sock *sk) break; } out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); state_change(sk); } diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 975183fe6950..27844f231d10 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -114,7 +114,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) rdsdebug("listen data ready sk %p\n", sk); - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); ready = sk->sk_user_data; if (ready == NULL) { /* check for teardown race */ ready = sk->sk_data_ready; @@ -131,7 +131,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) queue_work(rds_wq, &rds_tcp_listen_work); out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); ready(sk, bytes); } diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 1aba6878fa5d..e43797404102 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -324,7 +324,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) rdsdebug("data ready sk %p bytes %d\n", sk, bytes); - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); conn = sk->sk_user_data; if (conn == NULL) { /* check for teardown race */ ready = sk->sk_data_ready; @@ -338,7 +338,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM) queue_delayed_work(rds_wq, &conn->c_recv_w, 0); out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); ready(sk, bytes); } diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index a28b895ff0d1..2f012a07d94d 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -224,7 +224,7 @@ void rds_tcp_write_space(struct sock *sk) struct rds_connection *conn; struct rds_tcp_connection *tc; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); conn = sk->sk_user_data; if (conn == NULL) { write_space = sk->sk_write_space; @@ -244,7 +244,7 @@ void rds_tcp_write_space(struct sock *sk) queue_delayed_work(rds_wq, &conn->c_send_w, 0); out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); /* * write_space is only called when data leaves tcp's send queue if diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index b6309db56226..fe9306bf10cc 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -800,7 +800,7 @@ static void xs_udp_data_ready(struct sock *sk, int len) u32 _xid; __be32 *xp; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); dprintk("RPC: xs_udp_data_ready...\n"); if (!(xprt = xprt_from_sock(sk))) goto out; @@ -852,7 +852,7 @@ static void xs_udp_data_ready(struct sock *sk, int len) dropit: skb_free_datagram(sk, skb); out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) @@ -1229,7 +1229,7 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes) dprintk("RPC: xs_tcp_data_ready...\n"); - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; if (xprt->shutdown) @@ -1248,7 +1248,7 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes) read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); } while (read > 0); out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } /* @@ -1301,7 +1301,7 @@ static void xs_tcp_state_change(struct sock *sk) { struct rpc_xprt *xprt; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; dprintk("RPC: xs_tcp_state_change client %p...\n", xprt); @@ -1313,7 +1313,7 @@ static void xs_tcp_state_change(struct sock *sk) switch (sk->sk_state) { case TCP_ESTABLISHED: - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->transport_lock); if (!xprt_test_and_set_connected(xprt)) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -1327,7 +1327,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt_wake_pending_tasks(xprt, -EAGAIN); } - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->transport_lock); break; case TCP_FIN_WAIT1: /* The client initiated a shutdown of the socket */ @@ -1365,7 +1365,7 @@ static void xs_tcp_state_change(struct sock *sk) xs_sock_mark_closed(xprt); } out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } /** @@ -1376,7 +1376,7 @@ static void xs_error_report(struct sock *sk) { struct rpc_xprt *xprt; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; dprintk("RPC: %s client %p...\n" @@ -1384,7 +1384,7 @@ static void xs_error_report(struct sock *sk) __func__, xprt, sk->sk_err); xprt_wake_pending_tasks(xprt, -EAGAIN); out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static void xs_write_space(struct sock *sk) @@ -1416,13 +1416,13 @@ static void xs_write_space(struct sock *sk) */ static void xs_udp_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); /* from net/core/sock.c:sock_def_write_space */ if (sock_writeable(sk)) xs_write_space(sk); - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } /** @@ -1437,13 +1437,13 @@ static void xs_udp_write_space(struct sock *sk) */ static void xs_tcp_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); /* from net/core/stream.c:sk_stream_write_space */ if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) xs_write_space(sk); - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) -- cgit v1.2.3 From 01db403cf99f739f86903314a489fb420e0e254f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 27 Sep 2010 20:24:54 -0700 Subject: tcp: Fix >4GB writes on 64-bit. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes kernel bugzilla #16603 tcp_sendmsg() truncates iov_len to an 'int' which a 4GB write to write zero bytes, for example. There is also the problem higher up of how verify_iovec() works. It wants to prevent the total length from looking like an error return value. However it does this using 'int', but syscalls return 'long' (and thus signed 64-bit on 64-bit machines). So it could trigger false-positives on 64-bit as written. So fix it to use 'long'. Reported-by: Olaf Bonorden Reported-by: Daniel Büse Reported-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/socket.h | 2 +- net/core/iovec.c | 5 +++-- net/ipv4/tcp.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/include/linux/socket.h b/include/linux/socket.h index a2fada9becb6..a8f56e1ec760 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -322,7 +322,7 @@ extern int csum_partial_copy_fromiovecend(unsigned char *kdata, int offset, unsigned int len, __wsum *csump); -extern int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode); +extern long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode); extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len); extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata, int offset, int len); diff --git a/net/core/iovec.c b/net/core/iovec.c index 1cd98df412df..e6b133b77ccb 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -35,9 +35,10 @@ * in any case. */ -int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) +long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) { - int size, err, ct; + int size, ct; + long err; if (m->msg_namelen) { if (mode == VERIFY_READ) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 95d75d443927..f115ea68a4ef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -943,7 +943,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sg = sk->sk_route_caps & NETIF_F_SG; while (--iovlen >= 0) { - int seglen = iov->iov_len; + size_t seglen = iov->iov_len; unsigned char __user *from = iov->iov_base; iov++; -- cgit v1.2.3