From 057af70713445fad2459aa348c9c2c4ecf7db938 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 5 Oct 2019 20:04:39 +0200 Subject: net: tipc: have genetlink code to parse the attrs during dumpit Benefit from the fact that the generic netlink code can parse the attrs for dumpit op and avoid need to parse it in the op callback. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/tipc/socket.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3b9f8cc328f5..d579b64705b1 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3588,13 +3588,9 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) struct tipc_sock *tsk; if (!tsk_portid) { - struct nlattr **attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1]; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_SOCK]) return -EINVAL; -- cgit v1.2.3 From 3ef7cf57c72f32f61e97f8fa401bc39ea1f1a5d4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 Oct 2019 22:44:50 -0700 Subject: net: use skb_queue_empty_lockless() in poll() handlers Many poll() handlers are lockless. Using skb_queue_empty_lockless() instead of skb_queue_empty() is more appropriate. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/isdn/capi/capi.c | 2 +- net/atm/common.c | 2 +- net/bluetooth/af_bluetooth.c | 4 ++-- net/caif/caif_socket.c | 2 +- net/core/datagram.c | 4 ++-- net/decnet/af_decnet.c | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/udp.c | 2 +- net/nfc/llcp_sock.c | 4 ++-- net/phonet/socket.c | 4 ++-- net/sctp/socket.c | 4 ++-- net/tipc/socket.c | 4 ++-- net/unix/af_unix.c | 6 +++--- net/vmw_vsock/af_vsock.c | 2 +- 14 files changed, 22 insertions(+), 22 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c index c92b405b7646..ba8619524231 100644 --- a/drivers/isdn/capi/capi.c +++ b/drivers/isdn/capi/capi.c @@ -744,7 +744,7 @@ capi_poll(struct file *file, poll_table *wait) poll_wait(file, &(cdev->recvwait), wait); mask = EPOLLOUT | EPOLLWRNORM; - if (!skb_queue_empty(&cdev->recvqueue)) + if (!skb_queue_empty_lockless(&cdev->recvqueue)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } diff --git a/net/atm/common.c b/net/atm/common.c index b7528e77997c..0ce530af534d 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -668,7 +668,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* writable? */ diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 94ddf19998c7..5f508c50649d 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -460,7 +460,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, if (sk->sk_state == BT_LISTEN) return bt_accept_poll(sk); - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); @@ -470,7 +470,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_state == BT_CLOSED) diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 13ea920600ae..ef14da50a981 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -953,7 +953,7 @@ static __poll_t caif_poll(struct file *file, mask |= EPOLLRDHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || + if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) mask |= EPOLLIN | EPOLLRDNORM; diff --git a/net/core/datagram.c b/net/core/datagram.c index c210fc116103..5b685e110aff 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -767,7 +767,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, mask = 0; /* exceptional events? */ - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); @@ -777,7 +777,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 0ea75286abf4..3349ea81f901 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1205,7 +1205,7 @@ static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wai struct dn_scp *scp = DN_SK(sk); __poll_t mask = datagram_poll(file, sock, wait); - if (!skb_queue_empty(&scp->other_receive_queue)) + if (!skb_queue_empty_lockless(&scp->other_receive_queue)) mask |= EPOLLRDBAND; return mask; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 42187a3b82f4..ffef502f5292 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -584,7 +584,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR; return mask; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2cc259736c2e..345a3d43f5a6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2712,7 +2712,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; - if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) + if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ccdd790e163a..28604414dec1 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -554,11 +554,11 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_state == LLCP_CLOSED) diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 96ea9f254ae9..76d499f6af9a 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -338,9 +338,9 @@ static __poll_t pn_socket_poll(struct file *file, struct socket *sock, if (sk->sk_state == TCP_CLOSE) return EPOLLERR; - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; - if (!skb_queue_empty(&pn->ctrlreq_queue)) + if (!skb_queue_empty_lockless(&pn->ctrlreq_queue)) mask |= EPOLLPRI; if (!mask && sk->sk_state == TCP_CLOSE_WAIT) return EPOLLHUP; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 5ca0ec0e823c..cfb25391b8b0 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -8476,7 +8476,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) mask = 0; /* Is there any exceptional events? */ - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (sk->sk_shutdown & RCV_SHUTDOWN) @@ -8485,7 +8485,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= EPOLLHUP; /* Is it readable? Reconsider this code with TCP-style support. */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* The association is either gone or not ready. */ diff --git a/net/tipc/socket.c b/net/tipc/socket.c index f8bbc4aab213..4b92b196cfa6 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -740,7 +740,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, /* fall through */ case TIPC_LISTEN: case TIPC_CONNECTING: - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) revents |= EPOLLIN | EPOLLRDNORM; break; case TIPC_OPEN: @@ -748,7 +748,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, revents |= EPOLLOUT; if (!tipc_sk_type_connectionless(sk)) break; - if (skb_queue_empty(&sk->sk_receive_queue)) + if (skb_queue_empty_lockless(&sk->sk_receive_queue)) break; revents |= EPOLLIN | EPOLLRDNORM; break; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 67e87db5877f..0d8da809bea2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2599,7 +2599,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ @@ -2628,7 +2628,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, mask = 0; /* exceptional events? */ - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); @@ -2638,7 +2638,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 2ab43b2bba31..582a3e4dfce2 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -870,7 +870,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, * the queue and write as long as the socket isn't shutdown for * sending. */ - if (!skb_queue_empty(&sk->sk_receive_queue) || + if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) { mask |= EPOLLIN | EPOLLRDNORM; } -- cgit v1.2.3 From f73b12812a3d1d798b7517547ccdcf864844d2cd Mon Sep 17 00:00:00 2001 From: Hoang Le Date: Tue, 29 Oct 2019 07:51:21 +0700 Subject: tipc: improve throughput between nodes in netns Currently, TIPC transports intra-node user data messages directly socket to socket, hence shortcutting all the lower layers of the communication stack. This gives TIPC very good intra node performance, both regarding throughput and latency. We now introduce a similar mechanism for TIPC data traffic across network namespaces located in the same kernel. On the send path, the call chain is as always accompanied by the sending node's network name space pointer. However, once we have reliably established that the receiving node is represented by a namespace on the same host, we just replace the namespace pointer with the receiving node/namespace's ditto, and follow the regular socket receive patch though the receiving node. This technique gives us a throughput similar to the node internal throughput, several times larger than if we let the traffic go though the full network stacks. As a comparison, max throughput for 64k messages is four times larger than TCP throughput for the same type of traffic. To meet any security concerns, the following should be noted. - All nodes joining a cluster are supposed to have been be certified and authenticated by mechanisms outside TIPC. This is no different for nodes/namespaces on the same host; they have to auto discover each other using the attached interfaces, and establish links which are supervised via the regular link monitoring mechanism. Hence, a kernel local node has no other way to join a cluster than any other node, and have to obey to policies set in the IP or device layers of the stack. - Only when a sender has established with 100% certainty that the peer node is located in a kernel local namespace does it choose to let user data messages, and only those, take the crossover path to the receiving node/namespace. - If the receiving node/namespace is removed, its namespace pointer is invalidated at all peer nodes, and their neighbor link monitoring will eventually note that this node is gone. - To ensure the "100% certainty" criteria, and prevent any possible spoofing, received discovery messages must contain a proof that the sender knows a common secret. We use the hash mix of the sending node/namespace for this purpose, since it can be accessed directly by all other namespaces in the kernel. Upon reception of a discovery message, the receiver checks this proof against all the local namespaces'hash_mix:es. If it finds a match, that, along with a matching node id and cluster id, this is deemed sufficient proof that the peer node in question is in a local namespace, and a wormhole can be opened. - We should also consider that TIPC is intended to be a cluster local IPC mechanism (just like e.g. UNIX sockets) rather than a network protocol, and hence we think it can justified to allow it to shortcut the lower protocol layers. Regarding traceability, we should notice that since commit 6c9081a3915d ("tipc: add loopback device tracking") it is possible to follow the node internal packet flow by just activating tcpdump on the loopback interface. This will be true even for this mechanism; by activating tcpdump on the involved nodes' loopback interfaces their inter-name space messaging can easily be tracked. v2: - update 'net' pointer when node left/rejoined v3: - grab read/write lock when using node ref obj v4: - clone traffics between netns to loopback Suggested-by: Jon Maloy Acked-by: Jon Maloy Signed-off-by: Hoang Le Signed-off-by: David S. Miller --- net/tipc/core.c | 16 ++++++ net/tipc/core.h | 6 ++ net/tipc/discover.c | 4 +- net/tipc/msg.h | 14 +++++ net/tipc/name_distr.c | 2 +- net/tipc/node.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++-- net/tipc/node.h | 5 +- net/tipc/socket.c | 6 +- 8 files changed, 197 insertions(+), 11 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/core.c b/net/tipc/core.c index 23cb379a93d6..ab648dd150ee 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -105,6 +105,15 @@ static void __net_exit tipc_exit_net(struct net *net) tipc_sk_rht_destroy(net); } +static void __net_exit tipc_pernet_pre_exit(struct net *net) +{ + tipc_node_pre_cleanup_net(net); +} + +static struct pernet_operations tipc_pernet_pre_exit_ops = { + .pre_exit = tipc_pernet_pre_exit, +}; + static struct pernet_operations tipc_net_ops = { .init = tipc_init_net, .exit = tipc_exit_net, @@ -151,6 +160,10 @@ static int __init tipc_init(void) if (err) goto out_pernet_topsrv; + err = register_pernet_subsys(&tipc_pernet_pre_exit_ops); + if (err) + goto out_register_pernet_subsys; + err = tipc_bearer_setup(); if (err) goto out_bearer; @@ -158,6 +171,8 @@ static int __init tipc_init(void) pr_info("Started in single node mode\n"); return 0; out_bearer: + unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); +out_register_pernet_subsys: unregister_pernet_device(&tipc_topsrv_net_ops); out_pernet_topsrv: tipc_socket_stop(); @@ -177,6 +192,7 @@ out_netlink: static void __exit tipc_exit(void) { tipc_bearer_cleanup(); + unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); unregister_pernet_device(&tipc_topsrv_net_ops); tipc_socket_stop(); unregister_pernet_device(&tipc_net_ops); diff --git a/net/tipc/core.h b/net/tipc/core.h index 60d829581068..8776d32a4a47 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -59,6 +59,7 @@ #include #include #include +#include struct tipc_node; struct tipc_bearer; @@ -185,6 +186,11 @@ static inline int in_range(u16 val, u16 min, u16 max) return !less(val, min) && !more(val, max); } +static inline u32 tipc_net_hash_mixes(struct net *net, int tn_rand) +{ + return net_hash_mix(&init_net) ^ net_hash_mix(net) ^ tn_rand; +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index c138d68e8a69..b043e8c6397a 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -94,6 +94,7 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, msg_set_dest_domain(hdr, dest_domain); msg_set_bc_netid(hdr, tn->net_id); b->media->addr2msg(msg_media_addr(hdr), &b->addr); + msg_set_peer_net_hash(hdr, tipc_net_hash_mixes(net, tn->random)); msg_set_node_id(hdr, tipc_own_id(net)); } @@ -242,7 +243,8 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, if (!tipc_in_scope(legacy, b->domain, src)) return; tipc_node_check_dest(net, src, peer_id, b, caps, signature, - &maddr, &respond, &dupl_addr); + msg_peer_net_hash(hdr), &maddr, &respond, + &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); if (!respond) diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 0daa6f04ca81..2d7cb66a6912 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -1026,6 +1026,20 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } +/* Word 13 + */ +static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 13, n); +} + +static inline u32 msg_peer_net_hash(struct tipc_msg *m) +{ + return msg_word(m, 13); +} + +/* Word 14 + */ static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 836e629e8f4a..5feaf3b67380 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -146,7 +146,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; - u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) / + u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0, false) - INT_H_SIZE) / ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; diff --git a/net/tipc/node.c b/net/tipc/node.c index f2e3cf70c922..4b60928049ea 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -126,6 +126,8 @@ struct tipc_node { struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; + struct net *peer_net; + u32 peer_hash_mix; }; /* Node FSM states and events: @@ -184,7 +186,7 @@ static struct tipc_link *node_active_link(struct tipc_node *n, int sel) return n->links[bearer_id].link; } -int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; @@ -194,6 +196,14 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) if (unlikely(!n)) return mtu; + /* Allow MAX_MSG_SIZE when building connection oriented message + * if they are in the same core network + */ + if (n->peer_net && connected) { + tipc_node_put(n); + return mtu; + } + bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; @@ -360,8 +370,37 @@ static void tipc_node_write_unlock(struct tipc_node *n) } } +static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) +{ + int net_id = tipc_netid(n->net); + struct tipc_net *tn_peer; + struct net *tmp; + u32 hash_chk; + + if (n->peer_net) + return; + + for_each_net_rcu(tmp) { + tn_peer = tipc_net(tmp); + if (!tn_peer) + continue; + /* Integrity checking whether node exists in namespace or not */ + if (tn_peer->net_id != net_id) + continue; + if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) + continue; + hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); + if (hash_mixes ^ hash_chk) + continue; + n->peer_net = tmp; + n->peer_hash_mix = hash_mixes; + break; + } +} + static struct tipc_node *tipc_node_create(struct net *net, u32 addr, - u8 *peer_id, u16 capabilities) + u8 *peer_id, u16 capabilities, + u32 signature, u32 hash_mixes) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n, *temp_node; @@ -372,6 +411,8 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr); if (n) { + if (n->peer_hash_mix ^ hash_mixes) + tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ @@ -389,6 +430,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); @@ -399,6 +441,10 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, n->addr = addr; memcpy(&n->peer_id, peer_id, 16); n->net = net; + n->peer_net = NULL; + n->peer_hash_mix = 0; + /* Assign kernel local namespace if exists */ + tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); @@ -426,6 +472,10 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, tipc_bc_sndlink(net), &n->bc_entry.link)) { pr_warn("Broadcast rcv link creation failed, no memory\n"); + if (n->peer_net) { + n->peer_net = NULL; + n->peer_hash_mix = 0; + } kfree(n); n = NULL; goto exit; @@ -979,7 +1029,7 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, - u16 capabilities, u32 signature, + u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { @@ -998,7 +1048,8 @@ void tipc_node_check_dest(struct net *net, u32 addr, *dupl_addr = false; *respond = false; - n = tipc_node_create(net, addr, peer_id, capabilities); + n = tipc_node_create(net, addr, peer_id, capabilities, signature, + hash_mixes); if (!n) return; @@ -1343,6 +1394,10 @@ static void node_lost_contact(struct tipc_node *n, /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; + if (n->peer_net) { + n->peer_net = NULL; + n->peer_hash_mix = 0; + } /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, @@ -1424,6 +1479,56 @@ msg_full: return -EMSGSIZE; } +static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) +{ + struct tipc_msg *hdr = buf_msg(skb_peek(list)); + struct sk_buff_head inputq; + + switch (msg_user(hdr)) { + case TIPC_LOW_IMPORTANCE: + case TIPC_MEDIUM_IMPORTANCE: + case TIPC_HIGH_IMPORTANCE: + case TIPC_CRITICAL_IMPORTANCE: + if (msg_connected(hdr) || msg_named(hdr)) { + tipc_loopback_trace(peer_net, list); + spin_lock_init(&list->lock); + tipc_sk_rcv(peer_net, list); + return; + } + if (msg_mcast(hdr)) { + tipc_loopback_trace(peer_net, list); + skb_queue_head_init(&inputq); + tipc_sk_mcast_rcv(peer_net, list, &inputq); + __skb_queue_purge(list); + skb_queue_purge(&inputq); + return; + } + return; + case MSG_FRAGMENTER: + if (tipc_msg_assemble(list)) { + tipc_loopback_trace(peer_net, list); + skb_queue_head_init(&inputq); + tipc_sk_mcast_rcv(peer_net, list, &inputq); + __skb_queue_purge(list); + skb_queue_purge(&inputq); + } + return; + case GROUP_PROTOCOL: + case CONN_MANAGER: + tipc_loopback_trace(peer_net, list); + spin_lock_init(&list->lock); + tipc_sk_rcv(peer_net, list); + return; + case LINK_PROTOCOL: + case NAME_DISTRIBUTOR: + case TUNNEL_PROTOCOL: + case BCAST_PROTOCOL: + return; + default: + return; + }; +} + /** * tipc_node_xmit() is the general link level function for message sending * @net: the applicable net namespace @@ -1439,6 +1544,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; + bool node_up = false; int bearer_id; int rc; @@ -1456,6 +1562,17 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, } tipc_node_read_lock(n); + node_up = node_is_up(n); + if (node_up && n->peer_net && check_net(n->peer_net)) { + /* xmit inner linux container */ + tipc_lxc_xmit(n->peer_net, list); + if (likely(skb_queue_empty(list))) { + tipc_node_read_unlock(n); + tipc_node_put(n); + return 0; + } + } + bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); @@ -2587,3 +2704,33 @@ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) return i; } + +void tipc_node_pre_cleanup_net(struct net *exit_net) +{ + struct tipc_node *n; + struct tipc_net *tn; + struct net *tmp; + + rcu_read_lock(); + for_each_net_rcu(tmp) { + if (tmp == exit_net) + continue; + tn = tipc_net(tmp); + if (!tn) + continue; + spin_lock_bh(&tn->node_list_lock); + list_for_each_entry_rcu(n, &tn->node_list, list) { + if (!n->peer_net) + continue; + if (n->peer_net != exit_net) + continue; + tipc_node_write_lock(n); + n->peer_net = NULL; + n->peer_hash_mix = 0; + tipc_node_write_unlock_fast(n); + break; + } + spin_unlock_bh(&tn->node_list_lock); + } + rcu_read_unlock(); +} diff --git a/net/tipc/node.h b/net/tipc/node.h index 291d0ecd4101..30563c4f35d5 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -75,7 +75,7 @@ u32 tipc_node_get_addr(struct tipc_node *node); u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr); void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, struct tipc_bearer *bearer, - u16 capabilities, u32 signature, + u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr); void tipc_node_delete_links(struct net *net, int bearer_id); @@ -92,7 +92,7 @@ void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr); void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); -int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected); bool tipc_node_is_up(struct net *net, u32 addr); u16 tipc_node_get_capabilities(struct net *net, u32 addr); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); @@ -107,4 +107,5 @@ int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb); +void tipc_node_pre_cleanup_net(struct net *exit_net); #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 35e32ffc2b90..2bcacd6022d5 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -854,7 +854,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, /* Build message as chain of buffers */ __skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1388,7 +1388,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return rc; __skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1526,7 +1526,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); tipc_set_sk_state(sk, TIPC_ESTABLISHED); tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); - tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); + tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid, true); tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); __skb_queue_purge(&sk->sk_write_queue); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) -- cgit v1.2.3 From c0bceb97db9efc72629dd00cd0d9812f24d4ba2d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 30 Oct 2019 14:00:41 +0100 Subject: tipc: add smart nagle feature We introduce a feature that works like a combination of TCP_NAGLE and TCP_CORK, but without some of the weaknesses of those. In particular, we will not observe long delivery delays because of delayed acks, since the algorithm itself decides if and when acks are to be sent from the receiving peer. - The nagle property as such is determined by manipulating a new 'maxnagle' field in struct tipc_sock. If certain conditions are met, 'maxnagle' will define max size of the messages which can be bundled. If it is set to zero no messages are ever bundled, implying that the nagle property is disabled. - A socket with the nagle property enabled enters nagle mode when more than 4 messages have been sent out without receiving any data message from the peer. - A socket leaves nagle mode whenever it receives a data message from the peer. In nagle mode, messages smaller than 'maxnagle' are accumulated in the socket write queue. The last buffer in the queue is marked with a new 'ack_required' bit, which forces the receiving peer to send a CONN_ACK message back to the sender upon reception. The accumulated contents of the write queue is transmitted when one of the following events or conditions occur. - A CONN_ACK message is received from the peer. - A data message is received from the peer. - A SOCK_WAKEUP pseudo message is received from the link level. - The write queue contains more than 64 1k blocks of data. - The connection is being shut down. - There is no CONN_ACK message to expect. I.e., there is currently no outstanding message where the 'ack_required' bit was set. As a consequence, the first message added after we enter nagle mode is always sent directly with this bit set. This new feature gives a 50-100% improvement of throughput for small (i.e., less than MTU size) messages, while it might add up to one RTT to latency time when the socket is in nagle mode. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 1 + net/tipc/msg.c | 53 +++++++++++++++++++++ net/tipc/msg.h | 12 +++++ net/tipc/node.h | 7 ++- net/tipc/socket.c | 117 +++++++++++++++++++++++++++++++++++++++------- 5 files changed, 170 insertions(+), 20 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 7df026ea6aff..76421b878767 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -191,6 +191,7 @@ struct sockaddr_tipc { #define TIPC_GROUP_JOIN 135 /* Takes struct tipc_group_req* */ #define TIPC_GROUP_LEAVE 136 /* No argument */ #define TIPC_SOCK_RECVQ_USED 137 /* Default: none (read only) */ +#define TIPC_NODELAY 138 /* Default: false */ /* * Flag values diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 922d262e153f..973795a1a968 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -190,6 +190,59 @@ err: return 0; } +/** + * tipc_msg_append(): Append data to tail of an existing buffer queue + * @hdr: header to be used + * @m: the data to be appended + * @mss: max allowable size of buffer + * @dlen: size of data to be appended + * @txq: queue to appand to + * Returns the number og 1k blocks appended or errno value + */ +int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, + int mss, struct sk_buff_head *txq) +{ + struct sk_buff *skb, *prev; + int accounted, total, curr; + int mlen, cpy, rem = dlen; + struct tipc_msg *hdr; + + skb = skb_peek_tail(txq); + accounted = skb ? msg_blocks(buf_msg(skb)) : 0; + total = accounted; + + while (rem) { + if (!skb || skb->len >= mss) { + prev = skb; + skb = tipc_buf_acquire(mss, GFP_KERNEL); + if (unlikely(!skb)) + return -ENOMEM; + skb_orphan(skb); + skb_trim(skb, MIN_H_SIZE); + hdr = buf_msg(skb); + skb_copy_to_linear_data(skb, _hdr, MIN_H_SIZE); + msg_set_hdr_sz(hdr, MIN_H_SIZE); + msg_set_size(hdr, MIN_H_SIZE); + __skb_queue_tail(txq, skb); + total += 1; + if (prev) + msg_set_ack_required(buf_msg(prev), 0); + msg_set_ack_required(hdr, 1); + } + hdr = buf_msg(skb); + curr = msg_blocks(hdr); + mlen = msg_size(hdr); + cpy = min_t(int, rem, mss - mlen); + if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter)) + return -EFAULT; + msg_set_size(hdr, mlen + cpy); + skb_put(skb, cpy); + rem -= cpy; + total += msg_blocks(hdr) - curr; + } + return total - accounted; +} + /* tipc_msg_validate - validate basic format of received message * * This routine ensures a TIPC message has an acceptable header, and at least diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 2d7cb66a6912..0435dda4b90c 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -290,6 +290,16 @@ static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) msg_set_bits(m, 0, 18, 1, d); } +static inline int msg_ack_required(struct tipc_msg *m) +{ + return msg_bits(m, 0, 18, 1); +} + +static inline void msg_set_ack_required(struct tipc_msg *m, u32 d) +{ + msg_set_bits(m, 0, 18, 1, d); +} + static inline bool msg_is_rcast(struct tipc_msg *m) { return msg_bits(m, 0, 18, 0x1); @@ -1079,6 +1089,8 @@ int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, int pktmax, struct sk_buff_head *frags); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); +int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen, + int mss, struct sk_buff_head *txq); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); diff --git a/net/tipc/node.h b/net/tipc/node.h index 30563c4f35d5..c39cd861c07d 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -54,7 +54,8 @@ enum { TIPC_LINK_PROTO_SEQNO = (1 << 6), TIPC_MCAST_RBCTL = (1 << 7), TIPC_GAP_ACK_BLOCK = (1 << 8), - TIPC_TUNNEL_ENHANCED = (1 << 9) + TIPC_TUNNEL_ENHANCED = (1 << 9), + TIPC_NAGLE = (1 << 10) }; #define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT | \ @@ -66,7 +67,9 @@ enum { TIPC_LINK_PROTO_SEQNO | \ TIPC_MCAST_RBCTL | \ TIPC_GAP_ACK_BLOCK | \ - TIPC_TUNNEL_ENHANCED) + TIPC_TUNNEL_ENHANCED | \ + TIPC_NAGLE) + #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 2bcacd6022d5..3e99a122e321 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -75,6 +75,7 @@ struct sockaddr_pair { * @conn_instance: TIPC instance used when connection was established * @published: non-zero if port has one or more associated names * @max_pkt: maximum packet size "hint" used when building messages sent by port + * @maxnagle: maximum size of msg which can be subject to nagle * @portid: unique port identity in TIPC socket hash table * @phdr: preformatted message header used when sending messages * #cong_links: list of congested links @@ -97,6 +98,7 @@ struct tipc_sock { u32 conn_instance; int published; u32 max_pkt; + u32 maxnagle; u32 portid; struct tipc_msg phdr; struct list_head cong_links; @@ -116,6 +118,10 @@ struct tipc_sock { struct tipc_mc_method mc_method; struct rcu_head rcu; struct tipc_group *group; + u32 oneway; + u16 snd_backlog; + bool expect_ack; + bool nodelay; bool group_is_open; }; @@ -137,6 +143,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); +static void tipc_sk_push_backlog(struct tipc_sock *tsk); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -227,6 +234,26 @@ static u16 tsk_inc(struct tipc_sock *tsk, int msglen) return 1; } +/* tsk_set_nagle - enable/disable nagle property by manipulating maxnagle + */ +static void tsk_set_nagle(struct tipc_sock *tsk) +{ + struct sock *sk = &tsk->sk; + + tsk->maxnagle = 0; + if (sk->sk_type != SOCK_STREAM) + return; + if (tsk->nodelay) + return; + if (!(tsk->peer_caps & TIPC_NAGLE)) + return; + /* Limit node local buffer size to avoid receive queue overflow */ + if (tsk->max_pkt == MAX_MSG_SIZE) + tsk->maxnagle = 1500; + else + tsk->maxnagle = tsk->max_pkt; +} + /** * tsk_advance_rx_queue - discard first buffer in socket receive queue * @@ -446,6 +473,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); tsk->max_pkt = MAX_PKT_DEFAULT; + tsk->maxnagle = 0; INIT_LIST_HEAD(&tsk->publications); INIT_LIST_HEAD(&tsk->cong_links); msg = &tsk->phdr; @@ -512,8 +540,12 @@ static void __tipc_shutdown(struct socket *sock, int error) tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))); - /* Remove any pending SYN message */ - __skb_queue_purge(&sk->sk_write_queue); + /* Push out unsent messages or remove if pending SYN */ + skb = skb_peek(&sk->sk_write_queue); + if (skb && !msg_is_syn(buf_msg(skb))) + tipc_sk_push_backlog(tsk); + else + __skb_queue_purge(&sk->sk_write_queue); /* Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer). @@ -1208,6 +1240,27 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, tipc_sk_rcv(net, inputq); } +/* tipc_sk_push_backlog(): send accumulated buffers in socket write queue + * when socket is in Nagle mode + */ +static void tipc_sk_push_backlog(struct tipc_sock *tsk) +{ + struct sk_buff_head *txq = &tsk->sk.sk_write_queue; + struct net *net = sock_net(&tsk->sk); + u32 dnode = tsk_peer_node(tsk); + int rc; + + if (skb_queue_empty(txq) || tsk->cong_link_cnt) + return; + + tsk->snt_unacked += tsk->snd_backlog; + tsk->snd_backlog = 0; + tsk->expect_ack = true; + rc = tipc_node_xmit(net, txq, dnode, tsk->portid); + if (rc == -ELINKCONG) + tsk->cong_link_cnt = 1; +} + /** * tipc_sk_conn_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket @@ -1221,7 +1274,7 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, u32 onode = tsk_own_node(tsk); struct sock *sk = &tsk->sk; int mtyp = msg_type(hdr); - bool conn_cong; + bool was_cong; /* Ignore if connection cannot be validated: */ if (!tsk_peer_msg(tsk, hdr)) { @@ -1254,11 +1307,13 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, __skb_queue_tail(xmitq, skb); return; } else if (mtyp == CONN_ACK) { - conn_cong = tsk_conn_cong(tsk); + was_cong = tsk_conn_cong(tsk); + tsk->expect_ack = false; + tipc_sk_push_backlog(tsk); tsk->snt_unacked -= msg_conn_ack(hdr); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) tsk->snd_win = msg_adv_win(hdr); - if (conn_cong) + if (was_cong && !tsk_conn_cong(tsk)) sk->sk_write_space(sk); } else if (mtyp != CONN_PROBE_REPLY) { pr_warn("Received unknown CONN_PROTO msg\n"); @@ -1437,15 +1492,15 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); + struct sk_buff_head *txq = &sk->sk_write_queue; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); - struct sk_buff_head pkts; u32 dnode = tsk_peer_node(tsk); + int maxnagle = tsk->maxnagle; + int maxpkt = tsk->max_pkt; int send, sent = 0; - int rc = 0; - - __skb_queue_head_init(&pkts); + int blocks, rc = 0; if (unlikely(dlen > INT_MAX)) return -EMSGSIZE; @@ -1467,21 +1522,35 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) tipc_sk_connected(sk))); if (unlikely(rc)) break; - send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); - rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts); - if (unlikely(rc != send)) - break; - - trace_tipc_sk_sendstream(sk, skb_peek(&pkts), + blocks = tsk->snd_backlog; + if (tsk->oneway++ >= 4 && send <= maxnagle) { + rc = tipc_msg_append(hdr, m, send, maxnagle, txq); + if (unlikely(rc < 0)) + break; + blocks += rc; + if (blocks <= 64 && tsk->expect_ack) { + tsk->snd_backlog = blocks; + sent += send; + break; + } + tsk->expect_ack = true; + } else { + rc = tipc_msg_build(hdr, m, sent, send, maxpkt, txq); + if (unlikely(rc != send)) + break; + blocks += tsk_inc(tsk, send + MIN_H_SIZE); + } + trace_tipc_sk_sendstream(sk, skb_peek(txq), TIPC_DUMP_SK_SNDQ, " "); - rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + rc = tipc_node_xmit(net, txq, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { tsk->cong_link_cnt = 1; rc = 0; } if (likely(!rc)) { - tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE); + tsk->snt_unacked += blocks; + tsk->snd_backlog = 0; sent += send; } } while (sent < dlen && !rc); @@ -1528,6 +1597,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid, true); tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); + tsk_set_nagle(tsk); __skb_queue_purge(&sk->sk_write_queue); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) return; @@ -1848,6 +1918,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, bool peek = flags & MSG_PEEK; int offset, required, copy, copied = 0; int hlen, dlen, err, rc; + bool ack = false; long timeout; /* Catch invalid receive attempts */ @@ -1892,6 +1963,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Copy data if msg ok, otherwise return error/partial data */ if (likely(!err)) { + ack = msg_ack_required(hdr); offset = skb_cb->bytes_read; copy = min_t(int, dlen - offset, buflen - copied); rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); @@ -1919,7 +1991,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Send connection flow control advertisement when applicable */ tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); - if (unlikely(tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)) + if (ack || tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) tipc_sk_send_ack(tsk); /* Exit if all requested data or FIN/error received */ @@ -1990,6 +2062,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, smp_wmb(); tsk->cong_link_cnt--; wakeup = true; + tipc_sk_push_backlog(tsk); break; case GROUP_PROTOCOL: tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); @@ -2029,6 +2102,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) if (unlikely(msg_mcast(hdr))) return false; + tsk->oneway = 0; switch (sk->sk_state) { case TIPC_CONNECTING: @@ -2074,6 +2148,8 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) return true; return false; case TIPC_ESTABLISHED: + if (!skb_queue_empty(&sk->sk_write_queue)) + tipc_sk_push_backlog(tsk); /* Accept only connection-based messages sent by peer */ if (likely(con_msg && !err && pport == oport && pnode == onode)) return true; @@ -2959,6 +3035,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_SRC_DROPPABLE: case TIPC_DEST_DROPPABLE: case TIPC_CONN_TIMEOUT: + case TIPC_NODELAY: if (ol < sizeof(value)) return -EINVAL; if (get_user(value, (u32 __user *)ov)) @@ -3007,6 +3084,10 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_GROUP_LEAVE: res = tipc_sk_leave(tsk); break; + case TIPC_NODELAY: + tsk->nodelay = !!value; + tsk_set_nagle(tsk); + break; default: res = -EINVAL; } -- cgit v1.2.3 From ab818362c9054beb950b97a09ce7b0d56f5a32a1 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 22 Nov 2019 08:15:19 +0000 Subject: net: use rhashtable_lookup() instead of rhashtable_lookup_fast() rhashtable_lookup_fast() internally calls rcu_read_lock() then, calls rhashtable_lookup(). So if rcu_read_lock() is already held, rhashtable_lookup() is enough. Signed-off-by: Taehee Yoo Signed-off-by: Jakub Kicinski --- drivers/infiniband/hw/hfi1/sdma.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 4 ++-- net/tipc/socket.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index c61b6022575e..5774dfc22e18 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -881,8 +881,8 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd, cpu_id = smp_processor_id(); rcu_read_lock(); - rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu_id, - sdma_rht_params); + rht_node = rhashtable_lookup(dd->sdma_rht, &cpu_id, + sdma_rht_params); if (rht_node && rht_node->map[vl]) { struct sdma_rht_map_elem *map = rht_node->map[vl]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 761fc35c4aab..0d5d84b5fa23 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3876,7 +3876,7 @@ int mlx5e_delete_flower(struct net_device *dev, struct mlx5e_priv *priv, int err; rcu_read_lock(); - flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params); + flow = rhashtable_lookup(tc_ht, &f->cookie, tc_ht_params); if (!flow || !same_flow_direction(flow, flags)) { err = -EINVAL; goto errout; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 06927ba5a3ae..95a0d3910e31 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -458,8 +458,8 @@ int nfp_bpf_event_output(struct nfp_app_bpf *bpf, const void *data, return -EINVAL; rcu_read_lock(); - record = rhashtable_lookup_fast(&bpf->maps_neutral, &map_id, - nfp_bpf_maps_neutral_params); + record = rhashtable_lookup(&bpf->maps_neutral, &map_id, + nfp_bpf_maps_neutral_params); if (!record || map_id_full > U32_MAX) { rcu_read_unlock(); cmsg_warn(bpf, "perf event: map id %lld (0x%llx) not recognized, dropping event\n", diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 5d7859aac78e..a1c8d722ca20 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2880,7 +2880,7 @@ static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) struct tipc_sock *tsk; rcu_read_lock(); - tsk = rhashtable_lookup_fast(&tn->sk_rht, &portid, tsk_rht_params); + tsk = rhashtable_lookup(&tn->sk_rht, &portid, tsk_rht_params); if (tsk) sock_hold(&tsk->sk); rcu_read_unlock(); -- cgit v1.2.3 From 2fe97a578d7bad3116a89dc8a6692a51e6fc1d9c Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 28 Nov 2019 10:10:05 +0700 Subject: tipc: fix potential memory leak in __tipc_sendmsg() When initiating a connection message to a server side, the connection message is cloned and added to the socket write queue. However, if the cloning is failed, only the socket write queue is purged. It causes memory leak because the original connection message is not freed. This commit fixes it by purging the list of connection message when it cannot be cloned. Fixes: 6787927475e5 ("tipc: buffer overflow handling in listener socket") Reported-by: Hoang Le Signed-off-by: Tung Nguyen Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index a1c8d722ca20..7baed2c2c93d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1447,8 +1447,10 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; - if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) + if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) { + __skb_queue_purge(&pkts); return -ENOMEM; + } trace_tipc_sk_sendmsg(sk, skb_peek(&pkts), TIPC_DUMP_SK_SNDQ, " "); rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); -- cgit v1.2.3 From 91a4a3eb433e4d786420c41f3c08d1d16c605962 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 28 Nov 2019 10:10:06 +0700 Subject: tipc: fix wrong socket reference counter after tipc_sk_timeout() returns When tipc_sk_timeout() is executed but user space is grabbing ownership, this function rearms itself and returns. However, the socket reference counter is not reduced. This causes potential unexpected behavior. This commit fixes it by calling sock_put() before tipc_sk_timeout() returns in the above-mentioned case. Fixes: afe8792fec69 ("tipc: refactor function tipc_sk_timeout()") Signed-off-by: Tung Nguyen Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7baed2c2c93d..fb5595081a05 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2759,6 +2759,7 @@ static void tipc_sk_timeout(struct timer_list *t) if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20); bh_unlock_sock(sk); + sock_put(sk); return; } -- cgit v1.2.3 From 12db3c8083fcab4270866a88191933f2d9f24f89 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 28 Nov 2019 10:10:07 +0700 Subject: tipc: fix wrong timeout input for tipc_wait_for_cond() In function __tipc_shutdown(), the timeout value passed to tipc_wait_for_cond() is not jiffies. This commit fixes it by converting that value from milliseconds to jiffies. Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion") Signed-off-by: Tung Nguyen Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index fb5595081a05..da5fb84852a6 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -532,7 +532,7 @@ static void __tipc_shutdown(struct socket *sock, int error) struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); - long timeout = CONN_TIMEOUT_DEFAULT; + long timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); u32 dnode = tsk_peer_node(tsk); struct sk_buff *skb; -- cgit v1.2.3 From d34910e1751be79672db9cc61ca8892fbb4763f2 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 28 Nov 2019 10:10:08 +0700 Subject: tipc: fix duplicate SYN messages under link congestion Scenario: 1. A client socket initiates a SYN message to a listening socket. 2. The send link is congested, the SYN message is put in the send link and a wakeup message is put in wakeup queue. 3. The congestion situation is abated, the wakeup message is pulled out of the wakeup queue. Function tipc_sk_push_backlog() is called to send out delayed messages by Nagle. However, the client socket is still in CONNECTING state. So, it sends the SYN message in the socket write queue to the listening socket again. 4. The listening socket receives the first SYN message and creates first server socket. The client socket receives ACK- and establishes a connection to the first server socket. The client socket closes its connection with the first server socket. 5. The listening socket receives the second SYN message and creates second server socket. The second server socket sends ACK- to the client socket, but it has been closed. It results in connection reset error when reading from the server socket in user space. Solution: return from function tipc_sk_push_backlog() immediately if there is pending SYN message in the socket write queue. Fixes: c0bceb97db9e ("tipc: add smart nagle feature") Signed-off-by: Tung Nguyen Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index da5fb84852a6..41688da233ab 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -540,12 +540,10 @@ static void __tipc_shutdown(struct socket *sock, int error) tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))); - /* Push out unsent messages or remove if pending SYN */ - skb = skb_peek(&sk->sk_write_queue); - if (skb && !msg_is_syn(buf_msg(skb))) - tipc_sk_push_backlog(tsk); - else - __skb_queue_purge(&sk->sk_write_queue); + /* Push out delayed messages if in Nagle mode */ + tipc_sk_push_backlog(tsk); + /* Remove pending SYN */ + __skb_queue_purge(&sk->sk_write_queue); /* Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer). @@ -1248,9 +1246,14 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk) struct sk_buff_head *txq = &tsk->sk.sk_write_queue; struct net *net = sock_net(&tsk->sk); u32 dnode = tsk_peer_node(tsk); + struct sk_buff *skb = skb_peek(txq); int rc; - if (skb_queue_empty(txq) || tsk->cong_link_cnt) + if (!skb || tsk->cong_link_cnt) + return; + + /* Do not send SYN again after congestion */ + if (msg_is_syn(buf_msg(skb))) return; tsk->snt_unacked += tsk->snd_backlog; -- cgit v1.2.3 From abc9b4e0549b93fdaff56e9532bc49a2d7b04955 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Tue, 10 Dec 2019 15:21:04 +0700 Subject: tipc: fix retrans failure due to wrong destination When a user message is sent, TIPC will check if the socket has faced a congestion at link layer. If that happens, it will make a sleep to wait for the congestion to disappear. This leaves a gap for other users to take over the socket (e.g. multi threads) since the socket is released as well. Also, in case of connectionless (e.g. SOCK_RDM), user is free to send messages to various destinations (e.g. via 'sendto()'), then the socket's preformatted header has to be updated correspondingly prior to the actual payload message building. Unfortunately, the latter action is done before the first action which causes a condition issue that the destination of a certain message can be modified incorrectly in the middle, leading to wrong destination when that message is built. Consequently, when the message is sent to the link layer, it gets stuck there forever because the peer node will simply reject it. After a number of retransmission attempts, the link is eventually taken down and the retransmission failure is reported. This commit fixes the problem by rearranging the order of actions to prevent the race condition from occurring, so the message building is 'atomic' and its header will not be modified by anyone. Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion") Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/socket.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 41688da233ab..6552f986774c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1364,8 +1364,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; struct sk_buff_head pkts; - u32 dport, dnode = 0; - u32 type, inst; + u32 dport = 0, dnode = 0; + u32 type = 0, inst = 0; int mtu, rc; if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) @@ -1418,23 +1418,11 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) type = dest->addr.name.name.type; inst = dest->addr.name.name.instance; dnode = dest->addr.name.domain; - msg_set_type(hdr, TIPC_NAMED_MSG); - msg_set_hdr_sz(hdr, NAMED_H_SIZE); - msg_set_nametype(hdr, type); - msg_set_nameinst(hdr, inst); - msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); dport = tipc_nametbl_translate(net, type, inst, &dnode); - msg_set_destnode(hdr, dnode); - msg_set_destport(hdr, dport); if (unlikely(!dport && !dnode)) return -EHOSTUNREACH; } else if (dest->addrtype == TIPC_ADDR_ID) { dnode = dest->addr.id.node; - msg_set_type(hdr, TIPC_DIRECT_MSG); - msg_set_lookup_scope(hdr, 0); - msg_set_destnode(hdr, dnode); - msg_set_destport(hdr, dest->addr.id.ref); - msg_set_hdr_sz(hdr, BASIC_H_SIZE); } else { return -EINVAL; } @@ -1445,6 +1433,22 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(rc)) return rc; + if (dest->addrtype == TIPC_ADDR_NAME) { + msg_set_type(hdr, TIPC_NAMED_MSG); + msg_set_hdr_sz(hdr, NAMED_H_SIZE); + msg_set_nametype(hdr, type); + msg_set_nameinst(hdr, inst); + msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); + msg_set_destnode(hdr, dnode); + msg_set_destport(hdr, dport); + } else { /* TIPC_ADDR_ID */ + msg_set_type(hdr, TIPC_DIRECT_MSG); + msg_set_lookup_scope(hdr, 0); + msg_set_destnode(hdr, dnode); + msg_set_destport(hdr, dest->addr.id.ref); + msg_set_hdr_sz(hdr, BASIC_H_SIZE); + } + __skb_queue_head_init(&pkts); mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); -- cgit v1.2.3