From 4b9d07a44015a0e940448fa3885b894349e8b162 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 9 Jan 2017 16:55:12 +0100 Subject: net: introduce keepalive function in struct proto Direct call of tcp_set_keepalive() function from protocol-agnostic sock_setsockopt() function in net/core/sock.c violates network layering. And newly introduced protocol (SMC-R) will need its own keepalive function. Therefore, add "keepalive" function pointer to "struct proto", and call it from sock_setsockopt() via this pointer. Signed-off-by: Ursula Braun Reviewed-by: Utz Bacher Signed-off-by: David S. Miller --- include/net/sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index f0e867f58722..99deda67eba0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1024,6 +1024,7 @@ struct proto { int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); + void (*keepalive)(struct sock *sk, int valbool); #ifdef CONFIG_COMPAT int (*compat_setsockopt)(struct sock *sk, int level, -- cgit v1.2.3 From ac7138746e14137a451f8539614cdd349153e0c0 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 9 Jan 2017 16:55:13 +0100 Subject: smc: establish new socket family * enable smc module loading and unloading * register new socket family * basic smc socket creation and deletion * use backing TCP socket to run CLC (Connection Layer Control) handshake of SMC protocol * Setup for infiniband traffic is implemented in follow-on patches. For now fallback to TCP socket is always used. Signed-off-by: Ursula Braun Reviewed-by: Utz Bacher Signed-off-by: David S. Miller --- MAINTAINERS | 7 + include/linux/socket.h | 7 +- net/Kconfig | 1 + net/Makefile | 1 + net/core/sock.c | 6 +- net/smc/Kconfig | 11 + net/smc/Makefile | 2 + net/smc/af_smc.c | 620 +++++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc.h | 37 +++ 9 files changed, 688 insertions(+), 4 deletions(-) create mode 100644 net/smc/Kconfig create mode 100644 net/smc/Makefile create mode 100644 net/smc/af_smc.c create mode 100644 net/smc/smc.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index d42a37a351f8..c8df0e1ef833 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10850,6 +10850,13 @@ S: Maintained F: drivers/staging/media/st-cec/ F: Documentation/devicetree/bindings/media/stih-cec.txt +SHARED MEMORY COMMUNICATIONS (SMC) SOCKETS +M: Ursula Braun +L: linux-s390@vger.kernel.org +W: http://www.ibm.com/developerworks/linux/linux390/ +S: Supported +F: net/smc/ + SYNOPSYS DESIGNWARE DMAC DRIVER M: Viresh Kumar M: Andy Shevchenko diff --git a/include/linux/socket.h b/include/linux/socket.h index c06438023d79..082027457825 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -202,8 +202,12 @@ struct ucred { #define AF_VSOCK 40 /* vSockets */ #define AF_KCM 41 /* Kernel Connection Multiplexor*/ #define AF_QIPCRTR 42 /* Qualcomm IPC Router */ +#define AF_SMC 43 /* smc sockets: reserve number for + * PF_SMC protocol family that + * reuses AF_INET address family + */ -#define AF_MAX 43 /* For now.. */ +#define AF_MAX 44 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC @@ -251,6 +255,7 @@ struct ucred { #define PF_VSOCK AF_VSOCK #define PF_KCM AF_KCM #define PF_QIPCRTR AF_QIPCRTR +#define PF_SMC AF_SMC #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ diff --git a/net/Kconfig b/net/Kconfig index a1005007224c..2e9ee61822e2 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -57,6 +57,7 @@ source "net/packet/Kconfig" source "net/unix/Kconfig" source "net/xfrm/Kconfig" source "net/iucv/Kconfig" +source "net/smc/Kconfig" config INET bool "TCP/IP networking" diff --git a/net/Makefile b/net/Makefile index 4cafaa2b4667..5d6e0e5ff7f8 100644 --- a/net/Makefile +++ b/net/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_MAC80211) += mac80211/ obj-$(CONFIG_TIPC) += tipc/ obj-$(CONFIG_NETLABEL) += netlabel/ obj-$(CONFIG_IUCV) += iucv/ +obj-$(CONFIG_SMC) += smc/ obj-$(CONFIG_RFKILL) += rfkill/ obj-$(CONFIG_NET_9P) += 9p/ obj-$(CONFIG_CAIF) += caif/ diff --git a/net/core/sock.c b/net/core/sock.c index 5018703ee2c2..31f72f3a3559 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -222,7 +222,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , - "sk_lock-AF_MAX" + "sk_lock-AF_SMC" , "sk_lock-AF_MAX" }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -239,7 +239,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , - "slock-AF_MAX" + "slock-AF_SMC" , "slock-AF_MAX" }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , @@ -256,7 +256,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , - "clock-AF_MAX" + "closck-AF_smc" , "clock-AF_MAX" }; /* diff --git a/net/smc/Kconfig b/net/smc/Kconfig new file mode 100644 index 000000000000..bc029803e728 --- /dev/null +++ b/net/smc/Kconfig @@ -0,0 +1,11 @@ +config SMC + tristate "SMC socket protocol family" + depends on INET && INFINIBAND + ---help--- + SMC-R provides a "sockets over RDMA" solution making use of + RDMA over Converged Ethernet (RoCE) technology to upgrade + AF_INET TCP connections transparently. + The Linux implementation of the SMC-R solution is designed as + a separate socket family SMC. + + Select this option if you want to run SMC socket applications diff --git a/net/smc/Makefile b/net/smc/Makefile new file mode 100644 index 000000000000..c285c868dd82 --- /dev/null +++ b/net/smc/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_SMC) += smc.o +smc-y := af_smc.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c new file mode 100644 index 000000000000..7fd773fc6238 --- /dev/null +++ b/net/smc/af_smc.c @@ -0,0 +1,620 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * AF_SMC protocol family socket handler keeping the AF_INET sock address type + * applies to SOCK_STREAM sockets only + * offers an alternative communication option for TCP-protocol sockets + * applicable with RoCE-cards only + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun + * based on prototype from Frank Blaschka + */ + +#define KMSG_COMPONENT "smc" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include + +#include "smc.h" + +static void smc_set_keepalive(struct sock *sk, int val) +{ + struct smc_sock *smc = smc_sk(sk); + + smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); +} + +static struct proto smc_proto = { + .name = "SMC", + .owner = THIS_MODULE, + .keepalive = smc_set_keepalive, + .obj_size = sizeof(struct smc_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, +}; + +static int smc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + + if (!sk) + goto out; + + smc = smc_sk(sk); + lock_sock(sk); + + sk->sk_state = SMC_CLOSED; + if (smc->clcsock) { + sock_release(smc->clcsock); + smc->clcsock = NULL; + } + + /* detach socket */ + sock_orphan(sk); + sock->sk = NULL; + release_sock(sk); + + sock_put(sk); +out: + return 0; +} + +static void smc_destruct(struct sock *sk) +{ + if (sk->sk_state != SMC_CLOSED) + return; + if (!sock_flag(sk, SOCK_DEAD)) + return; + + sk_refcnt_debug_dec(sk); +} + +static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) +{ + struct smc_sock *smc; + struct sock *sk; + + sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0); + if (!sk) + return NULL; + + sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ + sk->sk_state = SMC_INIT; + sk->sk_destruct = smc_destruct; + sk->sk_protocol = SMCPROTO_SMC; + sk_refcnt_debug_inc(sk); + + smc = smc_sk(sk); + + return sk; +} + +static int smc_bind(struct socket *sock, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc; + + smc = smc_sk(sk); + + /* replicate tests from inet_bind(), to be safe wrt. future changes */ + rc = -EINVAL; + if (addr_len < sizeof(struct sockaddr_in)) + goto out; + + rc = -EAFNOSUPPORT; + /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ + if ((addr->sin_family != AF_INET) && + ((addr->sin_family != AF_UNSPEC) || + (addr->sin_addr.s_addr != htonl(INADDR_ANY)))) + goto out; + + lock_sock(sk); + + /* Check if socket is already active */ + rc = -EINVAL; + if (sk->sk_state != SMC_INIT) + goto out_rel; + + smc->clcsock->sk->sk_reuse = sk->sk_reuse; + rc = kernel_bind(smc->clcsock, uaddr, addr_len); + +out_rel: + release_sock(sk); +out: + return rc; +} + +static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + /* options we don't get control via setsockopt for */ + nsk->sk_type = osk->sk_type; + nsk->sk_sndbuf = osk->sk_sndbuf; + nsk->sk_rcvbuf = osk->sk_rcvbuf; + nsk->sk_sndtimeo = osk->sk_sndtimeo; + nsk->sk_rcvtimeo = osk->sk_rcvtimeo; + nsk->sk_mark = osk->sk_mark; + nsk->sk_priority = osk->sk_priority; + nsk->sk_rcvlowat = osk->sk_rcvlowat; + nsk->sk_bound_dev_if = osk->sk_bound_dev_if; + nsk->sk_err = osk->sk_err; + + nsk->sk_flags &= ~mask; + nsk->sk_flags |= osk->sk_flags & mask; +} + +#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ + (1UL << SOCK_KEEPOPEN) | \ + (1UL << SOCK_LINGER) | \ + (1UL << SOCK_BROADCAST) | \ + (1UL << SOCK_TIMESTAMP) | \ + (1UL << SOCK_DBG) | \ + (1UL << SOCK_RCVTSTAMP) | \ + (1UL << SOCK_RCVTSTAMPNS) | \ + (1UL << SOCK_LOCALROUTE) | \ + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ + (1UL << SOCK_RXQ_OVFL) | \ + (1UL << SOCK_WIFI_STATUS) | \ + (1UL << SOCK_NOFCS) | \ + (1UL << SOCK_FILTER_LOCKED)) +/* copy only relevant settings and flags of SOL_SOCKET level from smc to + * clc socket (since smc is not called for these options from net/core) + */ +static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) +{ + smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); +} + +#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ + (1UL << SOCK_KEEPOPEN) | \ + (1UL << SOCK_LINGER) | \ + (1UL << SOCK_DBG)) +/* copy only settings and flags relevant for smc from clc to smc socket */ +static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) +{ + smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); +} + +static int smc_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EINVAL; + + smc = smc_sk(sk); + + /* separate smc parameter checking to be safe */ + if (alen < sizeof(addr->sa_family)) + goto out_err; + if (addr->sa_family != AF_INET) + goto out_err; + + lock_sock(sk); + switch (sk->sk_state) { + default: + goto out; + case SMC_ACTIVE: + rc = -EISCONN; + goto out; + case SMC_INIT: + rc = 0; + break; + } + + smc_copy_sock_settings_to_clc(smc); + rc = kernel_connect(smc->clcsock, addr, alen, flags); + if (rc) + goto out; + + sk->sk_state = SMC_ACTIVE; + + /* always use TCP fallback as transport mechanism for now; + * This will change once RDMA transport is implemented + */ + smc->use_fallback = true; + +out: + release_sock(sk); +out_err: + return rc; +} + +static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) +{ + struct sock *sk = &lsmc->sk; + struct socket *new_clcsock; + struct sock *new_sk; + int rc; + + new_sk = smc_sock_alloc(sock_net(sk), NULL); + if (!new_sk) { + rc = -ENOMEM; + lsmc->sk.sk_err = ENOMEM; + *new_smc = NULL; + goto out; + } + *new_smc = smc_sk(new_sk); + + rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); + if (rc) { + sock_put(new_sk); + *new_smc = NULL; + goto out; + } + + (*new_smc)->clcsock = new_clcsock; +out: + return rc; +} + +static int smc_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc; + + smc = smc_sk(sk); + lock_sock(sk); + + rc = -EINVAL; + if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) + goto out; + + rc = 0; + if (sk->sk_state == SMC_LISTEN) { + sk->sk_max_ack_backlog = backlog; + goto out; + } + /* some socket options are handled in core, so we could not apply + * them to the clc socket -- copy smc socket options to clc socket + */ + smc_copy_sock_settings_to_clc(smc); + + rc = kernel_listen(smc->clcsock, backlog); + if (rc) + goto out; + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + sk->sk_state = SMC_LISTEN; + +out: + release_sock(sk); + return rc; +} + +static int smc_accept(struct socket *sock, struct socket *new_sock, + int flags) +{ + struct smc_sock *new_smc; + struct sock *sk = sock->sk; + struct smc_sock *lsmc; + int rc; + + lsmc = smc_sk(sk); + lock_sock(sk); + + if (lsmc->sk.sk_state != SMC_LISTEN) { + rc = -EINVAL; + goto out; + } + + rc = smc_clcsock_accept(lsmc, &new_smc); + if (rc) + goto out; + sock_graft(&new_smc->sk, new_sock); + new_smc->sk.sk_state = SMC_ACTIVE; + + smc_copy_sock_settings_to_smc(new_smc); + + /* always use TCP fallback as transport mechanism for now; + * This will change once RDMA transport is implemented + */ + new_smc->use_fallback = true; + +out: + release_sock(sk); + return rc; +} + +static int smc_getname(struct socket *sock, struct sockaddr *addr, + int *len, int peer) +{ + struct smc_sock *smc; + + if (peer && (sock->sk->sk_state != SMC_ACTIVE)) + return -ENOTCONN; + + smc = smc_sk(sock->sk); + + return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer); +} + +static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EPIPE; + + smc = smc_sk(sk); + lock_sock(sk); + if (sk->sk_state != SMC_ACTIVE) + goto out; + if (smc->use_fallback) + rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); + else + rc = sock_no_sendmsg(sock, msg, len); +out: + release_sock(sk); + return rc; +} + +static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -ENOTCONN; + + smc = smc_sk(sk); + lock_sock(sk); + if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED)) + goto out; + + if (smc->use_fallback) + rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); + else + rc = sock_no_recvmsg(sock, msg, len, flags); +out: + release_sock(sk); + return rc; +} + +static unsigned int smc_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask = 0; + struct smc_sock *smc; + + smc = smc_sk(sock->sk); + if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) || + smc->use_fallback) { + mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); + /* if non-blocking connect finished ... */ + lock_sock(sk); + if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) { + sk->sk_state = SMC_ACTIVE; + /* always use TCP fallback as transport mechanism; + * This will change once RDMA transport is implemented + */ + smc->use_fallback = true; + } + release_sock(sk); + } else { + mask = sock_no_poll(file, sock, wait); + } + + return mask; +} + +static int smc_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EINVAL; + + smc = smc_sk(sk); + + if ((how < SHUT_RD) || (how > SHUT_RDWR)) + goto out_err; + + lock_sock(sk); + + rc = -ENOTCONN; + if (sk->sk_state == SMC_CLOSED) + goto out; + if (smc->use_fallback) { + rc = kernel_sock_shutdown(smc->clcsock, how); + sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; + if (sk->sk_shutdown == SHUTDOWN_MASK) + sk->sk_state = SMC_CLOSED; + } else { + rc = sock_no_shutdown(sock, how); + } + +out: + release_sock(sk); + +out_err: + return rc; +} + +static int smc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + + smc = smc_sk(sk); + + /* generic setsockopts reaching us here always apply to the + * CLC socket + */ + return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, + optval, optlen); +} + +static int smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct smc_sock *smc; + + smc = smc_sk(sock->sk); + /* socket options apply to the CLC socket */ + return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, + optval, optlen); +} + +static int smc_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct smc_sock *smc; + + smc = smc_sk(sock->sk); + if (smc->use_fallback) + return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); + else + return sock_no_ioctl(sock, cmd, arg); +} + +static ssize_t smc_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EPIPE; + + smc = smc_sk(sk); + lock_sock(sk); + if (sk->sk_state != SMC_ACTIVE) + goto out; + if (smc->use_fallback) + rc = kernel_sendpage(smc->clcsock, page, offset, + size, flags); + else + rc = sock_no_sendpage(sock, page, offset, size, flags); + +out: + release_sock(sk); + return rc; +} + +static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -ENOTCONN; + + smc = smc_sk(sk); + lock_sock(sk); + if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED)) + goto out; + if (smc->use_fallback) { + rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, + pipe, len, flags); + } else { + rc = -EOPNOTSUPP; + } +out: + release_sock(sk); + return rc; +} + +/* must look like tcp */ +static const struct proto_ops smc_sock_ops = { + .family = PF_SMC, + .owner = THIS_MODULE, + .release = smc_release, + .bind = smc_bind, + .connect = smc_connect, + .socketpair = sock_no_socketpair, + .accept = smc_accept, + .getname = smc_getname, + .poll = smc_poll, + .ioctl = smc_ioctl, + .listen = smc_listen, + .shutdown = smc_shutdown, + .setsockopt = smc_setsockopt, + .getsockopt = smc_getsockopt, + .sendmsg = smc_sendmsg, + .recvmsg = smc_recvmsg, + .mmap = sock_no_mmap, + .sendpage = smc_sendpage, + .splice_read = smc_splice_read, +}; + +static int smc_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct smc_sock *smc; + struct sock *sk; + int rc; + + rc = -ESOCKTNOSUPPORT; + if (sock->type != SOCK_STREAM) + goto out; + + rc = -EPROTONOSUPPORT; + if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP)) + goto out; + + rc = -ENOBUFS; + sock->ops = &smc_sock_ops; + sk = smc_sock_alloc(net, sock); + if (!sk) + goto out; + + /* create internal TCP socket for CLC handshake and fallback */ + smc = smc_sk(sk); + rc = sock_create_kern(net, PF_INET, SOCK_STREAM, + IPPROTO_TCP, &smc->clcsock); + if (rc) + sk_common_release(sk); + +out: + return rc; +} + +static const struct net_proto_family smc_sock_family_ops = { + .family = PF_SMC, + .owner = THIS_MODULE, + .create = smc_create, +}; + +static int __init smc_init(void) +{ + int rc; + + rc = proto_register(&smc_proto, 1); + if (rc) { + pr_err("%s: proto_register fails with %d\n", __func__, rc); + goto out; + } + + rc = sock_register(&smc_sock_family_ops); + if (rc) { + pr_err("%s: sock_register fails with %d\n", __func__, rc); + goto out_proto; + } + + return 0; + +out_proto: + proto_unregister(&smc_proto); +out: + return rc; +} + +static void __exit smc_exit(void) +{ + sock_unregister(PF_SMC); + proto_unregister(&smc_proto); +} + +module_init(smc_init); +module_exit(smc_exit); + +MODULE_AUTHOR("Ursula Braun "); +MODULE_DESCRIPTION("smc socket address family"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_SMC); diff --git a/net/smc/smc.h b/net/smc/smc.h new file mode 100644 index 000000000000..fcea7399e2eb --- /dev/null +++ b/net/smc/smc.h @@ -0,0 +1,37 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for the SMC module (socket related) + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun + */ +#ifndef __SMC_H +#define __SMC_H + +#include +#include +#include + +#define SMCPROTO_SMC 0 /* SMC protocol */ + +enum smc_state { /* possible states of an SMC socket */ + SMC_ACTIVE = 1, + SMC_INIT = 2, + SMC_CLOSED = 7, + SMC_LISTEN = 10, +}; + +struct smc_sock { /* smc sock container */ + struct sock sk; + struct socket *clcsock; /* internal tcp socket */ + bool use_fallback; /* fallback to tcp */ +}; + +static inline struct smc_sock *smc_sk(const struct sock *sk) +{ + return (struct smc_sock *)sk; +} + +#endif /* __SMC_H */ -- cgit v1.2.3 From 6812baabf24d5c299c13223366a23c269408f4d0 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 9 Jan 2017 16:55:15 +0100 Subject: smc: establish pnet table management Connection creation with SMC-R starts through an internal TCP-connection. The Ethernet interface for this TCP-connection is not restricted to the Ethernet interface of a RoCE device. Any existing Ethernet interface belonging to the same physical net can be used, as long as there is a defined relation between the Ethernet interface and some RoCE devices. This relation is defined with the help of an identification string called "Physical Net ID" or short "pnet ID". Information about defined pnet IDs and their related Ethernet interfaces and RoCE devices is stored in the SMC-R pnet table. A pnet table entry consists of the identifying pnet ID and the associated network and IB device. This patch adds pnet table configuration support using the generic netlink message interface referring to network and IB device by their names. Commands exist to add, delete, and display pnet table entries, and to flush or display the entire pnet table. There are cross-checks to verify whether the ethernet interfaces or infiniband devices really exist in the system. If either device is not available, the pnet ID entry is not created. Loss of network devices and IB devices is also monitored; a pnet ID entry is removed when an associated network or IB device is removed. Signed-off-by: Thomas Richter Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- include/uapi/linux/smc.h | 35 ++++ net/smc/Makefile | 2 +- net/smc/af_smc.c | 11 +- net/smc/smc_ib.c | 2 + net/smc/smc_pnet.c | 534 +++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_pnet.h | 23 ++ 6 files changed, 604 insertions(+), 3 deletions(-) create mode 100644 include/uapi/linux/smc.h create mode 100644 net/smc/smc_pnet.c create mode 100644 net/smc/smc_pnet.h (limited to 'include') diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h new file mode 100644 index 000000000000..ab1dea8e53ee --- /dev/null +++ b/include/uapi/linux/smc.h @@ -0,0 +1,35 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for generic netlink based configuration of an SMC-R PNET table + * + * Copyright IBM Corp. 2016 + * + * Author(s): Thomas Richter + */ + +#ifndef _UAPI_LINUX_SMC_H_ +#define _UAPI_LINUX_SMC_H_ + +/* Netlink SMC_PNETID attributes */ +enum { + SMC_PNETID_UNSPEC, + SMC_PNETID_NAME, + SMC_PNETID_ETHNAME, + SMC_PNETID_IBNAME, + SMC_PNETID_IBPORT, + __SMC_PNETID_MAX, + SMC_PNETID_MAX = __SMC_PNETID_MAX - 1 +}; + +enum { /* SMC PNET Table commands */ + SMC_PNETID_GET = 1, + SMC_PNETID_ADD, + SMC_PNETID_DEL, + SMC_PNETID_FLUSH +}; + +#define SMCR_GENL_FAMILY_NAME "SMC_PNETID" +#define SMCR_GENL_FAMILY_VERSION 1 + +#endif /* _UAPI_LINUX_SMC_H */ diff --git a/net/smc/Makefile b/net/smc/Makefile index 56f2170e6f64..50f39ffec8c9 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -1,2 +1,2 @@ obj-$(CONFIG_SMC) += smc.o -smc-y := af_smc.o smc_ib.o +smc-y := af_smc.o smc_pnet.o smc_ib.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 50492ee495ce..8b059b2fc34d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -21,6 +21,7 @@ #include "smc.h" #include "smc_ib.h" +#include "smc_pnet.h" static void smc_set_keepalive(struct sock *sk, int val) { @@ -586,10 +587,14 @@ static int __init smc_init(void) { int rc; + rc = smc_pnet_init(); + if (rc) + return rc; + rc = proto_register(&smc_proto, 1); if (rc) { pr_err("%s: proto_register fails with %d\n", __func__, rc); - goto out; + goto out_pnet; } rc = sock_register(&smc_sock_family_ops); @@ -610,7 +615,8 @@ out_sock: sock_unregister(PF_SMC); out_proto: proto_unregister(&smc_proto); -out: +out_pnet: + smc_pnet_exit(); return rc; } @@ -619,6 +625,7 @@ static void __exit smc_exit(void) smc_ib_unregister_client(); sock_unregister(PF_SMC); proto_unregister(&smc_proto); + smc_pnet_exit(); } module_init(smc_init); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 7222b7ede900..5b037f435bc1 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -14,6 +14,7 @@ #include #include +#include "smc_pnet.h" #include "smc_ib.h" #include "smc.h" @@ -123,6 +124,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) spin_lock(&smc_ib_devices.lock); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ spin_unlock(&smc_ib_devices.lock); + smc_pnet_remove_by_ibdev(smcibdev); kfree(smcibdev); } diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c new file mode 100644 index 000000000000..9d3e7fb8348d --- /dev/null +++ b/net/smc/smc_pnet.c @@ -0,0 +1,534 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic netlink support functions to configure an SMC-R PNET table + * + * Copyright IBM Corp. 2016 + * + * Author(s): Thomas Richter + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "smc_pnet.h" +#include "smc_ib.h" + +#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */ + +static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { + [SMC_PNETID_NAME] = { + .type = NLA_NUL_STRING, + .len = SMC_MAX_PNET_ID_LEN - 1 + }, + [SMC_PNETID_ETHNAME] = { + .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 + }, + [SMC_PNETID_IBNAME] = { + .type = NLA_NUL_STRING, + .len = IB_DEVICE_NAME_MAX - 1 + }, + [SMC_PNETID_IBPORT] = { .type = NLA_U8 } +}; + +static struct genl_family smc_pnet_nl_family; + +/** + * struct smc_pnettable - SMC PNET table anchor + * @lock: Lock for list action + * @pnetlist: List of PNETIDs + */ +static struct smc_pnettable { + rwlock_t lock; + struct list_head pnetlist; +} smc_pnettable = { + .pnetlist = LIST_HEAD_INIT(smc_pnettable.pnetlist), + .lock = __RW_LOCK_UNLOCKED(smc_pnettable.lock) +}; + +/** + * struct smc_pnetentry - pnet identifier name entry + * @list: List node. + * @pnet_name: Pnet identifier name + * @ndev: pointer to network device. + * @smcibdev: Pointer to IB device. + */ +struct smc_pnetentry { + struct list_head list; + char pnet_name[SMC_MAX_PNET_ID_LEN + 1]; + struct net_device *ndev; + struct smc_ib_device *smcibdev; + u8 ib_port; +}; + +/* Check if two RDMA device entries are identical. Use device name and port + * number for comparison. + */ +static bool smc_pnet_same_ibname(struct smc_pnetentry *pnetelem, char *ibname, + u8 ibport) +{ + return pnetelem->ib_port == ibport && + !strncmp(pnetelem->smcibdev->ibdev->name, ibname, + sizeof(pnetelem->smcibdev->ibdev->name)); +} + +/* Find a pnetid in the pnet table. + */ +static struct smc_pnetentry *smc_pnet_find_pnetid(char *pnet_name) +{ + struct smc_pnetentry *pnetelem, *found_pnetelem = NULL; + + read_lock(&smc_pnettable.lock); + list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { + if (!strncmp(pnetelem->pnet_name, pnet_name, + sizeof(pnetelem->pnet_name))) { + found_pnetelem = pnetelem; + break; + } + } + read_unlock(&smc_pnettable.lock); + return found_pnetelem; +} + +/* Remove a pnetid from the pnet table. + */ +static int smc_pnet_remove_by_pnetid(char *pnet_name) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + int rc = -ENOENT; + + write_lock(&smc_pnettable.lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist, + list) { + if (!strncmp(pnetelem->pnet_name, pnet_name, + sizeof(pnetelem->pnet_name))) { + list_del(&pnetelem->list); + dev_put(pnetelem->ndev); + kfree(pnetelem); + rc = 0; + break; + } + } + write_unlock(&smc_pnettable.lock); + return rc; +} + +/* Remove a pnet entry mentioning a given network device from the pnet table. + */ +static int smc_pnet_remove_by_ndev(struct net_device *ndev) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + int rc = -ENOENT; + + write_lock(&smc_pnettable.lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist, + list) { + if (pnetelem->ndev == ndev) { + list_del(&pnetelem->list); + dev_put(pnetelem->ndev); + kfree(pnetelem); + rc = 0; + break; + } + } + write_unlock(&smc_pnettable.lock); + return rc; +} + +/* Remove a pnet entry mentioning a given ib device from the pnet table. + */ +int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + int rc = -ENOENT; + + write_lock(&smc_pnettable.lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist, + list) { + if (pnetelem->smcibdev == ibdev) { + list_del(&pnetelem->list); + dev_put(pnetelem->ndev); + kfree(pnetelem); + rc = 0; + break; + } + } + write_unlock(&smc_pnettable.lock); + return rc; +} + +/* Append a pnetid to the end of the pnet table if not already on this list. + */ +static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem) +{ + struct smc_pnetentry *pnetelem; + int rc = -EEXIST; + + write_lock(&smc_pnettable.lock); + list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { + if (!strncmp(pnetelem->pnet_name, new_pnetelem->pnet_name, + sizeof(new_pnetelem->pnet_name)) || + !strncmp(pnetelem->ndev->name, new_pnetelem->ndev->name, + sizeof(new_pnetelem->ndev->name)) || + smc_pnet_same_ibname(pnetelem, + new_pnetelem->smcibdev->ibdev->name, + new_pnetelem->ib_port)) + goto found; + } + list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist); + rc = 0; +found: + write_unlock(&smc_pnettable.lock); + return rc; +} + +/* The limit for pnetid is 16 characters. + * Valid characters should be (single-byte character set) a-z, A-Z, 0-9. + * Lower case letters are converted to upper case. + * Interior blanks should not be used. + */ +static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) +{ + char *bf = skip_spaces(pnet_name); + size_t len = strlen(bf); + char *end = bf + len; + + if (!len) + return false; + while (--end >= bf && isspace(*end)) + ; + if (end - bf >= SMC_MAX_PNET_ID_LEN) + return false; + while (bf <= end) { + if (!isalnum(*bf)) + return false; + *pnetid++ = islower(*bf) ? toupper(*bf) : *bf; + bf++; + } + *pnetid = '\0'; + return true; +} + +/* Find an infiniband device by a given name. The device might not exist. */ +struct smc_ib_device *smc_pnet_find_ib(char *ib_name) +{ + struct smc_ib_device *ibdev; + + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + if (!strncmp(ibdev->ibdev->name, ib_name, + sizeof(ibdev->ibdev->name))) { + goto out; + } + } + ibdev = NULL; +out: + spin_unlock(&smc_ib_devices.lock); + return ibdev; +} + +/* Parse the supplied netlink attributes and fill a pnetentry structure. + * For ethernet and infiniband device names verify that the devices exist. + */ +static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem, + struct nlattr *tb[]) +{ + char *string, *ibname = NULL; + int rc = 0; + + memset(pnetelem, 0, sizeof(*pnetelem)); + INIT_LIST_HEAD(&pnetelem->list); + if (tb[SMC_PNETID_NAME]) { + string = (char *)nla_data(tb[SMC_PNETID_NAME]); + if (!smc_pnetid_valid(string, pnetelem->pnet_name)) { + rc = -EINVAL; + goto error; + } + } + if (tb[SMC_PNETID_ETHNAME]) { + string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); + pnetelem->ndev = dev_get_by_name(net, string); + if (!pnetelem->ndev) + return -ENOENT; + } + if (tb[SMC_PNETID_IBNAME]) { + ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]); + ibname = strim(ibname); + pnetelem->smcibdev = smc_pnet_find_ib(ibname); + if (!pnetelem->smcibdev) { + rc = -ENOENT; + goto error; + } + } + if (tb[SMC_PNETID_IBPORT]) { + pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]); + if (pnetelem->ib_port > SMC_MAX_PORTS) { + rc = -EINVAL; + goto error; + } + } + return 0; + +error: + if (pnetelem->ndev) + dev_put(pnetelem->ndev); + return rc; +} + +/* Convert an smc_pnetentry to a netlink attribute sequence */ +static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem) +{ + if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name) || + nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->ndev->name) || + nla_put_string(msg, SMC_PNETID_IBNAME, + pnetelem->smcibdev->ibdev->name) || + nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) + return -1; + return 0; +} + +/* Retrieve one PNETID entry */ +static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info) +{ + struct smc_pnetentry *pnetelem; + struct sk_buff *msg; + void *hdr; + int rc; + + pnetelem = smc_pnet_find_pnetid( + (char *)nla_data(info->attrs[SMC_PNETID_NAME])); + if (!pnetelem) + return -ENOENT; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &smc_pnet_nl_family, 0, SMC_PNETID_GET); + if (!hdr) { + rc = -EMSGSIZE; + goto err_out; + } + + if (smc_pnet_set_nla(msg, pnetelem)) { + rc = -ENOBUFS; + goto err_out; + } + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +err_out: + nlmsg_free(msg); + return rc; +} + +static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct smc_pnetentry *pnetelem; + int rc; + + pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL); + if (!pnetelem) + return -ENOMEM; + rc = smc_pnet_fill_entry(net, pnetelem, info->attrs); + if (!rc) + rc = smc_pnet_enter(pnetelem); + if (rc) { + kfree(pnetelem); + return rc; + } + rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port); + if (rc) + smc_pnet_remove_by_pnetid(pnetelem->pnet_name); + return rc; +} + +static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) +{ + return smc_pnet_remove_by_pnetid( + (char *)nla_data(info->attrs[SMC_PNETID_NAME])); +} + +static int smc_pnet_dump_start(struct netlink_callback *cb) +{ + cb->args[0] = 0; + return 0; +} + +static int smc_pnet_dumpinfo(struct sk_buff *skb, + u32 portid, u32 seq, u32 flags, + struct smc_pnetentry *pnetelem) +{ + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family, + flags, SMC_PNETID_GET); + if (!hdr) + return -ENOMEM; + if (smc_pnet_set_nla(skb, pnetelem) < 0) { + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; + } + genlmsg_end(skb, hdr); + return 0; +} + +static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_pnetentry *pnetelem; + int idx = 0; + + read_lock(&smc_pnettable.lock); + list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { + if (idx++ < cb->args[0]) + continue; + if (smc_pnet_dumpinfo(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + pnetelem)) { + --idx; + break; + } + } + cb->args[0] = idx; + read_unlock(&smc_pnettable.lock); + return skb->len; +} + +/* Remove and delete all pnetids from pnet table. + */ +static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + + write_lock(&smc_pnettable.lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist, + list) { + list_del(&pnetelem->list); + dev_put(pnetelem->ndev); + kfree(pnetelem); + } + write_unlock(&smc_pnettable.lock); + return 0; +} + +/* SMC_PNETID generic netlink operation definition */ +static const struct genl_ops smc_pnet_ops[] = { + { + .cmd = SMC_PNETID_GET, + .flags = GENL_ADMIN_PERM, + .policy = smc_pnet_policy, + .doit = smc_pnet_get, + .dumpit = smc_pnet_dump, + .start = smc_pnet_dump_start + }, + { + .cmd = SMC_PNETID_ADD, + .flags = GENL_ADMIN_PERM, + .policy = smc_pnet_policy, + .doit = smc_pnet_add + }, + { + .cmd = SMC_PNETID_DEL, + .flags = GENL_ADMIN_PERM, + .policy = smc_pnet_policy, + .doit = smc_pnet_del + }, + { + .cmd = SMC_PNETID_FLUSH, + .flags = GENL_ADMIN_PERM, + .policy = smc_pnet_policy, + .doit = smc_pnet_flush + } +}; + +/* SMC_PNETID family definition */ +static struct genl_family smc_pnet_nl_family = { + .hdrsize = 0, + .name = SMCR_GENL_FAMILY_NAME, + .version = SMCR_GENL_FAMILY_VERSION, + .maxattr = SMC_PNETID_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = smc_pnet_ops, + .n_ops = ARRAY_SIZE(smc_pnet_ops) +}; + +static int smc_pnet_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_REBOOT: + case NETDEV_UNREGISTER: + smc_pnet_remove_by_ndev(event_dev); + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block smc_netdev_notifier = { + .notifier_call = smc_pnet_netdev_event +}; + +int __init smc_pnet_init(void) +{ + int rc; + + rc = genl_register_family(&smc_pnet_nl_family); + if (rc) + return rc; + rc = register_netdevice_notifier(&smc_netdev_notifier); + if (rc) + genl_unregister_family(&smc_pnet_nl_family); + return rc; +} + +void smc_pnet_exit(void) +{ + smc_pnet_flush(NULL, NULL); + unregister_netdevice_notifier(&smc_netdev_notifier); + genl_unregister_family(&smc_pnet_nl_family); +} + +/* PNET table analysis for a given sock: + * determine ib_device and port belonging to used internal TCP socket + * ethernet interface. + */ +void smc_pnet_find_roce_resource(struct sock *sk, + struct smc_ib_device **smcibdev, u8 *ibport) +{ + struct dst_entry *dst = sk_dst_get(sk); + struct smc_pnetentry *pnetelem; + + *smcibdev = NULL; + *ibport = 0; + + if (!dst) + return; + if (!dst->dev) + goto out_rel; + read_lock(&smc_pnettable.lock); + list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { + if (dst->dev == pnetelem->ndev) { + *smcibdev = pnetelem->smcibdev; + *ibport = pnetelem->ib_port; + break; + } + } + read_unlock(&smc_pnettable.lock); +out_rel: + dst_release(dst); +} diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h new file mode 100644 index 000000000000..32ab3df928ca --- /dev/null +++ b/net/smc/smc_pnet.h @@ -0,0 +1,23 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * PNET table queries + * + * Copyright IBM Corp. 2016 + * + * Author(s): Thomas Richter + */ + +#ifndef _SMC_PNET_H +#define _SMC_PNET_H + +struct smc_ib_device; + +int smc_pnet_init(void) __init; +void smc_pnet_exit(void); +int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev); +struct smc_ib_device *smc_pnet_find_ib(char *ib_name); +void smc_pnet_find_roce_resource(struct sock *sk, + struct smc_ib_device **smcibdev, u8 *ibport); + +#endif -- cgit v1.2.3 From f16a7dd5cf27eeda187425c9c7d96802a549f9c4 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 9 Jan 2017 16:55:26 +0100 Subject: smc: netlink interface for SMC sockets Support for SMC socket monitoring via netlink sockets of protocol NETLINK_SOCK_DIAG. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- include/net/smc.h | 20 ++++ include/net/sock.h | 3 + include/uapi/linux/netlink.h | 1 + include/uapi/linux/smc_diag.h | 85 +++++++++++++++++ net/smc/Kconfig | 9 ++ net/smc/Makefile | 1 + net/smc/af_smc.c | 43 ++++++++- net/smc/smc.h | 2 + net/smc/smc_close.c | 1 + net/smc/smc_diag.c | 215 ++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 379 insertions(+), 1 deletion(-) create mode 100644 include/net/smc.h create mode 100644 include/uapi/linux/smc_diag.h create mode 100644 net/smc/smc_diag.c (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h new file mode 100644 index 000000000000..12d26358ad9f --- /dev/null +++ b/include/net/smc.h @@ -0,0 +1,20 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for the SMC module (socket related) + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun + */ +#ifndef _SMC_H +#define _SMC_H + +struct smc_hashinfo { + rwlock_t lock; + struct hlist_head ht; +}; + +int smc_hash_sk(struct sock *sk); +void smc_unhash_sk(struct sock *sk); +#endif /* _SMC_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 99deda67eba0..389a0a619b45 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -70,6 +70,7 @@ #include #include #include +#include /* * This structure really needs to be cleaned up. @@ -986,6 +987,7 @@ struct request_sock_ops; struct timewait_sock_ops; struct inet_hashinfo; struct raw_hashinfo; +struct smc_hashinfo; struct module; /* @@ -1094,6 +1096,7 @@ struct proto { struct inet_hashinfo *hashinfo; struct udp_table *udp_table; struct raw_hashinfo *raw_hash; + struct smc_hashinfo *smc_hash; } h; struct module *owner; diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 0dba4e4ed2be..f3946a27bd07 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -27,6 +27,7 @@ #define NETLINK_ECRYPTFS 19 #define NETLINK_RDMA 20 #define NETLINK_CRYPTO 21 /* Crypto layer */ +#define NETLINK_SMC 22 /* SMC monitoring */ #define NETLINK_INET_DIAG NETLINK_SOCK_DIAG diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h new file mode 100644 index 000000000000..0063919fea34 --- /dev/null +++ b/include/uapi/linux/smc_diag.h @@ -0,0 +1,85 @@ +#ifndef _UAPI_SMC_DIAG_H_ +#define _UAPI_SMC_DIAG_H_ + +#include +#include +#include + +/* Request structure */ +struct smc_diag_req { + __u8 diag_family; + __u8 pad[2]; + __u8 diag_ext; /* Query extended information */ + struct inet_diag_sockid id; +}; + +/* Base info structure. It contains socket identity (addrs/ports/cookie) based + * on the internal clcsock, and more SMC-related socket data + */ +struct smc_diag_msg { + __u8 diag_family; + __u8 diag_state; + __u8 diag_fallback; + __u8 diag_shutdown; + struct inet_diag_sockid id; + + __u32 diag_uid; + __u64 diag_inode; +}; + +/* Extensions */ + +enum { + SMC_DIAG_NONE, + SMC_DIAG_CONNINFO, + SMC_DIAG_LGRINFO, + SMC_DIAG_SHUTDOWN, + __SMC_DIAG_MAX, +}; + +#define SMC_DIAG_MAX (__SMC_DIAG_MAX - 1) + +/* SMC_DIAG_CONNINFO */ + +struct smc_diag_cursor { + __u16 reserved; + __u16 wrap; + __u32 count; +}; + +struct smc_diag_conninfo { + __u32 token; /* unique connection id */ + __u32 sndbuf_size; /* size of send buffer */ + __u32 rmbe_size; /* size of RMB element */ + __u32 peer_rmbe_size; /* size of peer RMB element */ + /* local RMB element cursors */ + struct smc_diag_cursor rx_prod; /* received producer cursor */ + struct smc_diag_cursor rx_cons; /* received consumer cursor */ + /* peer RMB element cursors */ + struct smc_diag_cursor tx_prod; /* sent producer cursor */ + struct smc_diag_cursor tx_cons; /* sent consumer cursor */ + __u8 rx_prod_flags; /* received producer flags */ + __u8 rx_conn_state_flags; /* recvd connection flags*/ + __u8 tx_prod_flags; /* sent producer flags */ + __u8 tx_conn_state_flags; /* sent connection flags*/ + /* send buffer cursors */ + struct smc_diag_cursor tx_prep; /* prepared to be sent cursor */ + struct smc_diag_cursor tx_sent; /* sent cursor */ + struct smc_diag_cursor tx_fin; /* confirmed sent cursor */ +}; + +/* SMC_DIAG_LINKINFO */ + +struct smc_diag_linkinfo { + __u8 link_id; /* link identifier */ + __u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */ + __u8 ibport; /* RDMA device port number */ + __u8 gid[40]; /* local GID */ + __u8 peer_gid[40]; /* peer GID */ +}; + +struct smc_diag_lgrinfo { + struct smc_diag_linkinfo lnk[1]; + __u8 role; +}; +#endif /* _UAPI_SMC_DIAG_H_ */ diff --git a/net/smc/Kconfig b/net/smc/Kconfig index bc029803e728..c717ef0896aa 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -9,3 +9,12 @@ config SMC a separate socket family SMC. Select this option if you want to run SMC socket applications + +config SMC_DIAG + tristate "SMC: socket monitoring interface" + depends on SMC + ---help--- + Support for SMC socket monitoring interface used by tools such as + smcss. + + if unsure, say Y. diff --git a/net/smc/Makefile b/net/smc/Makefile index 5cf0cafaa208..188104654b54 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_SMC) += smc.o +obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3f543d58bc5c..5d4208ad029e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "smc.h" #include "smc_clc.h" @@ -59,13 +60,48 @@ static void smc_set_keepalive(struct sock *sk, int val) smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); } -static struct proto smc_proto = { +static struct smc_hashinfo smc_v4_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), +}; + +int smc_hash_sk(struct sock *sk) +{ + struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; + struct hlist_head *head; + + head = &h->ht; + + write_lock_bh(&h->lock); + sk_add_node(sk, head); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + write_unlock_bh(&h->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(smc_hash_sk); + +void smc_unhash_sk(struct sock *sk) +{ + struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; + + write_lock_bh(&h->lock); + if (sk_del_node_init(sk)) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + write_unlock_bh(&h->lock); +} +EXPORT_SYMBOL_GPL(smc_unhash_sk); + +struct proto smc_proto = { .name = "SMC", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_DESTROY_BY_RCU, }; +EXPORT_SYMBOL_GPL(smc_proto); static int smc_release(struct socket *sock) { @@ -109,6 +145,7 @@ static int smc_release(struct socket *sock) schedule_delayed_work(&smc->sock_put_work, SMC_CLOSE_SOCK_PUT_DELAY); } + sk->sk_prot->unhash(sk); release_sock(sk); sock_put(sk); @@ -144,6 +181,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work); + sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); return sk; @@ -536,6 +574,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) lsmc->sk.sk_err = -rc; new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); + sk->sk_prot->unhash(new_sk); sock_put(new_sk); *new_smc = NULL; goto out; @@ -545,6 +584,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); + sk->sk_prot->unhash(new_sk); sock_put(new_sk); *new_smc = NULL; goto out; @@ -1320,6 +1360,7 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto; } + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { diff --git a/net/smc/smc.h b/net/smc/smc.h index 959a5d2014ab..ee5fbea24549 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,6 +21,8 @@ #define SMC_MAX_PORTS 2 /* Max # of ports */ +extern struct proto smc_proto; + #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index d70c05b57021..03dfcc6b7661 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -384,6 +384,7 @@ void smc_close_sock_put_work(struct work_struct *work) struct smc_sock, sock_put_work); + smc->sk.sk_prot->unhash(&smc->sk); sock_put(&smc->sk); } diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c new file mode 100644 index 000000000000..d2d01cf70224 --- /dev/null +++ b/net/smc/smc_diag.c @@ -0,0 +1,215 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Monitoring SMC transport protocol sockets + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "smc.h" +#include "smc_core.h" + +static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) +{ + sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", + be16_to_cpu(((__be16 *)gid_raw)[0]), + be16_to_cpu(((__be16 *)gid_raw)[1]), + be16_to_cpu(((__be16 *)gid_raw)[2]), + be16_to_cpu(((__be16 *)gid_raw)[3]), + be16_to_cpu(((__be16 *)gid_raw)[4]), + be16_to_cpu(((__be16 *)gid_raw)[5]), + be16_to_cpu(((__be16 *)gid_raw)[6]), + be16_to_cpu(((__be16 *)gid_raw)[7])); +} + +static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + r->diag_family = sk->sk_family; + if (!smc->clcsock) + return; + r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); + r->id.idiag_dport = smc->clcsock->sk->sk_dport; + r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; + sock_diag_save_cookie(sk, r->id.idiag_cookie); + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; + r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; +} + +static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, + struct smc_diag_msg *r, + struct user_namespace *user_ns) +{ + if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown)) + return 1; + + r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + r->diag_inode = sock_i_ino(sk); + return 0; +} + +static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct smc_diag_req *req, + struct nlattr *bc) +{ + struct smc_sock *smc = smc_sk(sk); + struct user_namespace *user_ns; + struct smc_diag_msg *r; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + smc_diag_msg_common_fill(r, sk); + r->diag_state = sk->sk_state; + r->diag_fallback = smc->use_fallback; + user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk); + if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) + goto errout; + + if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) { + struct smc_connection *conn = &smc->conn; + struct smc_diag_conninfo cinfo = { + .token = conn->alert_token_local, + .sndbuf_size = conn->sndbuf_size, + .rmbe_size = conn->rmbe_size, + .peer_rmbe_size = conn->peer_rmbe_size, + + .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, + .rx_prod.count = conn->local_rx_ctrl.prod.count, + .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap, + .rx_cons.count = conn->local_rx_ctrl.cons.count, + + .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap, + .tx_prod.count = conn->local_tx_ctrl.prod.count, + .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap, + .tx_cons.count = conn->local_tx_ctrl.cons.count, + + .tx_prod_flags = + *(u8 *)&conn->local_tx_ctrl.prod_flags, + .tx_conn_state_flags = + *(u8 *)&conn->local_tx_ctrl.conn_state_flags, + .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags, + .rx_conn_state_flags = + *(u8 *)&conn->local_rx_ctrl.conn_state_flags, + + .tx_prep.wrap = conn->tx_curs_prep.wrap, + .tx_prep.count = conn->tx_curs_prep.count, + .tx_sent.wrap = conn->tx_curs_sent.wrap, + .tx_sent.count = conn->tx_curs_sent.count, + .tx_fin.wrap = conn->tx_curs_fin.wrap, + .tx_fin.count = conn->tx_curs_fin.count, + }; + + if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) + goto errout; + } + + if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) { + struct smc_diag_lgrinfo linfo = { + .role = smc->conn.lgr->role, + .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport, + .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id, + }; + + memcpy(linfo.lnk[0].ibname, + smc->conn.lgr->lnk[0].smcibdev->ibdev->name, + sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name)); + smc_gid_be16_convert(linfo.lnk[0].gid, + smc->conn.lgr->lnk[0].gid.raw); + smc_gid_be16_convert(linfo.lnk[0].peer_gid, + smc->conn.lgr->lnk[0].peer_gid); + + if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) + goto errout; + } + + nlmsg_end(skb, nlh); + return 0; + +errout: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *bc = NULL; + struct hlist_head *head; + struct sock *sk; + int rc = 0; + + read_lock(&smc_proto.h.smc_hash->lock); + head = &smc_proto.h.smc_hash->ht; + if (hlist_empty(head)) + goto out; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc) + break; + } + +out: + read_unlock(&smc_proto.h.smc_hash->lock); + return rc; +} + +static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +{ + struct net *net = sock_net(skb->sk); + + if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && + h->nlmsg_flags & NLM_F_DUMP) { + { + struct netlink_dump_control c = { + .dump = smc_diag_dump, + .min_dump_alloc = SKB_WITH_OVERHEAD(32768), + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); + } + } + return 0; +} + +static const struct sock_diag_handler smc_diag_handler = { + .family = AF_SMC, + .dump = smc_diag_handler_dump, +}; + +static int __init smc_diag_init(void) +{ + return sock_diag_register(&smc_diag_handler); +} + +static void __exit smc_diag_exit(void) +{ + sock_diag_unregister(&smc_diag_handler); +} + +module_init(smc_diag_init); +module_exit(smc_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); -- cgit v1.2.3