summary refs log tree commit diff
path: root/net/smc/af_smc.c
diff options
context:
space:
mode:
authorUrsula Braun <ubraun@linux.vnet.ibm.com>2017-01-09 16:55:16 +0100
committerDavid S. Miller <davem@davemloft.net>2017-01-09 16:07:39 -0500
commita046d57da19f812216f393e7c535f5858f793ac3 (patch)
tree9b3c535ff6cd3da3cf06b3164cd727839df25f38 /net/smc/af_smc.c
parent6812baabf24d5c299c13223366a23c269408f4d0 (diff)
downloadlinux-a046d57da19f812216f393e7c535f5858f793ac3.tar.gz
smc: CLC handshake (incl. preparation steps)
* CLC (Connection Layer Control) handshake

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/smc/af_smc.c')
-rw-r--r--net/smc/af_smc.c464
1 files changed, 433 insertions, 31 deletions
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 8b059b2fc34d..05c705a688e5 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -6,6 +6,13 @@
  *  offers an alternative communication option for TCP-protocol sockets
  *  applicable with RoCE-cards only
  *
+ *  Initial restrictions:
+ *    - non-blocking connect postponed
+ *    - IPv6 support postponed
+ *    - support for alternate links postponed
+ *    - partial support for non-blocking sockets only
+ *    - support for urgent data postponed
+ *
  *  Copyright IBM Corp. 2016
  *
  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
@@ -17,12 +24,18 @@
 
 #include <linux/module.h>
 #include <linux/socket.h>
+#include <linux/inetdevice.h>
+#include <linux/workqueue.h>
 #include <net/sock.h>
+#include <net/tcp.h>
 
 #include "smc.h"
+#include "smc_clc.h"
 #include "smc_ib.h"
 #include "smc_pnet.h"
 
+static void smc_tcp_listen_work(struct work_struct *);
+
 static void smc_set_keepalive(struct sock *sk, int val)
 {
 	struct smc_sock *smc = smc_sk(sk);
@@ -88,9 +101,11 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
 	sk->sk_state = SMC_INIT;
 	sk->sk_destruct = smc_destruct;
 	sk->sk_protocol = SMCPROTO_SMC;
-	sk_refcnt_debug_inc(sk);
-
 	smc = smc_sk(sk);
+	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+	INIT_LIST_HEAD(&smc->accept_q);
+	spin_lock_init(&smc->accept_q_lock);
+	sk_refcnt_debug_inc(sk);
 
 	return sk;
 }
@@ -184,6 +199,119 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
 }
 
+/* determine subnet and mask of internal TCP socket */
+int smc_netinfo_by_tcpsk(struct socket *clcsock,
+			 __be32 *subnet, u8 *prefix_len)
+{
+	struct dst_entry *dst = sk_dst_get(clcsock->sk);
+	struct sockaddr_in addr;
+	int rc = -ENOENT;
+	int len;
+
+	if (!dst) {
+		rc = -ENOTCONN;
+		goto out;
+	}
+	if (!dst->dev) {
+		rc = -ENODEV;
+		goto out_rel;
+	}
+
+	/* get address to which the internal TCP socket is bound */
+	kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
+	/* analyze IPv4 specific data of net_device belonging to TCP socket */
+	for_ifa(dst->dev->ip_ptr) {
+		if (ifa->ifa_address != addr.sin_addr.s_addr)
+			continue;
+		*prefix_len = inet_mask_len(ifa->ifa_mask);
+		*subnet = ifa->ifa_address & ifa->ifa_mask;
+		rc = 0;
+		break;
+	} endfor_ifa(dst->dev->ip_ptr);
+
+out_rel:
+	dst_release(dst);
+out:
+	return rc;
+}
+
+/* setup for RDMA connection of client */
+static int smc_connect_rdma(struct smc_sock *smc)
+{
+	struct smc_clc_msg_accept_confirm aclc;
+	struct smc_ib_device *smcibdev;
+	int reason_code = 0;
+	int rc = 0;
+	u8 ibport;
+
+	/* IPSec connections opt out of SMC-R optimizations */
+	if (using_ipsec(smc)) {
+		reason_code = SMC_CLC_DECL_IPSEC;
+		goto decline_rdma;
+	}
+
+	/* PNET table look up: search active ib_device and port
+	 * within same PNETID that also contains the ethernet device
+	 * used for the internal TCP socket
+	 */
+	smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
+	if (!smcibdev) {
+		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+		goto decline_rdma;
+	}
+
+	/* do inband token exchange */
+	reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
+	if (reason_code < 0) {
+		rc = reason_code;
+		goto out_err;
+	}
+	if (reason_code > 0) /* configuration error */
+		goto decline_rdma;
+	/* receive SMC Accept CLC message */
+	reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
+				       SMC_CLC_ACCEPT);
+	if (reason_code < 0) {
+		rc = reason_code;
+		goto out_err;
+	}
+	if (reason_code > 0)
+		goto decline_rdma;
+
+	/* tbd in follow-on patch: more steps to setup RDMA communcication,
+	 * create connection, link group, link
+	 */
+
+	/* tbd in follow-on patch: more steps to setup RDMA communcication,
+	 * create rmbs, map rmbs, rtoken_handling, modify_qp
+	 */
+
+	rc = smc_clc_send_confirm(smc);
+	if (rc)
+		goto out_err;
+
+	/* tbd in follow-on patch: llc_confirm */
+
+out_connected:
+	smc_copy_sock_settings_to_clc(smc);
+	smc->sk.sk_state = SMC_ACTIVE;
+
+	return rc;
+
+decline_rdma:
+	/* RDMA setup failed, switch back to TCP */
+	smc->use_fallback = true;
+	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+		rc = smc_clc_send_decline(smc, reason_code, 0);
+		if (rc < sizeof(struct smc_clc_msg_decline))
+			goto out_err;
+	}
+	goto out_connected;
+
+out_err:
+	return rc;
+}
+
 static int smc_connect(struct socket *sock, struct sockaddr *addr,
 		       int alen, int flags)
 {
@@ -198,6 +326,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 		goto out_err;
 	if (addr->sa_family != AF_INET)
 		goto out_err;
+	smc->addr = addr;	/* needed for nonblocking connect */
 
 	lock_sock(sk);
 	switch (sk->sk_state) {
@@ -216,12 +345,12 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 	if (rc)
 		goto out;
 
-	sk->sk_state = SMC_ACTIVE;
-
-	/* always use TCP fallback as transport mechanism for now;
-	 * This will change once RDMA transport is implemented
-	 */
-	smc->use_fallback = true;
+	/* setup RDMA connection */
+	rc = smc_connect_rdma(smc);
+	if (rc < 0)
+		goto out;
+	else
+		rc = 0; /* success cases including fallback */
 
 out:
 	release_sock(sk);
@@ -236,17 +365,32 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 	struct sock *new_sk;
 	int rc;
 
+	release_sock(&lsmc->sk);
 	new_sk = smc_sock_alloc(sock_net(sk), NULL);
 	if (!new_sk) {
 		rc = -ENOMEM;
 		lsmc->sk.sk_err = ENOMEM;
 		*new_smc = NULL;
+		lock_sock(&lsmc->sk);
 		goto out;
 	}
 	*new_smc = smc_sk(new_sk);
 
 	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
-	if (rc) {
+	lock_sock(&lsmc->sk);
+	if  (rc < 0) {
+		lsmc->sk.sk_err = -rc;
+		new_sk->sk_state = SMC_CLOSED;
+		sock_set_flag(new_sk, SOCK_DEAD);
+		sock_put(new_sk);
+		*new_smc = NULL;
+		goto out;
+	}
+	if (lsmc->sk.sk_state == SMC_CLOSED) {
+		if (new_clcsock)
+			sock_release(new_clcsock);
+		new_sk->sk_state = SMC_CLOSED;
+		sock_set_flag(new_sk, SOCK_DEAD);
 		sock_put(new_sk);
 		*new_smc = NULL;
 		goto out;
@@ -257,6 +401,216 @@ out:
 	return rc;
 }
 
+/* add a just created sock to the accept queue of the listen sock as
+ * candidate for a following socket accept call from user space
+ */
+static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+	struct smc_sock *par = smc_sk(parent);
+
+	sock_hold(sk);
+	spin_lock(&par->accept_q_lock);
+	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
+	spin_unlock(&par->accept_q_lock);
+	sk_acceptq_added(parent);
+}
+
+/* remove a socket from the accept queue of its parental listening socket */
+static void smc_accept_unlink(struct sock *sk)
+{
+	struct smc_sock *par = smc_sk(sk)->listen_smc;
+
+	spin_lock(&par->accept_q_lock);
+	list_del_init(&smc_sk(sk)->accept_q);
+	spin_unlock(&par->accept_q_lock);
+	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
+	sock_put(sk);
+}
+
+/* remove a sock from the accept queue to bind it to a new socket created
+ * for a socket accept call from user space
+ */
+static struct sock *smc_accept_dequeue(struct sock *parent,
+				       struct socket *new_sock)
+{
+	struct smc_sock *isk, *n;
+	struct sock *new_sk;
+
+	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
+		new_sk = (struct sock *)isk;
+
+		smc_accept_unlink(new_sk);
+		if (new_sk->sk_state == SMC_CLOSED) {
+			/* tbd in follow-on patch: close this sock */
+			continue;
+		}
+		if (new_sock)
+			sock_graft(new_sk, new_sock);
+		return new_sk;
+	}
+	return NULL;
+}
+
+/* clean up for a created but never accepted sock */
+static void smc_close_non_accepted(struct sock *sk)
+{
+	struct smc_sock *smc = smc_sk(sk);
+
+	sock_hold(sk);
+	if (smc->clcsock) {
+		struct socket *tcp;
+
+		tcp = smc->clcsock;
+		smc->clcsock = NULL;
+		sock_release(tcp);
+	}
+	/* more closing stuff to be added with socket closing patch */
+	sock_put(sk);
+}
+
+/* setup for RDMA connection of server */
+static void smc_listen_work(struct work_struct *work)
+{
+	struct smc_sock *new_smc = container_of(work, struct smc_sock,
+						smc_listen_work);
+	struct socket *newclcsock = new_smc->clcsock;
+	struct smc_sock *lsmc = new_smc->listen_smc;
+	struct smc_clc_msg_accept_confirm cclc;
+	struct sock *newsmcsk = &new_smc->sk;
+	struct smc_clc_msg_proposal pclc;
+	struct smc_ib_device *smcibdev;
+	struct sockaddr_in peeraddr;
+	int reason_code = 0;
+	int rc = 0, len;
+	__be32 subnet;
+	u8 prefix_len;
+	u8 ibport;
+
+	/* do inband token exchange -
+	 *wait for and receive SMC Proposal CLC message
+	 */
+	reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
+				       SMC_CLC_PROPOSAL);
+	if (reason_code < 0)
+		goto out_err;
+	if (reason_code > 0)
+		goto decline_rdma;
+
+	/* IPSec connections opt out of SMC-R optimizations */
+	if (using_ipsec(new_smc)) {
+		reason_code = SMC_CLC_DECL_IPSEC;
+		goto decline_rdma;
+	}
+
+	/* PNET table look up: search active ib_device and port
+	 * within same PNETID that also contains the ethernet device
+	 * used for the internal TCP socket
+	 */
+	smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
+	if (!smcibdev) {
+		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+		goto decline_rdma;
+	}
+
+	/* determine subnet and mask from internal TCP socket */
+	rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+		goto decline_rdma;
+	}
+	if ((pclc.outgoing_subnet != subnet) ||
+	    (pclc.prefix_len != prefix_len)) {
+		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+		goto decline_rdma;
+	}
+
+	/* get address of the peer connected to the internal TCP socket */
+	kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
+
+	/* tbd in follow-on patch: more steps to setup RDMA communcication,
+	 * create connection, link_group, link
+	 */
+
+	/* tbd in follow-on patch: more steps to setup RDMA communcication,
+	 * create rmbs, map rmbs
+	 */
+
+	rc = smc_clc_send_accept(new_smc);
+	if (rc)
+		goto out_err;
+
+	/* receive SMC Confirm CLC message */
+	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
+				       SMC_CLC_CONFIRM);
+	if (reason_code < 0)
+		goto out_err;
+	if (reason_code > 0)
+		goto decline_rdma;
+
+	/* tbd in follow-on patch: more steps to setup RDMA communcication,
+	 * rtoken_handling, modify_qp
+	 */
+
+out_connected:
+	sk_refcnt_debug_inc(newsmcsk);
+	newsmcsk->sk_state = SMC_ACTIVE;
+enqueue:
+	lock_sock(&lsmc->sk);
+	if (lsmc->sk.sk_state == SMC_LISTEN) {
+		smc_accept_enqueue(&lsmc->sk, newsmcsk);
+	} else { /* no longer listening */
+		smc_close_non_accepted(newsmcsk);
+	}
+	release_sock(&lsmc->sk);
+
+	/* Wake up accept */
+	lsmc->sk.sk_data_ready(&lsmc->sk);
+	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
+	return;
+
+decline_rdma:
+	/* RDMA setup failed, switch back to TCP */
+	new_smc->use_fallback = true;
+	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+		rc = smc_clc_send_decline(new_smc, reason_code, 0);
+		if (rc < sizeof(struct smc_clc_msg_decline))
+			goto out_err;
+	}
+	goto out_connected;
+
+out_err:
+	newsmcsk->sk_state = SMC_CLOSED;
+	goto enqueue; /* queue new sock with sk_err set */
+}
+
+static void smc_tcp_listen_work(struct work_struct *work)
+{
+	struct smc_sock *lsmc = container_of(work, struct smc_sock,
+					     tcp_listen_work);
+	struct smc_sock *new_smc;
+	int rc = 0;
+
+	lock_sock(&lsmc->sk);
+	while (lsmc->sk.sk_state == SMC_LISTEN) {
+		rc = smc_clcsock_accept(lsmc, &new_smc);
+		if (rc)
+			goto out;
+		if (!new_smc)
+			continue;
+
+		new_smc->listen_smc = lsmc;
+		new_smc->use_fallback = false; /* assume rdma capability first*/
+		sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
+		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
+		smc_copy_sock_settings_to_smc(new_smc);
+		schedule_work(&new_smc->smc_listen_work);
+	}
+
+out:
+	release_sock(&lsmc->sk);
+	lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
+}
+
 static int smc_listen(struct socket *sock, int backlog)
 {
 	struct sock *sk = sock->sk;
@@ -286,6 +640,8 @@ static int smc_listen(struct socket *sock, int backlog)
 	sk->sk_max_ack_backlog = backlog;
 	sk->sk_ack_backlog = 0;
 	sk->sk_state = SMC_LISTEN;
+	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+	schedule_work(&smc->tcp_listen_work);
 
 out:
 	release_sock(sk);
@@ -295,10 +651,11 @@ out:
 static int smc_accept(struct socket *sock, struct socket *new_sock,
 		      int flags)
 {
-	struct smc_sock *new_smc;
-	struct sock *sk = sock->sk;
+	struct sock *sk = sock->sk, *nsk;
+	DECLARE_WAITQUEUE(wait, current);
 	struct smc_sock *lsmc;
-	int rc;
+	long timeo;
+	int rc = 0;
 
 	lsmc = smc_sk(sk);
 	lock_sock(sk);
@@ -308,18 +665,30 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
 		goto out;
 	}
 
-	rc = smc_clcsock_accept(lsmc, &new_smc);
-	if (rc)
-		goto out;
-	sock_graft(&new_smc->sk, new_sock);
-	new_smc->sk.sk_state = SMC_ACTIVE;
-
-	smc_copy_sock_settings_to_smc(new_smc);
+	/* Wait for an incoming connection */
+	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
+	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!timeo) {
+			rc = -EAGAIN;
+			break;
+		}
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		/* wakeup by sk_data_ready in smc_listen_work() */
+		sched_annotate_sleep();
+		lock_sock(sk);
+		if (signal_pending(current)) {
+			rc = sock_intr_errno(timeo);
+			break;
+		}
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
 
-	/* always use TCP fallback as transport mechanism for now;
-	 * This will change once RDMA transport is implemented
-	 */
-	new_smc->use_fallback = true;
+	if (!rc)
+		rc = sock_error(nsk);
 
 out:
 	release_sock(sk);
@@ -379,29 +748,61 @@ out:
 	return rc;
 }
 
+static unsigned int smc_accept_poll(struct sock *parent)
+{
+	struct smc_sock *isk;
+	struct sock *sk;
+
+	lock_sock(parent);
+	list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
+		sk = (struct sock *)isk;
+
+		if (sk->sk_state == SMC_ACTIVE) {
+			release_sock(parent);
+			return POLLIN | POLLRDNORM;
+		}
+	}
+	release_sock(parent);
+
+	return 0;
+}
+
 static unsigned int smc_poll(struct file *file, struct socket *sock,
 			     poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	unsigned int mask = 0;
 	struct smc_sock *smc;
+	int rc;
 
 	smc = smc_sk(sock->sk);
-	if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) ||
-	    smc->use_fallback) {
+	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
+		/* delegate to CLC child sock */
 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
 		/* if non-blocking connect finished ... */
 		lock_sock(sk);
 		if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
-			sk->sk_state = SMC_ACTIVE;
-			/* always use TCP fallback as transport mechanism;
-			 * This will change once RDMA transport is implemented
-			 */
-			smc->use_fallback = true;
+			sk->sk_err = smc->clcsock->sk->sk_err;
+			if (sk->sk_err) {
+				mask |= POLLERR;
+			} else {
+				rc = smc_connect_rdma(smc);
+				if (rc < 0)
+					mask |= POLLERR;
+				else
+					/* success cases including fallback */
+					mask |= POLLOUT | POLLWRNORM;
+			}
 		}
 		release_sock(sk);
 	} else {
-		mask = sock_no_poll(file, sock, wait);
+		sock_poll_wait(file, sk_sleep(sk), wait);
+		if (sk->sk_state == SMC_LISTEN)
+			/* woken up by sk_data_ready in smc_listen_work() */
+			mask |= smc_accept_poll(sk);
+		if (sk->sk_err)
+			mask |= POLLERR;
+		/* for now - to be enhanced in follow-on patch */
 	}
 
 	return mask;
@@ -568,6 +969,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
 
 	/* create internal TCP socket for CLC handshake and fallback */
 	smc = smc_sk(sk);
+	smc->use_fallback = false; /* assume rdma capability first */
 	rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
 			      IPPROTO_TCP, &smc->clcsock);
 	if (rc)