summary refs log tree commit diff
path: root/kernel/bpf/sockmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf/sockmap.c')
-rw-r--r--kernel/bpf/sockmap.c187
1 files changed, 115 insertions, 72 deletions
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 0314d1783d77..48c33417d13c 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -86,9 +86,10 @@ struct smap_psock {
 	struct work_struct tx_work;
 	struct work_struct gc_work;
 
+	struct proto *sk_proto;
+	void (*save_close)(struct sock *sk, long timeout);
 	void (*save_data_ready)(struct sock *sk);
 	void (*save_write_space)(struct sock *sk);
-	void (*save_state_change)(struct sock *sk);
 };
 
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
@@ -96,12 +97,102 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
 	return rcu_dereference_sk_user_data(sk);
 }
 
+static struct proto tcp_bpf_proto;
+static int bpf_tcp_init(struct sock *sk)
+{
+	struct smap_psock *psock;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	if (unlikely(psock->sk_proto)) {
+		rcu_read_unlock();
+		return -EBUSY;
+	}
+
+	psock->save_close = sk->sk_prot->close;
+	psock->sk_proto = sk->sk_prot;
+	sk->sk_prot = &tcp_bpf_proto;
+	rcu_read_unlock();
+	return 0;
+}
+
+static void bpf_tcp_release(struct sock *sk)
+{
+	struct smap_psock *psock;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+
+	if (likely(psock)) {
+		sk->sk_prot = psock->sk_proto;
+		psock->sk_proto = NULL;
+	}
+	rcu_read_unlock();
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+
+static void bpf_tcp_close(struct sock *sk, long timeout)
+{
+	void (*close_fun)(struct sock *sk, long timeout);
+	struct smap_psock_map_entry *e, *tmp;
+	struct smap_psock *psock;
+	struct sock *osk;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		return sk->sk_prot->close(sk, timeout);
+	}
+
+	/* The psock may be destroyed anytime after exiting the RCU critial
+	 * section so by the time we use close_fun the psock may no longer
+	 * be valid. However, bpf_tcp_close is called with the sock lock
+	 * held so the close hook and sk are still valid.
+	 */
+	close_fun = psock->save_close;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+		osk = cmpxchg(e->entry, sk, NULL);
+		if (osk == sk) {
+			list_del(&e->list);
+			smap_release_sock(psock, sk);
+		}
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+	rcu_read_unlock();
+	close_fun(sk, timeout);
+}
+
 enum __sk_action {
 	__SK_DROP = 0,
 	__SK_PASS,
 	__SK_REDIRECT,
 };
 
+static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
+	.name		= "bpf_tcp",
+	.uid		= TCP_ULP_BPF,
+	.user_visible	= false,
+	.owner		= NULL,
+	.init		= bpf_tcp_init,
+	.release	= bpf_tcp_release,
+};
+
+static int bpf_tcp_ulp_register(void)
+{
+	tcp_bpf_proto = tcp_prot;
+	tcp_bpf_proto.close = bpf_tcp_close;
+	return tcp_register_ulp(&bpf_tcp_ulp_ops);
+}
+
 static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 {
 	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -166,68 +257,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err)
 	sk->sk_error_report(sk);
 }
 
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-
-/* Called with lock_sock(sk) held */
-static void smap_state_change(struct sock *sk)
-{
-	struct smap_psock_map_entry *e, *tmp;
-	struct smap_psock *psock;
-	struct socket_wq *wq;
-	struct sock *osk;
-
-	rcu_read_lock();
-
-	/* Allowing transitions into an established syn_recv states allows
-	 * for early binding sockets to a smap object before the connection
-	 * is established.
-	 */
-	switch (sk->sk_state) {
-	case TCP_SYN_SENT:
-	case TCP_SYN_RECV:
-	case TCP_ESTABLISHED:
-		break;
-	case TCP_CLOSE_WAIT:
-	case TCP_CLOSING:
-	case TCP_LAST_ACK:
-	case TCP_FIN_WAIT1:
-	case TCP_FIN_WAIT2:
-	case TCP_LISTEN:
-		break;
-	case TCP_CLOSE:
-		/* Only release if the map entry is in fact the sock in
-		 * question. There is a case where the operator deletes
-		 * the sock from the map, but the TCP sock is closed before
-		 * the psock is detached. Use cmpxchg to verify correct
-		 * sock is removed.
-		 */
-		psock = smap_psock_sk(sk);
-		if (unlikely(!psock))
-			break;
-		write_lock_bh(&sk->sk_callback_lock);
-		list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-			osk = cmpxchg(e->entry, sk, NULL);
-			if (osk == sk) {
-				list_del(&e->list);
-				smap_release_sock(psock, sk);
-			}
-		}
-		write_unlock_bh(&sk->sk_callback_lock);
-		break;
-	default:
-		psock = smap_psock_sk(sk);
-		if (unlikely(!psock))
-			break;
-		smap_report_sk_error(psock, EPIPE);
-		break;
-	}
-
-	wq = rcu_dereference(sk->sk_wq);
-	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_all(&wq->wait);
-	rcu_read_unlock();
-}
-
 static void smap_read_sock_strparser(struct strparser *strp,
 				     struct sk_buff *skb)
 {
@@ -322,10 +351,8 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
 		return;
 	sk->sk_data_ready = psock->save_data_ready;
 	sk->sk_write_space = psock->save_write_space;
-	sk->sk_state_change = psock->save_state_change;
 	psock->save_data_ready = NULL;
 	psock->save_write_space = NULL;
-	psock->save_state_change = NULL;
 	strp_stop(&psock->strp);
 	psock->strp_enabled = false;
 }
@@ -350,6 +377,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
 	if (psock->refcnt)
 		return;
 
+	tcp_cleanup_ulp(sock);
 	smap_stop_sock(psock, sock);
 	clear_bit(SMAP_TX_RUNNING, &psock->state);
 	rcu_assign_sk_user_data(sock, NULL);
@@ -427,10 +455,8 @@ static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
 		return;
 	psock->save_data_ready = sk->sk_data_ready;
 	psock->save_write_space = sk->sk_write_space;
-	psock->save_state_change = sk->sk_state_change;
 	sk->sk_data_ready = smap_data_ready;
 	sk->sk_write_space = smap_write_space;
-	sk->sk_state_change = smap_state_change;
 	psock->strp_enabled = true;
 }
 
@@ -509,6 +535,10 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	if (attr->value_size > KMALLOC_MAX_SIZE)
 		return ERR_PTR(-E2BIG);
 
+	err = bpf_tcp_ulp_register();
+	if (err && err != -EEXIST)
+		return ERR_PTR(err);
+
 	stab = kzalloc(sizeof(*stab), GFP_USER);
 	if (!stab)
 		return ERR_PTR(-ENOMEM);
@@ -590,11 +620,6 @@ static void sock_map_free(struct bpf_map *map)
 	}
 	rcu_read_unlock();
 
-	if (stab->bpf_verdict)
-		bpf_prog_put(stab->bpf_verdict);
-	if (stab->bpf_parse)
-		bpf_prog_put(stab->bpf_parse);
-
 	sock_map_remove_complete(stab);
 }
 
@@ -754,6 +779,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 			goto out_progs;
 		}
 
+		err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
+		if (err)
+			goto out_progs;
+
 		set_bit(SMAP_TX_RUNNING, &psock->state);
 	}
 
@@ -866,6 +895,19 @@ static int sock_map_update_elem(struct bpf_map *map,
 	return err;
 }
 
+static void sock_map_release(struct bpf_map *map, struct file *map_file)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_prog *orig;
+
+	orig = xchg(&stab->bpf_parse, NULL);
+	if (orig)
+		bpf_prog_put(orig);
+	orig = xchg(&stab->bpf_verdict, NULL);
+	if (orig)
+		bpf_prog_put(orig);
+}
+
 const struct bpf_map_ops sock_map_ops = {
 	.map_alloc = sock_map_alloc,
 	.map_free = sock_map_free,
@@ -873,6 +915,7 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_get_next_key = sock_map_get_next_key,
 	.map_update_elem = sock_map_update_elem,
 	.map_delete_elem = sock_map_delete_elem,
+	.map_release = sock_map_release,
 };
 
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,