From e11ecddf5128011c936cc5360780190cbc901fdc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Sep 2014 04:19:51 -0700
Subject: tcp: use TCP_SKB_CB(skb)->tcp_flags in input path

Input path of TCP do not currently uses TCP_SKB_CB(skb)->tcp_flags,
which is only used in output path.

tcp_recvmsg(), looks at tcp_hdr(skb)->syn for every skb found in receive queue,
and its unfortunate because this bit is located in a cache line right before
the payload.

We can simplify TCP by copying tcp flags into TCP_SKB_CB(skb)->tcp_flags.

This patch does so, and avoids the cache line miss in tcp_recvmsg()

Following patches will
- allow a segment with FIN being coalesced in tcp_try_coalesce()
- simplify tcp_collapse() by not copying the headers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 541f26a67ba2..070aeff1b131 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1510,9 +1510,9 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 
 	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
 		offset = seq - TCP_SKB_CB(skb)->seq;
-		if (tcp_hdr(skb)->syn)
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
 			offset--;
-		if (offset < skb->len || tcp_hdr(skb)->fin) {
+		if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
 			*off = offset;
 			return skb;
 		}
@@ -1585,7 +1585,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 			if (offset + 1 != skb->len)
 				continue;
 		}
-		if (tcp_hdr(skb)->fin) {
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
 			sk_eat_skb(sk, skb, false);
 			++seq;
 			break;
@@ -1722,11 +1722,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 				break;
 
 			offset = *seq - TCP_SKB_CB(skb)->seq;
-			if (tcp_hdr(skb)->syn)
+			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
 				offset--;
 			if (offset < skb->len)
 				goto found_ok_skb;
-			if (tcp_hdr(skb)->fin)
+			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 				goto found_fin_ok;
 			WARN(!(flags & MSG_PEEK),
 			     "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
@@ -1959,7 +1959,7 @@ skip_copy:
 		if (used + offset < skb->len)
 			continue;
 
-		if (tcp_hdr(skb)->fin)
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			goto found_fin_ok;
 		if (!(flags & MSG_PEEK)) {
 			sk_eat_skb(sk, skb, copied_early);
@@ -2160,8 +2160,10 @@ void tcp_close(struct sock *sk, long timeout)
 	 *  reader process may not have drained the data yet!
 	 */
 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
-		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
-			  tcp_hdr(skb)->fin;
+		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+			len--;
 		data_was_unread += len;
 		__kfree_skb(skb);
 	}
-- 
cgit 1.4.1


From f4a775d14489a801a5b8b0540e23ab82e2703091 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Sep 2014 16:29:32 -0700
Subject: net: introduce __skb_header_release()

While profiling TCP stack, I noticed one useless atomic operation
in tcp_sendmsg(), caused by skb_header_release().

It turns out all current skb_header_release() users have a fresh skb,
that no other user can see, so we can avoid one atomic operation.

Introduce __skb_header_release() to clearly document this.

This gave me a 1.5 % improvement on TCP_RR workload.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 15 +++++++++++++++
 net/core/skbuff.c      |  4 ++--
 net/ipv4/tcp.c         |  4 ++--
 net/ipv4/tcp_output.c  | 10 +++++-----
 4 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f1bfa3781c75..8eaa62400fca 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1083,6 +1083,7 @@ static inline int skb_header_cloned(const struct sk_buff *skb)
  *	Drop a reference to the header part of the buffer.  This is done
  *	by acquiring a payload reference.  You must not read from the header
  *	part of skb->data after this.
+ *	Note : Check if you can use __skb_header_release() instead.
  */
 static inline void skb_header_release(struct sk_buff *skb)
 {
@@ -1091,6 +1092,20 @@ static inline void skb_header_release(struct sk_buff *skb)
 	atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref);
 }
 
+/**
+ *	__skb_header_release - release reference to header
+ *	@skb: buffer to operate on
+ *
+ *	Variant of skb_header_release() assuming skb is private to caller.
+ *	We can avoid one atomic operation.
+ */
+static inline void __skb_header_release(struct sk_buff *skb)
+{
+	skb->nohdr = 1;
+	atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT));
+}
+
+
 /**
  *	skb_shared - is the buffer shared
  *	@skb: buffer to check
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 06a8feb10099..512dc7dcbc32 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3179,7 +3179,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	skb_shinfo(nskb)->frag_list = p;
 	skb_shinfo(nskb)->gso_size = pinfo->gso_size;
 	pinfo->gso_size = 0;
-	skb_header_release(p);
+	__skb_header_release(p);
 	NAPI_GRO_CB(nskb)->last = p;
 
 	nskb->data_len += p->len;
@@ -3211,7 +3211,7 @@ merge:
 	else
 		NAPI_GRO_CB(p)->last->next = skb;
 	NAPI_GRO_CB(p)->last = skb;
-	skb_header_release(skb);
+	__skb_header_release(skb);
 	lp = p;
 
 done:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 070aeff1b131..553b01f52f71 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -609,7 +609,7 @@ static inline bool forced_push(const struct tcp_sock *tp)
 	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 }
 
-static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
+static void skb_entail(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -618,7 +618,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
 	tcb->seq     = tcb->end_seq = tp->write_seq;
 	tcb->tcp_flags = TCPHDR_ACK;
 	tcb->sacked  = 0;
-	skb_header_release(skb);
+	__skb_header_release(skb);
 	tcp_add_write_queue_tail(sk, skb);
 	sk->sk_wmem_queued += skb->truesize;
 	sk_mem_charge(sk, skb->truesize);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8c61a7c0c889..f173b1c4f815 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -995,7 +995,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 
 	/* Advance write_seq and place onto the write_queue. */
 	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
-	skb_header_release(skb);
+	__skb_header_release(skb);
 	tcp_add_write_queue_tail(sk, skb);
 	sk->sk_wmem_queued += skb->truesize;
 	sk_mem_charge(sk, skb->truesize);
@@ -1167,7 +1167,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	}
 
 	/* Link BUFF into the send queue. */
-	skb_header_release(buff);
+	__skb_header_release(buff);
 	tcp_insert_write_queue_after(skb, buff, sk);
 
 	return 0;
@@ -1671,7 +1671,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 	tcp_set_skb_tso_segs(sk, buff, mss_now);
 
 	/* Link BUFF into the send queue. */
-	skb_header_release(buff);
+	__skb_header_release(buff);
 	tcp_insert_write_queue_after(skb, buff, sk);
 
 	return 0;
@@ -2772,7 +2772,7 @@ int tcp_send_synack(struct sock *sk)
 			if (nskb == NULL)
 				return -ENOMEM;
 			tcp_unlink_write_queue(skb, sk);
-			skb_header_release(nskb);
+			__skb_header_release(nskb);
 			__tcp_add_write_queue_head(sk, nskb);
 			sk_wmem_free_skb(sk, skb);
 			sk->sk_wmem_queued += nskb->truesize;
@@ -2947,7 +2947,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
 	tcb->end_seq += skb->len;
-	skb_header_release(skb);
+	__skb_header_release(skb);
 	__tcp_add_write_queue_tail(sk, skb);
 	sk->sk_wmem_queued += skb->truesize;
 	sk_mem_charge(sk, skb->truesize);
-- 
cgit 1.4.1


From cd7d8498c9a5d510c64db38d9f4f4fbc41790f09 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 24 Sep 2014 04:11:22 -0700
Subject: tcp: change tcp_skb_pcount() location

Our goal is to access no more than one cache line access per skb in
a write or receive queue when doing the various walks.

After recent TCP_SKB_CB() reorganizations, it is almost done.

Last part is tcp_skb_pcount() which currently uses
skb_shinfo(skb)->gso_segs, which is a terrible choice, because it needs
3 cache lines in current kernel (skb->head, skb->end, and
shinfo->gso_segs are all in 3 different cache lines, far from skb->cb)

This very simple patch reuses space currently taken by tcp_tw_isn
only in input path, as tcp_skb_pcount is only needed for skb stored in
write queue.

This considerably speeds up tcp_ack(), granted we avoid shinfo->tx_flags
to get SKBTX_ACK_TSTAMP, which seems possible.

This also speeds up all sack processing in general.

This speeds up tcp_sendmsg() because it no longer has to access/dirty
shinfo.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 23 +++++++++++++++++++++--
 net/ipv4/tcp.c        |  4 ++--
 net/ipv4/tcp_input.c  |  8 ++++----
 net/ipv4/tcp_output.c |  9 ++++++---
 4 files changed, 33 insertions(+), 11 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4dc6641ee990..02a9a2c366bf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -698,7 +698,16 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
 struct tcp_skb_cb {
 	__u32		seq;		/* Starting sequence number	*/
 	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
-	__u32		tcp_tw_isn;	/* isn chosen by tcp_timewait_state_process() */
+	union {
+		/* Note : tcp_tw_isn is used in input path only
+		 *	  (isn chosen by tcp_timewait_state_process())
+		 *
+		 * 	  tcp_gso_segs is used in write queue only,
+		 *	  cf tcp_skb_pcount()
+		 */
+		__u32		tcp_tw_isn;
+		__u32		tcp_gso_segs;
+	};
 	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
 
 	__u8		sacked;		/* State flags for SACK/FACK.	*/
@@ -746,7 +755,17 @@ TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
  */
 static inline int tcp_skb_pcount(const struct sk_buff *skb)
 {
-	return skb_shinfo(skb)->gso_segs;
+	return TCP_SKB_CB(skb)->tcp_gso_segs;
+}
+
+static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs)
+{
+	TCP_SKB_CB(skb)->tcp_gso_segs = segs;
+}
+
+static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
+{
+	TCP_SKB_CB(skb)->tcp_gso_segs += segs;
 }
 
 /* This is valid iff tcp_skb_pcount() > 1. */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 553b01f52f71..87289e51be00 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -963,7 +963,7 @@ new_segment:
 		skb->ip_summed = CHECKSUM_PARTIAL;
 		tp->write_seq += copy;
 		TCP_SKB_CB(skb)->end_seq += copy;
-		skb_shinfo(skb)->gso_segs = 0;
+		tcp_skb_pcount_set(skb, 0);
 
 		if (!copied)
 			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
@@ -1261,7 +1261,7 @@ new_segment:
 
 			tp->write_seq += copy;
 			TCP_SKB_CB(skb)->end_seq += copy;
-			skb_shinfo(skb)->gso_segs = 0;
+			tcp_skb_pcount_set(skb, 0);
 
 			from += copy;
 			copied += copy;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f3f016a15c5a..2c0af90231cf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1295,9 +1295,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	TCP_SKB_CB(prev)->end_seq += shifted;
 	TCP_SKB_CB(skb)->seq += shifted;
 
-	skb_shinfo(prev)->gso_segs += pcount;
-	BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
-	skb_shinfo(skb)->gso_segs -= pcount;
+	tcp_skb_pcount_add(prev, pcount);
+	BUG_ON(tcp_skb_pcount(skb) < pcount);
+	tcp_skb_pcount_add(skb, -pcount);
 
 	/* When we're adding to gso_segs == 1, gso_size will be zero,
 	 * in theory this shouldn't be necessary but as long as DSACK
@@ -1310,7 +1310,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	}
 
 	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
-	if (skb_shinfo(skb)->gso_segs <= 1) {
+	if (tcp_skb_pcount(skb) <= 1) {
 		skb_shinfo(skb)->gso_size = 0;
 		skb_shinfo(skb)->gso_type = 0;
 	}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a462fb1db896..4d92703df4c6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -384,7 +384,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
 	TCP_SKB_CB(skb)->tcp_flags = flags;
 	TCP_SKB_CB(skb)->sacked = 0;
 
-	shinfo->gso_segs = 1;
+	tcp_skb_pcount_set(skb, 1);
 	shinfo->gso_size = 0;
 	shinfo->gso_type = 0;
 
@@ -972,6 +972,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
 			      tcp_skb_pcount(skb));
 
+	/* OK, its time to fill skb_shinfo(skb)->gso_segs */
+	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
+
 	/* Our usage of tstamp should remain private */
 	skb->tstamp.tv64 = 0;
 
@@ -1019,11 +1022,11 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
 		/* Avoid the costly divide in the normal
 		 * non-TSO case.
 		 */
-		shinfo->gso_segs = 1;
+		tcp_skb_pcount_set(skb, 1);
 		shinfo->gso_size = 0;
 		shinfo->gso_type = 0;
 	} else {
-		shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
+		tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
 		shinfo->gso_size = mss_now;
 		shinfo->gso_type = sk->sk_gso_type;
 	}
-- 
cgit 1.4.1


From 55d8694fa82c9b5858ae5a78a210353961f908f9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 26 Sep 2014 22:37:32 +0200
Subject: net: tcp: assign tcp cong_ops when tcp sk is created

Split assignment and initialization from one into two functions.

This is required by followup patches that add Datacenter TCP
(DCTCP) congestion control algorithm - we need to be able to
determine if the connection is moderated by DCTCP before the
3WHS has finished.

As we walk the available congestion control list during the
assignment, we are always guaranteed to have Reno present as
it's fixed compiled-in. Therefore, since we're doing the
early assignment, we don't have a real use for the Reno alias
tcp_init_congestion_ops anymore and can thus remove it.

Actual usage of the congestion control operations are being
made after the 3WHS has finished, in some cases however we
can access get_info() via diag if implemented, therefore we
need to zero out the private area for those modules.

Joint work with Daniel Borkmann and Glenn Judd.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        |  2 +-
 net/ipv4/tcp.c           |  6 ++----
 net/ipv4/tcp_cong.c      | 46 ++++++++++++++++++++++------------------------
 net/ipv4/tcp_minisocks.c |  5 ++---
 4 files changed, 27 insertions(+), 32 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 02a9a2c366bf..f99b0c072ee5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -824,6 +824,7 @@ struct tcp_congestion_ops {
 int tcp_register_congestion_control(struct tcp_congestion_ops *type);
 void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
 
+void tcp_assign_congestion_control(struct sock *sk);
 void tcp_init_congestion_control(struct sock *sk);
 void tcp_cleanup_congestion_control(struct sock *sk);
 int tcp_set_default_congestion_control(const char *name);
@@ -835,7 +836,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name);
 int tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
 
-extern struct tcp_congestion_ops tcp_init_congestion_ops;
 u32 tcp_reno_ssthresh(struct sock *sk);
 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
 extern struct tcp_congestion_ops tcp_reno;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 87289e51be00..cf5e508e1ef5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -405,7 +405,7 @@ void tcp_init_sock(struct sock *sk)
 
 	tp->reordering = sysctl_tcp_reordering;
 	tcp_enable_early_retrans(tp);
-	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+	tcp_assign_congestion_control(sk);
 
 	tp->tsoffset = 0;
 
@@ -3258,8 +3258,6 @@ void __init tcp_init(void)
 		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
 
 	tcp_metrics_init();
-
-	tcp_register_congestion_control(&tcp_reno);
-
+	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
 	tcp_tasklet_init();
 }
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 80248f56c89f..a6c8a5775624 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 
 /* Assign choice of congestion control. */
-void tcp_init_congestion_control(struct sock *sk)
+void tcp_assign_congestion_control(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_congestion_ops *ca;
 
-	/* if no choice made yet assign the current value set as default */
-	if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
-		rcu_read_lock();
-		list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
-			if (try_module_get(ca->owner)) {
-				icsk->icsk_ca_ops = ca;
-				break;
-			}
-
-			/* fallback to next available */
+	rcu_read_lock();
+	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+		if (likely(try_module_get(ca->owner))) {
+			icsk->icsk_ca_ops = ca;
+			goto out;
 		}
-		rcu_read_unlock();
+		/* Fallback to next available. The last really
+		 * guaranteed fallback is Reno from this list.
+		 */
 	}
+out:
+	rcu_read_unlock();
+
+	/* Clear out private data before diag gets it and
+	 * the ca has not been initialized.
+	 */
+	if (ca->get_info)
+		memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+}
+
+void tcp_init_congestion_control(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (icsk->icsk_ca_ops->init)
 		icsk->icsk_ca_ops->init(sk);
@@ -345,15 +355,3 @@ struct tcp_congestion_ops tcp_reno = {
 	.ssthresh	= tcp_reno_ssthresh,
 	.cong_avoid	= tcp_reno_cong_avoid,
 };
-
-/* Initial congestion control used (until SYN)
- * really reno under another name so we can tell difference
- * during tcp_set_default_congestion_control
- */
-struct tcp_congestion_ops tcp_init_congestion_ops  = {
-	.name		= "",
-	.owner		= THIS_MODULE,
-	.ssthresh	= tcp_reno_ssthresh,
-	.cong_avoid	= tcp_reno_cong_avoid,
-};
-EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a058f411d3a6..47b73506b77e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->snd_cwnd = TCP_INIT_CWND;
 		newtp->snd_cwnd_cnt = 0;
 
-		if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
-		    !try_module_get(newicsk->icsk_ca_ops->owner))
-			newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
+		if (!try_module_get(newicsk->icsk_ca_ops->owner))
+			tcp_assign_congestion_control(newsk);
 
 		tcp_set_ca_state(newsk, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
-- 
cgit 1.4.1


From 47d7a88c188f06ffaea3a539f84fe10cb4e77787 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Wed, 1 Oct 2014 18:27:50 +0200
Subject: tcp: add __init to tcp_init_mem

tcp_init_mem is only called by __init tcp_init.

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index cf5e508e1ef5..5c170340f684 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3172,7 +3172,7 @@ static int __init set_thash_entries(char *str)
 }
 __setup("thash_entries=", set_thash_entries);
 
-static void tcp_init_mem(void)
+static void __init tcp_init_mem(void)
 {
 	unsigned long limit = nr_free_buffer_pages() / 8;
 	limit = max(limit, 128UL);
-- 
cgit 1.4.1


From b248230c34970a6c1c17c591d63b464e8d2cfc33 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 29 Sep 2014 13:20:38 -0700
Subject: tcp: abort orphan sockets stalling on zero window probes

Currently we have two different policies for orphan sockets
that repeatedly stall on zero window ACKs. If a socket gets
a zero window ACK when it is transmitting data, the RTO is
used to probe the window. The socket is aborted after roughly
tcp_orphan_retries() retries (as in tcp_write_timeout()).

But if the socket was idle when it received the zero window ACK,
and later wants to send more data, we use the probe timer to
probe the window. If the receiver always returns zero window ACKs,
icsk_probes keeps getting reset in tcp_ack() and the orphan socket
can stall forever until the system reaches the orphan limit (as
commented in tcp_probe_timer()). This opens up a simple attack
to create lots of hanging orphan sockets to burn the memory
and the CPU, as demonstrated in the recent netdev post "TCP
connection will hang in FIN_WAIT1 after closing if zero window is
advertised." http://www.spinics.net/lists/netdev/msg296539.html

This patch follows the design in RTO-based probe: we abort an orphan
socket stalling on zero window when the probe timer reaches both
the maximum backoff and the maximum RTO. For example, an 100ms RTT
connection will timeout after roughly 153 seconds (0.3 + 0.6 +
.... + 76.8) if the receiver keeps the window shut. If the orphan
socket passes this check, but the system already has too many orphans
(as in tcp_out_of_resources()), we still abort it but we'll also
send an RST packet as the connection may still be active.

In addition, we change TCP_USER_TIMEOUT to cover (life or dead)
sockets stalled on zero-window probes. This changes the semantics
of TCP_USER_TIMEOUT slightly because it previously only applies
when the socket has pending transmission.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reported-by: Andrey Dmitrov <andrey.dmitrov@oktetlabs.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c       |  2 +-
 net/ipv4/tcp_timer.c | 41 +++++++++++++++++++++--------------------
 2 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5c170340f684..26a6f113f00c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2693,7 +2693,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		break;
 #endif
 	case TCP_USER_TIMEOUT:
-		/* Cap the max timeout in ms TCP will retry/retrans
+		/* Cap the max time in ms TCP will retry or probe the window
 		 * before giving up and aborting (ETIMEDOUT) a connection.
 		 */
 		if (val < 0)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b24360f6e293..9b21ae8b2e31 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -52,7 +52,7 @@ static void tcp_write_err(struct sock *sk)
  *    limit.
  * 2. If we have strong memory pressure.
  */
-static int tcp_out_of_resources(struct sock *sk, int do_reset)
+static int tcp_out_of_resources(struct sock *sk, bool do_reset)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int shift = 0;
@@ -72,7 +72,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
 		if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
 		    /*  2. Window is closed. */
 		    (!tp->snd_wnd && !tp->packets_out))
-			do_reset = 1;
+			do_reset = true;
 		if (do_reset)
 			tcp_send_active_reset(sk, GFP_ATOMIC);
 		tcp_done(sk);
@@ -270,40 +270,41 @@ static void tcp_probe_timer(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int max_probes;
+	u32 start_ts;
 
 	if (tp->packets_out || !tcp_send_head(sk)) {
 		icsk->icsk_probes_out = 0;
 		return;
 	}
 
-	/* *WARNING* RFC 1122 forbids this
-	 *
-	 * It doesn't AFAIK, because we kill the retransmit timer -AK
-	 *
-	 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
-	 * this behaviour in Solaris down as a bug fix. [AC]
-	 *
-	 * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
-	 * even if they advertise zero window. Hence, connection is killed only
-	 * if we received no ACKs for normal connection timeout. It is not killed
-	 * only because window stays zero for some time, window may be zero
-	 * until armageddon and even later. We are in full accordance
-	 * with RFCs, only probe timer combines both retransmission timeout
-	 * and probe timeout in one bottle.				--ANK
+	/* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as
+	 * long as the receiver continues to respond probes. We support this by
+	 * default and reset icsk_probes_out with incoming ACKs. But if the
+	 * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we
+	 * kill the socket when the retry count and the time exceeds the
+	 * corresponding system limit. We also implement similar policy when
+	 * we use RTO to probe window in tcp_retransmit_timer().
 	 */
-	max_probes = sysctl_tcp_retries2;
+	start_ts = tcp_skb_timestamp(tcp_send_head(sk));
+	if (!start_ts)
+		skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp);
+	else if (icsk->icsk_user_timeout &&
+		 (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)
+		goto abort;
 
+	max_probes = sysctl_tcp_retries2;
 	if (sock_flag(sk, SOCK_DEAD)) {
 		const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
 
 		max_probes = tcp_orphan_retries(sk, alive);
-
-		if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
+		if (!alive && icsk->icsk_backoff >= max_probes)
+			goto abort;
+		if (tcp_out_of_resources(sk, true))
 			return;
 	}
 
 	if (icsk->icsk_probes_out > max_probes) {
-		tcp_write_err(sk);
+abort:		tcp_write_err(sk);
 	} else {
 		/* Only send another probe if we didn't close things up. */
 		tcp_send_probe0(sk);
-- 
cgit 1.4.1