patch-1.3.63 linux/net/ipv4/tcp_input.c

Next file: linux/net/ipv4/tcp_output.c
Previous file: linux/net/ipv4/tcp.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v1.3.62/linux/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c
@@ -24,10 +24,97 @@
 #include <net/tcp.h>
 
 /*
+ *	Policy code extracted so its now seperate
+ */
+
+/*
+ *	Called each time to estimate the delayed ack timeout. This is
+ *	how it should be done so a fast link isnt impacted by ack delay.
+ */
+ 
+extern __inline__ void tcp_delack_estimator(struct sock *sk)
+{
+	/*
+	 *	Delayed ACK time estimator.
+	 */
+	
+	if (sk->lrcvtime == 0) 
+	{
+		sk->lrcvtime = jiffies;
+		sk->ato = HZ/3;
+	}
+	else 
+	{
+		int m;
+		
+		m = jiffies - sk->lrcvtime;
+
+		sk->lrcvtime = jiffies;
+
+		if (m <= 0)
+			m = 1;
+
+		if (m > (sk->rtt >> 3)) 
+		{
+			sk->ato = sk->rtt >> 3;
+			/*
+			 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
+			 */
+		}
+		else 
+		{
+			sk->ato = (sk->ato >> 1) + m;
+			/*
+			 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
+			 */
+		}
+	}
+}
+
+/*
+ *	Called on frames that were known _not_ to have been
+ *	retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 
+ *	The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
+ */
+ 
+extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
+{
+	long m;
+	/*
+	 *	The following amusing code comes from Jacobson's
+	 *	article in SIGCOMM '88.  Note that rtt and mdev
+	 *	are scaled versions of rtt and mean deviation.
+	 *	This is designed to be as fast as possible 
+	 *	m stands for "measurement".
+	 */
+	
+	m = jiffies - oskb->when;  /* RTT */
+	if(m<=0)
+		m=1;		/* IS THIS RIGHT FOR <0 ??? */
+	m -= (sk->rtt >> 3);    /* m is now error in rtt est */
+	sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
+	if (m < 0)
+		m = -m;		/* m is now abs(error) */
+	m -= (sk->mdev >> 2);   /* similar update on mdev */
+	sk->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
+
+	/*
+	 *	Now update timeout.  Note that this removes any backoff.
+	 */
+			 
+	sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
+	if (sk->rto > 120*HZ)
+		sk->rto = 120*HZ;
+	if (sk->rto < HZ/5)	/* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
+		sk->rto = HZ/5;
+	sk->backoff = 0;
+}
+
+/*
  *	Cached last hit socket
  */
  
-static volatile unsigned long 	th_cache_saddr,th_cache_daddr;
+static volatile unsigned long 	th_cache_saddr, th_cache_daddr;
 static volatile unsigned short  th_cache_dport, th_cache_sport;
 static volatile struct sock *th_cache_sk;
 
@@ -37,8 +124,10 @@
 }
 
 /*
- *	Find the socket, using the last hit cache if applicable.
+ *	Find the socket, using the last hit cache if applicable. The cache is not quite
+ *	right...
  */
+
 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
 {
 	struct sock * sk;
@@ -61,6 +150,7 @@
 /*
  * React to a out-of-window TCP sequence number in an incoming packet
  */
+ 
 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
 	     struct options *opt, unsigned long saddr, struct device *dev)
 {
@@ -79,8 +169,16 @@
 		tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
 		return;
 	}
-
-	/* Try to resync things. */
+	
+	/*
+	 *	4.3reno machines look for these kind of acks so they can do fast
+	 *	recovery. Three identical 'old' acks lets it know that one frame has
+	 *	been lost and should be resent. Because this is before the whole window
+	 *	of data has timed out it can take one lost frame per window without
+	 *	stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
+	 *
+	 *	We also should be spotting triple bad sequences.
+	 */
 	tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
 	return;
 }
@@ -99,20 +197,29 @@
 }
 
 /*
- *	When we get a reset we do this.
+ *	When we get a reset we do this. This probably is a tcp_output routine
+ *	really.
  */
 
 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
 {
 	sk->zapped = 1;
+	/*
+	 *	We want the right error as BSD sees it (and indeed as we do).
+	 */
 	sk->err = ECONNRESET;
 	if (sk->state == TCP_SYN_SENT)
 		sk->err = ECONNREFUSED;
 	if (sk->state == TCP_CLOSE_WAIT)
 		sk->err = EPIPE;
-#ifdef TCP_DO_RFC1337		
+#ifdef CONFIG_TCP_RFC1337
 	/*
 	 *	Time wait assassination protection [RFC1337]
+	 *
+	 *	This is a good idea, but causes more sockets to take time to close.
+	 *
+	 *	Ian Heavens has since shown this is an inadequate fix for the protocol
+	 *	bug in question.
 	 */
 	if(sk->state!=TCP_TIME_WAIT)
 	{	
@@ -228,8 +335,11 @@
 	}
 
 	/*
-	 * Make sure we can accept more.  This will prevent a
-	 * flurry of syns from eating up all our memory.
+	 *	Make sure we can accept more.  This will prevent a
+	 *	flurry of syns from eating up all our memory.
+	 *
+	 *	BSD does some funnies here and allows 3/2 times the
+	 *	set backlog as a fudge factor. Thats just too gross.
 	 */
 
 	if (sk->ack_backlog >= sk->max_ack_backlog) 
@@ -259,21 +369,24 @@
 	memcpy(newsk, sk, sizeof(*newsk));
 	newsk->opt = NULL;
 	newsk->ip_route_cache  = NULL;
-	if (opt && opt->optlen) {
-	  sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
-	  if (!sk->opt) {
-	        kfree_s(newsk, sizeof(struct sock));
-		tcp_statistics.TcpAttemptFails++;
-		kfree_skb(skb, FREE_READ);
-		return;
-	  }
-	  if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
-		kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
-	        kfree_s(newsk, sizeof(struct sock));
-		tcp_statistics.TcpAttemptFails++;
-		kfree_skb(skb, FREE_READ);
-		return;
-	  }
+	if (opt && opt->optlen) 
+	{
+		sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
+		if (!sk->opt) 
+		{
+	        	kfree_s(newsk, sizeof(struct sock));
+			tcp_statistics.TcpAttemptFails++;
+			kfree_skb(skb, FREE_READ);
+			return;
+		}
+		if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) 
+		{
+			kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
+	        	kfree_s(newsk, sizeof(struct sock));
+			tcp_statistics.TcpAttemptFails++;
+			kfree_skb(skb, FREE_READ);
+			return;
+		}
 	}
 	skb_queue_head_init(&newsk->write_queue);
 	skb_queue_head_init(&newsk->receive_queue);
@@ -336,15 +449,6 @@
 	newsk->rcv_saddr = daddr;
 
 	put_sock(newsk->num,newsk);
-	newsk->dummy_th.res1 = 0;
-	newsk->dummy_th.doff = 6;
-	newsk->dummy_th.fin = 0;
-	newsk->dummy_th.syn = 0;
-	newsk->dummy_th.rst = 0;	
-	newsk->dummy_th.psh = 0;
-	newsk->dummy_th.ack = 0;
-	newsk->dummy_th.urg = 0;
-	newsk->dummy_th.res2 = 0;
 	newsk->acked_seq = skb->seq + 1;
 	newsk->copied_seq = skb->seq + 1;
 	newsk->socket = NULL;
@@ -409,14 +513,83 @@
 	tcp_send_synack(newsk, sk, skb);
 }
 
+
+/*
+ * Handle a TCP window that shrunk on us. It shouldn't happen,
+ * but..
+ *
+ * We may need to move packets from the send queue
+ * to the write queue, if the window has been shrunk on us.
+ * The RFC says you are not allowed to shrink your window
+ * like this, but if the other end does, you must be able
+ * to deal with it.
+ */
+void tcp_window_shrunk(struct sock * sk, u32 window_seq)
+{
+	struct sk_buff *skb;
+	struct sk_buff *skb2;
+	struct sk_buff *wskb = NULL;
+ 	
+	skb2 = sk->send_head;
+	sk->send_head = NULL;
+	sk->send_tail = NULL;
+
+	/*
+	 *	This is an artifact of a flawed concept. We want one
+	 *	queue and a smarter send routine when we send all.
+	 */
+	cli();
+	while (skb2 != NULL) 
+	{
+		skb = skb2;
+		skb2 = skb->link3;
+		skb->link3 = NULL;
+		if (after(skb->end_seq, window_seq)) 
+		{
+			if (sk->packets_out > 0) 
+				sk->packets_out--;
+			/* We may need to remove this from the dev send list. */
+			if (skb->next != NULL) 
+			{
+				skb_unlink(skb);				
+			}
+			/* Now add it to the write_queue. */
+			if (wskb == NULL)
+				skb_queue_head(&sk->write_queue,skb);
+			else
+				skb_append(wskb,skb);
+			wskb = skb;
+		} 
+		else 
+		{
+			if (sk->send_head == NULL) 
+			{
+				sk->send_head = skb;
+				sk->send_tail = skb;
+			}
+			else
+			{
+				sk->send_tail->link3 = skb;
+				sk->send_tail = skb;
+			}
+			skb->link3 = NULL;
+		}
+	}
+	sti();
+}
+
+
 /*
  *	This routine deals with incoming acks, but not outgoing ones.
+ *
+ *	This routine is totally _WRONG_. The list structuring is wrong,
+ *	the algorithm is wrong, the code is wrong.
  */
 
 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
 {
 	int flag = 0;
-	unsigned window;
+	u32 window_seq;
 
 	/* 
 	 * 1 - there was data in packet as well as ack or new data is sent or 
@@ -429,29 +602,11 @@
 		return(1);	/* Dead, cant ack any more so why bother */
 
 	/*
-	 *	Have we discovered a larger window
-	 */
-	 
-	window = ntohs(th->window);
-
-	if (window > sk->max_window) 
-	{
-  		sk->max_window = window;
-#ifdef CONFIG_INET_PCTCP
-		/* Hack because we don't send partial packets to non SWS
-		   handling hosts */
-		sk->mss = min(window>>1, sk->mtu);
-#else
-		sk->mss = min(window, sk->mtu);
-#endif	
-	}
-
-	/*
 	 *	We have dropped back to keepalive timeouts. Thus we have
 	 *	no retransmits pending.
 	 */
 	 
-	if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
+	if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
 	  	sk->retransmits = 0;
 
 	/*
@@ -460,30 +615,7 @@
 	 */
 	 
 	if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
-	{
-		if(sk->debug)
-			printk("Ack ignored %u %u\n",ack,sk->sent_seq);
-			
-		/*
-		 *	Keepalive processing.
-		 */
-		 
-		if (after(ack, sk->sent_seq)) 
-		{
-			return(0);
-		}
-		
-		/*
-		 *	Restart the keepalive timer.
-		 */
-		 
-		if (sk->keepopen) 
-		{
-			if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
-				tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
-		}
-		return(1);
-	}
+		goto uninteresting_ack;
 
 	/*
 	 *	If there is data set flag 1
@@ -493,78 +625,38 @@
 		flag |= 1;
 
 	/*
-	 *	See if our window has been shrunk. 
+	 *	Have we discovered a larger window
 	 */
-
-	if (after(sk->window_seq, ack+window)) 
+	window_seq = ntohs(th->window);
+	if (window_seq > sk->max_window) 
 	{
-		/*
-		 * We may need to move packets from the send queue
-		 * to the write queue, if the window has been shrunk on us.
-		 * The RFC says you are not allowed to shrink your window
-		 * like this, but if the other end does, you must be able
-		 * to deal with it.
-		 */
-		struct sk_buff *skb;
-		struct sk_buff *skb2;
-		struct sk_buff *wskb = NULL;
-  	
-		skb2 = sk->send_head;
-		sk->send_head = NULL;
-		sk->send_tail = NULL;
-	
-		/*
-		 *	This is an artifact of a flawed concept. We want one
-		 *	queue and a smarter send routine when we send all.
-		 */
-	
-		flag |= 4;	/* Window changed */
-	
-		sk->window_seq = ack + window;
-		cli();
-		while (skb2 != NULL) 
-		{
-			skb = skb2;
-			skb2 = skb->link3;
-			skb->link3 = NULL;
-			if (after(skb->end_seq, sk->window_seq)) 
-			{
-				if (sk->packets_out > 0) 
-					sk->packets_out--;
-				/* We may need to remove this from the dev send list. */
-				if (skb->next != NULL) 
-				{
-					skb_unlink(skb);				
-				}
-				/* Now add it to the write_queue. */
-				if (wskb == NULL)
-					skb_queue_head(&sk->write_queue,skb);
-				else
-					skb_append(wskb,skb);
-				wskb = skb;
-			} 
-			else 
-			{
-				if (sk->send_head == NULL) 
-				{
-					sk->send_head = skb;
-					sk->send_tail = skb;
-				}
-				else
-				{
-					sk->send_tail->link3 = skb;
-					sk->send_tail = skb;
-				}
-				skb->link3 = NULL;
-			}
-		}
-		sti();
+  		sk->max_window = window_seq;
+#ifdef CONFIG_INET_PCTCP
+		/* Hack because we don't send partial packets to non SWS
+		   handling hosts */
+		sk->mss = min(window_seq>>1, sk->mtu);
+#else
+		sk->mss = min(window_seq, sk->mtu);
+#endif	
 	}
+	window_seq += ack;
 
 	/*
-	 *	Pipe has emptied
+	 *	See if our window has been shrunk. 
 	 */
-	 
+	if (after(sk->window_seq, window_seq)) {
+		flag |= 4;
+		tcp_window_shrunk(sk, window_seq);
+	}
+
+	/*
+	 *	Update the right hand window edge of the host
+	 */
+	sk->window_seq = window_seq;
+
+	/*
+	 *	Pipe has emptied
+	 */	 
 	if (sk->send_tail == NULL || sk->send_head == NULL) 
 	{
 		sk->send_head = NULL;
@@ -573,18 +665,13 @@
 	}
 
 	/*
-	 *	Update the right hand window edge of the host
-	 */
-	 
-	sk->window_seq = ack + window;
-
-	/*
 	 *	We don't want too many packets out there. 
 	 */
 	 
 	if (sk->ip_xmit_timeout == TIME_WRITE && 
 		sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
 	{
+		
 		/* 
 		 * This is Jacobson's slow start and congestion avoidance. 
 		 * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
@@ -722,38 +809,7 @@
 			oskb = sk->send_head;
 
 			if (!(flag&2)) 	/* Not retransmitting */
-			{
-				long m;
-	
-				/*
-				 *	The following amusing code comes from Jacobson's
-				 *	article in SIGCOMM '88.  Note that rtt and mdev
-				 *	are scaled versions of rtt and mean deviation.
-				 *	This is designed to be as fast as possible 
-				 *	m stands for "measurement".
-				 */
-	
-				m = jiffies - oskb->when;  /* RTT */
-				if(m<=0)
-					m=1;		/* IS THIS RIGHT FOR <0 ??? */
-				m -= (sk->rtt >> 3);    /* m is now error in rtt est */
-				sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
-				if (m < 0)
-					m = -m;		/* m is now abs(error) */
-				m -= (sk->mdev >> 2);   /* similar update on mdev */
-				sk->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
-	
-				/*
-				 *	Now update timeout.  Note that this removes any backoff.
-				 */
-			 
-				sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
-				if (sk->rto > 120*HZ)
-					sk->rto = 120*HZ;
-				if (sk->rto < HZ/5)	/* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
-					sk->rto = HZ/5;
-				sk->backoff = 0;
-			}
+				tcp_rtt_estimator(sk,oskb);
 			flag |= (2|4);	/* 2 is really more like 'don't adjust the rtt 
 					   In this case as we just set it up */
 			cli();
@@ -1000,7 +1056,31 @@
 		}
 	}
 
-	return(1);
+	return 1;
+
+uninteresting_ack:
+	if(sk->debug)
+		printk("Ack ignored %u %u\n",ack,sk->sent_seq);
+			
+	/*
+	 *	Keepalive processing.
+	 */
+		 
+	if (after(ack, sk->sent_seq)) 
+	{
+		return 0;
+	}
+		
+	/*
+	 *	Restart the keepalive timer.
+	 */
+		 
+	if (sk->keepopen) 
+	{
+		if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
+			tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
+	}
+	return 1;
 }
 
 
@@ -1515,12 +1595,14 @@
 		/*
 		 *	Pull up the IP header.
 		 */
+	
 		skb_pull(skb, skb->h.raw-skb->data);
 
 		/*
 		 *	Try to use the device checksum if provided.
 		 */
-		switch (skb->ip_summed) {
+		switch (skb->ip_summed) 
+		{
 			case CHECKSUM_NONE:
 				skb->csum = csum_partial((char *)th, len, 0);
 			case CHECKSUM_HW:
@@ -1539,7 +1621,7 @@
 
 		skb->acked = 0;
 		skb->used = 0;
-		skb->free = 0;
+		skb->free = 1;
 		skb->saddr = daddr;
 		skb->daddr = saddr;
 	
@@ -1580,7 +1662,11 @@
 	 
 	skb->sk=sk;
 	sk->rmem_alloc += skb->truesize;
-
+	
+	/*
+	 *	We should now do header prediction.
+	 */
+	 
 	/*
 	 *	This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
 	 *	don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
@@ -1627,13 +1713,19 @@
 			 *	and Solaris 2.1 gives you a protocol error. For now we just ignore
 			 *	it, that fits the spec precisely and avoids incompatibilities. It
 			 *	would be nice in future to drop through and process the data.
+			 *
+			 *	Now TTCP is starting to use we ought to queue this data.
 			 */
 			 
 			release_sock(sk);
 			return 0;
 		}
 	
-		/* retransmitted SYN? */
+		/* 
+		 *	Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
+		 *	then its a new connection
+		 */
+		 
 		if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
 		{
 			kfree_skb(skb, FREE_READ);
@@ -1643,7 +1735,8 @@
 		
 		/*
 		 *	SYN sent means we have to look for a suitable ack and either reset
-		 *	for bad matches or go to connected 
+		 *	for bad matches or go to connected. The SYN_SENT case is unusual and should
+		 *	not be in line code. [AC]
 		 */
 	   
 		if(sk->state==TCP_SYN_SENT)
@@ -1796,43 +1889,8 @@
 		return tcp_reset(sk,skb);	
 	}
 
-
-	/*
-	 *	Delayed ACK time estimator.
-	 */
+	tcp_delack_estimator(sk);
 	
-	if (sk->lrcvtime == 0) 
-	{
-		sk->lrcvtime = jiffies;
-		sk->ato = HZ/3;
-	}
-	else 
-	{
-		int m;
-		
-		m = jiffies - sk->lrcvtime;
-
-		sk->lrcvtime = jiffies;
-
-		if (m <= 0)
-			m = 1;
-
-		if (m > (sk->rtt >> 3)) 
-		{
-			sk->ato = sk->rtt >> 3;
-			/*
-			 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
-			 */
-		}
-		else 
-		{
-			sk->ato = (sk->ato >> 1) + m;
-			/*
-			 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
-			 */
-		}
-	}
-	  
 	/*
 	 *	Process the ACK
 	 */
@@ -1880,11 +1938,7 @@
 	 */
 	
 	if(tcp_data(skb,sk, saddr, len))
-	{
 		kfree_skb(skb, FREE_READ);
-		release_sock(sk);
-		return 0;
-	}
 
 	/*
 	 *	And done

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov with Sam's (original) version
of this