patch-2.1.8 linux/net/ipv4/tcp_output.c

Next file: linux/net/ipv4/tcp_timer.c
Previous file: linux/net/ipv4/tcp_ipv4.c
Back to the patch index
Back to the overall index
Lines: 1781
Date: Thu Nov 7 11:02:38 1996
Orig file: v2.1.7/linux/net/ipv4/tcp_output.c
Orig date: Wed Oct 9 08:55:24 1996

diff -u --recursive --new-file v2.1.7/linux/net/ipv4/tcp_output.c linux/net/ipv4/tcp_output.c
@@ -18,89 +18,73 @@
  *		Matthew Dillon, <dillon@apollo.west.oic.com>
  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  *		Jorge Cwik, <jorge@laser.satlink.net>
- *
- * Fixes:	Eric Schenk	: avoid multiple retransmissions in one
- *				: round trip timeout.
  */
 
-#include <linux/config.h>
-#include <net/tcp.h>
-#include <linux/ip_fw.h>
-#include <linux/firewall.h>
-#include <linux/interrupt.h>
-
-
 /*
- * RFC 1122 says:
+ * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
+ *				:	Fragmentation on mtu decrease
+ *				:	Segment collapse on retransmit
+ *				:	AF independence
  *
- * "the suggested [SWS] avoidance algorithm for the receiver is to keep
- *  RECV.NEXT + RCV.WIN fixed until:
- *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
+ *		Linus Torvalds	:	send_delayed_ack
  *
- * Experiments against BSD and Solaris machines show that following
- * these rules results in the BSD and Solaris machines making very
- * bad guesses about how much data they can have in flight.
- *
- * Instead we follow the BSD lead and offer a window that gives
- * the size of the current free space, truncated to a multiple
- * of 1024 bytes. If the window is smaller than
- * 	min(sk->mss, MAX_WINDOW/2)
- * then we advertise the window as having size 0, unless this
- * would shrink the window we offered last time.
- * This results in as much as double the throughput as the original
- * implementation.
- *
- * We do BSD style SWS avoidance -- note that RFC1122 only says we
- * must do silly window avoidance, it does not require that we use
- * the suggested algorithm.
- *
- * The "rcvbuf" and "rmem_alloc" values are shifted by 1, because
- * they also contain buffer handling overhead etc, so the window
- * we actually use is essentially based on only half those values.
  */
-int tcp_new_window(struct sock * sk)
-{
-	unsigned long window;
-	unsigned long minwin, maxwin;
 
-	/* Get minimum and maximum window values.. */
-	minwin = sk->mss;
-	if (!minwin)
-		minwin = sk->mtu;
-	maxwin = sk->window_clamp;
-	if (!maxwin)
-		maxwin = MAX_WINDOW;
-	if (minwin > maxwin/2)
-		minwin = maxwin/2;
-
-	/* Get current rcvbuf size.. */
-	window = sk->rcvbuf/2;
-	if (window < minwin) {
-		sk->rcvbuf = minwin*2;
-		window = minwin;
-	}
-
-	/* Check rcvbuf against used and minimum window */
-	window -= sk->rmem_alloc/2;
-	if ((long)(window - minwin) < 0)		/* SWS avoidance */
-		window = 0;
-
-	if (window > 1023)
-		window &= ~1023;
-	if (window > maxwin)
-		window = maxwin;
-	return window;
-}
+#include <net/tcp.h>
 
 /*
  *	Get rid of any delayed acks, we sent one already..
  */
 static __inline__ void clear_delayed_acks(struct sock * sk)
 {
-	sk->ack_timed = 0;
+	sk->delayed_acks = 0;
 	sk->ack_backlog = 0;
 	sk->bytes_rcv = 0;
-	del_timer(&sk->delack_timer);
+	tcp_clear_xmit_timer(sk, TIME_DACK);
+}
+
+static __inline__ void update_send_head(struct sock *sk)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	
+	tp->send_head = tp->send_head->next;
+
+	if (tp->send_head == (struct sk_buff *) &sk->write_queue)
+	{
+		tp->send_head = NULL;
+	}
+
+}
+
+static __inline__ int tcp_snd_test(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	int nagle_check = 1;
+	int len;
+
+	/*
+	 *	RFC 1122 - section 4.2.3.4
+	 *
+	 *	We must queue if
+	 *
+	 *	a) The right edge of this frame exceeds the window
+	 *	b) There are packets in flight and we have a small segment
+	 *	   [SWS avoidance and Nagle algorithm]
+	 *	   (part of SWS is done on packetization)
+	 *	c) We are retransmiting [Nagle]
+	 *	d) We have too many packets 'in flight'
+	 */
+		
+	len = skb->end_seq - skb->seq;
+
+	if (!sk->nonagle && len < (sk->mss >> 1) && sk->packets_out)
+	{
+		nagle_check = 0;
+	}
+
+	return (nagle_check && sk->packets_out < sk->cong_window &&
+		!after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
+		sk->retransmits == 0);
 }
 
 /*
@@ -108,10 +92,11 @@
  *	having checked it is sane seeming.
  */
  
-void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
+int tcp_send_skb(struct sock *sk, struct sk_buff *skb)
 {
-	int size;
 	struct tcphdr * th = skb->h.th;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	int size;
 
 	/*
 	 *	length of packet (not counting length of pre-tcp headers) 
@@ -125,10 +110,10 @@
 	 
 	if (size < sizeof(struct tcphdr) || size > skb->len) 
 	{
-		printk(KERN_ERR "tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
+		printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
 			skb, skb->data, th, skb->len);
 		kfree_skb(skb, FREE_WRITE);
-		return;
+		return 0;
 	}
 
 	/*
@@ -138,165 +123,245 @@
 	 
 	if (size == sizeof(struct tcphdr)) 
 	{
-		/* If it's got a syn or fin it's notionally included in the size..*/
+		/* 
+                 * If it's got a syn or fin discard
+                 */
 		if(!th->syn && !th->fin) 
 		{
-			printk(KERN_ERR "tcp_send_skb: attempt to queue a bogon.\n");
+			printk("tcp_send_skb: attempt to queue a bogon.\n");
 			kfree_skb(skb,FREE_WRITE);
-			return;
+			return 0;
 		}
 	}
 
-	/*
-	 * Jacobson recommends this in the appendix of his SIGCOMM'88 paper.
-	 * The idea is to do a slow start again if we haven't been doing
-	 * anything for a long time, in which case we have no reason to
-	 * believe that our congestion window is still correct.
-	 */
-	if (sk->send_head == 0 && (jiffies - sk->idletime) > sk->rto)
-		sk->cong_window = 1;
 
 	/*
 	 *	Actual processing.
 	 */
-
+	 
 	tcp_statistics.TcpOutSegs++;  
 	skb->seq = ntohl(th->seq);
 	skb->end_seq = skb->seq + size - 4*th->doff;
 
-	/*
-	 *	We must queue if
-	 *
-	 *	a) The right edge of this frame exceeds the window
-	 *	b) We are retransmitting (Nagle's rule)
-	 *	c) We have too many packets 'in flight'
-	 */
-	 
-	if (after(skb->end_seq, sk->window_seq) ||
-	    (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
-	     sk->packets_out >= sk->cong_window) 
-	{
-		/* checksum will be supplied by tcp_write_xmit.  So
-		 * we shouldn't need to set it at all.  I'm being paranoid */
-		th->check = 0;
-		if (skb->next != NULL) 
+	
+	if (tp->send_head || !tcp_snd_test(sk, skb))
+	{
+		/* 
+		 * Remember where we must start sending
+		 */
+
+		if (tp->send_head == NULL)
+			tp->send_head = skb;
+
+		skb_queue_tail(&sk->write_queue, skb);
+
+		if (sk->packets_out == 0 && !tp->pending)
 		{
-			printk(KERN_ERR "tcp_send_partial: next != NULL\n");
-			skb_unlink(skb);
+			tp->pending = TIME_PROBE0;
+			tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
 		}
-		skb_queue_tail(&sk->write_queue, skb);
-		
-		if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
-		    sk->send_head == NULL && sk->ack_backlog == 0)
-			tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
+
 	}
-	else 
+	else
 	{
+		struct sk_buff * buff;
+
 		/*
 		 *	This is going straight out
 		 */
-		clear_delayed_acks(sk);		 
-		th->ack_seq = htonl(sk->acked_seq);
+
+		skb_queue_tail(&sk->write_queue, skb);
+
+		clear_delayed_acks(sk);
+		 
+		th->ack_seq = htonl(tp->rcv_nxt);
 		th->window = htons(tcp_select_window(sk));
 
-		tcp_send_check(th, sk->saddr, sk->daddr, size, skb);
+		tp->af_specific->send_check(sk, th, size, skb);
 
-		sk->sent_seq = sk->write_seq;
+		tp->snd_nxt = skb->end_seq;
+		
+		atomic_inc(&sk->packets_out);
 
-		/*
-		 *	This is mad. The tcp retransmit queue is put together
-		 *	by the ip layer. This causes half the problems with
-		 *	unroutable FIN's and other things.
-		 */
-		 
-		sk->prot->queue_xmit(sk, skb->dev, skb, 0);
+		skb->when = jiffies;
 		
-		/*
-		 *	Set for next retransmit based on expected ACK time
-		 *	of the first packet in the resend queue.
-		 *	This is no longer a window behind.
-		 */
+		buff = skb_clone(skb, GFP_ATOMIC);
+		atomic_add(buff->truesize, &sk->wmem_alloc);
+
+		tp->af_specific->queue_xmit(sk, skb->dev, buff, 1);
 
-		tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
+		if (!tcp_timer_is_set(sk, TIME_RETRANS))
+			tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 	}
+
+	return 0;
 }
 
 /*
- *	Locking problems lead us to a messy situation where we can have
- *	multiple partially complete buffers queued up. This is really bad
- *	as we don't want to be sending partial buffers. Fix this with
- *	a semaphore or similar to lock tcp_write per socket.
- *
- *	These routines are pretty self descriptive.
+ *	Function to create two new tcp segments.
+ *	Shrinks the given segment to the specified size and appends a new
+ *	segment with the rest of the packet to the list.
+ *	This won't be called frenquently, I hope... 
  */
- 
-struct sk_buff * tcp_dequeue_partial(struct sock * sk)
+
+static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 {
-	struct sk_buff * skb;
-	unsigned long flags;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	struct sk_buff *buff;
+	struct tcphdr *th, *nth;	
+	int nsize;
+	int tmp;
+
+	th = skb->h.th;
+
+	/* size of new segment */
+	nsize = skb->tail - ((unsigned char *) (th + 1)) - len;
 
-	save_flags(flags);
-	cli();
-	skb = sk->partial;
-	if (skb) {
-		sk->partial = NULL;
-		del_timer(&sk->partial_timer);
+	if (nsize <= 0)
+	{
+		printk(KERN_DEBUG "tcp_fragment: bug size <= 0\n");
+		return -1;
+	}
+
+	/*
+	 *	Get a new skb... force flag on
+	 */
+	buff = sock_wmalloc(sk, nsize + 128 + sk->prot->max_header + 15, 1, 
+			    GFP_ATOMIC);
+
+	if (buff == NULL)
+		return -1;
+
+	buff->sk = sk;
+	buff->localroute = sk->localroute;
+	    	
+	/*
+	 *	Put headers on the new packet
+	 */
+
+	tmp = tp->af_specific->build_net_header(sk, buff);
+
+	if (tmp < 0)
+	{
+		sock_wfree(sk, buff);
+		return -1;
+	}
+		
+	/*
+	 *	Move the TCP header over
+	 */
+	
+	nth = (struct tcphdr *) skb_put(buff, sizeof(*th));
+
+	buff->h.th = nth;
+	
+	memcpy(nth, th, sizeof(*th));
+	
+	/*
+	 *	Correct the new header
+	 */
+	
+	buff->seq = skb->seq + len;
+	buff->end_seq = skb->end_seq;
+	nth->seq = htonl(buff->seq);
+	nth->check = 0;
+	nth->doff  = 5; 
+	
+	/* urg data is always an headache */
+	if (th->urg)
+	{
+		if (th->urg_ptr > len)
+		{
+			th->urg = 0;
+			nth->urg_ptr -= len;
+		}
+		else
+		{
+			nth->urg = 0;
+		}
 	}
-	restore_flags(flags);
-	return skb;
+
+	/*
+	 *	Copy TCP options and data start to our new buffer
+	 */
+	
+	buff->csum = csum_partial_copy(((u8 *)(th + 1)) + len,
+				       skb_put(buff, nsize),
+				       nsize, 0);
+       
+
+	skb->end_seq -= nsize;
+
+	skb_trim(skb, skb->len - nsize);
+
+	/* remember to checksum this packet afterwards */
+	th->check = 0;
+	skb->csum = csum_partial((u8*) (th + 1), skb->tail - ((u8 *) (th + 1)),
+				 0);
+
+	skb_append(skb, buff);
+
+	return 0;
 }
 
-/*
- *	Empty the partial queue
- */
- 
-void tcp_send_partial(struct sock *sk)
+static void tcp_wrxmit_prob(struct sock *sk, struct sk_buff *skb)
 {
-	struct sk_buff *skb;
+	/*
+	 *	This is acked data. We can discard it. This 
+	 *	cannot currently occur.
+	 */
 
-	if (sk == NULL)
-		return;
-	while ((skb = tcp_dequeue_partial(sk)) != NULL)
-		tcp_send_skb(sk, skb);
+	sk->retransmits = 0;
+
+	printk(KERN_DEBUG "tcp_write_xmit: bug skb in write queue\n");
+
+	update_send_head(sk);
+
+	skb_unlink(skb);	
+	skb->sk = NULL;
+	skb->free = 1;
+	kfree_skb(skb, FREE_WRITE);
+
+	if (!sk->dead)
+		sk->write_space(sk);
 }
 
-/*
- *	Queue a partial frame
- */
- 
-void tcp_enqueue_partial(struct sock * sk, struct sk_buff * skb)
+static int tcp_wrxmit_frag(struct sock *sk, struct sk_buff *skb, int size)
 {
-	struct sk_buff * tmp;
-	unsigned long flags;
-
-	save_flags(flags);
-	cli();
-	tmp = sk->partial;
-	if (tmp)
-		del_timer(&sk->partial_timer);
-	sk->partial = skb;
-	init_timer(&sk->partial_timer);
-	/*
-	 *	Wait up to 1 second for the buffer to fill.
-	 */
-	sk->partial_timer.expires = jiffies+HZ/10;
-	sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
-	sk->partial_timer.data = (unsigned long) sk;
-	add_timer(&sk->partial_timer);
-	restore_flags(flags);
-	if (tmp)
-		tcp_send_skb(sk, tmp);
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	
+	printk(KERN_DEBUG "tcp_write_xmit: frag needed size=%d mss=%d\n", 
+	       size, sk->mss);
+				
+	if (tcp_fragment(sk, skb, sk->mss))
+	{
+		/* !tcp_frament Failed! */
+		tp->send_head = skb;
+		atomic_dec(&sk->packets_out);
+		return -1;
+	}
+	else
+	{
+		/* 
+		 * If tcp_fragment succeded then
+		 * the send head is the resulting
+		 * fragment
+		 */
+		tp->send_head = skb->next;
+	}
+	return 0;
 }
 
 /*
- * 	This routine takes stuff off of the write queue,
- *	and puts it in the xmit queue. This happens as incoming acks
- *	open up the remote window for us.
+ * 	This routine writes packets to the network.
+ *	It advances the send_head.
+ *	This happens as incoming acks open up the remote window for us.
  */
  
 void tcp_write_xmit(struct sock *sk)
 {
 	struct sk_buff *skb;
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
 	/*
 	 *	The bytes will have to remain here. In time closedown will
@@ -308,64 +373,43 @@
 
 	/*
 	 *	Anything on the transmit queue that fits the window can
-	 *	be added providing we are not
+	 *	be added providing we are:
 	 *
-	 *	a) retransmitting (Nagle's rule)
-	 *	b) exceeding our congestion window.
+	 *	a) following SWS avoidance [and Nagle algorithm]
+	 *	b) not exceeding our congestion window.
+	 *	c) not retransmiting [Nagle]
 	 */
-	 
-	while((skb = skb_peek(&sk->write_queue)) != NULL &&
-		!after(skb->end_seq, sk->window_seq) &&
-		(sk->retransmits == 0 ||
-		 sk->ip_xmit_timeout != TIME_WRITE ||
-		 !after(skb->end_seq, sk->rcv_ack_seq))
-		&& sk->packets_out < sk->cong_window) 
+
+	start_bh_atomic();
+
+	while((skb = tp->send_head) && tcp_snd_test(sk, skb))
 	{
 		IS_SKB(skb);
-		skb_unlink(skb);
-		
+				
 		/*
-		 *	See if we really need to send the whole packet. 
+		 *	See if we really need to send the packet. 
 		 */
 		 
-		if (before(skb->end_seq, sk->rcv_ack_seq +1)) {
-			/*
-			 *	This is acked data. We can discard it.
-			 *	This implies the packet was sent out
-			 *	of the write queue by a zero window probe.
-			 */
-			 
-			sk->retransmits = 0;
-			kfree_skb(skb, FREE_WRITE);
-			if (!sk->dead) 
-				sk->write_space(sk);
-		} else {
+		if (!after(skb->end_seq, tp->snd_una)) 
+		{
+			tcp_wrxmit_prob(sk, skb);
+		} 
+		else
+		{
 			struct tcphdr *th;
-			struct iphdr *iph;
+			struct sk_buff *buff;
 			int size;
 
-			iph = skb->ip_hdr;
-			th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
+			/* 
+			 * Advance the send_head
+			 * This one is going out.
+			 */
+
+			update_send_head(sk);
+
+			atomic_inc(&sk->packets_out);
 
-                        /* See if we need to shrink the leading packet on
-                         * the retransmit queue. Strictly speaking, we
-                         * should never need to do this, but some buggy TCP
-                         * implementations get confused if you send them
-                         * a packet that contains both old and new data. (Feh!)
-                         * Soooo, we have this uglyness here.
-                         */
-			if (after(sk->rcv_ack_seq,skb->seq+th->syn+th->fin))
-				tcp_shrink_skb(sk,skb,sk->rcv_ack_seq);
 
-			size = skb->len - (((unsigned char *) th) - skb->data);
-#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
-			if (size > sk->mtu - sizeof(struct iphdr))
-			{
-				iph->frag_off &= ~htons(IP_DF);
-				ip_send_check(iph);
-			}
-#endif
-			
 /*
  * put in the ack seq and window at this point rather than earlier,
  * in order to keep them monotonic.  We really want to avoid taking
@@ -373,79 +417,140 @@
  * Ack and window will in general have changed since this packet was put
  * on the write queue.
  */
-			th->ack_seq = htonl(sk->acked_seq);
-			th->window = htons(tcp_select_window(sk));
 
-			tcp_send_check(th, sk->saddr, sk->daddr, size, skb);
+			th = skb->h.th;
+			size = skb->len - (((unsigned char *) th) - skb->data);
 
-			sk->sent_seq = skb->end_seq;
+			if (size - (th->doff << 2) > sk->mss)
+			{
+				if (tcp_wrxmit_frag(sk, skb, size))
+					break;
+			}
 			
-			/*
-			 *	IP manages our queue for some crazy reason
-			 */
-			 
-			sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
+			th->ack_seq = htonl(tp->rcv_nxt);
+			th->window = htons(tcp_select_window(sk));
+
+			tp->af_specific->send_check(sk, th, size, skb);
 
+			if (before(skb->end_seq, tp->snd_nxt)) 
+				printk(KERN_DEBUG "tcp_write_xmit:"
+				       " sending already sent seq\n");
+			else
+				tp->snd_nxt = skb->end_seq;
+			
 			clear_delayed_acks(sk);
+			
+			skb->when = jiffies;
+
+			buff = skb_clone(skb, GFP_ATOMIC);
+			atomic_add(buff->truesize, &sk->wmem_alloc);
 
-			tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
+			tp->af_specific->queue_xmit(sk, skb->dev, buff, 1);
+			
+			if (!tcp_timer_is_set(sk, TIME_RETRANS))
+			{
+				tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+			}
 		}
 	}
+
+	end_bh_atomic();
+}
+
+static int tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcphdr *th1, *th2;
+	int size1, size2, avail;
+	struct sk_buff *buff = skb->next;
+
+	th1 = skb->h.th;
+
+	if (th1->urg)
+		return -1;
+
+	avail = skb->end - skb->tail;
+
+	/*
+	 *  size of tcp payload
+	 */
+
+	size1 = skb->tail - (u8 *) (th1 + 1);
+	
+	th2 = buff->h.th;
+
+	size2 = buff->tail - (u8 *) (th2 + 1); 
+
+	if (size2 > avail || size1 + size2 > sk->mss )
+		return -1;
+
+	/*
+	 *  ok. we will be able to collapse the packet
+	 */
+
+	skb_unlink(buff);
+
+	memcpy(skb_put(skb, size2), ((char *) th2) + (th2->doff << 2), size2);
+	
+	/*
+	 * update sizes on original skb. both TCP and IP
+	 */
+ 
+	skb->end_seq += size2;
+
+	if (th2->urg)
+	{
+		th1->urg = 1;
+		th1->urg_ptr = th2->urg_ptr + size1;
+	}
+
+	/*
+	 * ... and off you go.
+	 */
+
+	buff->free = 1;
+	kfree_skb(buff, FREE_WRITE);
+	atomic_dec(&sk->packets_out);
+
+	/* 
+	 *	Header checksum will be set by the retransmit procedure
+	 *	after calling rebuild header
+	 */
+
+	th1->check = 0;
+	skb->csum = csum_partial((u8*) (th1+1), size1 + size2, 0);
+
+	return 0;
 }
 
 
 /*
  *	A socket has timed out on its send queue and wants to do a
- *	little retransmitting. Currently this means TCP.
+ *	little retransmitting.
+ *	retransmit_head can be different from the head of the write_queue
+ *	if we are doing fast retransmit.
  */
 
 void tcp_do_retransmit(struct sock *sk, int all)
 {
 	struct sk_buff * skb;
-	struct proto *prot;
-	struct device *dev;
-	struct rtable *rt;
+	int ct=0;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
-	prot = sk->prot;
-	if (!all) {
-		/*
-		 * If we are just retransmitting one packet reset
-		 * to the start of the queue.
-		 */
-		sk->send_next = sk->send_head;
-		sk->packets_out = 0;
-	}
-	skb = sk->send_next;
+	start_bh_atomic();
 
-	while (skb != NULL)
+	if (tp->retrans_head == NULL)
+		tp->retrans_head = skb_peek(&sk->write_queue);
+
+	if (tp->retrans_head == tp->send_head)
+		tp->retrans_head = NULL;
+	
+	while ((skb = tp->retrans_head) != NULL)
 	{
 		struct tcphdr *th;
-		struct iphdr *iph;
-		int size;
+		u32 tcp_size;
 
-		dev = skb->dev;
 		IS_SKB(skb);
-		skb->when = jiffies;
-
-		/* dl1bke 960201 - @%$$! Hope this cures strange race conditions    */
-		/*		   with AX.25 mode VC. (esp. DAMA)		    */
-		/*		   if the buffer is locked we should not retransmit */
-		/*		   anyway, so we don't need all the fuss to prepare */
-		/*		   the buffer in this case. 			    */
-		/*		   (the skb_pull() changes skb->data while we may   */
-		/*		   actually try to send the data. Ouch. A side	    */
-		/*		   effect is that we'll send some unnecessary data, */
-		/*		   but the alternative is disastrous...	    */
 		
-		if (skb_device_locked(skb))
-			break;
-
-		/*
-		 *	Discard the surplus MAC header
-		 */
-		 
-		skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
-
 		/*
 		 * In general it's OK just to use the old packet.  However we
 		 * need to use the current ack and window fields.  Urg and
@@ -455,143 +560,76 @@
 		 * changing the packet, we have to issue a new IP identifier.
 		 */
 
-		iph = (struct iphdr *)skb->data;
-		th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
-		size = ntohs(iph->tot_len) - (iph->ihl<<2);
-		
-		/*
-		 *	Note: We ought to check for window limits here but
-		 *	currently this is done (less efficiently) elsewhere.
-		 */
+		th = skb->h.th;
 
-		/*
-		 *	Put a MAC header back on (may cause ARPing)
-		 */
-		 
-	        {
-			/* ANK: UGLY, but the bug, that was here, should be fixed.
-			 */
-			struct options *  opt = (struct options*)skb->proto_priv;
-			rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
-	        }
-
-		iph->id = htons(ip_id_count++);
-#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
-		if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
-			iph->frag_off &= ~htons(IP_DF);
-#endif
-		ip_send_check(iph);
-			
-		if (rt==NULL)	/* Deep poo */
+		tcp_size = skb->tail - ((unsigned char *) (th + 1));
+
+		if (tcp_size > sk->mss)
 		{
-			if(skb->sk)
+			if (tcp_fragment(sk, skb, sk->mss))
 			{
-				skb->sk->err_soft=ENETUNREACH;
-				skb->sk->error_report(skb->sk);
+				printk(KERN_DEBUG "tcp_fragment failed\n");
+				return;
 			}
-			/* Can't transmit this packet, no reason
-			 * to transmit the later ones, even if
-			 * the congestion window allows.
-			 */
-			break;
+			atomic_inc(&sk->packets_out);
 		}
-		else
+
+		if (!th->syn &&
+		    tcp_size < (sk->mss >> 1) &&
+		    skb->next != tp->send_head &&
+		    skb->next != (struct sk_buff *)&sk->write_queue)
 		{
-			dev=rt->rt_dev;
-			skb->raddr=rt->rt_gateway;
-			skb->dev=dev;
-			skb->arp=1;
-#ifdef CONFIG_FIREWALL
-        		if (call_out_firewall(PF_INET, skb->dev, iph, NULL) < FW_ACCEPT) {
-				/* The firewall wants us to dump the packet.
-			 	* We have to check this here, because
-			 	* the drop in ip_queue_xmit only catches the
-			 	* first time we send it. We must drop on
-				* every resend as well.
-			 	*/
-				break;
-        		}
-#endif 
-			if (rt->rt_hh)
-			{
-				memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
-				if (!rt->rt_hh->hh_uptodate)
-				{
-					skb->arp = 0;
-#if RT_CACHE_DEBUG >= 2
-					printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
-#endif
-				}
-			}
-			else if (dev->hard_header)
-			{
-				if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
-					skb->arp=0;
-			}
-		
+			tcp_retrans_try_collapse(sk, skb);
+		}	       		
+
+		if (tp->af_specific->rebuild_header(sk, skb) == 0) 
+		{
+			struct sk_buff *buff;
+			int size;
+
+			if (sk->debug)
+				printk("retransmit sending\n");
+
 			/*
-			 *	This is not the right way to handle this. We have to
-			 *	issue an up to date window and ack report with this 
-			 *	retransmit to keep the odd buggy tcp that relies on 
-			 *	the fact BSD does this happy. 
-			 *	We don't however need to recalculate the entire 
-			 *	checksum, so someone wanting a small problem to play
-			 *	with might like to implement RFC1141/RFC1624 and speed
-			 *	this up by avoiding a full checksum.
+			 *	update ack and window
 			 */
-		 
-			th->ack_seq = htonl(sk->acked_seq);
-			clear_delayed_acks(sk);
+			th->ack_seq = htonl(tp->rcv_nxt);
 			th->window = ntohs(tcp_select_window(sk));
-			tcp_send_check(th, sk->saddr, sk->daddr, size, skb);
-		
-			/*
-			 *	If the interface is (still) up and running, kick it.
-			 */
-	
-			if (dev->flags & IFF_UP)
-			{
-				/*
-				 *	If the packet is still being sent by the device/protocol
-				 *	below then don't retransmit. This is both needed, and good -
-				 *	especially with connected mode AX.25 where it stops resends
-				 *	occurring of an as yet unsent anyway frame!
-				 *	We still add up the counts as the round trip time wants
-				 *	adjusting.
-				 */
-				if (sk && !skb_device_locked(skb))
-				{
-					/* Remove it from any existing driver queue first! */
-					skb_unlink(skb);
-					/* Now queue it */
-					ip_statistics.IpOutRequests++;
-					dev_queue_xmit(skb, dev, sk->priority);
-					sk->packets_out++;
-				}
-			}
+
+			size = skb->tail - (unsigned char *) th;
+			tp->af_specific->send_check(sk, th, size, skb);
+
+			skb->when = jiffies;
+			buff = skb_clone(skb, GFP_ATOMIC);
+			atomic_add(buff->truesize, &sk->wmem_alloc);
+
+			clear_delayed_acks(sk);
+
+			tp->af_specific->queue_xmit(sk, skb->dev, buff, 1);
+		}
+		else
+		{
+			printk(KERN_DEBUG "tcp_do_rebuild_header failed\n");
+			break;
 		}
 
 		/*
 		 *	Count retransmissions
 		 */
 		 
-		sk->prot->retransmits++;
+		ct++;
+		sk->prot->retransmits ++;
 		tcp_statistics.TcpRetransSegs++;
 
 		/*
 		 * Record the high sequence number to help avoid doing
 		 * to much fast retransmission.
 		 */
+
 		if (sk->retransmits)
-			sk->high_seq = sk->sent_seq;
+		       tp->high_seq = tp->snd_nxt;
 		
 		/*
-	         * Advance the send_next pointer so we don't keep
-		 * retransmitting the same stuff every time we get an ACK.
-		 */
-		sk->send_next = skb->link3;
-
-		/*
 		 *	Only one retransmit requested.
 		 */
 	
@@ -602,87 +640,22 @@
 		 *	This should cut it off before we send too many packets.
 		 */
 
-		if (sk->packets_out >= sk->cong_window)
+		if (ct >= sk->cong_window)
 			break;
 
-		skb = skb->link3;
-	}
-}
-
-/*
- *	This routine will send an RST to the other tcp. 
- */
- 
-void tcp_send_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
-	  struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
-{
-	struct sk_buff *buff;
-	struct tcphdr *t1;
-	int tmp;
-	struct device *ndev=NULL;
-
-	/*
-	 *	Cannot reset a reset (Think about it).
-	 */
-	 
-	if(th->rst)
-		return;
-  
-	/*
-	 * We need to grab some memory, and put together an RST,
-	 * and then put it into the queue to be sent.
-	 */
-
-	buff = alloc_skb(MAX_RESET_SIZE, GFP_ATOMIC);
-	if (buff == NULL) 
-	  	return;
-
-	buff->sk = NULL;
-	buff->dev = dev;
-	buff->localroute = 0;
-	buff->csum = 0;
-
-	/*
-	 *	Put in the IP header and routing stuff. 
-	 */
-
-	tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
-			   sizeof(struct tcphdr),tos,ttl,NULL);
-	if (tmp < 0) 
-	{
-  		buff->free = 1;
-		sock_wfree(NULL, buff);
-		return;
-	}
-
-	t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
-	memset(t1, 0, sizeof(*t1));
-
-	/*
-	 *	Swap the send and the receive. 
-	 */
-
-	t1->dest = th->source;
-	t1->source = th->dest;
-	t1->doff = sizeof(*t1)/4;
-	t1->rst = 1;
-  
-	if(th->ack)
-	{
-	  	t1->seq = th->ack_seq;
-	}
-	else
-	{
-		t1->ack = 1;
-	  	if(!th->syn)
-			t1->ack_seq = th->seq;
-		else
-			t1->ack_seq = htonl(ntohl(th->seq)+1);
+		/*
+		 *	Advance the pointer
+		 */
+		
+		tp->retrans_head = skb->next;
+		if ((tp->retrans_head == tp->send_head) ||
+		    (tp->retrans_head == (struct sk_buff *) &sk->write_queue))
+		{
+			tp->retrans_head = NULL;
+		}
 	}
 
-	tcp_send_check(t1, saddr, daddr, sizeof(*t1), buff);
-	prot->queue_xmit(NULL, ndev, buff, 1);
-	tcp_statistics.TcpOutSegs++;
+	end_bh_atomic();
 }
 
 /*
@@ -691,19 +664,19 @@
 
 void tcp_send_fin(struct sock *sk)
 {
-	struct proto *prot =(struct proto *)sk->prot;
 	struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);	
 	struct tcphdr *t1;
 	struct sk_buff *buff;
-	struct device *dev=NULL;
 	int tmp;
+	
 		
-	buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
+	buff = sock_wmalloc(sk, MAX_RESET_SIZE, 1, GFP_KERNEL);
 
 	if (buff == NULL)
 	{
 		/* This is a disaster if it occurs */
-		printk(KERN_CRIT "tcp_send_fin: Impossible malloc failure");
+		printk("tcp_send_fin: Impossible malloc failure");
 		return;
 	}
 
@@ -719,9 +692,8 @@
 	 *	Put in the IP header and routing stuff. 
 	 */
 
-	tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
-			   IPPROTO_TCP, sk->opt,
-			   sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
+	tmp = tp->af_specific->build_net_header(sk, buff);
+
 	if (tmp < 0) 
 	{
 		int t;
@@ -747,126 +719,115 @@
 	 */
 
 	t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
-	buff->dev = dev;
+	buff->h.th =  t1;
+
 	memcpy(t1, th, sizeof(*t1));
 	buff->seq = sk->write_seq;
 	sk->write_seq++;
 	buff->end_seq = sk->write_seq;
 	t1->seq = htonl(buff->seq);
-	t1->ack_seq = htonl(sk->acked_seq);
+	t1->ack_seq = htonl(tp->rcv_nxt);
 	t1->window = htons(tcp_select_window(sk));
 	t1->fin = 1;
-	tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), buff);
+
+	tp->af_specific->send_check(sk, t1, sizeof(*t1), buff);
 
 	/*
-	 * If there is data in the write queue, the fin must be appended to
-	 * the write queue.
+	 * The fin can only be transmited after the data.
  	 */
  	
- 	if (skb_peek(&sk->write_queue) != NULL) 
- 	{
-  		buff->free = 0;
-		if (buff->next != NULL) 
-		{
-			printk(KERN_ERR "tcp_send_fin: next != NULL\n");
-			skb_unlink(buff);
-		}
-		skb_queue_tail(&sk->write_queue, buff);
-  	} 
-  	else 
-  	{
-        	sk->sent_seq = sk->write_seq;
-		sk->prot->queue_xmit(sk, dev, buff, 0);
-		tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
+	skb_queue_tail(&sk->write_queue, buff);
+
+ 	if (tp->send_head == NULL)
+	{
+		struct sk_buff *skb1;
+
+		atomic_inc(&sk->packets_out);
+		tp->snd_nxt = sk->write_seq;
+		buff->when = jiffies;
+
+		skb1 = skb_clone(buff, GFP_KERNEL);
+		atomic_add(skb1->truesize, &sk->wmem_alloc);
+
+		tp->af_specific->queue_xmit(sk, skb1->dev, skb1, 1);
+
+                if (!tcp_timer_is_set(sk, TIME_RETRANS))
+			tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 	}
 }
 
-
-void tcp_send_synack(struct sock * newsk, struct sock * sk, struct sk_buff * skb)
+int tcp_send_synack(struct sock *sk)
 {
-	struct tcphdr *t1;
-	unsigned char *ptr;
+	struct tcp_opt * tp = &(sk->tp_pinfo.af_tcp);
+	struct sk_buff * skb;	
 	struct sk_buff * buff;
-	struct device *ndev=NULL;
+	struct tcphdr *th;
+	unsigned char *ptr;
 	int tmp;
+	
+	skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
 
-	buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
-	if (buff == NULL) 
+	if (skb == NULL) 
 	{
-		sk->err = ENOMEM;
-		destroy_sock(newsk);
-		kfree_skb(skb, FREE_READ);
-		tcp_statistics.TcpAttemptFails++;
-		return;
+		return -ENOMEM;
 	}
-  
-	buff->sk = newsk;
-	buff->localroute = newsk->localroute;
-
-	/*
-	 *	Put in the IP header and routing stuff. 
-	 */
-
-	tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
-			       IPPROTO_TCP, newsk->opt, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
 
-	/*
-	 *	Something went wrong. 
-	 */
+	skb->sk = sk;
+	skb->localroute = sk->localroute;
 
-	if (tmp < 0) 
+	tmp = tp->af_specific->build_net_header(sk, skb);
+	
+	if (tmp < 0)
 	{
-		sk->err = tmp;
-		buff->free = 1;
-		kfree_skb(buff,FREE_WRITE);
-		destroy_sock(newsk);
-		skb->sk = sk;
-		kfree_skb(skb, FREE_READ);
-		tcp_statistics.TcpAttemptFails++;
-		return;
+		skb->free = 1;
+		kfree_skb(skb, FREE_WRITE);
+		return tmp;
 	}
 
-	t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
-  
-	memcpy(t1, skb->h.th, sizeof(*t1));
-	buff->seq = newsk->write_seq++;
-	buff->end_seq = newsk->write_seq;
-	/*
-	 *	Swap the send and the receive. 
-	 */
-	t1->dest = skb->h.th->source;
-	t1->source = newsk->dummy_th.source;
-	t1->seq = ntohl(buff->seq);
-	newsk->sent_seq = newsk->write_seq;
-	t1->window = ntohs(tcp_select_window(newsk));
-	t1->syn = 1;
-	t1->ack = 1;
-	t1->urg = 0;
-	t1->rst = 0;
-	t1->psh = 0;
-	t1->ack_seq = htonl(newsk->acked_seq);
-	t1->doff = sizeof(*t1)/4+1;
-	ptr = skb_put(buff,4);
-	ptr[0] = 2;
-	ptr[1] = 4;
-	ptr[2] = ((newsk->mtu) >> 8) & 0xff;
-	ptr[3] =(newsk->mtu) & 0xff;
-	buff->csum = csum_partial(ptr, 4, 0);
-	tcp_send_check(t1, newsk->saddr, newsk->daddr, sizeof(*t1)+4, buff);
-	newsk->prot->queue_xmit(newsk, ndev, buff, 0);
-	tcp_reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
-	skb->sk = newsk;
+	th =(struct tcphdr *) skb_put(skb, sizeof(struct tcphdr));
+	skb->h.th = th;
+	memset(th, 0, sizeof(struct tcphdr));
 
-	/*
-	 *	Charge the sock_buff to newsk. 
-	 */
-	 
-	atomic_sub(skb->truesize, &sk->rmem_alloc);
-	atomic_add(skb->truesize, &newsk->rmem_alloc);
+	th->syn = 1;
+	th->ack = 1;
+
+	th->source = sk->dummy_th.source;
+	th->dest = sk->dummy_th.dest;
+	       
+	skb->seq = tp->snd_una;
+	skb->end_seq = skb->seq + 1 /* th->syn */ ;
+	th->seq = ntohl(skb->seq);
+
+	th->window = ntohs(tp->rcv_wnd);
+
+	th->ack_seq = htonl(tp->rcv_nxt);
+	th->doff = sizeof(*th)/4 + 1;
+
+	ptr = skb_put(skb, TCPOLEN_MSS);
+	ptr[0] = TCPOPT_MSS;
+	ptr[1] = TCPOLEN_MSS;
+	ptr[2] = ((sk->mss) >> 8) & 0xff;
+	ptr[3] = (sk->mss) & 0xff;
+	skb->csum = csum_partial(ptr, TCPOLEN_MSS, 0);
+
+	tp->af_specific->send_check(sk, th, sizeof(*th)+4, skb);
+
+	skb_queue_tail(&sk->write_queue, skb);
+
+	atomic_inc(&sk->packets_out);
 	
-	skb_queue_tail(&sk->receive_queue,skb);
-	sk->ack_backlog++;
+	skb->when = jiffies;
+	buff = skb_clone(skb, GFP_ATOMIC);
+
+	atomic_add(skb->truesize, &sk->wmem_alloc);
+
+	tp->af_specific->queue_xmit(sk, skb->dev, buff, 1);
+
+	tcp_reset_xmit_timer(sk, TIME_RETRANS, TCP_TIMEOUT_INIT);
+
 	tcp_statistics.TcpOutSegs++;
+
+	return 0;
 }
 
 /*
@@ -876,31 +837,31 @@
  *      - delay time <= 0.5 HZ
  *      - must send at least every 2 full sized packets
  *      - we don't have a window update to send
- *
- * 	additional thoughts:
- *	- we should not delay sending an ACK if we have ato > 0.5 HZ.
- *	  My thinking about this is that in this case we will just be
- *	  systematically skewing the RTT calculation. (The rule about
- *	  sending every two full sized packets will never need to be
- *	  invoked, the delayed ack will be sent before the ATO timeout
- *	  every time. Of course, the relies on our having a good estimate
- *	  for packet interarrival times.)
  */
-void tcp_send_delayed_ack(struct sock * sk, int max_timeout, unsigned long timeout)
+
+void tcp_send_delayed_ack(struct sock * sk, int max_timeout)
 {
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	unsigned long timeout, now;
+
 	/* Calculate new timeout */
-	if (timeout > max_timeout)
-		timeout = max_timeout;
-	if (sk->bytes_rcv >= sk->max_unacked)
-		timeout = 0;
-	timeout += jiffies;
-
-	/* Use new timeout only if there wasn't an older one earlier  */
-	if (!del_timer(&sk->delack_timer) || timeout < sk->delack_timer.expires)
-		sk->delack_timer.expires = timeout;
+	now = jiffies;
+	timeout = tp->ato;
 
-	sk->ack_backlog++;
-	add_timer(&sk->delack_timer);
+	if (timeout > max_timeout || sk->bytes_rcv > (sk->mss << 2))
+	{
+		timeout = now;
+	}
+	else
+		timeout += now;
+
+	/* Use new timeout only if there wasn't a older one earlier  */
+	if (!del_timer(&tp->delack_timer) || timeout < tp->delack_timer.expires)
+	{
+		tp->delack_timer.expires = timeout;
+	}
+
+	add_timer(&tp->delack_timer);
 }
 
 
@@ -912,29 +873,15 @@
 void tcp_send_ack(struct sock *sk)
 {
 	struct sk_buff *buff;
-	struct tcphdr *t1;
-	struct device *dev = NULL;
+	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+	struct tcphdr *th;
 	int tmp;
 
+	
 	if(sk->zapped)
-		return;		/* We have been reset, we may not send again */
-		
-	/*
-	 *	If we have nothing queued for transmit and the transmit timer
-	 *	is on we are just doing an ACK timeout and need to switch
-	 *	to a keepalive.
-	 */
-
-	clear_delayed_acks(sk);
-
-	if (sk->send_head == NULL
-	    && skb_queue_empty(&sk->write_queue)
-	    && sk->ip_xmit_timeout == TIME_WRITE)
 	{
-		if (sk->keepopen)
-			tcp_reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
-		else
-			del_timer(&sk->retransmit_timer);
+		/* We have been reset, we may not send again */
+		return;		
 	}
 
 	/*
@@ -951,11 +898,13 @@
 		 *	bandwidth on slow links to send a spare ack than
 		 *	resend packets. 
 		 */
-
-		tcp_send_delayed_ack(sk, HZ/2, HZ/2);
+		 
+		tcp_send_delayed_ack(sk, HZ/2);
 		return;
 	}
 
+	clear_delayed_acks(sk);
+
 	/*
 	 *	Assemble a suitable TCP frame
 	 */
@@ -968,35 +917,39 @@
 	 *	Put in the IP header and routing stuff. 
 	 */
 	 
-	tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
-				IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
+	tmp = tp->af_specific->build_net_header(sk, buff);
+
 	if (tmp < 0) 
 	{
   		buff->free = 1;
 		sock_wfree(sk, buff);
 		return;
 	}
-#if 0	/* why does this result in problems? */
-#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
-	buff->ip_hdr->frag_off |= htons(IP_DF);
-#endif
-#endif
 
-	t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
+	th =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
+
+	memcpy(th, &sk->dummy_th, sizeof(struct tcphdr));
+
+	/*
+	 *	Swap the send and the receive. 
+	 */
+	 
+	th->window	= ntohs(tcp_select_window(sk));
+	th->seq		= ntohl(tp->snd_nxt);
+	th->ack_seq	= ntohl(tp->rcv_nxt);
 
   	/*
   	 *	Fill in the packet and send it
   	 */
-  	 
-	memcpy(t1, &sk->dummy_th, sizeof(*t1));
-	t1->seq     = htonl(sk->sent_seq);
-  	t1->ack_seq = htonl(sk->acked_seq);
-	t1->window  = htons(tcp_select_window(sk));
 
-  	tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), buff);
+	tp->af_specific->send_check(sk, th, sizeof(struct tcphdr), buff);
+
   	if (sk->debug)
-  		 printk(KERN_ERR "\rtcp_ack: seq %x ack %x\n", sk->sent_seq, sk->acked_seq);
-  	sk->prot->queue_xmit(sk, dev, buff, 1);
+  		 printk("\rtcp_send_ack: seq %x ack %x\n", 
+			tp->snd_nxt, tp->rcv_nxt);
+
+	tp->af_specific->queue_xmit(sk, buff->dev, buff, 1);
+
   	tcp_statistics.TcpOutSegs++;
 }
 
@@ -1007,9 +960,9 @@
 
 void tcp_write_wakeup(struct sock *sk)
 {
-	struct sk_buff *buff,*skb;
+	struct sk_buff *buff, *skb;
 	struct tcphdr *t1;
-	struct device *dev=NULL;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	int tmp;
 
 	if (sk->zapped)
@@ -1030,115 +983,56 @@
 	{
 		return;
 	}
-	if ( before(sk->sent_seq, sk->window_seq) && 
-	    (skb=skb_peek(&sk->write_queue)))
+
+	if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) && 
+	    (skb=tp->send_head))
 	{
 		/*
 	    	 * We are probing the opening of a window
 	    	 * but the window size is != 0
 	    	 * must have been a result SWS avoidance ( sender )
 	    	 */
-	    
-	    	struct iphdr *iph;
-	    	struct tcphdr *th;
-	    	struct tcphdr *nth;
-	    	unsigned long win_size;
-#if 0
-		unsigned long ow_size;
-#endif
-	
-		/*
-		 *	How many bytes can we send ?
-		 */
-		 
-		win_size = sk->window_seq - sk->sent_seq;
-
-		/*
-		 *	Recover the buffer pointers
-		 */
-		 
-	    	iph = (struct iphdr *)skb->ip_hdr;
-	    	th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
 
-		/*
-		 *	Grab the data for a temporary frame
-		 */
-		 
-	    	buff = sock_wmalloc(sk, win_size + th->doff * 4 + 
-				     (iph->ihl << 2) +
-				     sk->prot->max_header + 15, 
-				     1, GFP_ATOMIC);
-	    	if ( buff == NULL )
-	    		return;
-
-	 	/* 
-	 	 *	If we strip the packet on the write queue we must
-	 	 *	be ready to retransmit this one 
-	 	 */
-	    
-	    	buff->free = /*0*/1;
+		struct tcphdr *th;
+		unsigned long win_size;
 
-	    	buff->sk = sk;
-	    	buff->localroute = sk->localroute;
-	    	
-	    	/*
-	    	 *	Put headers on the new packet
-	    	 */
+		win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
 
-	    	tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
-					 IPPROTO_TCP, sk->opt, buff->truesize,
-					 sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
-	    	if (tmp < 0) 
-	    	{
-			sock_wfree(sk, buff);
-			return;
+		if (win_size < skb->end_seq - skb->seq)
+		{
+			if (tcp_fragment(sk, skb, win_size))
+			{
+				printk(KERN_DEBUG "tcp_write_wakeup: "
+				       "fragment failed\n");
+				return;
+			}
 		}
+
+			    	
+		th = skb->h.th;
 		
-		/*
-		 *	Move the TCP header over
-		 */
+		tp->af_specific->send_check(sk, th, th->doff * 4 + win_size, 
+					    skb);
 
-		buff->dev = dev;
+		buff = skb_clone(skb, GFP_ATOMIC);
 
-		nth = (struct tcphdr *) skb_put(buff,sizeof(*th));
+		atomic_add(buff->truesize, &sk->wmem_alloc);
+		atomic_inc(&sk->packets_out);
 
-		memcpy(nth, th, sizeof(*th));
-		
-		/*
-		 *	Correct the new header
-		 */
-		 
-		nth->ack = 1; 
-		nth->ack_seq = htonl(sk->acked_seq);
-		nth->window = htons(tcp_select_window(sk));
-		nth->check = 0;
+		clear_delayed_acks(sk);
 
-		/*
-		 *	Copy TCP options and data start to our new buffer
-		 */
-		 
-		buff->csum = csum_partial_copy((void *)(th + 1), skb_put(buff,win_size),
-				win_size + th->doff*4 - sizeof(*th), 0);
-		
-		/*
-		 *	Remember our right edge sequence number.
-		 */
-		 
-	    	buff->end_seq = sk->sent_seq + win_size;
-	    	sk->sent_seq = buff->end_seq;		/* Hack */
-		if(th->urg && ntohs(th->urg_ptr) < win_size)
-			nth->urg = 0;
+		if (!tcp_timer_is_set(sk, TIME_RETRANS))
+			tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 
-		/*
-		 *	Checksum the split buffer
-		 */
-		 
-	    	tcp_send_check(nth, sk->saddr, sk->daddr, 
-			   nth->doff * 4 + win_size , buff);
+		skb->when = jiffies;
+
+		update_send_head(sk);
+
+		tp->snd_nxt = skb->end_seq;
 	}
 	else
 	{	
-		buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
+		buff = sock_wmalloc(sk,MAX_ACK_SIZE, 1, GFP_ATOMIC);
 		if (buff == NULL) 
 			return;
 
@@ -1151,15 +1045,15 @@
 		 *	Put in the IP header and routing stuff. 
 		 */
 		 
-		tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
-				IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
+		tmp = tp->af_specific->build_net_header(sk, buff);
+
 		if (tmp < 0) 
 		{
 			sock_wfree(sk, buff);
 			return;
 		}
 
-		t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
+		t1 = (struct tcphdr *) skb_put(buff, sizeof(struct tcphdr));
 		memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
 
 		/*
@@ -1167,89 +1061,43 @@
 		 *	This should cause the other end to send an ack.
 		 */
 	 
-		t1->seq = htonl(sk->sent_seq-1);
+		t1->seq = htonl(tp->snd_nxt-1);
 /*		t1->fin = 0;	-- We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
-		t1->ack_seq = htonl(sk->acked_seq);
+		t1->ack_seq = htonl(tp->rcv_nxt);
 		t1->window = htons(tcp_select_window(sk));
-		tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), buff);
-
+		
+		tp->af_specific->send_check(sk, t1, sizeof(*t1), buff);
 	}		
 
 	/*
 	 *	Send it.
 	 */
-	
-	sk->prot->queue_xmit(sk, dev, buff, 1);
+
+	tp->af_specific->queue_xmit(sk, buff->dev, buff, 1);
 	tcp_statistics.TcpOutSegs++;
 }
 
 /*
  *	A window probe timeout has occurred.
+ *	If window is not closed send a partial packet
+ *	else a zero probe.
  */
 
 void tcp_send_probe0(struct sock *sk)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
 	if (sk->zapped)
 		return;		/* After a valid reset we can send no more */
 
-	tcp_write_wakeup(sk);
-
-	sk->backoff++;
-	sk->rto = min(sk->rto << 1, 120*HZ);
-	sk->retransmits++;
-	sk->prot->retransmits ++;
-	tcp_reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
-}
-
-/*
- * Remove the portion of a packet that has already been sent.
- * Needed to deal with buggy TCP implementations that can't deal
- * with seeing a packet that contains some data that has already
- * been received.
- */
-void tcp_shrink_skb(struct sock *sk, struct sk_buff *skb, u32 ack)
-{
-	struct iphdr *iph;
-	struct tcphdr *th;
-	unsigned char *old, *new;
-	unsigned long len;
-	int diff;
 
-	/*
-	 *	Recover the buffer pointers
-	 */
-	 
-	iph = (struct iphdr *)skb->ip_hdr;
-	th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
+	tcp_write_wakeup(sk);
 
-	/* how much data are we droping from the tcp frame */
-	diff = ack - skb->seq;
-	/* how much data are we keeping in the tcp frame */
-	len = (skb->end_seq - (th->fin + th->syn)) - ack;
-
-	/* pointers to new start of remaining data, and old start */
-	new = (unsigned char *)th + th->doff*4;
-	old = new+diff;
-
-	/* Update our starting seq number */
-	skb->seq = ack;
-	th->seq = htonl(ack);
-	iph->tot_len = htons(ntohs(iph->tot_len)-diff);
-
-	/* Get the partial checksum for the IP options */
-	if (th->doff*4 - sizeof(*th) > 0)
-		skb->csum = csum_partial((void *)(th+1),
-				th->doff*4-sizeof(*th),0);
-	else
-		skb->csum = 0;
+	tp->pending = TIME_PROBE0;
 
-	/* Copy the good data down and get it's checksum */
-	skb->csum = csum_partial_copy((void *)old,(void *)new,len,skb->csum);
+	tp->backoff++;
+	tp->probes_out++;
 
-	/* shorten the skb */
-	skb_trim(skb,skb->len-diff);
-	 
-	/* Checksum the shrunk buffer */
-	tcp_send_check(th, sk->saddr, sk->daddr, 
-		   th->doff * 4 + len , skb);
+	tcp_reset_xmit_timer (sk, TIME_PROBE0, 
+			      min(tp->rto << tp->backoff, 120*HZ));
 }
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov