patch-2.1.28 linux/net/ipv4/af_inet.c

Next file: linux/net/ipv4/icmp.c
Previous file: linux/net/core/sock.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.1.27/linux/net/ipv4/af_inet.c linux/net/ipv4/af_inet.c
@@ -50,6 +50,8 @@
  *		Alan Cox	:	Loosened bind a little.
  *		Mike McLagan	:	ADD/DEL DLCI Ioctls
  *	Willy Konynenberg	:	Transparent proxying support.
+ *		David S. Miller	:	New socket lookup architecture.
+ *					Some other random speedups.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -119,10 +121,6 @@
 extern int udp_get_info(char *, char **, off_t, int, int);
 
 
-struct sock * tcp_sock_array[SOCK_ARRAY_SIZE];
-struct sock * udp_sock_array[SOCK_ARRAY_SIZE];
-struct sock * raw_sock_array[SOCK_ARRAY_SIZE];
-
 #ifdef CONFIG_DLCI
 extern int dlci_ioctl(unsigned int, void*);
 #endif
@@ -134,293 +132,94 @@
 int (*rarp_ioctl_hook)(unsigned int,void*) = NULL;
 
 /*
- *	See if a socket number is in use.
+ *	Destroy an AF_INET socket
  */
  
-static int sk_inuse(struct proto *prot, int num)
+static __inline__ void kill_sk_queues(struct sock *sk)
 {
-	struct sock *sk;
+	struct sk_buff *skb;
 
-	for(sk = prot->sock_array[num & (SOCK_ARRAY_SIZE -1 )];
-		sk != NULL;  sk=sk->next) 
-	{
-		if (sk->num == num) 
-			return(1);
+	/* First the read buffer. */
+	while((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
+		/* This will take care of closing sockets that were
+		 * listening and didn't accept everything.
+		 */
+		if (skb->sk != NULL && skb->sk != sk)
+			skb->sk->prot->close(skb->sk, 0);
+		kfree_skb(skb, FREE_READ);
 	}
-	return(0);
-}
 
+	/* Next, the error queue. */
+	while((skb = skb_dequeue(&sk->error_queue)) != NULL)
+		kfree_skb(skb, FREE_READ);
 
-/*
- *	Pick a new socket number
- */
+  	/* Now the backlog. */
+  	while((skb=skb_dequeue(&sk->back_log)) != NULL)
+		kfree_skb(skb, FREE_READ);
+}
 
-unsigned short get_new_socknum(struct proto *prot, unsigned short base)
+static __inline__ void kill_sk_now(struct sock *sk)
 {
-	static int start=0;
-
-	/*
-	 * Used to cycle through the port numbers so the
-	 * chances of a confused connection drop.
-	 */
-	 
-	int i, j;
-	int best = 0;
-	int size = 32767; /* a big num. */
-	struct sock *sk;
+	/* No longer exists. */
+	del_from_prot_sklist(sk);
 
-	if (base == 0) 
-		base = PROT_SOCK+1+(start & 1023);
-	if (base <= PROT_SOCK) 
-	{
-		base += PROT_SOCK+(start & 1023);
-	}
-
-	/*
-	 *	Now look through the entire array and try to find an empty ptr. 
-	 */
-	 
-	for(i=0; i < SOCK_ARRAY_SIZE; i++) 
-	{
-		j = 0;
-		sk = prot->sock_array[(i+base+1) &(SOCK_ARRAY_SIZE -1)];
-		while(sk != NULL) 
-		{
-			sk = sk->next;
-			j++;
-		}
-		if (j == 0) 
-		{
-			start =(i+1+start )&1023;
-			return(i+base+1);
-		}
-		if (j < size) 
-		{
-			best = i;
-			size = j;
-		}
-	}
+	/* This is gross, but needed for SOCK_PACKET -DaveM */
+	if(sk->prot->unhash)
+		sk->prot->unhash(sk);
 
-	/* Now make sure the one we want is not in use. */
-
-	while(sk_inuse(prot, base +best+1)) 
-	{
-		best += SOCK_ARRAY_SIZE;
-	}
-	return(best+base+1);
+	if(sk->opt)
+		kfree(sk->opt);
+	dst_release(sk->dst_cache);
+	sk_free(sk);
 }
 
-/*
- *	Add a socket into the socket tables by number.
- */
-
-void inet_put_sock(unsigned short num, struct sock *sk)
+static __inline__ void kill_sk_later(struct sock *sk)
 {
-	struct sock **skp, *tmp;
-	int mask;
-	unsigned long flags;
-	
-	if(sk->type==SOCK_PACKET)
-		return;
-
-	sk->num = num;
-	sk->next = NULL;
-	num = num &(SOCK_ARRAY_SIZE -1);
-
+	/* this should never happen. */
+	/* actually it can if an ack has just been sent. */
 	/* 
-	 *	We can't have an interrupt re-enter here. 
-	 */
-	 
-	save_flags(flags);
-	cli();
-
-	sk->prot->inuse += 1;
-	if (sk->prot->highestinuse < sk->prot->inuse)
-		sk->prot->highestinuse = sk->prot->inuse;
-
-	if (sk->prot->sock_array[num] == NULL) 
-	{
-		sk->prot->sock_array[num] = sk;
-		restore_flags(flags);
-		return;
-	}
-	
-	restore_flags(flags);
-	for(mask = 0xff000000; mask != 0xffffffff; mask = (mask >> 8) | mask) 
-	{
-		if ((mask & sk->rcv_saddr) &&
-		    (mask & sk->rcv_saddr) != (mask & 0xffffffff)) 
-		{
-			mask = mask << 8;
-			break;
-		}
-	}
-
-	/*
-	 * add the socket to the sock_array[]..
+	 * It's more normal than that...
+	 * It can happen because a skb is still in the device queues
+	 * [PR]
 	 */
-	skp = sk->prot->sock_array + num;
-	cli();
-	while ((tmp = *skp) != NULL) {
-		if (!(tmp->rcv_saddr & mask))
-			break;
-		skp = &tmp->next;
-	}
-	sk->next = tmp;
-	*skp = sk;
-	sti();
-}
-
-/*
- *	Remove a socket from the socket tables.
- */
-
-void inet_remove_sock(struct sock *sk1)
-{
-	struct sock **p;
-	unsigned long flags;
-
-	if (sk1->type==SOCK_PACKET)
-		return;
-		
-	if (!sk1->prot) 
-	{
-		NETDEBUG(printk("sock.c: remove_sock: sk1->prot == NULL\n"));
-		return;
-	}
+		  
+	printk("Socket destroy delayed (r=%d w=%d)\n",
+	       sk->rmem_alloc, sk->wmem_alloc);
 
-	/* We can't have this changing out from under us. */
-	save_flags(flags);
-	cli();
-	
-	p=&(sk1->prot->sock_array[sk1->num & (SOCK_ARRAY_SIZE -1)]);
-	
-	while(*p!=NULL)
-	{
-		if(*p==sk1)
-		{
-			sk1->prot->inuse--;
-			*p=sk1->next;
-			break;
-		}
-		p=&((*p)->next);
-	}
-	restore_flags(flags);
+	sk->destroy = 1;
+	sk->ack_backlog = 0;
+	release_sock(sk);
+	net_reset_timer(sk, TIME_DESTROY, SOCK_DESTROY_TIME);
 }
 
-/*
- *	Destroy an AF_INET socket
- */
- 
 void destroy_sock(struct sock *sk)
 {
-	struct sk_buff *skb;
-
 	lock_sock(sk);			/* just to be safe. */
 
-  	/*
-  	 *	Now we can no longer get new packets or once the
-  	 *	timers are killed, send them.
+  	/* Now we can no longer get new packets or once the
+  	 * timers are killed, send them.
   	 */
-  	 
   	net_delete_timer(sk);
 
 	if (sk->prot->destroy)
 		sk->prot->destroy(sk);
 
-  	/*
-  	 *	Clean up the read buffer.
-  	 */
-
-	while((skb=skb_dequeue(&sk->receive_queue))!=NULL) 
-	{
-		/*
-		 * This will take care of closing sockets that were
-		 * listening and didn't accept everything.
-		 */
-		if (skb->sk != NULL && skb->sk != sk) 
-		{
-			IS_SKB(skb);
-			skb->sk->prot->close(skb->sk, 0);
-		}
-		IS_SKB(skb);
-		kfree_skb(skb, FREE_READ);
-	}
-
-  	/*
-  	 *	Clean up the error queue.
-  	 */
-
-	while((skb=skb_dequeue(&sk->error_queue))!=NULL) 
-	{
-		IS_SKB(skb);
-		kfree_skb(skb, FREE_READ);
-	}
-
-  	/*
-  	 *	Now the backlog. 
-  	 */
-  	 
-  	while((skb=skb_dequeue(&sk->back_log))!=NULL) 
-  	{
-		IS_SKB(skb);
-		kfree_skb(skb, FREE_READ);
-	}
+	kill_sk_queues(sk);
 
-	/*
-	 *	Now if it has a half accepted/ closed socket. 
-	 */
-	 
-	if (sk->pair) 
-	{
+	/* Now if it has a half accepted/ closed socket. */
+	if (sk->pair) {
 		sk->pair->prot->close(sk->pair, 0);
 		sk->pair = NULL;
   	}
 
-	/*
-	 * Now if everything is gone we can free the socket
+	/* Now if everything is gone we can free the socket
 	 * structure, otherwise we need to keep it around until
 	 * everything is gone.
 	 */
-
-	if (sk->rmem_alloc == 0 && sk->wmem_alloc == 0) 
-	{
-/*
- *	It is wrong! We MUST unlink socket from socket table
- *	even earlier, than it used to be.
- *	F.e. TCP socket must be unlinked at the moment, when
- *	it goes to TCP_CLOSE. --ANK
- */
-	    	inet_remove_sock(sk);
-
-		if(sk->opt)
-			kfree(sk->opt);
-		dst_release(sk->dst_cache);
-		/*
-		 *	This one is pure paranoia. I'll take it out
-		 *	later once I know the bug is buried.
-		 */
-		tcp_cache_zap();
-		sk_free(sk);
-	} 
-	else 
-	{
-		/* this should never happen. */
-		/* actually it can if an ack has just been sent. */
-		/* 
-		 * It's more normal than that...
-		 * It can happen because a skb is still in the device queues
-		 * [PR]
-		 */
-		  
-		printk("Socket destroy delayed (r=%d w=%d)\n",
- 			sk->rmem_alloc, sk->wmem_alloc);
-
-		sk->destroy = 1;
-		sk->ack_backlog = 0;
-		release_sock(sk);
-		net_reset_timer(sk, TIME_DESTROY, SOCK_DESTROY_TIME);
-  	}
+	if (sk->rmem_alloc == 0 && sk->wmem_alloc == 0)
+		kill_sk_now(sk);
+	else
+		kill_sk_later(sk);
 }
 
 /*
@@ -467,15 +266,13 @@
 static int inet_autobind(struct sock *sk)
 {
 	/* We may need to bind the socket. */
-	if (sk->num == 0) 
-	{
-		sk->num = get_new_socknum(sk->prot, 0);
+	if (sk->num == 0) {
+		sk->num = sk->prot->good_socknum();
 		if (sk->num == 0) 
 			return(-EAGAIN);
-		udp_cache_zap();
-		tcp_cache_zap();
-		inet_put_sock(sk->num, sk);
-		sk->dummy_th.source = ntohs(sk->num);
+		sk->dummy_th.source = htons(sk->num);
+		sk->prot->hash(sk);
+		add_to_prot_sklist(sk);
 	}
 	return 0;
 }
@@ -491,7 +288,7 @@
 	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
 		return(-EINVAL);
 
-	if (inet_autobind(sk)!=0)
+	if (inet_autobind(sk) != 0)
 		return -EAGAIN;
 
 	/* We might as well re use these. */ 
@@ -506,10 +303,11 @@
 	if ((unsigned) backlog > SOMAXCONN)
 		backlog = SOMAXCONN;
 	sk->max_ack_backlog = backlog;
-	if (sk->state != TCP_LISTEN)
-	{
+	if (sk->state != TCP_LISTEN) {
 		sk->ack_backlog = 0;
 		sk->state = TCP_LISTEN;
+		sk->prot->rehash(sk);
+		add_to_prot_sklist(sk);
 	}
 	sk->socket->flags |= SO_ACCEPTCON;
 	return(0);
@@ -526,87 +324,44 @@
 {
 	struct sock *sk;
 	struct proto *prot;
-	int err;
 
 	sock->state = SS_UNCONNECTED;
 	sk = sk_alloc(GFP_KERNEL);
 	if (sk == NULL) 
-		return(-ENOBUFS);
+		goto do_oom;
 
-	/*
-	 *	Note for tcp that also wiped the dummy_th block for us.
-	 */
-	switch (sock->type) 
-	{
-		case SOCK_STREAM:
-		case SOCK_SEQPACKET:
-			if (protocol && protocol != IPPROTO_TCP) 
-			{
-				sk_free(sk);
-				return(-EPROTONOSUPPORT);
-			}
-			protocol = IPPROTO_TCP;
-			sk->no_check = TCP_NO_CHECK;
-			if (ipv4_config.no_pmtu_disc)
-				sk->ip_pmtudisc = IP_PMTUDISC_DONT;
-			else
-				sk->ip_pmtudisc = IP_PMTUDISC_WANT;
-			prot = &tcp_prot;
-			sock->ops = &inet_stream_ops;
-			break;
-
-		case SOCK_DGRAM:
-			if (protocol && protocol != IPPROTO_UDP) 
-			{
-				sk_free(sk);
-				return(-EPROTONOSUPPORT);
-			}
-			protocol = IPPROTO_UDP;
-			sk->no_check = UDP_NO_CHECK;
+	/* Note for tcp that also wiped the dummy_th block for us. */
+	if(sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET) {
+		if (protocol && protocol != IPPROTO_TCP)
+			goto free_and_noproto;
+		protocol = IPPROTO_TCP;
+		sk->no_check = TCP_NO_CHECK;
+		if (ipv4_config.no_pmtu_disc)
 			sk->ip_pmtudisc = IP_PMTUDISC_DONT;
-			prot=&udp_prot;
-			sock->ops = &inet_dgram_ops;
-			break;
-      
-		case SOCK_RAW:
-			if (!suser()) 
-			{
-				sk_free(sk);
-				return(-EPERM);
-			}
-			if (!protocol) 
-			{
-				sk_free(sk);
-				return(-EPROTONOSUPPORT);
-			}
-			prot = &raw_prot;
-			sk->reuse = 1;
-			sk->ip_pmtudisc = IP_PMTUDISC_DONT;
-			sk->num = protocol;
-			sock->ops = &inet_dgram_ops;
-			break;
-
-		case SOCK_PACKET:
-			if (!suser()) 
-			{
-				sk_free(sk);
-				return(-EPERM);
-			}
-			if (!protocol) 
-			{
-				sk_free(sk);
-				return(-EPROTONOSUPPORT);
-			}
-			prot = &packet_prot;
-			sk->reuse = 1;
-			sk->ip_pmtudisc = IP_PMTUDISC_DONT;
-			sk->num = protocol;
-			sock->ops = &inet_dgram_ops;
-			break;
-
-		default:
-			sk_free(sk);
-			return(-ESOCKTNOSUPPORT);
+		else
+			sk->ip_pmtudisc = IP_PMTUDISC_WANT;
+		prot = &tcp_prot;
+		sock->ops = &inet_stream_ops;
+	} else if(sock->type == SOCK_DGRAM) {
+		if (protocol && protocol != IPPROTO_UDP)
+			goto free_and_noproto;
+		protocol = IPPROTO_UDP;
+		sk->no_check = UDP_NO_CHECK;
+		sk->ip_pmtudisc = IP_PMTUDISC_DONT;
+		prot=&udp_prot;
+		sock->ops = &inet_dgram_ops;
+	} else if(sock->type == SOCK_RAW || sock->type == SOCK_PACKET) {
+		if (!suser())
+			goto free_and_badperm;
+		if (!protocol)
+			goto free_and_noproto;
+		prot = (sock->type == SOCK_RAW) ? &raw_prot : &packet_prot;
+		sk->reuse = 1;
+		sk->ip_pmtudisc = IP_PMTUDISC_DONT;
+		sk->num = protocol;
+		sock->ops = &inet_dgram_ops;
+	} else {
+		goto free_and_badtype;
 	}
 
 	sock_init_data(sock,sk);
@@ -636,33 +391,47 @@
 	sk->ip_mc_index=0;
 	sk->ip_mc_list=NULL;
 	
-	/*
-	 *	Speed up by setting some standard state for the dummy_th
+	/*	Speed up by setting some standard state for the dummy_th
 	 *	if TCP uses it (maybe move to tcp_init later)
 	 */
   	
-	if (sk->num) 
-	{
-		/*
-		 * It assumes that any protocol which allows
+	if (sk->num) {
+		/* It assumes that any protocol which allows
 		 * the user to assign a number at socket
 		 * creation time automatically
 		 * shares.
 		 */
-		inet_put_sock(sk->num, sk);
 		sk->dummy_th.source = ntohs(sk->num);
+
+		/* This is gross, but needed for SOCK_PACKET -DaveM */
+		if(sk->prot->hash)
+			sk->prot->hash(sk);
+		add_to_prot_sklist(sk);
 	}
 
-	if (sk->prot->init) 
-	{
-		err = sk->prot->init(sk);
-		if (err != 0) 
-		{
+	if (sk->prot->init) {
+		int err = sk->prot->init(sk);
+		if (err != 0) {
 			destroy_sock(sk);
 			return(err);
 		}
 	}
 	return(0);
+
+free_and_badtype:
+	sk_free(sk);
+	return -ESOCKTNOSUPPORT;
+
+free_and_badperm:
+	sk_free(sk);
+	return -EPERM;
+
+free_and_noproto:
+	sk_free(sk);
+	return -EPROTONOSUPPORT;
+
+do_oom:
+	return -ENOBUFS;
 }
 
 
@@ -684,193 +453,99 @@
 int inet_release(struct socket *sock, struct socket *peersock)
 {
 	struct sock *sk = sock->sk;
-	unsigned long timeout;
-
-	if (sk==NULL)
-		return 0;
-
-	if (sock->state != SS_UNCONNECTED)
-		sock->state = SS_DISCONNECTING;
-
-	sk->state_change(sk);
-
-	/* Start closing the connection.  This may take a while. */
 
-	/* Applications forget to leave groups before exiting */
-	ip_mc_drop_socket(sk);
+	if (sk) {
+		unsigned long timeout;
 
-	/*
-	 * If linger is set, we don't return until the close
-	 * is complete.  Otherwise we return immediately. The
-	 * actually closing is done the same either way.
-	 *
-	 * If the close is due to the process exiting, we never
-	 * linger..
-	 */
-	timeout = 0;
-	if (sk->linger)
-	{
-		timeout = ~0UL;
-		if (!sk->lingertime)
-			timeout = jiffies + HZ*sk->lingertime;
-	}
-	if (current->flags & PF_EXITING)
+		/* Begin closedown and wake up sleepers. */
+		if (sock->state != SS_UNCONNECTED)
+			sock->state = SS_DISCONNECTING;
+		sk->state_change(sk);
+
+		/* Applications forget to leave groups before exiting */
+		ip_mc_drop_socket(sk);
+
+		/* If linger is set, we don't return until the close
+		 * is complete.  Otherwise we return immediately. The
+		 * actually closing is done the same either way.
+		 *
+		 * If the close is due to the process exiting, we never
+		 * linger..
+		 */
 		timeout = 0;
+		if (sk->linger && !(current->flags & PF_EXITING)) {
+			timeout = ~0UL;
 
-	sock->sk = NULL;
-	sk->socket = NULL;
-
-	sk->prot->close(sk, timeout);
+			/* XXX This makes no sense whatsoever... -DaveM */
+			if (!sk->lingertime)
+				timeout = jiffies + HZ*sk->lingertime;
+		}
+		sock->sk = NULL;
+		sk->socket = NULL;
+		sk->prot->close(sk, timeout);
+	}
 	return(0);
 }
 
-
-static int inet_bind(struct socket *sock, struct sockaddr *uaddr,
-		     int addr_len)
+static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
 	struct sockaddr_in *addr=(struct sockaddr_in *)uaddr;
-	struct sock *sk=sock->sk, *sk2;
-	unsigned short snum = 0 /* Stoopid compiler.. this IS ok */;
+	struct sock *sk=sock->sk;
+	unsigned short snum;
 	int chk_addr_ret;
 
-	/*
-	 *	If the socket has its own bind function then use it.
-	 */
-	 
+	/* If the socket has its own bind function then use it. (RAW and PACKET) */
 	if(sk->prot->bind)
-		return sk->prot->bind(sk,uaddr, addr_len);
+		return sk->prot->bind(sk, uaddr, addr_len);
 		
-	/* check this error. */
-	if (sk->state != TCP_CLOSE)
-		return(-EINVAL);
-	if(addr_len<sizeof(struct sockaddr_in))
+	/* Check these errors (active socket, bad address length, double bind). */
+	if ((sk->state != TCP_CLOSE)			||
+	    (addr_len < sizeof(struct sockaddr_in))	||
+	    (sk->num != 0))
 		return -EINVAL;
 		
-	if (sock->type != SOCK_RAW)
-	{
-		if (sk->num != 0) 
-			return(-EINVAL);
-
-		snum = ntohs(addr->sin_port);
-		
+	snum = ntohs(addr->sin_port);
 #ifdef CONFIG_IP_MASQUERADE
-		/*
-		 *	The kernel masquerader needs some ports
-		 */		
-		if(snum>=PORT_MASQ_BEGIN && snum<=PORT_MASQ_END)
-			return -EADDRINUSE;
+	/* The kernel masquerader needs some ports. */
+	if((snum >= PORT_MASQ_BEGIN) && (snum <= PORT_MASQ_END))
+		return -EADDRINUSE;
 #endif		 
-
-		if (snum == 0) 
-			snum = get_new_socknum(sk->prot, 0);
-		if (snum < PROT_SOCK && !suser()) 
-			return(-EACCES);
-	}
+	if (snum == 0) 
+		snum = sk->prot->good_socknum();
+	if (snum < PROT_SOCK && !suser())
+		return(-EACCES);
 	
 	chk_addr_ret = __ip_chk_addr(addr->sin_addr.s_addr);
+	if (addr->sin_addr.s_addr != 0 && chk_addr_ret != IS_MYADDR &&
+	    chk_addr_ret != IS_MULTICAST && chk_addr_ret != IS_BROADCAST) {
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
-	/*
-	 * Superuser may bind to any address to allow transparent proxying.
-	 */
-	if (addr->sin_addr.s_addr != 0 && chk_addr_ret != IS_MYADDR && chk_addr_ret != IS_MULTICAST && chk_addr_ret != IS_BROADCAST && !suser())
-#else
-	if (addr->sin_addr.s_addr != 0 && chk_addr_ret != IS_MYADDR && chk_addr_ret != IS_MULTICAST && chk_addr_ret != IS_BROADCAST)
-#endif
-		return(-EADDRNOTAVAIL);	/* Source address MUST be ours! */
-
-#ifndef CONFIG_IP_TRANSPARENT_PROXY
-	/*
-	 * Am I just thick or is this test really always true after the one
-	 * above?  Just taking the test out appears to be the easiest way to
-	 * make binds to remote addresses for transparent proxying work.
-	 */
-	if (chk_addr_ret || addr->sin_addr.s_addr == 0)
-	{
+		/* Superuser may bind to any address to allow transparent proxying. */
+		if(!suser())
 #endif
-		/*
-		 *      We keep a pair of addresses. rcv_saddr is the one
-		 *      used by get_sock_*(), and saddr is used for transmit.
-		 *
-		 *      In the BSD API these are the same except where it
-		 *      would be illegal to use them (multicast/broadcast) in
-		 *      which case the sending device address is used.
-		 */
-		sk->rcv_saddr = addr->sin_addr.s_addr;
-		if(chk_addr_ret==IS_MULTICAST||chk_addr_ret==IS_BROADCAST)
-			sk->saddr = 0;  /* Use device */
-		else
-			sk->saddr = addr->sin_addr.s_addr;
-#ifndef CONFIG_IP_TRANSPARENT_PROXY
+			return -EADDRNOTAVAIL;	/* Source address MUST be ours! */
 	}
-#endif
-	if (sock->type != SOCK_RAW)
-	{
-		/* Make sure we are allowed to bind here. */
-		cli();
-		for(sk2 = sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)];
-					sk2 != NULL; sk2 = sk2->next) 
-		{
-			/*
-			 *	Hash collision or real match ?
-			 */
-			 
-			if (sk2->num != snum) 
-				continue;
-				
-			/*
-			 *	Either bind on the port is wildcard means
-			 *	they will overlap and thus be in error
-			 */			
-			 
-			if (!sk2->rcv_saddr || !sk->rcv_saddr)
-			{
-				/*
-				 *	Allow only if both are setting reuse.
-				 */
-				if(sk2->reuse && sk->reuse && sk2->state!=TCP_LISTEN)
-					continue;
-				sti();
-				return(-EADDRINUSE);
-			}
-
-			/*
-			 *	Two binds match ?
-			 */
-
-			if (sk2->rcv_saddr != sk->rcv_saddr) 
-				continue;
-			/*
-			 *	Reusable port ?
-			 */
-
-			if (!sk->reuse)
-			{
-				sti();
-				return(-EADDRINUSE);
-			}
-			
-			/*
-			 *	Reuse ?
-			 */
-			 
-			if (!sk2->reuse || sk2->state==TCP_LISTEN)
-			{
-				sti();
-				return(-EADDRINUSE);
-			}
-		}
-		sti();
 
-		inet_remove_sock(sk);
-		if (sock->type==SOCK_DGRAM)
-			udp_cache_zap();
-		if (sock->type==SOCK_STREAM)
-			tcp_cache_zap();
-		inet_put_sock(snum, sk);
-		sk->dummy_th.source = ntohs(sk->num);
-		sk->daddr = 0;
-		sk->dummy_th.dest = 0;
-	}
+	/*      We keep a pair of addresses. rcv_saddr is the one
+	 *      used by hash lookups, and saddr is used for transmit.
+	 *
+	 *      In the BSD API these are the same except where it
+	 *      would be illegal to use them (multicast/broadcast) in
+	 *      which case the sending device address is used.
+	 */
+	sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;
+	if(chk_addr_ret == IS_MULTICAST || chk_addr_ret == IS_BROADCAST)
+		sk->saddr = 0;  /* Use device */
+
+	/* Make sure we are allowed to bind here. */
+	if(sk->prot->verify_bind(sk, snum))
+		return -EADDRINUSE;
+
+	sk->num = snum;
+	sk->dummy_th.source = ntohs(snum);
+	sk->daddr = 0;
+	sk->dummy_th.dest = 0;
+	sk->prot->rehash(sk);
+	add_to_prot_sklist(sk);
 	dst_release(sk->dst_cache);
 	sk->dst_cache=NULL;
 	return(0);
@@ -882,7 +557,7 @@
 	struct sock *sk=sock->sk;
 	int err;
 
-	if (inet_autobind(sk)!=0)
+	if (inet_autobind(sk) != 0)
 		return(-EAGAIN);
 	if (sk->prot->connect == NULL) 
 		return(-EOPNOTSUPP);
@@ -903,85 +578,61 @@
 	struct sock *sk=sock->sk;
 	int err;
 
-	switch (sock->state)
-	{
-		case SS_UNCONNECTED:
-			/* This is ok... continue with connect */
-			break;
-		case SS_CONNECTED:
-			/* Socket is already connected */
+	if(sock->state != SS_UNCONNECTED && sock->state != SS_CONNECTING) {
+		if(sock->state == SS_CONNECTED)
 			return -EISCONN;
-		case SS_CONNECTING:
-			/* Not yet connected... we will check this. */
-		
-			/*
-			 *	FIXME:  for all protocols what happens if you start
-			 *	an async connect fork and both children connect. Clean
-			 *	this up in the protocols!
-			 */
-			break;
-		default:
-			return(-EINVAL);
-	}
-
-	if (sock->state == SS_CONNECTING && tcp_connected(sk->state))
-	{
-		sock->state = SS_CONNECTED;
-		/* Connection completing after a connect/EINPROGRESS/select/connect */
-		return 0;	/* Rock and roll */
+		return -EINVAL;
 	}
 
-	if (sock->state == SS_CONNECTING && sk->protocol == IPPROTO_TCP && (flags & O_NONBLOCK))
-	{
-		if(sk->err!=0)
-			return sock_error(sk);
-		return -EALREADY;	/* Connecting is currently in progress */
-  	}
-	if (sock->state != SS_CONNECTING) 
-	{
+	if(sock->state == SS_CONNECTING) {
+		if(tcp_connected(sk->state)) {
+			sock->state = SS_CONNECTED;
+			return 0;
+		}
+		if(sk->protocol == IPPROTO_TCP && (flags & O_NONBLOCK)) {
+			if(sk->err)
+				return sock_error(sk);
+			return -EALREADY;
+		}
+	} else {
 		/* We may need to bind the socket. */
-		if (inet_autobind(sk)!=0)
+		if (inet_autobind(sk) != 0)
 			return(-EAGAIN);
 		if (sk->prot->connect == NULL) 
 			return(-EOPNOTSUPP);
 		err = sk->prot->connect(sk, uaddr, addr_len);
-		if (err < 0) 
+		if (err < 0)
 			return(err);
   		sock->state = SS_CONNECTING;
 	}
 	
-	if (sk->state > TCP_FIN_WAIT2 && sock->state==SS_CONNECTING)
-	{
-		sock->state=SS_UNCONNECTED;
+	if (sk->state > TCP_FIN_WAIT2 && sock->state == SS_CONNECTING) {
+		sock->state = SS_UNCONNECTED;
 		return sock_error(sk);
 	}
 
 	if (sk->state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) 
 	  	return (-EINPROGRESS);
 
-	cli(); /* avoid the race condition */
-	while(sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
-	{
+	cli();
+	while(sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
 		interruptible_sleep_on(sk->sleep);
-		if (current->signal & ~current->blocked) 
-		{
+		if (current->signal & ~current->blocked) {
 			sti();
 			return(-ERESTARTSYS);
 		}
 		/* This fixes a nasty in the tcp/ip code. There is a hideous hassle with
 		   icmp error packets wanting to close a tcp or udp socket. */
-		if (sk->err && sk->protocol == IPPROTO_TCP)
-		{
+		if (sk->err && sk->protocol == IPPROTO_TCP) {
 			sock->state = SS_UNCONNECTED;
 			sti();
 			return sock_error(sk); /* set by tcp_err() */
 		}
 	}
 	sti();
-	sock->state = SS_CONNECTED;
 
-	if (sk->state != TCP_ESTABLISHED && sk->err) 
-	{
+	sock->state = SS_CONNECTED;
+	if ((sk->state != TCP_ESTABLISHED) && sk->err) {
 		sock->state = SS_UNCONNECTED;
 		return sock_error(sk);
 	}
@@ -994,32 +645,24 @@
 
 int inet_accept(struct socket *sock, struct socket *newsock, int flags)
 {
-	struct sock *sk1 = sock->sk;
+	struct sock *sk1 = sock->sk, *sk2;
 	struct sock *newsk = newsock->sk;
-	struct sock *sk2;
-	int err;
+	int err = -EINVAL;
 
-	if (sock->state != SS_UNCONNECTED)
-		return -EINVAL;
-	if (!(sock->flags & SO_ACCEPTCON)) 
-		return -EINVAL;
-	if (sk1->prot->accept == NULL) 
-		return -EOPNOTSUPP;
+	if (sock->state != SS_UNCONNECTED || !(sock->flags & SO_ACCEPTCON))
+		goto do_err;
 
-	/*
-	 *	Restore the state if we have been interrupted, and then returned. 
-	 */
-	 
-	if (sk1->pair != NULL ) 
-	{
+	err = -EOPNOTSUPP;
+	if (sk1->prot->accept == NULL)
+		goto do_err;
+
+	/* Restore the state if we have been interrupted, and then returned. */
+	if (sk1->pair != NULL) {
 		sk2 = sk1->pair;
 		sk1->pair = NULL;
-	} 
-	else
-	{
-		sk2 = sk1->prot->accept(sk1,flags);
-		if (sk2 == NULL) 
-			return sock_error(sk1);
+	} else {
+		if((sk2 = sk1->prot->accept(sk1,flags)) == NULL)
+			goto do_sk1_err;
 	}
 
 	/*
@@ -1027,7 +670,6 @@
 	 *	We need to free it up because the tcp module creates
 	 *	its own when it accepts one.
 	 */
-
 	sk2->sleep = newsk->sleep;
 
 	newsock->sk = sk2;
@@ -1035,56 +677,54 @@
 	newsk->socket = NULL;
 
 	if (flags & O_NONBLOCK)
-	{
-		destroy_sock(newsk);
-		return(0);
-	}
+		goto do_half_success;
 
-	cli(); /* avoid the race. */
-	while (sk2->state == TCP_SYN_RECV) 
-	{
+	cli();
+	while (sk2->state == TCP_SYN_RECV) {
 		interruptible_sleep_on(sk2->sleep);
-		if (current->signal & ~current->blocked) 
-		{
-			sti();
-			sk1->pair = sk2;
-			sk2->sleep = NULL;
-			sk2->socket = NULL;
-
-			newsock->sk = newsk;
-			newsk->socket = newsock;
-			return -ERESTARTSYS;
-		}
+		if (current->signal & ~current->blocked)
+			goto do_interrupted;
 	}
 	sti();
-
-	if (sk2->state != TCP_ESTABLISHED && sk2->err > 0) 
-	{
-		err = sock_error(sk2);
-		sk2->sleep = NULL;
-		sk2->socket = NULL;
-		destroy_sock(sk2);
-
-		newsock->sk = newsk;
-		newsk->socket = newsock;
-
-		return err;
-	}
+	if(sk2->state == TCP_ESTABLISHED)
+		goto do_full_success;
+	if(sk2->err > 0)
+		goto do_connect_err;
+	err = -ECONNABORTED;
 	if (sk2->state == TCP_CLOSE)
-	{
-		sk2->sleep = NULL;
-		sk2->socket = NULL;
-		destroy_sock(sk2);
-
-		newsock->sk = newsk;
-		newsk->socket = newsock;
-
-		return -ECONNABORTED;
-	}
-
+		goto do_bad_connection;
+do_full_success:
 	destroy_sock(newsk);
 	newsock->state = SS_CONNECTED;
+	return 0;
+
+do_half_success:
+	destroy_sock(newsk);
 	return(0);
+
+do_connect_err:
+	err = sock_error(sk2);
+do_bad_connection:
+	sk2->sleep = NULL;
+	sk2->socket = NULL;
+	destroy_sock(sk2);
+	newsock->sk = newsk;
+	newsk->socket = newsock;
+	return err;
+
+do_interrupted:
+	sti();
+	sk1->pair = sk2;
+	sk2->sleep = NULL;
+	sk2->socket = NULL;
+	newsock->sk = newsk;
+	newsk->socket = newsock;
+	err = -ERESTARTSYS;
+do_err:
+	return err;
+do_sk1_err:
+	err = sock_error(sk1);
+	return err;
 }
 
 
@@ -1095,19 +735,16 @@
 static int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		 int *uaddr_len, int peer)
 {
-	struct sock *sk=sock->sk;
-	struct sockaddr_in *sin=(struct sockaddr_in *)uaddr;
+	struct sock *sk		= sock->sk;
+	struct sockaddr_in *sin	= (struct sockaddr_in *)uaddr;
   
 	sin->sin_family = AF_INET;
-	if (peer) 
-	{
+	if (peer) {
 		if (!tcp_connected(sk->state)) 
 			return(-ENOTCONN);
 		sin->sin_port = sk->dummy_th.dest;
 		sin->sin_addr.s_addr = sk->daddr;
-	} 
-	else 
-	{
+	} else {
 		__u32 addr = sk->rcv_saddr;
 		if (!addr)
 			addr = sk->saddr;
@@ -1134,7 +771,7 @@
 	if (sk->err)
 		return sock_error(sk);
 	/* We may need to bind the socket. */
-	if (inet_autobind(sk)!=0)
+	if (inet_autobind(sk) != 0)
 		return(-EAGAIN);
 	err = sk->prot->recvmsg(sk, msg, size, flags&MSG_DONTWAIT,
 				flags&~MSG_DONTWAIT, &addr_len);
@@ -1157,9 +794,11 @@
 		return(-EOPNOTSUPP);
 	if(sk->err)
 		return sock_error(sk);
+
 	/* We may need to bind the socket. */
-	if(inet_autobind(sk)!=0)
+	if(inet_autobind(sk) != 0)
 		return -EAGAIN;
+
 	return sk->prot->sendmsg(sk, msg, size);
 }
 
@@ -1168,8 +807,7 @@
 {
 	struct sock *sk = sock->sk;
 
-	/*
-	 * This should really check to make sure
+	/* This should really check to make sure
 	 * the socket is a TCP socket. (WHY AC...)
 	 */
 	how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
@@ -1329,256 +967,6 @@
 	return(0);
 }
 
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-/*
- * Some routines for the for loop in get_sock which sometimes needs to walk
- * two linked lists in sequence.  Could use macros as well.
- * Does anyone know a nicer way to code this?
- */
-static __inline__ struct sock *secondlist(unsigned short hpnum, struct sock *s,
-				int *pfirstpass, struct proto *prot)
-{
-	if (s == NULL && (*pfirstpass)-- )
-		return prot->sock_array[hpnum & (SOCK_ARRAY_SIZE - 1)];
-	else
-		return s;
-}
-static __inline__ struct sock *get_sock_loop_init(unsigned short hnum,
-			unsigned short hpnum, struct sock *s,
-			int *pfirstpass, struct proto *prot)
-{
-	s = prot->sock_array[hnum & (SOCK_ARRAY_SIZE - 1)];
-	return secondlist(hpnum, s, pfirstpass, prot);
-}
-static __inline__ struct sock *get_sock_loop_next(unsigned short hnum,
-			unsigned short hpnum, struct sock *s,
-			int *pfirstpass, struct proto *prot)
-{
-	s = s->next;
-	return secondlist(hpnum, s, pfirstpass, prot);
-}
-
-struct sock *get_sock_proxy(struct proto *prot, unsigned short num,
-				unsigned long raddr,
-				unsigned short rnum, unsigned long laddr,
-				unsigned long paddr, unsigned short pnum)
-{
-	struct sock *s = 0;
-	struct sock *result = NULL;
-	int badness = -1;
-	unsigned short hnum;
-	unsigned short hpnum;
-	int firstpass = 1;
-
-	hnum = ntohs(num);
-	hpnum = ntohs(pnum);
-
-	/*
-	 * SOCK_ARRAY_SIZE must be a power of two.  This will work better
-	 * than a prime unless 3 or more sockets end up using the same
-	 * array entry.  This should not be a problem because most
-	 * well known sockets don't overlap that much, and for
-	 * the other ones, we can just be careful about picking our
-	 * socket number when we choose an arbitrary one.
-	 */
-
-	for(s = get_sock_loop_init(hnum, hpnum, s, &firstpass, prot);
-		s != NULL;
-		s = get_sock_loop_next(hnum, hpnum, s, &firstpass, prot))
-	{
-		int score = 0;
-
-		/* accept the addressed port or the redirect (proxy) port */
-		if (s->num != hnum && s->num != hpnum)
-			continue;
-
-		if(s->dead && (s->state == TCP_CLOSE))
-			continue;
-		/* local address matches? */
-		if (s->rcv_saddr) {
-			/*
-			 * If this is redirected traffic, it must either
-			 * match on the redirected port/ip-address or on
-			 * the actual destination, not on a mixture.
-			 * There must be a simpler way to express this...
-			 */
-			if ((s->num != hpnum || s->rcv_saddr != paddr)
-			    && (s->num != hnum || s->rcv_saddr != laddr))
-				continue;
-			score++;
-		}
-		/* remote address matches? */
-		if (s->daddr) {
-			if (s->daddr != raddr)
-				continue;
-			score++;
-		}
-		/* remote port matches? */
-		if (s->dummy_th.dest) {
-			if (s->dummy_th.dest != rnum)
-				continue;
-			score++;
-		}
-		/* perfect match? */
-		if (score == 3 && s->num == hnum)
-			return s;
-		/* no, check if this is the best so far.. */
-		if (score <= badness)
-			continue;
-		/* don't accept near matches on the actual destination
-		 * port with IN_ADDR_ANY for redirected traffic, but do
-		 * allow explicit remote address listens.  (disputable)
-		 */
-		if (s->num != hpnum && !s->rcv_saddr)
-			continue;
-		result = s;
-		badness = score;
-  	}
-  	return result;
-}
-#endif
-
-/*
- * This routine must find a socket given a TCP or UDP header.
- * Everything is assumed to be in net order.
- *
- * We give priority to more closely bound ports: if some socket
- * is bound to a particular foreign address, it will get the packet
- * rather than somebody listening to any address..
- */
-
-struct sock *get_sock(struct proto *prot, unsigned short num,
-				unsigned long raddr,
-				unsigned short rnum, unsigned long laddr)
-{
-	struct sock *s = 0;
-	struct sock *result = NULL;
-	int badness = -1;
-	unsigned short hnum;
-
-	hnum = ntohs(num);
-
-	/*
-	 * SOCK_ARRAY_SIZE must be a power of two.  This will work better
-	 * than a prime unless 3 or more sockets end up using the same
-	 * array entry.  This should not be a problem because most
-	 * well known sockets don't overlap that much, and for
-	 * the other ones, we can just be careful about picking our
-	 * socket number when we choose an arbitrary one.
-	 */
-
-	for(s = prot->sock_array[hnum & (SOCK_ARRAY_SIZE - 1)];
-			s != NULL; s = s->next) 
-	{
-		int score = 0;
-
-		if (s->num != hnum) 
-			continue;
-
-		if(s->dead && (s->state == TCP_CLOSE))
-			continue;
-		/* local address matches? */
-		if (s->rcv_saddr) {
-			if (s->rcv_saddr != laddr)
-				continue;
-			score++;
-		}
-		/* remote address matches? */
-		if (s->daddr) {
-			if (s->daddr != raddr)
-				continue;
-			score++;
-		}
-		/* remote port matches? */
-		if (s->dummy_th.dest) {
-			if (s->dummy_th.dest != rnum)
-				continue;
-			score++;
-		}
-		/* perfect match? */
-		if (score == 3)
-			return s;
-		/* no, check if this is the best so far.. */
-		if (score <= badness)
-			continue;
-		result = s;
-		badness = score;
-  	}
-  	return result;
-}
-
-
-/*
- *	Deliver a datagram to raw sockets.
- */
- 
-struct sock *get_sock_raw(struct sock *sk, 
-				unsigned short num,
-				unsigned long raddr,
-				unsigned long laddr)
-{
-	struct sock *s;
-
-	s=sk;
-
-	for(; s != NULL; s = s->next) 
-	{
-		if (s->num != num) 
-			continue;
-		if(s->dead && (s->state == TCP_CLOSE))
-			continue;
-		if(s->daddr && s->daddr!=raddr)
-			continue;
- 		if(s->rcv_saddr && s->rcv_saddr != laddr)
-			continue;
-		return(s);
-  	}
-  	return(NULL);
-}
-
-/*
- *	Deliver a datagram to broadcast/multicast sockets.
- */
- 
-struct sock *get_sock_mcast(struct sock *sk, 
-				unsigned short num,
-				unsigned long raddr,
-				unsigned short rnum, unsigned long laddr)
-{
-	struct sock *s;
-	unsigned short hnum;
-
-	hnum = ntohs(num);
-
-	/*
-	 * SOCK_ARRAY_SIZE must be a power of two.  This will work better
-	 * than a prime unless 3 or more sockets end up using the same
-	 * array entry.  This should not be a problem because most
-	 * well known sockets don't overlap that much, and for
-	 * the other ones, we can just be careful about picking our
-	 * socket number when we choose an arbitrary one.
-	 */
-	
-	s=sk;
-
-	for(; s != NULL; s = s->next) 
-	{
-		if (s->num != hnum) 
-			continue;
-		if(s->dead && (s->state == TCP_CLOSE))
-			continue;
-		if(s->daddr && s->daddr!=raddr)
-			continue;
-		if (s->dummy_th.dest != rnum && s->dummy_th.dest != 0) 
-			continue;
- 		if(s->rcv_saddr  && s->rcv_saddr != laddr)
-			continue;
-		return(s);
-  	}
-  	return(NULL);
-}
-
-
 struct proto_ops inet_stream_ops = {
 	AF_INET,
 
@@ -1621,9 +1009,6 @@
 	inet_recvmsg
 };
 
-
-
-
 struct net_proto_family inet_family_ops = {
 	AF_INET,
 	inet_create
@@ -1681,8 +1066,6 @@
 {
 	struct sk_buff *dummy_skb;
 	struct inet_protocol *p;
-	int i;
-
 
 	printk("Swansea University Computer Society TCP/IP for NET3.037\n");
 
@@ -1703,22 +1086,6 @@
 	/*
 	 *	Add all the protocols. 
 	 */
-	 
-	for(i = 0; i < SOCK_ARRAY_SIZE; i++) 
-	{
-		tcp_sock_array[i] = NULL;
-		udp_sock_array[i] = NULL;
-		raw_sock_array[i] = NULL;
-  	}
-	tcp_prot.inuse = 0;
-	tcp_prot.highestinuse = 0;
-	tcp_prot.sock_array = tcp_sock_array;
-	udp_prot.inuse = 0;
-	udp_prot.highestinuse = 0;
-	udp_prot.sock_array = udp_sock_array;
-	raw_prot.inuse = 0;
-	raw_prot.highestinuse = 0;
-	raw_prot.sock_array = raw_sock_array;
 
 	printk("IP Protocols: ");
 	for(p = inet_protocol_base; p != NULL;) 

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov