patch-2.4.19 linux-2.4.19/net/ipv4/tcp_ipv4.c
Next file: linux-2.4.19/net/ipv4/tcp_minisocks.c
Previous file: linux-2.4.19/net/ipv4/tcp_input.c
Back to the patch index
Back to the overall index
- Lines: 441
- Date:
Fri Aug 2 17:39:46 2002
- Orig file:
linux-2.4.18/net/ipv4/tcp_ipv4.c
- Orig date:
Mon Feb 25 11:38:14 2002
diff -urN linux-2.4.18/net/ipv4/tcp_ipv4.c linux-2.4.19/net/ipv4/tcp_ipv4.c
@@ -64,6 +64,8 @@
#include <linux/ipsec.h>
extern int sysctl_ip_dynaddr;
+extern int sysctl_ip_default_ttl;
+int sysctl_tcp_tw_reuse = 0;
/* Check TCP sequence numbers in ICMP packets. */
#define ICMP_MIN_LENGTH 8
@@ -162,23 +164,24 @@
local_bh_enable();
}
-static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
-{
- sk->num = snum;
+static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
+{
+ sk->num = snum;
if ((sk->bind_next = tb->owners) != NULL)
tb->owners->bind_pprev = &sk->bind_next;
tb->owners = sk;
sk->bind_pprev = &tb->owners;
sk->prev = (struct sock *) tb;
-}
+}
static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
-{
+{
struct sock *sk2 = tb->owners;
int sk_reuse = sk->reuse;
for( ; sk2 != NULL; sk2 = sk2->bind_next) {
if (sk != sk2 &&
+ sk2->reuse <= 1 &&
sk->bound_dev_if == sk2->bound_dev_if) {
if (!sk_reuse ||
!sk2->reuse ||
@@ -190,8 +193,8 @@
}
}
}
- return sk2 != NULL;
-}
+ return sk2 != NULL;
+}
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
@@ -244,12 +247,14 @@
break;
}
if (tb != NULL && tb->owners != NULL) {
- if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
+ if (sk->reuse > 1)
+ goto success;
+ if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
goto success;
} else {
- ret = 1;
+ ret = 1;
if (tcp_bind_conflict(sk, tb))
- goto fail_unlock;
+ goto fail_unlock;
}
}
ret = 1;
@@ -266,7 +271,7 @@
tb->fastreuse = 0;
success:
if (sk->prev == NULL)
- tcp_bind_hash(sk, tb, snum);
+ tcp_bind_hash(sk, tb, snum);
BUG_TRAP(sk->prev == (struct sock *) tb);
ret = 0;
@@ -337,13 +342,13 @@
}
}
-static __inline__ void __tcp_v4_hash(struct sock *sk)
+static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
{
struct sock **skp;
rwlock_t *lock;
BUG_TRAP(sk->pprev==NULL);
- if(sk->state == TCP_LISTEN) {
+ if(listen_possible && sk->state == TCP_LISTEN) {
skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
lock = &tcp_lhash_lock;
tcp_listen_wlock();
@@ -358,7 +363,7 @@
sk->pprev = skp;
sock_prot_inc_use(sk->prot);
write_unlock(lock);
- if (sk->state == TCP_LISTEN)
+ if (listen_possible && sk->state == TCP_LISTEN)
wake_up(&tcp_lhash_wait);
}
@@ -366,7 +371,7 @@
{
if (sk->state != TCP_CLOSE) {
local_bh_disable();
- __tcp_v4_hash(sk);
+ __tcp_v4_hash(sk, 1);
local_bh_enable();
}
}
@@ -375,6 +380,9 @@
{
rwlock_t *lock;
+ if (!sk->pprev)
+ goto ende;
+
if (sk->state == TCP_LISTEN) {
local_bh_disable();
tcp_listen_wlock();
@@ -393,6 +401,8 @@
sock_prot_dec_use(sk->prot);
}
write_unlock_bh(lock);
+
+ ende:
if (sk->state == TCP_LISTEN)
wake_up(&tcp_lhash_wait);
}
@@ -530,19 +540,21 @@
skb->h.th->source);
}
-static int tcp_v4_check_established(struct sock *sk)
+/* called with local bh disabled */
+static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
+ struct tcp_tw_bucket **twp)
{
u32 daddr = sk->rcv_saddr;
u32 saddr = sk->daddr;
int dif = sk->bound_dev_if;
TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
- __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
- int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
+ __u32 ports = TCP_COMBINED_PORTS(sk->dport, lport);
+ int hash = tcp_hashfn(daddr, lport, saddr, sk->dport);
struct tcp_ehash_bucket *head = &tcp_ehash[hash];
struct sock *sk2, **skp;
struct tcp_tw_bucket *tw;
- write_lock_bh(&head->lock);
+ write_lock(&head->lock);
/* Check TIME-WAIT sockets first. */
for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
@@ -566,7 +578,9 @@
fall back to VJ's scheme and use initial
timestamp retrieved from peer table.
*/
- if (tw->ts_recent_stamp) {
+ if (tw->ts_recent_stamp &&
+ (!twp || (sysctl_tcp_tw_reuse &&
+ xtime.tv_sec - tw->ts_recent_stamp > 1))) {
if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
tp->write_seq = 1;
tp->ts_recent = tw->ts_recent;
@@ -587,6 +601,10 @@
}
unique:
+ /* Must record num and sport now. Otherwise we will see
+ * in hash table socket with a funny identity. */
+ sk->num = lport;
+ sk->sport = htons(lport);
BUG_TRAP(sk->pprev==NULL);
if ((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
@@ -595,15 +613,16 @@
sk->pprev = skp;
sk->hashent = hash;
sock_prot_inc_use(sk->prot);
- write_unlock_bh(&head->lock);
+ write_unlock(&head->lock);
- if (tw) {
+ if (twp) {
+ *twp = tw;
+ NET_INC_STATS_BH(TimeWaitRecycled);
+ } else if (tw) {
/* Silly. Should hash-dance instead... */
- local_bh_disable();
tcp_tw_deschedule(tw);
tcp_timewait_kill(tw);
NET_INC_STATS_BH(TimeWaitRecycled);
- local_bh_enable();
tcp_tw_put(tw);
}
@@ -611,34 +630,120 @@
return 0;
not_unique:
- write_unlock_bh(&head->lock);
+ write_unlock(&head->lock);
return -EADDRNOTAVAIL;
}
-/* Hash SYN-SENT socket to established hash table after
- * checking that it is unique. Note, that without kernel lock
- * we MUST make these two operations atomically.
- *
- * Optimization: if it is bound and tcp_bind_bucket has the only
- * owner (us), we need not to scan established bucket.
+/*
+ * Bind a port for a connect operation and hash it.
*/
-
-int tcp_v4_hash_connecting(struct sock *sk)
+static int tcp_v4_hash_connect(struct sock *sk)
{
unsigned short snum = sk->num;
- struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
- struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
+ struct tcp_bind_hashbucket *head;
+ struct tcp_bind_bucket *tb;
+
+ if (snum == 0) {
+ int rover;
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int remaining = (high - low) + 1;
+ struct tcp_tw_bucket *tw = NULL;
+
+ local_bh_disable();
+ /* TODO. Actually it is not so bad idea to remove
+ * tcp_portalloc_lock before next submission to Linus.
+ * As soon as we touch this place at all it is time to think.
+ *
+ * Now it protects single _advisory_ variable tcp_port_rover,
+ * hence it is mostly useless.
+ * Code will work nicely if we just delete it, but
+ * I am afraid in contented case it will work not better or
+ * even worse: another cpu just will hit the same bucket
+ * and spin there.
+ * So some cpu salt could remove both contention and
+ * memory pingpong. Any ideas how to do this in a nice way?
+ */
+ spin_lock(&tcp_portalloc_lock);
+ rover = tcp_port_rover;
+
+ do {
+ rover++;
+ if ((rover < low) || (rover > high))
+ rover = low;
+ head = &tcp_bhash[tcp_bhashfn(rover)];
+ spin_lock(&head->lock);
+
+ /* Does not bother with rcv_saddr checks,
+ * because the established check is already
+ * unique enough.
+ */
+ for (tb = head->chain; tb; tb = tb->next) {
+ if (tb->port == rover) {
+ BUG_TRAP(tb->owners != NULL);
+ if (tb->fastreuse >= 0)
+ goto next_port;
+ if (!__tcp_v4_check_established(sk, rover, &tw))
+ goto ok;
+ goto next_port;
+ }
+ }
+
+ tb = tcp_bucket_create(head, rover);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ break;
+ }
+ tb->fastreuse = -1;
+ goto ok;
+
+ next_port:
+ spin_unlock(&head->lock);
+ } while (--remaining > 0);
+ tcp_port_rover = rover;
+ spin_unlock(&tcp_portalloc_lock);
+
+ local_bh_enable();
+
+ return -EADDRNOTAVAIL;
+
+ ok:
+ /* All locks still held and bhs disabled */
+ tcp_port_rover = rover;
+ spin_unlock(&tcp_portalloc_lock);
+
+ tcp_bind_hash(sk, tb, rover);
+ if (!sk->pprev) {
+ sk->sport = htons(rover);
+ __tcp_v4_hash(sk, 0);
+ }
+ spin_unlock(&head->lock);
+
+ if (tw) {
+ tcp_tw_deschedule(tw);
+ tcp_timewait_kill(tw);
+ tcp_tw_put(tw);
+ }
+
+ local_bh_enable();
+ return 0;
+ }
+
+ head = &tcp_bhash[tcp_bhashfn(snum)];
+ tb = (struct tcp_bind_bucket *)sk->prev;
spin_lock_bh(&head->lock);
if (tb->owners == sk && sk->bind_next == NULL) {
- __tcp_v4_hash(sk);
+ __tcp_v4_hash(sk, 0);
spin_unlock_bh(&head->lock);
return 0;
} else {
- spin_unlock_bh(&head->lock);
-
+ int ret;
+ spin_unlock(&head->lock);
/* No definite answer... Walk to established hash table */
- return tcp_v4_check_established(sk);
+ ret = __tcp_v4_check_established(sk, snum, NULL);
+ local_bh_enable();
+ return ret;
}
}
@@ -647,7 +752,6 @@
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
- struct sk_buff *buff;
struct rtable *rt;
u32 daddr, nexthop;
int tmp;
@@ -682,12 +786,6 @@
if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
daddr = rt->rt_dst;
- err = -ENOBUFS;
- buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation);
-
- if (buff == NULL)
- goto failure;
-
if (!sk->saddr)
sk->saddr = rt->rt_src;
sk->rcv_saddr = sk->saddr;
@@ -718,22 +816,36 @@
sk->dport = usin->sin_port;
sk->daddr = daddr;
+ tp->ext_header_len = 0;
+ if (sk->protinfo.af_inet.opt)
+ tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
+
+ tp->mss_clamp = 536;
+
+ /* Socket identity is still unknown (sport may be zero).
+ * However we set state to SYN-SENT and not releasing socket
+ * lock select source port, enter ourselves into the hash tables and
+ * complete initalization after this.
+ */
+ tcp_set_state(sk, TCP_SYN_SENT);
+ err = tcp_v4_hash_connect(sk);
+ if (err)
+ goto failure;
+
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
sk->sport, usin->sin_port);
- tp->ext_header_len = 0;
- if (sk->protinfo.af_inet.opt)
- tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
sk->protinfo.af_inet.id = tp->write_seq^jiffies;
- tp->mss_clamp = 536;
+ err = tcp_connect(sk);
+ if (err)
+ goto failure;
- err = tcp_connect(sk, buff);
- if (err == 0)
- return 0;
+ return 0;
failure:
+ tcp_set_state(sk, TCP_CLOSE);
__sk_dst_reset(sk);
sk->route_caps = 0;
sk->dport = 0;
@@ -786,7 +898,6 @@
req->expires = jiffies + TCP_TIMEOUT_INIT;
req->retrans = 0;
req->sk = NULL;
- req->index = h;
req->dl_next = lopt->syn_table[h];
write_lock(&tp->syn_wait_lock);
@@ -1072,6 +1183,7 @@
arg.n_iov = 1;
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+ tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
TCP_INC_STATS_BH(TcpOutSegs);
@@ -1387,7 +1499,6 @@
NETDEBUG(if (net_ratelimit()) \
printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
NIPQUAD(saddr), ntohs(skb->h.th->source)));
- TCP_INC_STATS_BH(TcpAttemptFails);
dst_release(dst);
goto drop_and_free;
}
@@ -1456,7 +1567,7 @@
newtp->advmss = dst->advmss;
tcp_initialize_rcv_mss(newsk);
- __tcp_v4_hash(newsk);
+ __tcp_v4_hash(newsk, 0);
__tcp_inherit_port(sk, newsk);
return newsk;
@@ -1876,7 +1987,6 @@
tcp_v4_rebuild_header,
tcp_v4_conn_request,
tcp_v4_syn_recv_sock,
- tcp_v4_hash_connecting,
tcp_v4_remember_stamp,
sizeof(struct iphdr),
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)