patch-2.1.68 linux/net/ipv4/route.c

Next file: linux/net/ipv4/syncookies.c
Previous file: linux/net/ipv4/raw.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.1.67/linux/net/ipv4/route.c linux/net/ipv4/route.c
@@ -5,7 +5,7 @@
  *
  *		ROUTE - implementation of the IP router.
  *
- * Version:	@(#)route.c	1.0.14	05/31/93
+ * Version:	$Id: route.c,v 1.33 1997/10/24 17:16:08 kuznet Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -68,27 +68,27 @@
 #include <linux/in.h>
 #include <linux/inet.h>
 #include <linux/netdevice.h>
-#include <linux/if_arp.h>
 #include <linux/proc_fs.h>
 #include <linux/init.h>
-#include <net/ip.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/pkt_sched.h>
+#include <linux/mroute.h>
 #include <net/protocol.h>
+#include <net/ip.h>
 #include <net/route.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
 #include <net/arp.h>
 #include <net/tcp.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
 #include <net/icmp.h>
-#include <linux/net_alias.h>
-
-/* Compile time configuretion flags */
 
-#define CONFIG_IP_LOCAL_RT_POLICY 1
+#define RTprint(a...)	printk(KERN_DEBUG a)
 
-static void rt_run_flush(unsigned long);
-  
 static struct timer_list rt_flush_timer =
-	{ NULL, NULL, RT_FLUSH_DELAY, 0L, rt_run_flush };
+	{ NULL, NULL, RT_FLUSH_DELAY, 0L, NULL };
 
 /*
  *	Interface to generic destination cache.
@@ -108,6 +108,24 @@
 	ipv4_dst_destroy
 };
 
+__u8 ip_tos2prio[16] = {
+	TC_PRIO_FILLER,
+	TC_PRIO_BESTEFFORT,
+	TC_PRIO_FILLER,
+	TC_PRIO_FILLER,
+	TC_PRIO_BULK,
+	TC_PRIO_FILLER,
+	TC_PRIO_BULK,
+	TC_PRIO_FILLER,
+	TC_PRIO_INTERACTIVE,
+	TC_PRIO_FILLER,
+	TC_PRIO_INTERACTIVE,
+	TC_PRIO_FILLER,
+	TC_PRIO_INTERACTIVE_BULK,
+	TC_PRIO_FILLER,
+	TC_PRIO_INTERACTIVE_BULK,
+	TC_PRIO_FILLER
+};
 
 /*
  * Route cache.
@@ -162,8 +180,10 @@
 				r->u.dst.dev ? r->u.dst.dev->name : "*",
 				(unsigned long)r->rt_dst,
 				(unsigned long)r->rt_gateway,
-				r->rt_flags, atomic_read(&r->u.dst.refcnt),
-				atomic_read(&r->u.dst.use), 0,
+				r->rt_flags,
+				atomic_read(&r->u.dst.use),
+				atomic_read(&r->u.dst.refcnt),
+				0,
 				(unsigned long)r->rt_src, (int)r->u.dst.pmtu,
 				r->u.dst.window,
 				(int)r->u.dst.rtt, r->key.tos,
@@ -202,8 +222,6 @@
 	struct rtable *rth, **rthp;
 	unsigned long now = jiffies;
 
-	start_bh_atomic();
-
 	for (i=0; i<RT_HASH_DIVISOR/5; i++) {
 		rover = (rover + 1) & (RT_HASH_DIVISOR-1);
 		rthp = &rt_hash_table[rover];
@@ -229,61 +247,24 @@
 			if (!rth_next)
 				break;
 
-			/*
-			 * Pseudo-LRU ordering.
-			 * Really we should teach it to move
-			 * rarely used but permanently living entries
-			 * (f.e. rdisc, igmp etc.) to the end of list.
-			 */
-
 			if ( rth_next->u.dst.lastuse - rth->u.dst.lastuse > RT_CACHE_BUBBLE_THRESHOLD ||
 			    (rth->u.dst.lastuse - rth_next->u.dst.lastuse < 0 &&
-			     atomic_read(&rth->u.dst.use) < atomic_read(&rth_next->u.dst.use))) {
+			     atomic_read(&rth->u.dst.refcnt) < atomic_read(&rth_next->u.dst.refcnt))) {
 #if RT_CACHE_DEBUG >= 2
 				printk("rt_check_expire bubbled %02x@%08x<->%08x\n", rover, rth->rt_dst, rth_next->rt_dst);
 #endif
 				*rthp = rth_next;
  				rth->u.rt_next = rth_next->u.rt_next;
 				rth_next->u.rt_next = rth;
-				sti();
 				rthp = &rth_next->u.rt_next;
 				continue;
 			}
 			rthp = &rth->u.rt_next;
 		}
 	}
-
-	end_bh_atomic();
 }
-  
-  
-void rt_cache_flush(int how)
-{
-	start_bh_atomic();
-	if (rt_flush_timer.expires) {
-		if (jiffies - rt_flush_timer.expires > 0 ||
-		    rt_flush_timer.expires - jiffies > RT_FLUSH_DELAY/2)
-			how = 1;
-	}
-	if (how) {
-		if (rt_flush_timer.expires)
-			del_timer(&rt_flush_timer);
-		rt_flush_timer.expires = 0;
-		end_bh_atomic();
-		rt_run_flush(0);
-		return;
-	}
-	if (rt_flush_timer.expires) {
-		end_bh_atomic();
-		return;
-	}
-	del_timer(&rt_flush_timer);
-	rt_flush_timer.expires = jiffies + RT_FLUSH_DELAY;
-	add_timer(&rt_flush_timer);
-	end_bh_atomic();
-}
-  
-void rt_run_flush(unsigned long dummy)
+
+static void rt_run_flush(unsigned long dummy)
 {
 	int i;
 	struct rtable * rth, * next;
@@ -313,6 +294,30 @@
 #endif
 	}
 }
+  
+void rt_cache_flush(int delay)
+{
+	start_bh_atomic();
+	if (delay && rt_flush_timer.function &&
+	    rt_flush_timer.expires - jiffies < delay) {
+		end_bh_atomic();
+		return;
+	}
+	if (rt_flush_timer.function) {
+		del_timer(&rt_flush_timer);
+		rt_flush_timer.function = NULL;
+	}
+	if (delay == 0) {
+		end_bh_atomic();
+		rt_run_flush(0);
+		return;
+	}
+	rt_flush_timer.function = rt_run_flush;
+	rt_flush_timer.expires = jiffies + delay;
+	add_timer(&rt_flush_timer);
+	end_bh_atomic();
+}
+
 
 static void rt_garbage_collect(void)
 {
@@ -327,7 +332,7 @@
 
 	/*
 	 * Garbage collection is pretty expensive,
-	 * do not make it too frequently.
+	 * do not make it too frequently, but just increase expire strength.
 	 */
 	if (now - last_gc < 1*HZ) {
 		expire >>= 1;
@@ -342,7 +347,7 @@
 			continue;
 		for (rthp=&rt_hash_table[i]; (rth=*rthp); rthp=&rth->u.rt_next)	{
 			if (atomic_read(&rth->u.dst.use) ||
-			    (now - rth->u.dst.lastuse > expire))
+			    now - rth->u.dst.lastuse < expire)
 				continue;
 			atomic_dec(&rt_cache_size);
 			*rthp = rth->u.rt_next;
@@ -465,115 +470,94 @@
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 		    u32 saddr, u8 tos, struct device *dev)
 {
-	int i;
-	int  off_link = 0;
-	struct fib_info *fi;
+	int i, k;
+	struct in_device *in_dev = dev->ip_ptr;
 	struct rtable *rth, **rthp;
-	u32  skeys[2] = { saddr, 0, };
-	struct device *pdev = net_alias_main_dev(dev);
+	u32  skeys[2] = { saddr, 0 };
+	int  ikeys[2] = { dev->ifindex, 0 };
 
 	tos &= IPTOS_TOS_MASK;
 
-	if (new_gw == old_gw || !ipv4_config.accept_redirects
+	if (!in_dev || new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
 		goto reject_redirect;
 
-	if ((new_gw^dev->pa_addr)&dev->pa_mask)
-		off_link = 1;
-
-	if (!ipv4_config.rfc1620_redirects) {
-		if (off_link)
+	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
+		if (ip_fib_check_default(new_gw, dev))
 			goto reject_redirect;
-		if (ipv4_config.secure_redirects && ip_fib_chk_default_gw(new_gw, dev))
+	} else {
+		if (inet_addr_type(new_gw) != RTN_UNICAST)
 			goto reject_redirect;
 	}
 
-	fi = fib_lookup_info(new_gw, 0, 0, &loopback_dev, NULL);
-	if (fi == NULL || fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_NAT))
-		goto reject_redirect;
-
 	for (i=0; i<2; i++) {
-		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
+		for (k=0; k<2; k++) {
+			unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
 
-		rthp=&rt_hash_table[hash];
+			rthp=&rt_hash_table[hash];
 
-		while ( (rth = *rthp) != NULL) {
-			struct rtable *rt;
+			while ( (rth = *rthp) != NULL) {
+				struct rtable *rt;
 
-			if (rth->key.dst != daddr ||
-			    rth->key.src != skeys[i] ||
-			    rth->key.tos != tos ||
-			    rth->key.dst_dev != NULL ||
-			    rth->key.src_dev != NULL) {
-				rthp = &rth->u.rt_next;
-				continue;
-			}
-
-			if (rth->rt_dst != daddr ||
-			    rth->rt_src != saddr ||
-			    rth->rt_flags&RTF_REJECT ||
-			    rth->rt_gateway != old_gw ||
-			    rth->u.dst.dev != dev)
-				break;
+				if (rth->key.dst != daddr ||
+				    rth->key.src != skeys[i] ||
+				    rth->key.tos != tos ||
+				    rth->key.oif != ikeys[k] ||
+				    rth->key.iif != 0) {
+					rthp = &rth->u.rt_next;
+					continue;
+				}
 
-			rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
-			if (rt == NULL)
-				return;
+				if (rth->rt_dst != daddr ||
+				    rth->rt_src != saddr ||
+				    rth->u.dst.error ||
+				    rth->rt_gateway != old_gw ||
+				    rth->u.dst.dev != dev)
+					break;
 
-			/*
-			 * Copy all the information.
-			 */
-			atomic_set(&rt->u.dst.refcnt, 1);
-			rt->u.dst.dev = dev;
-			rt->u.dst.input = rth->u.dst.input;
-			rt->u.dst.output = rth->u.dst.output;
-			rt->u.dst.pmtu = dev->mtu;
-			rt->u.dst.rtt = TCP_TIMEOUT_INIT;
-			rt->u.dst.window = 0;
-			atomic_set(&rt->u.dst.use, 1);
-			rt->u.dst.lastuse = jiffies;
-
-			rt->rt_flags = rth->rt_flags|RTF_DYNAMIC|RTF_MODIFIED;
-			rt->rt_flags &= ~RTF_GATEWAY;
-			if (new_gw != daddr)
-				rt->rt_flags |= RTF_GATEWAY;
-
-			rt->rt_src = rth->rt_src;
-			rt->rt_dst = rth->rt_dst;
-			rt->rt_src_dev = rth->rt_src_dev;
-			rt->rt_spec_dst = rth->rt_spec_dst;
-			rt->key = rth->key;
-
-			/* But gateway is different ... */
-			rt->rt_gateway = new_gw;
-
-			if (off_link) {
-				if (fi->fib_dev != dev &&
-				    net_alias_main_dev(fi->fib_dev) == pdev)
-					rt->u.dst.dev = fi->fib_dev;
-			}
+				rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+				if (rt == NULL)
+					return;
+
+				/*
+				 * Copy all the information.
+				 */
+				*rt = *rth;
+				atomic_set(&rt->u.dst.refcnt, 1);
+				atomic_set(&rt->u.dst.use, 1);
+				rt->u.dst.lastuse = jiffies;
+				rt->u.dst.neighbour = NULL;
+				rt->u.dst.hh = NULL;
+
+				rt->rt_flags |= RTCF_REDIRECTED;
+
+				/* Gateway is different ... */
+				rt->rt_gateway = new_gw;
+
+				if (!rt_ll_bind(rt)) {
+					ip_rt_put(rt);
+					rt_free(rt);
+					break;
+				}
 
-			if (ipv4_config.rfc1620_redirects && !rt_ll_bind(rt)) {
+				*rthp = rth->u.rt_next;
+				rt_free(rth);
+				rt = rt_intern_hash(hash, rt, ETH_P_IP);
 				ip_rt_put(rt);
-				rt_free(rt);
 				break;
 			}
-
-			*rthp = rth->u.rt_next;
-			rt_free(rth);
-			rt = rt_intern_hash(hash, rt, ETH_P_IP);
-			ip_rt_put(rt);
-			break;
 		}
 	}
 	return;
 
 reject_redirect:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
 	if (ipv4_config.log_martians && net_ratelimit())
 		printk(KERN_INFO "Redirect from %lX/%s to %lX ignored."
 		       "Path = %lX -> %lX, tos %02x\n",
 		       ntohl(old_gw), dev->name, ntohl(new_gw),
 		       ntohl(saddr), ntohl(daddr), tos);
+#endif
 }
 
 
@@ -585,7 +569,7 @@
 		return;
 
 	start_bh_atomic();
-	if ((rt = *rp) != NULL && (rt->rt_flags&(RTF_DYNAMIC|RTF_MODIFIED))) {
+	if ((rt = *rp) != NULL && (rt->rt_flags&RTCF_REDIRECTED)) {
 #if RT_CACHE_DEBUG >= 1
 		printk(KERN_DEBUG "ip_rt_advice: redirect to %08x/%02x dropped\n", rt->rt_dst, rt->key.tos);
 #endif
@@ -602,7 +586,7 @@
  *	1. The first RT_REDIRECT_NUMBER redirects are sent
  *	   with exponential backoff, then we stop sending them at all,
  *	   assuming that the host ignores our redirects.
- *	2. If we did not see a packets requiring redirects
+ *	2. If we did not see packets requiring redirects
  *	   during RT_REDIRECT_SILENCE, we assume that the host
  *	   forgot redirected route and start to send redirects again.
  *
@@ -637,9 +621,12 @@
 	if (jiffies - rt->last_error > (RT_REDIRECT_LOAD<<rt->errors)) {
 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 		rt->last_error = jiffies;
-		if (ipv4_config.log_martians && ++rt->errors == RT_REDIRECT_NUMBER && net_ratelimit())
-			printk(KERN_WARNING "host %08x/%s ignores redirects for %08x to %08x.\n",
-			       rt->rt_src, rt->rt_src_dev->name, rt->rt_dst, rt->rt_gateway);
+		++rt->errors;
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+		if (ipv4_config.log_martians && rt->errors == RT_REDIRECT_NUMBER && net_ratelimit())
+			printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n",
+			       rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
+#endif
 	}
 }
 
@@ -653,6 +640,9 @@
 	default:
 		kfree_skb(skb, FREE_READ);
 		return 0;
+	case EHOSTUNREACH:
+		code = ICMP_HOST_UNREACH;
+		break;
 	case ENETUNREACH:
 		code = ICMP_NET_UNREACH;
 		break;
@@ -668,37 +658,24 @@
 	return 0;
 } 
 
+/*
+ *	The last two values are not from the RFC but
+ *	are needed for AMPRnet AX.25 paths.
+ */
+
+static unsigned short mtu_plateau[] =
+{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
 
 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
 {
-	if (old_mtu > 32000)
-		return 32000;
-	else if (old_mtu > 17914)
-		return 17914;
-	else if (old_mtu > 8166)
-		return 8166;
-	else if (old_mtu > 4352)
-		return 4352;
-	else if (old_mtu > 2002)
-		return 2002;
-	else if (old_mtu > 1492)
-		return 1492;
-	else if (old_mtu > 576)
-		return 576;
-	else if (old_mtu > 296)
-		return 296;
-	/*
-	 *	These two are not from the RFC but
-	 *	are needed for AMPRnet AX.25 paths.
-	 */
-	else if (old_mtu > 216)
-		return 216;
-	else if (old_mtu > 128)
-		return 128;
+	int i;
+	
+	for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++)
+		if (old_mtu > mtu_plateau[i])
+			return mtu_plateau[i];
 	return 68;
 }
 
-
 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
 {
 	int i;
@@ -721,8 +698,8 @@
 			    rth->rt_dst == daddr &&
 			    rth->rt_src == iph->saddr &&
 			    rth->key.tos == tos &&
-			    !rth->key.src_dev &&
-			    !(rth->rt_flags&RTF_NOPMTUDISC)) {
+			    rth->key.iif == 0 &&
+			    !(rth->rt_flags&RTCF_NOPMTUDISC)) {
 				unsigned short mtu = new_mtu;
 
 				if (new_mtu < 68 || new_mtu >= old_mtu) {
@@ -770,177 +747,227 @@
 	return NULL;
 }
 
-int
-ip_check_mc(struct device *dev, u32 mc_addr)
+static int ip_rt_bug(struct sk_buff *skb)
 {
-	struct ip_mc_list *ip_mc;
+	printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr,
+	       skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?");
+	kfree_skb(skb, FREE_WRITE);
+	return 0;
+}
 
-	if (mc_addr==htonl(INADDR_ALLHOSTS_GROUP))
-		return 1;
+/*
+   We do not cache source address of outgoing interface,
+   because it is used only by IP RR, TS and SRR options,
+   so that it out of fast path.
 
-	for (ip_mc=dev->ip_mc_list; ip_mc; ip_mc=ip_mc->next)
-		if (ip_mc->multiaddr == mc_addr)
-			return 1;
-	return 0;
+   BTW remember: "addr" is allowed to be not aligned
+   in IP options!
+ */
+
+void ip_rt_get_source(u8 *addr, struct rtable *rt)
+{
+	u32 src;
+	struct fib_result res;
+
+	if (rt->key.iif == 0) {
+		memcpy(addr, &rt->rt_src, 4);
+		return;
+	}
+	if (fib_lookup(&rt->key, &res) == 0) {
+		src = FIB_RES_PREFSRC(res);
+		memcpy(addr, &src, 4);
+		return;
+	}
+	src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+	memcpy(addr, &src, 4);
 }
 
-static int ip_rt_bug(struct sk_buff *skb)
+static int
+ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
+		  u8 tos, struct device *dev, int our)
 {
-	kfree_skb(skb, FREE_WRITE);
-	printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr,
-	       skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?");
+	unsigned hash;
+	struct rtable *rth;
+	u32 spec_dst;
+	struct in_device *in_dev = dev->ip_ptr;
+
+	/* Primary sanity checks. */
+
+	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
+	    in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP))
+		return -EINVAL;
+
+	if (ZERONET(saddr)) {
+		if (!LOCAL_MCAST(daddr))
+			return -EINVAL;
+		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+	} else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst) < 0)
+		return -EINVAL;
+
+	rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+	if (!rth)
+		return -ENOBUFS;
+
+	rth->u.dst.output= ip_rt_bug;
+
+	atomic_set(&rth->u.dst.use, 1);
+	rth->key.dst	= daddr;
+	rth->rt_dst	= daddr;
+	rth->key.tos	= tos;
+	rth->key.src	= saddr;
+	rth->rt_src	= saddr;
+#ifdef CONFIG_IP_ROUTE_NAT
+	rth->rt_dst_map	= daddr;
+	rth->rt_src_map	= saddr;
+#endif
+	rth->rt_iif	=
+	rth->key.iif	= dev->ifindex;
+	rth->u.dst.dev	= &loopback_dev;
+	rth->key.oif	= 0;
+	rth->rt_gateway	= daddr;
+	rth->rt_spec_dst= spec_dst;
+	rth->rt_type	= RTN_MULTICAST;
+	rth->rt_flags	= RTCF_MULTICAST;
+	if (our) {
+		rth->u.dst.input= ip_local_deliver;
+		rth->rt_flags |= RTCF_LOCAL;
+	}
+
+#ifdef CONFIG_IP_MROUTE
+	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
+		rth->u.dst.input = ip_mr_input;
+#endif
+
+	hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
+	skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0);
 	return 0;
 }
 
 /*
- *	This function is called ONLY FROM NET BH. No locking!
- *
  *	NOTE. We drop all the packets that has local source
  *	addresses, because every properly looped back packet
  *	must have correct destination already attached by output routine.
  *
  *	Such approach solves two big problems:
- *	1. Not simplex devices (if they exist 8)) are handled properly.
+ *	1. Not simplex devices are handled properly.
  *	2. IP spoofing attempts are filtered with 100% of guarantee.
  */
 
 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
-			u8 tos, struct device *pdev)
+			u8 tos, struct device *dev)
 {
-	struct device * dev = pdev;
-	struct fib_info *fi = NULL;
-	struct fib_info *src_fi = NULL;
+	struct rt_key	key;
+	struct fib_result res;
+	struct in_device *in_dev = dev->ip_ptr;
+	struct in_device *out_dev;
 	unsigned	flags = 0;
-	struct	device	*devout;
 	struct rtable * rth;
 	unsigned	hash;
-	struct fib_result res;
-	u32	src_key = saddr;
-	u32	dst_key = daddr;
-	int	err = -EINVAL;
-	int	log = 0;
+	u32		spec_dst;
+	int		err = -EINVAL;
 
-	hash = rt_hash_code(daddr, saddr^(unsigned long)pdev, tos);
+	/*
+	 *	IP on this device is disabled.
+	 */
+
+	if (!in_dev)
+		return -EINVAL;
+
+	key.dst = daddr;
+	key.src = saddr;
+	key.tos = tos;
+	key.iif = dev->ifindex;
+	key.oif = 0;
+	key.scope = RT_SCOPE_UNIVERSE;
 
-	/*	Check for martians... */
+	hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos);
+
+	/* Check for the most weird martians, which can be not detected
+	   by fib_lookup.
+	 */
 
 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
 		goto martian_source;
-	if (MULTICAST(daddr) || daddr == 0xFFFFFFFF)
-		goto mc_input;
 
-	/* Accept zero addresses only to limited broadcast/multicasts;
-	 * I even do not know to fix it or not.
+	if (daddr == 0xFFFFFFFF)
+		goto brd_input;
+
+	/* Accept zero addresses only to limited broadcast;
+	 * I even do not know to fix it or not. Waiting for complains :-)
 	 */
 	if (ZERONET(saddr))
 		goto martian_source;
+
 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
 		goto martian_destination;
 
 	/*
-	 * Device is not yet initialized, accept all addresses as ours.
+	 *	Now we are ready to route packet.
 	 */
-	if (ZERONET(dev->pa_addr))
-		goto promisc_ip;
-
-	/*
-	 *	Now we are able to route packet.
-	 */
-	if ((err = fib_lookup(&res, daddr, saddr, tos, pdev, NULL)) < 0) {
-		if (!IS_ROUTER)
+	if ((err = fib_lookup(&key, &res))) {
+		if (!IN_DEV_FORWARD(in_dev))
 			return -EINVAL;
 		goto no_route;
 	}
 
-	fi = res.f->fib_info;
-	flags  = fi->fib_flags;
-	devout = fi->fib_dev;
-
-	if (flags&RTF_NAT) {
-		daddr = htonl((ntohl(daddr)&((1<<res.fm)-1)))|fi->fib_gateway;
-		fi = fib_lookup_info(daddr, saddr, tos, pdev, NULL);
-		if (!fi || fi->fib_flags&(RTF_NAT|RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST))
-			return -EINVAL;
-		devout = fi->fib_dev;
-		flags = fi->fib_flags|RTCF_NAT|RTF_NAT;
-	}
-
-	switch (res.fr->cl_action) {
-	case RTP_NAT:
-		/* Packet is from  translated source; remember it */
-		saddr = (saddr&~res.fr->cl_srcmask)|res.fr->cl_srcmap;
-		flags |= RTCF_NAT;
-		break;
-	case RTP_MASQUERADE:
-		/* Packet is from masqueraded source; remember it */
-		flags |= RTCF_MASQ;
-		break;
-	default:
-	}
-	log = res.fr->cl_flags&RTRF_LOG;
+#ifdef CONFIG_IP_ROUTE_NAT
+	/* Policy is applied before mapping destination,
+	   but rerouting after map should be made with old source.
+	 */
 
-	if (!(flags & RTF_LOCAL)) {
-		if (!IS_ROUTER || flags&RTF_NOFORWARD)
-			return -EINVAL;
-	} else {
-		fi = NULL;
-		devout = &loopback_dev;
-		if (flags&RTF_BROADCAST)
-		    goto mc_input;
+	if (1) {
+		u32 src_map = saddr;
+		if (res.r)
+			src_map = fib_rules_policy(saddr, &res, &flags);
+
+		if (res.type == RTN_NAT) {
+			key.dst = fib_rules_map_destination(daddr, &res);
+			if (fib_lookup(&key, &res) || res.type != RTN_UNICAST)
+				return -EINVAL;
+			flags |= RTCF_DNAT;
+		}
+		key.src = src_map;
 	}
-
-#ifndef CONFIG_IP_LOCAL_RT_POLICY
-	if (flags&RTF_LOCAL)
-		src_fi = fib_lookup_info(src_key, 0, tos, &loopback_dev, NULL);
-	else
 #endif
-	if (fib_lookup(&res, src_key, daddr, tos, net_alias_main_dev(devout), NULL) == 0) {
-		src_fi = res.f->fib_info;
-		/* Destination is on masqueraded network:
-		 * if it is real incoming frame, ip_forward will drop it.
-		 */
-		if (res.fr->cl_flags&RTRF_VALVE)
-			flags |= RTCF_VALVE;
-	}
 
-        if (src_fi) {
-		if (src_fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT))
+	if (res.type == RTN_BROADCAST)
+		goto brd_input;
+
+	if (res.type == RTN_LOCAL) {
+		spec_dst = daddr;
+		if (inet_addr_type(saddr) != RTN_UNICAST)
 			goto martian_source;
+		goto local_input;
+	}
 
-		if (!(src_fi->fib_flags&RTF_GATEWAY))
-			flags |= RTCF_DIRECTSRC;
+	if (!IN_DEV_FORWARD(in_dev))
+		return -EINVAL;
+	if (res.type != RTN_UNICAST)
+		goto martian_destination;
 
-		if (net_alias_main_dev(src_fi->fib_dev) == pdev)
-			skb->dev = dev = src_fi->fib_dev;
-		else {
-			/* Route to packet source goes via
-			   different interface; rfc1812 proposes
-			   to drop them.
-			   It is dangerous on not-stub/transit networks
-			   because of path asymmetry.
-			 */
-			if (ipv4_config.rfc1812_filter >= 2)
-				goto martian_source;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (res.fi->fib_nhs > 1 && key.oif == 0)
+		fib_select_multipath(&key, &res);
+#endif
+	out_dev = FIB_RES_DEV(res)->ip_ptr;
 
-			/* Weaker form of rfc1812 filtering.
-			   If source is on directly connected network,
-			   it can mean either local network configuration error
-			   (the most probable case) or real IP spoofing attempt.
-			 */
-			if (ipv4_config.rfc1812_filter >= 1 && !(flags&RTCF_DIRECTSRC))
-				goto martian_source;
-		}
-	} else if (ipv4_config.rfc1812_filter >= 1)
+	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst);
+	if (err < 0)
 		goto martian_source;
 
-make_route:
+	if (err)
+		flags |= RTCF_DIRECTSRC;
+
+	if (out_dev == in_dev && err && !(flags&RTCF_NAT) &&
+	    (IN_DEV_SHARED_MEDIA(out_dev)
+	     || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
+		flags |= RTCF_DOREDIRECT;
+
 	if (skb->protocol != __constant_htons(ETH_P_IP)) {
-		/* ARP request. Do not make route for invalid destination or
-		 * if it is redirected.
+		/* Not IP (i.e. ARP). Do not make route for invalid
+		 * destination or if it is redirected.
 		 */
-		if (flags&(RTF_REJECT|RTF_BROADCAST|RTF_MULTICAST) ||
-		    skb->pkt_type == PACKET_OTHERHOST ||
-		    (devout == dev && !(flags&(RTF_LOCAL|RTCF_NAT))))
+		if (out_dev == in_dev && flags&RTCF_DOREDIRECT)
 			return -EINVAL;
 	}
 
@@ -948,147 +975,105 @@
 	if (!rth)
 		return -ENOBUFS;
 
-	rth->u.dst.output= ip_rt_bug;
-
 	atomic_set(&rth->u.dst.use, 1);
-	rth->key.dst	= dst_key;
-	rth->rt_dst	= dst_key;
-	rth->rt_dst_map	= daddr;
+	rth->key.dst	= daddr;
+	rth->rt_dst	= daddr;
 	rth->key.tos	= tos;
-	rth->key.src	= src_key;
-	rth->rt_src	= src_key;
-	rth->rt_src_map	= saddr;
-	rth->rt_src_dev = dev;
-	rth->key.src_dev= pdev;
-	rth->u.dst.dev	= devout;
-	rth->key.dst_dev= NULL;
+	rth->key.src	= saddr;
+	rth->rt_src	= saddr;
 	rth->rt_gateway	= daddr;
-	rth->rt_spec_dst= daddr;
-
-	if (!(flags&RTF_REJECT)) {
-		if (flags&RTF_LOCAL)
-			rth->u.dst.input= ip_local_deliver;
-		if (!(flags&(RTF_NOFORWARD|RTF_BROADCAST))) {
-			if (flags&RTF_MULTICAST) {
-#ifdef CONFIG_IP_MROUTE
-				if (!LOCAL_MCAST(daddr) && ipv4_config.multicast_route) {
-					rth->u.dst.input = ip_mr_input;
-					rth->u.dst.output = ip_output;
-				}
+#ifdef CONFIG_IP_ROUTE_NAT
+	rth->rt_src_map	= key.src;
+	rth->rt_dst_map	= key.dst;
+	if (flags&RTCF_DNAT)
+		rth->rt_gateway	= key.dst;
 #endif
-			} else if (!(flags&RTF_LOCAL)) {
-				rth->u.dst.input = ip_forward;
-				rth->u.dst.output = ip_output;
-			}
-		}
-	} else if (IS_ROUTER && !(flags&(RTF_MULTICAST|RTF_BROADCAST))) {
-		rth->u.dst.input= ip_error;
-		rth->u.dst.error= -err;
-	}
-
-	if ((flags&(RTF_BROADCAST|RTF_MULTICAST)) || !(flags&RTF_LOCAL))
-		rth->rt_spec_dst= dev->pa_addr;
-
-	if (fi) {
-		rth->u.dst.pmtu	= fi->fib_mtu;
-		rth->u.dst.window=fi->fib_window;
-		rth->u.dst.rtt	= fi->fib_irtt;
-		if (flags & RTF_GATEWAY)
-			rth->rt_gateway	= fi->fib_gateway;
-	} else {
-		rth->u.dst.pmtu	= devout->mtu;
-		rth->u.dst.window=0;
-		rth->u.dst.rtt	= TCP_TIMEOUT_INIT;
-	}
-
-	if (!(flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTCF_NAT)) &&
-	    flags&RTCF_DIRECTSRC &&
-	    (devout == dev || (ipv4_config.rfc1620_redirects &&
-			       net_alias_main_dev(devout) == pdev)))
-		flags |= RTCF_DOREDIRECT;
+	rth->rt_iif 	=
+	rth->key.iif	= dev->ifindex;
+	rth->u.dst.dev	= out_dev->dev;
+	rth->key.oif 	= 0;
+	rth->rt_spec_dst= spec_dst;
+
+	rth->u.dst.input = ip_forward;
+	rth->u.dst.output = ip_output;
+
+	rth->u.dst.pmtu	= res.fi->fib_mtu ? : out_dev->dev->mtu;
+	rth->u.dst.window=res.fi->fib_window ? : 0;
+	rth->u.dst.rtt	= res.fi->fib_rtt ? : TCP_TIMEOUT_INIT;
+	if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK)
+		rth->rt_gateway	= FIB_RES_GW(res);
 
 	rth->rt_flags = flags;
+	rth->rt_type = res.type;
 
-	if (log)
-		printk(KERN_INFO "installing route %08lX -> %08lX\n", ntohl(rth->rt_src), ntohl(rth->rt_dst));
-
-	if (flags&(RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST|RTF_REJECT)) {
-		skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0);
-		return 0;
-	}
-	skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, __constant_ntohs(skb->protocol));
+	skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, ntohs(skb->protocol));
 	return 0;
 
-mc_input:
+brd_input:
 	if (skb->protocol != __constant_htons(ETH_P_IP))
 		return -EINVAL;
 
 	if (ZERONET(saddr)) {
-		if (!ipv4_config.bootp_agent)
-			goto martian_source;
-		flags |= RTF_NOFORWARD|RTF_LOCAL;
+		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
 	} else {
-		src_fi = fib_lookup_info(saddr, 0, tos, &loopback_dev, NULL);
-		if (!src_fi)
+		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst);
+		if (err < 0)
 			goto martian_source;
-
-		if (src_fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT))
-			goto martian_source;
-
-		if (!(src_fi->fib_flags&RTF_GATEWAY))
+		if (err)
 			flags |= RTCF_DIRECTSRC;
-
-		if (!MULTICAST(daddr) || !ipv4_config.multicast_route ||
-		    LOCAL_MCAST(daddr)) {
-			if (net_alias_main_dev(src_fi->fib_dev) == pdev) {
-				skb->dev = dev = src_fi->fib_dev;
-			} else {
-				/* Fascist not-unicast filtering 8) */
-				goto martian_source;
-			}
-		}
 	}
+	flags |= RTCF_BROADCAST;
 
-	if (!MULTICAST(daddr)) {
-		flags |= RTF_LOCAL|RTF_BROADCAST|RTF_NOFORWARD;
-		devout = dev;
-		goto make_route;
-	}
-
-	flags |= RTF_MULTICAST|RTF_LOCAL;
+local_input:
+	rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+	if (!rth)
+		return -ENOBUFS;
 
-	if (ip_check_mc(dev, daddr) == 0) {
-		flags &= ~RTF_LOCAL;
+	rth->u.dst.output= ip_rt_bug;
 
-		if (!ipv4_config.multicast_route || !(dev->flags&IFF_ALLMULTI))
-			goto no_route;
+	atomic_set(&rth->u.dst.use, 1);
+	rth->key.dst	= daddr;
+	rth->rt_dst	= daddr;
+	rth->key.tos	= tos;
+	rth->key.src	= saddr;
+	rth->rt_src	= saddr;
+#ifdef CONFIG_IP_ROUTE_NAT
+	rth->rt_dst_map	= key.dst;
+	rth->rt_src_map	= key.src;
+#endif
+	rth->rt_iif	=
+	rth->key.iif	= dev->ifindex;
+	rth->u.dst.dev	= &loopback_dev;
+	rth->key.oif 	= 0;
+	rth->rt_gateway	= daddr;
+	rth->rt_spec_dst= spec_dst;
+	rth->u.dst.input= ip_local_deliver;
+	if (res.type == RTN_UNREACHABLE) {
+		rth->u.dst.input= ip_error;
+		rth->u.dst.error= err;
 	}
-	devout = dev;
-	goto make_route;
-
-promisc_ip:
-	flags |= RTF_LOCAL|RTF_NOFORWARD;
-	if (MULTICAST(daddr))
-		flags |= RTF_MULTICAST;
-	else
-		flags |= RTF_BROADCAST;
-	devout = dev;
-	goto make_route;
+	rth->rt_flags 	= flags|RTCF_LOCAL;
+	rth->rt_type	= res.type;
+	skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0);
+	return 0;
 
 no_route:
-	flags |= RTF_REJECT;
-	devout = dev;
-	goto make_route;
+	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+	res.type = RTN_UNREACHABLE;
+	goto local_input;
 
 	/*
 	 *	Do not cache martian addresses: they should be logged (RFC1812)
 	 */
 martian_destination:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
 	if (ipv4_config.log_martians && net_ratelimit())
 		printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name);
+#endif
 	return -EINVAL;
 
 martian_source:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
 	if (ipv4_config.log_martians && net_ratelimit()) {
 		/*
 		 *	RFC1812 recommenadtion, if source is martian,
@@ -1104,6 +1089,7 @@
 			printk("\n");
 		}
 	}
+#endif
 	return -EINVAL;
 }
 
@@ -1112,224 +1098,298 @@
 {
 	struct rtable * rth;
 	unsigned	hash;
-
-	if (skb->dst)
-		return 0;
-
-#if RT_CACHE_DEBUG >= 1
-	if (dev->flags & IFF_LOOPBACK) {
-		printk(KERN_DEBUG "ip_route_input: bug: packet is looped back\n");
-		return -EINVAL;
-	}
-	if (net_alias_main_dev(dev) != dev)
-		printk(KERN_DEBUG "ip_route_input: bug: packet is received on alias %s\n", dev->name);
-#endif
+	int iif = dev->ifindex;
 
 	tos &= IPTOS_TOS_MASK;
-	hash = rt_hash_code(daddr, saddr^(unsigned long)dev, tos);
-	skb->dev = dev;
+	hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
 
 	for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
 		if (rth->key.dst == daddr &&
 		    rth->key.src == saddr &&
-		    rth->key.src_dev == dev &&
-		    rth->key.dst_dev == NULL &&
+		    rth->key.iif == iif &&
+		    rth->key.oif == 0 &&
 		    rth->key.tos == tos) {
 			rth->u.dst.lastuse = jiffies;
 			atomic_inc(&rth->u.dst.use);
 			atomic_inc(&rth->u.dst.refcnt);
 			skb->dst = (struct dst_entry*)rth;
-			skb->dev = rth->rt_src_dev;
 			return 0;
 		}
 	}
+
+	/* Multicast recognition logic is moved from route cache to here.
+	   The problem was that too many ethernet cards have broken/missing
+	   hardware multicast filters :-( As result the host on multicasting
+	   network acquires a lot of useless route cache entries, sort of
+	   SDR messages from all the world. Now we try to get rid of them.
+	   Really, provided software IP multicast filter is organized
+	   reasonably (at least, hashed), it does not result in a slowdown
+	   comparing with route cache reject entries.
+	   Note, that multicast routers are not affected, because
+	   route cache entry is created eventually.
+	 */
+	if (MULTICAST(daddr)) {
+		int our = ip_check_mc(dev, daddr);
+		if (!our
+#ifdef CONFIG_IP_MROUTE
+		    && (LOCAL_MCAST(daddr) || !dev->ip_ptr ||
+			!IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr))
+#endif
+		    ) return -EINVAL;
+		return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
+	}
 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
 }
 
-
 /*
  * Major route resolver routine.
  */
 
-int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos,
-			 struct device *dev_out)
+int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int oif)
 {
-	u32 src_key = saddr;
-	u32 dst_key = daddr;
-	u32 dst_map;
-	struct device *dst_dev_key = dev_out;
+	struct rt_key key;
+	struct fib_result res;
 	unsigned flags = 0;
-	struct fib_info *fi = NULL;
 	struct rtable *rth;
-#ifdef CONFIG_IP_LOCAL_RT_POLICY
-	struct fib_result res;
-#endif
+	struct device *dev_out = NULL;
 	unsigned hash;
 
 	tos &= IPTOS_TOS_MASK|1;
+	key.dst = daddr;
+	key.src = saddr;
+	key.tos = tos&IPTOS_TOS_MASK;
+	key.iif = loopback_dev.ifindex;
+	key.oif = oif;
+	key.scope = (tos&1) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
+	res.fi = NULL;
 
 	if (saddr) {
-		if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr) ||
-		    __ip_chk_addr(saddr) != IS_MYADDR)
+		if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr))
 			return -EINVAL;
-		if (dev_out == NULL && (MULTICAST(daddr) || daddr == 0xFFFFFFFF))
-			dev_out = ip_dev_find(saddr, NULL);
+
+		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+		dev_out = ip_dev_find(saddr);
+		if (dev_out == NULL)
+			return -EINVAL;
+
+		/* I removed check for oif == dev_out->oif here.
+		   It was wrong by three reasons:
+		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
+		      assigned to multiple interfaces.
+		   2. Moreover, we are allowed to send packets with saddr
+		      of another iface. --ANK
+		 */
+
+		if (oif == 0 && (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) {
+			/* Special hack: user can direct multicasts
+			   and limited broadcast via necessary interface
+			   without fiddling with IP_MULTICAST_IF or IP_TXINFO.
+			   This hack is not just for fun, it allows
+			   vic,vat and friends to work.
+			   They bind socket to loopback, set ttl to zero
+			   and expect that it will work.
+			   From the viewpoint of routing cache they are broken,
+			   because we are not allowed to build multicast path
+			   with loopback source addr (look, routing cache
+			   cannot know, that ttl is zero, so that packet
+			   will not leave this host and route is valid).
+			   Luckily, this hack is good workaround.
+			 */
+
+			key.oif = dev_out->ifindex;
+			goto make_route;
+		}
+		dev_out = NULL;
 	}
-	if (!daddr)
-		daddr = saddr;
+	if (oif) {
+		dev_out = dev_get_by_index(oif);
+		if (dev_out == NULL)
+			return -ENODEV;
+		if (dev_out->ip_ptr == NULL)
+			return -ENODEV;	/* Wrong error code */
 
-	if (dev_out) {
-		if (!saddr) {
-			saddr = dev_out->pa_addr;
-			if (!daddr)
-				daddr = saddr;
+		if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) {
+			key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
+			goto make_route;
 		}
-		dst_map = daddr;
-		if (MULTICAST(daddr) || daddr == 0xFFFFFFFF)
+		if (MULTICAST(daddr)) {
+			key.src = inet_select_addr(dev_out, 0, key.scope);
 			goto make_route;
+		}
+		if (!daddr)
+			key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST);
+	}
+
+	if (!key.dst) {
+		key.dst = key.src;
+		if (!key.dst)
+			key.dst = key.src = htonl(INADDR_LOOPBACK);
+		dev_out = &loopback_dev;
+		key.oif = loopback_dev.ifindex;
+		flags |= RTCF_LOCAL;
+		goto make_route;
 	}
 
-	if (!daddr)
-		daddr = htonl(INADDR_LOOPBACK);
+	if (fib_lookup(&key, &res)) {
+		res.fi = NULL;
+		if (oif) {
+			/* Apparently, routing tables are wrong. Assume,
+			   that the destination is on link.
+
+			   WHY? DW.
+			   Because we are allowed to send to iface
+			   even if it has NO routes and NO assigned
+			   addresses. When oif is specified, routing
+			   tables are looked up with only one purpose:
+			   to catch if destination is gatewayed, rather than
+			   direct. Moreover, if MSG_DONTROUTE is set,
+			   we send packet, no matter of routing tables
+			   of ifaddr state. --ANK
 
-#ifdef CONFIG_IP_LOCAL_RT_POLICY
-	if (fib_lookup(&res, daddr, saddr, tos, &loopback_dev, dev_out))
+
+			   We could make it even if oif is unknown,
+			   likely IPv6, but we do not.
+			 */
+
+			printk(KERN_DEBUG "Dest not on link. Forcing...\n");
+			if (key.src == 0)
+				key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
+			goto make_route;
+		}
 		return -ENETUNREACH;
-	fi = res.f->fib_info;
-	dst_map = daddr;
+	}
 
-	if (fi->fib_flags&RTF_NAT)
+	if (res.type == RTN_NAT)
 		return -EINVAL;
 
-	if (!saddr) {
-		saddr = fi->fib_dev->pa_addr;
 
+	if (!key.src) {
+		key.src = FIB_RES_PREFSRC(res);
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
 		/*
 		 * "Stabilization" of route.
 		 * This step is necessary, if locally originated packets
-		 * are subjected to source routing, else we could get
+		 * are subjected to policy routing, otherwise we could get
 		 * route flapping.
 		 */
-		fi = fib_lookup_info(dst_map, saddr, tos, &loopback_dev, dev_out);
-		if (!fi)
+		if (fib_lookup(&key, &res))
 			return -ENETUNREACH;
+#endif
 	}
-#else
-	fi = fib_lookup_info(daddr, 0, tos, &loopback_dev, dev_out);
-	if (!fi)
-		return -ENETUNREACH;
-
-	if (fi->fib_flags&RTF_NAT)
-		return -EINVAL;
 
-	dst_map = daddr;
-	if (!saddr)
-		saddr = fi->fib_dev->pa_addr;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (res.fi->fib_nhs > 1 && key.oif == 0)
+		fib_select_multipath(&key, &res);
 #endif
 
-	flags |= fi->fib_flags;
-	dev_out = fi->fib_dev;
+	dev_out = FIB_RES_DEV(res);
 
-	if (RT_LOCALADDR(flags)) {
+	if (res.type == RTN_LOCAL) {
 		dev_out = &loopback_dev;
-		fi = NULL;
+		key.oif = dev_out->ifindex;
+		res.fi = NULL;
+		flags |= RTCF_LOCAL;
 	}
 
-	if (dst_dev_key && dev_out != dst_dev_key)
-		return -EINVAL;
+	key.oif = dev_out->ifindex;
 
 make_route:
-	if (LOOPBACK(saddr) && !(dev_out->flags&IFF_LOOPBACK)) {
-		printk(KERN_DEBUG "this guy talks to %08x from loopback\n", daddr);
+	if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK)) {
+		printk(KERN_DEBUG "this guy talks to %08x from loopback\n", key.dst);
 		return -EINVAL;
 	}
 
-	if (daddr == 0xFFFFFFFF)
-		flags |= RTF_BROADCAST;
-	else if (MULTICAST(daddr))
-		flags |= RTF_MULTICAST;
-	else if (BADCLASS(daddr) || ZERONET(daddr))
+	if (key.dst == 0xFFFFFFFF)
+		res.type = RTN_BROADCAST;
+	else if (MULTICAST(key.dst))
+		res.type = RTN_MULTICAST;
+	else if (BADCLASS(key.dst) || ZERONET(key.dst))
 		return -EINVAL;
 
-	if (flags&RTF_BROADCAST && (dev_out->flags&IFF_LOOPBACK ||
-	    !(dev_out->flags&IFF_BROADCAST)))
-		flags &= ~RTF_LOCAL;
-	else if (flags&RTF_MULTICAST) {
+	if (res.type == RTN_BROADCAST) {
+		flags |= RTCF_BROADCAST;
+		if (!(dev_out->flags&IFF_LOOPBACK) && dev_out->flags&IFF_BROADCAST)
+			flags |= RTCF_LOCAL;
+	} else if (res.type == RTN_MULTICAST) {
+		flags |= RTCF_MULTICAST;
 		if (ip_check_mc(dev_out, daddr))
-			flags |= RTF_LOCAL;
+			flags |= RTCF_LOCAL;
 	}
-	
+
 	rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
 	if (!rth)
 		return -ENOBUFS;
 
 	atomic_set(&rth->u.dst.use, 1);
-	rth->key.dst	= dst_key;
+	rth->key.dst	= daddr;
 	rth->key.tos	= tos;
-	rth->key.src	= src_key;
-	rth->key.src_dev= NULL;
-	rth->key.dst_dev= dst_dev_key;
-	rth->rt_dst	= daddr;
-	rth->rt_dst_map	= dst_map;
-	rth->rt_src	= saddr;
-	rth->rt_src_map	= saddr;
-	rth->rt_src_dev = dev_out;
+	rth->key.src	= saddr;
+	rth->key.iif	= 0;
+	rth->key.oif	= oif;
+	rth->rt_dst	= key.dst;
+	rth->rt_src	= key.src;
+#ifdef CONFIG_IP_ROUTE_NAT
+	rth->rt_dst_map	= key.dst;
+	rth->rt_src_map	= key.src;
+#endif
+	rth->rt_iif	= dev_out->ifindex;
 	rth->u.dst.dev	= dev_out;
-	rth->rt_gateway = dst_map;
-	rth->rt_spec_dst= dev_out->pa_addr;
+	rth->rt_gateway = key.dst;
+	rth->rt_spec_dst= key.src;
 
 	rth->u.dst.output=ip_output;
 
-	if (flags&RTF_LOCAL) {
+	if (flags&RTCF_LOCAL) {
 		rth->u.dst.input = ip_local_deliver;
-		rth->rt_spec_dst = daddr;
+		rth->rt_spec_dst = key.dst;
 	}
-	if (flags&(RTF_BROADCAST|RTF_MULTICAST)) {
-		rth->rt_spec_dst = dev_out->pa_addr;
-		flags &= ~RTF_GATEWAY;
-		if (flags&RTF_LOCAL)
+	if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
+		rth->rt_spec_dst = key.src;
+		if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
 			rth->u.dst.output = ip_mc_output;
-		if (flags&RTF_MULTICAST) {
-			if (dev_out->flags&IFF_ALLMULTI)
-				rth->u.dst.output = ip_mc_output;
 #ifdef CONFIG_IP_MROUTE
-			if (ipv4_config.multicast_route && !LOCAL_MCAST(daddr))
+		if (res.type == RTN_MULTICAST && dev_out->ip_ptr) {
+			struct in_device *in_dev = dev_out->ip_ptr;
+			if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) {
 				rth->u.dst.input = ip_mr_input;
-#endif
+				rth->u.dst.output = ip_mc_output;
+			}
 		}
+#endif
 	}
 
-	if (fi) {
-		if (flags&RTF_GATEWAY)
-			rth->rt_gateway = fi->fib_gateway;
-		rth->u.dst.pmtu	= fi->fib_mtu;
-		rth->u.dst.window=fi->fib_window;
-		rth->u.dst.rtt	= fi->fib_irtt;
+	if (res.fi) {
+		if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK)
+			rth->rt_gateway = FIB_RES_GW(res);
+		rth->u.dst.pmtu	= res.fi->fib_mtu ? : dev_out->mtu;
+		rth->u.dst.window=res.fi->fib_window ? : 0;
+		rth->u.dst.rtt	= res.fi->fib_rtt ? : TCP_TIMEOUT_INIT;
 	} else {
 		rth->u.dst.pmtu	= dev_out->mtu;
 		rth->u.dst.window=0;
 		rth->u.dst.rtt	= TCP_TIMEOUT_INIT;
 	}
 	rth->rt_flags = flags;
-	hash = rt_hash_code(dst_key, dst_dev_key ? src_key^(dst_dev_key->ifindex<<5) : src_key, tos);
+        rth->rt_type = res.type;
+	hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
 	*rp = rt_intern_hash(hash, rth, ETH_P_IP);
 	return 0;
 }
 
-int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, struct device *dev_out)
+int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int oif)
 {
 	unsigned hash;
 	struct rtable *rth;
 
-	hash = rt_hash_code(daddr, dev_out ? saddr^(dev_out->ifindex<<5)
-			                   : saddr, tos);
+	hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
 
 	start_bh_atomic();
 	for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
 		if (rth->key.dst == daddr &&
 		    rth->key.src == saddr &&
-		    rth->key.src_dev == NULL &&
-		    rth->key.dst_dev == dev_out &&
+		    rth->key.iif == 0 &&
+		    rth->key.oif == oif &&
 		    rth->key.tos == tos) {
 			rth->u.dst.lastuse = jiffies;
 			atomic_inc(&rth->u.dst.use);
@@ -1341,48 +1401,126 @@
 	}
 	end_bh_atomic();
 
-	return ip_route_output_slow(rp, daddr, saddr, tos, dev_out);
+	return ip_route_output_slow(rp, daddr, saddr, tos, oif);
 }
 
-int ip_route_output_dev(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int ifindex)
+#ifdef CONFIG_RTNETLINK
+
+int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 {
-	unsigned hash;
-	struct rtable *rth;
-	struct device *dev_out;
+	struct kern_rta *rta = arg;
+	struct rtmsg *rtm = NLMSG_DATA(nlh);
+	struct rtable *rt = NULL;
+	u32 dst = 0;
+	u32 src = 0;
+	int err;
+	struct sk_buff *skb;
+	u8  *o;
 
-	hash = rt_hash_code(daddr, saddr^(ifindex<<5), tos);
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		return -ENOBUFS;
 
-	start_bh_atomic();
-	for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
-		if (rth->key.dst == daddr &&
-		    rth->key.src == saddr &&
-		    rth->key.src_dev == NULL &&
-		    rth->key.tos == tos &&
-		    rth->key.dst_dev &&
-		    rth->key.dst_dev->ifindex == ifindex) {
-			rth->u.dst.lastuse = jiffies;
-			atomic_inc(&rth->u.dst.use);
-			atomic_inc(&rth->u.dst.refcnt);
-			end_bh_atomic();
-			*rp = rth;
-			return 0;
+	/* Reserve room for dummy headers, this skb can pass
+	   through good chunk of routing engine.
+	 */
+	skb->mac.raw = skb->data;
+	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+
+	if (rta->rta_dst)
+		memcpy(&dst, rta->rta_dst, 4);
+	if (rta->rta_src)
+		memcpy(&src, rta->rta_src, 4);
+
+	if (rta->rta_iif) {
+		struct device *dev;
+		dev = dev_get_by_index(*rta->rta_iif);
+		if (!dev)
+			return -ENODEV;
+		skb->protocol = __constant_htons(ETH_P_IP);
+		skb->dev = dev;
+		start_bh_atomic();
+		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+		end_bh_atomic();
+		rt = (struct rtable*)skb->dst;
+		if (!err && rt->u.dst.error)
+			err = rt->u.dst.error;
+	} else {
+		err = ip_route_output(&rt, dst, src, rtm->rtm_tos,
+				      rta->rta_oif ? *rta->rta_oif : 0);
+	}
+	if (err) {
+		kfree_skb(skb, FREE_WRITE);
+		return err;
+	}
+
+	skb->dst = &rt->u.dst;
+	if (rtm->rtm_flags & RTM_F_NOTIFY)
+		rt->rt_flags |= RTCF_NOTIFY;
+
+	nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+			RTM_NEWROUTE, sizeof(*rtm));
+	rtm = NLMSG_DATA(nlh);
+	nlh->nlmsg_flags = 0;
+	rtm->rtm_family = AF_INET;
+	rtm->rtm_dst_len = 32;
+	rtm->rtm_src_len = 32;
+	rtm->rtm_tos = rt->key.tos;
+	rtm->rtm_table = RT_TABLE_MAIN;
+	rtm->rtm_type = rt->rt_type;
+	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+	rtm->rtm_protocol = RTPROT_UNSPEC;
+	rtm->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
+	rtm->rtm_nhs = 0;
+
+	o = skb->tail;
+	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
+	RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src);
+	if (rt->u.dst.dev)
+		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+	if (rt->rt_dst != rt->rt_gateway)
+		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
+	RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
+	RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window);
+	RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt);
+	RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
+	rtm->rtm_optlen = skb->tail - o;
+	if (rta->rta_iif) {
+#ifdef CONFIG_IP_MROUTE
+		if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_config.multicast_route) {
+			NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid;
+			err = ipmr_get_route(skb, rtm);
+			if (err <= 0)
+				return err;
+		} else
+#endif
+		{
+			RTA_PUT(skb, RTA_IIF, 4, rta->rta_iif);
+			rtm->rtm_optlen = skb->tail - o;
 		}
 	}
-	end_bh_atomic();
+	nlh->nlmsg_len = skb->tail - (u8*)nlh;
+	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
+	if (err < 0)
+		return err;
+	return 0;
 
-	dev_out = dev_get_by_index(ifindex);
-	if (!dev_out)
-		return -ENODEV;
-	return ip_route_output_slow(rp, daddr, saddr, tos, dev_out);
+nlmsg_failure:
+rtattr_failure:
+	kfree_skb(skb, FREE_WRITE);
+	return -EMSGSIZE;
 }
 
-void ip_rt_multicast_event(struct device *dev)
+#endif /* CONFIG_RTNETLINK */
+
+void ip_rt_multicast_event(struct in_device *in_dev)
 {
-	rt_cache_flush(0);
+	rt_cache_flush(1*HZ);
 }
 
 __initfunc(void ip_rt_init(void))
 {
+	devinet_init();
 	ip_fib_init();
 
 #ifdef CONFIG_PROC_FS

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov