xref: /OK3568_Linux_fs/kernel/net/ipv4/route.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		ROUTE - implementation of the IP router.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *		Alan Cox	:	Verify area fixes.
17  *		Alan Cox	:	cli() protects routing changes
18  *		Rui Oliveira	:	ICMP routing table updates
19  *		(rco@di.uminho.pt)	Routing table insertion and update
20  *		Linus Torvalds	:	Rewrote bits to be sensible
21  *		Alan Cox	:	Added BSD route gw semantics
22  *		Alan Cox	:	Super /proc >4K
23  *		Alan Cox	:	MTU in route table
24  *		Alan Cox	: 	MSS actually. Also added the window
25  *					clamper.
26  *		Sam Lantinga	:	Fixed route matching in rt_del()
27  *		Alan Cox	:	Routing cache support.
28  *		Alan Cox	:	Removed compatibility cruft.
29  *		Alan Cox	:	RTF_REJECT support.
30  *		Alan Cox	:	TCP irtt support.
31  *		Jonathan Naylor	:	Added Metric support.
32  *	Miquel van Smoorenburg	:	BSD API fixes.
33  *	Miquel van Smoorenburg	:	Metrics.
34  *		Alan Cox	:	Use __u32 properly
35  *		Alan Cox	:	Aligned routing errors more closely with BSD
36  *					our system is still very different.
37  *		Alan Cox	:	Faster /proc handling
38  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39  *					routing caches and better behaviour.
40  *
41  *		Olaf Erb	:	irtt wasn't being copied right.
42  *		Bjorn Ekwall	:	Kerneld route support.
43  *		Alan Cox	:	Multicast fixed (I hope)
44  * 		Pavel Krauz	:	Limited broadcast fixed
45  *		Mike McLagan	:	Routing by source
46  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47  *					route.c and rewritten from scratch.
48  *		Andi Kleen	:	Load-limit warning messages.
49  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53  *		Marc Boucher	:	routing by fwmark
54  *	Robert Olsson		:	Added rt_cache statistics
55  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59  */
60 
61 #define pr_fmt(fmt) "IPv4: " fmt
62 
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113 
114 #include "fib_lookup.h"
115 
116 #define RT_FL_TOS(oldflp4) \
117 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 
119 #define RT_GC_TIMEOUT (300*HZ)
120 
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly	= 256;
130 
131 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
132 
133 /*
134  *	Interface to generic destination cache.
135  */
136 
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void		 ipv4_link_failure(struct sk_buff *skb);
142 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 					   struct sk_buff *skb, u32 mtu,
144 					   bool confirm_neigh);
145 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146 					struct sk_buff *skb);
147 static void		ipv4_dst_destroy(struct dst_entry *dst);
148 
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151 	WARN_ON(1);
152 	return NULL;
153 }
154 
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 					   struct sk_buff *skb,
157 					   const void *daddr);
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159 
160 static struct dst_ops ipv4_dst_ops = {
161 	.family =		AF_INET,
162 	.check =		ipv4_dst_check,
163 	.default_advmss =	ipv4_default_advmss,
164 	.mtu =			ipv4_mtu,
165 	.cow_metrics =		ipv4_cow_metrics,
166 	.destroy =		ipv4_dst_destroy,
167 	.negative_advice =	ipv4_negative_advice,
168 	.link_failure =		ipv4_link_failure,
169 	.update_pmtu =		ip_rt_update_pmtu,
170 	.redirect =		ip_do_redirect,
171 	.local_out =		__ip_local_out,
172 	.neigh_lookup =		ipv4_neigh_lookup,
173 	.confirm_neigh =	ipv4_confirm_neigh,
174 };
175 
176 #define ECN_OR_COST(class)	TC_PRIO_##class
177 
178 const __u8 ip_tos2prio[16] = {
179 	TC_PRIO_BESTEFFORT,
180 	ECN_OR_COST(BESTEFFORT),
181 	TC_PRIO_BESTEFFORT,
182 	ECN_OR_COST(BESTEFFORT),
183 	TC_PRIO_BULK,
184 	ECN_OR_COST(BULK),
185 	TC_PRIO_BULK,
186 	ECN_OR_COST(BULK),
187 	TC_PRIO_INTERACTIVE,
188 	ECN_OR_COST(INTERACTIVE),
189 	TC_PRIO_INTERACTIVE,
190 	ECN_OR_COST(INTERACTIVE),
191 	TC_PRIO_INTERACTIVE_BULK,
192 	ECN_OR_COST(INTERACTIVE_BULK),
193 	TC_PRIO_INTERACTIVE_BULK,
194 	ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197 
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200 
201 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 	if (*pos)
205 		return NULL;
206 	return SEQ_START_TOKEN;
207 }
208 
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 	++*pos;
212 	return NULL;
213 }
214 
rt_cache_seq_stop(struct seq_file * seq,void * v)215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218 
rt_cache_seq_show(struct seq_file * seq,void * v)219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 	if (v == SEQ_START_TOKEN)
222 		seq_printf(seq, "%-127s\n",
223 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 			   "HHUptod\tSpecDst");
226 	return 0;
227 }
228 
229 static const struct seq_operations rt_cache_seq_ops = {
230 	.start  = rt_cache_seq_start,
231 	.next   = rt_cache_seq_next,
232 	.stop   = rt_cache_seq_stop,
233 	.show   = rt_cache_seq_show,
234 };
235 
rt_cache_seq_open(struct inode * inode,struct file * file)236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 	return seq_open(file, &rt_cache_seq_ops);
239 }
240 
241 static const struct proc_ops rt_cache_proc_ops = {
242 	.proc_open	= rt_cache_seq_open,
243 	.proc_read	= seq_read,
244 	.proc_lseek	= seq_lseek,
245 	.proc_release	= seq_release,
246 };
247 
248 
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251 	int cpu;
252 
253 	if (*pos == 0)
254 		return SEQ_START_TOKEN;
255 
256 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257 		if (!cpu_possible(cpu))
258 			continue;
259 		*pos = cpu+1;
260 		return &per_cpu(rt_cache_stat, cpu);
261 	}
262 	return NULL;
263 }
264 
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267 	int cpu;
268 
269 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270 		if (!cpu_possible(cpu))
271 			continue;
272 		*pos = cpu+1;
273 		return &per_cpu(rt_cache_stat, cpu);
274 	}
275 	(*pos)++;
276 	return NULL;
277 
278 }
279 
rt_cpu_seq_stop(struct seq_file * seq,void * v)280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282 
283 }
284 
rt_cpu_seq_show(struct seq_file * seq,void * v)285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 	struct rt_cache_stat *st = v;
288 
289 	if (v == SEQ_START_TOKEN) {
290 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 		return 0;
292 	}
293 
294 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 		   dst_entries_get_slow(&ipv4_dst_ops),
297 		   0, /* st->in_hit */
298 		   st->in_slow_tot,
299 		   st->in_slow_mc,
300 		   st->in_no_route,
301 		   st->in_brd,
302 		   st->in_martian_dst,
303 		   st->in_martian_src,
304 
305 		   0, /* st->out_hit */
306 		   st->out_slow_tot,
307 		   st->out_slow_mc,
308 
309 		   0, /* st->gc_total */
310 		   0, /* st->gc_ignored */
311 		   0, /* st->gc_goal_miss */
312 		   0, /* st->gc_dst_overflow */
313 		   0, /* st->in_hlist_search */
314 		   0  /* st->out_hlist_search */
315 		);
316 	return 0;
317 }
318 
319 static const struct seq_operations rt_cpu_seq_ops = {
320 	.start  = rt_cpu_seq_start,
321 	.next   = rt_cpu_seq_next,
322 	.stop   = rt_cpu_seq_stop,
323 	.show   = rt_cpu_seq_show,
324 };
325 
326 
rt_cpu_seq_open(struct inode * inode,struct file * file)327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 	return seq_open(file, &rt_cpu_seq_ops);
330 }
331 
332 static const struct proc_ops rt_cpu_proc_ops = {
333 	.proc_open	= rt_cpu_seq_open,
334 	.proc_read	= seq_read,
335 	.proc_lseek	= seq_lseek,
336 	.proc_release	= seq_release,
337 };
338 
339 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342 	struct ip_rt_acct *dst, *src;
343 	unsigned int i, j;
344 
345 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 	if (!dst)
347 		return -ENOMEM;
348 
349 	for_each_possible_cpu(i) {
350 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 		for (j = 0; j < 256; j++) {
352 			dst[j].o_bytes   += src[j].o_bytes;
353 			dst[j].o_packets += src[j].o_packets;
354 			dst[j].i_bytes   += src[j].i_bytes;
355 			dst[j].i_packets += src[j].i_packets;
356 		}
357 	}
358 
359 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 	kfree(dst);
361 	return 0;
362 }
363 #endif
364 
ip_rt_do_proc_init(struct net * net)365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367 	struct proc_dir_entry *pde;
368 
369 	pde = proc_create("rt_cache", 0444, net->proc_net,
370 			  &rt_cache_proc_ops);
371 	if (!pde)
372 		goto err1;
373 
374 	pde = proc_create("rt_cache", 0444,
375 			  net->proc_net_stat, &rt_cpu_proc_ops);
376 	if (!pde)
377 		goto err2;
378 
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 	pde = proc_create_single("rt_acct", 0, net->proc_net,
381 			rt_acct_proc_show);
382 	if (!pde)
383 		goto err3;
384 #endif
385 	return 0;
386 
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389 	remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392 	remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394 	return -ENOMEM;
395 }
396 
ip_rt_do_proc_exit(struct net * net)397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399 	remove_proc_entry("rt_cache", net->proc_net_stat);
400 	remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 	remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405 
406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
407 	.init = ip_rt_do_proc_init,
408 	.exit = ip_rt_do_proc_exit,
409 };
410 
ip_rt_proc_init(void)411 static int __init ip_rt_proc_init(void)
412 {
413 	return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415 
416 #else
ip_rt_proc_init(void)417 static inline int ip_rt_proc_init(void)
418 {
419 	return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422 
rt_is_expired(const struct rtable * rth)423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427 
rt_cache_flush(struct net * net)428 void rt_cache_flush(struct net *net)
429 {
430 	rt_genid_bump_ipv4(net);
431 }
432 
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434 					   struct sk_buff *skb,
435 					   const void *daddr)
436 {
437 	const struct rtable *rt = container_of(dst, struct rtable, dst);
438 	struct net_device *dev = dst->dev;
439 	struct neighbour *n;
440 
441 	rcu_read_lock_bh();
442 
443 	if (likely(rt->rt_gw_family == AF_INET)) {
444 		n = ip_neigh_gw4(dev, rt->rt_gw4);
445 	} else if (rt->rt_gw_family == AF_INET6) {
446 		n = ip_neigh_gw6(dev, &rt->rt_gw6);
447         } else {
448 		__be32 pkey;
449 
450 		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451 		n = ip_neigh_gw4(dev, pkey);
452 	}
453 
454 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455 		n = NULL;
456 
457 	rcu_read_unlock_bh();
458 
459 	return n;
460 }
461 
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 {
464 	const struct rtable *rt = container_of(dst, struct rtable, dst);
465 	struct net_device *dev = dst->dev;
466 	const __be32 *pkey = daddr;
467 
468 	if (rt->rt_gw_family == AF_INET) {
469 		pkey = (const __be32 *)&rt->rt_gw4;
470 	} else if (rt->rt_gw_family == AF_INET6) {
471 		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472 	} else if (!daddr ||
473 		 (rt->rt_flags &
474 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475 		return;
476 	}
477 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478 }
479 
480 /* Hash tables of size 2048..262144 depending on RAM size.
481  * Each bucket uses 8 bytes.
482  */
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486 
487 /* In order to protect privacy, we add a perturbation to identifiers
488  * if one generator is seldom used. This makes hard for an attacker
489  * to infer how many packets were sent between two points in time.
490  */
ip_idents_reserve(u32 hash,int segs)491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493 	u32 bucket, old, now = (u32)jiffies;
494 	atomic_t *p_id;
495 	u32 *p_tstamp;
496 	u32 delta = 0;
497 
498 	bucket = hash & ip_idents_mask;
499 	p_tstamp = ip_tstamps + bucket;
500 	p_id = ip_idents + bucket;
501 	old = READ_ONCE(*p_tstamp);
502 
503 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
504 		delta = prandom_u32_max(now - old);
505 
506 	/* If UBSAN reports an error there, please make sure your compiler
507 	 * supports -fno-strict-overflow before reporting it that was a bug
508 	 * in UBSAN, and it has been fixed in GCC-8.
509 	 */
510 	return atomic_add_return(segs + delta, p_id) - segs;
511 }
512 EXPORT_SYMBOL(ip_idents_reserve);
513 
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515 {
516 	u32 hash, id;
517 
518 	/* Note the following code is not safe, but this is okay. */
519 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520 		get_random_bytes(&net->ipv4.ip_id_key,
521 				 sizeof(net->ipv4.ip_id_key));
522 
523 	hash = siphash_3u32((__force u32)iph->daddr,
524 			    (__force u32)iph->saddr,
525 			    iph->protocol,
526 			    &net->ipv4.ip_id_key);
527 	id = ip_idents_reserve(hash, segs);
528 	iph->id = htons(id);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531 
ip_rt_fix_tos(struct flowi4 * fl4)532 static void ip_rt_fix_tos(struct flowi4 *fl4)
533 {
534 	__u8 tos = RT_FL_TOS(fl4);
535 
536 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
537 	fl4->flowi4_scope = tos & RTO_ONLINK ?
538 			    RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
539 }
540 
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)541 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
542 			     const struct sock *sk,
543 			     const struct iphdr *iph,
544 			     int oif, u8 tos,
545 			     u8 prot, u32 mark, int flow_flags)
546 {
547 	if (sk) {
548 		const struct inet_sock *inet = inet_sk(sk);
549 
550 		oif = sk->sk_bound_dev_if;
551 		mark = sk->sk_mark;
552 		tos = RT_CONN_FLAGS(sk);
553 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
554 	}
555 	flowi4_init_output(fl4, oif, mark, tos,
556 			   RT_SCOPE_UNIVERSE, prot,
557 			   flow_flags,
558 			   iph->daddr, iph->saddr, 0, 0,
559 			   sock_net_uid(net, sk));
560 }
561 
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)562 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
563 			       const struct sock *sk)
564 {
565 	const struct net *net = dev_net(skb->dev);
566 	const struct iphdr *iph = ip_hdr(skb);
567 	int oif = skb->dev->ifindex;
568 	u8 tos = RT_TOS(iph->tos);
569 	u8 prot = iph->protocol;
570 	u32 mark = skb->mark;
571 
572 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
573 }
574 
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)575 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
576 {
577 	const struct inet_sock *inet = inet_sk(sk);
578 	const struct ip_options_rcu *inet_opt;
579 	__be32 daddr = inet->inet_daddr;
580 
581 	rcu_read_lock();
582 	inet_opt = rcu_dereference(inet->inet_opt);
583 	if (inet_opt && inet_opt->opt.srr)
584 		daddr = inet_opt->opt.faddr;
585 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
586 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
587 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
588 			   inet_sk_flowi_flags(sk),
589 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
590 	rcu_read_unlock();
591 }
592 
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)593 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
594 				 const struct sk_buff *skb)
595 {
596 	if (skb)
597 		build_skb_flow_key(fl4, skb, sk);
598 	else
599 		build_sk_flow_key(fl4, sk);
600 }
601 
602 static DEFINE_SPINLOCK(fnhe_lock);
603 
fnhe_flush_routes(struct fib_nh_exception * fnhe)604 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
605 {
606 	struct rtable *rt;
607 
608 	rt = rcu_dereference(fnhe->fnhe_rth_input);
609 	if (rt) {
610 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
611 		dst_dev_put(&rt->dst);
612 		dst_release(&rt->dst);
613 	}
614 	rt = rcu_dereference(fnhe->fnhe_rth_output);
615 	if (rt) {
616 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
617 		dst_dev_put(&rt->dst);
618 		dst_release(&rt->dst);
619 	}
620 }
621 
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)622 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
623 {
624 	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
625 	struct fib_nh_exception *fnhe, *oldest = NULL;
626 
627 	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
628 		fnhe = rcu_dereference_protected(*fnhe_p,
629 						 lockdep_is_held(&fnhe_lock));
630 		if (!fnhe)
631 			break;
632 		if (!oldest ||
633 		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
634 			oldest = fnhe;
635 			oldest_p = fnhe_p;
636 		}
637 	}
638 	fnhe_flush_routes(oldest);
639 	*oldest_p = oldest->fnhe_next;
640 	kfree_rcu(oldest, rcu);
641 }
642 
fnhe_hashfun(__be32 daddr)643 static u32 fnhe_hashfun(__be32 daddr)
644 {
645 	static siphash_key_t fnhe_hash_key __read_mostly;
646 	u64 hval;
647 
648 	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
649 	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
650 	return hash_64(hval, FNHE_HASH_SHIFT);
651 }
652 
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)653 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
654 {
655 	rt->rt_pmtu = fnhe->fnhe_pmtu;
656 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
657 	rt->dst.expires = fnhe->fnhe_expires;
658 
659 	if (fnhe->fnhe_gw) {
660 		rt->rt_flags |= RTCF_REDIRECTED;
661 		rt->rt_uses_gateway = 1;
662 		rt->rt_gw_family = AF_INET;
663 		rt->rt_gw4 = fnhe->fnhe_gw;
664 	}
665 }
666 
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)667 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
668 				  __be32 gw, u32 pmtu, bool lock,
669 				  unsigned long expires)
670 {
671 	struct fnhe_hash_bucket *hash;
672 	struct fib_nh_exception *fnhe;
673 	struct rtable *rt;
674 	u32 genid, hval;
675 	unsigned int i;
676 	int depth;
677 
678 	genid = fnhe_genid(dev_net(nhc->nhc_dev));
679 	hval = fnhe_hashfun(daddr);
680 
681 	spin_lock_bh(&fnhe_lock);
682 
683 	hash = rcu_dereference(nhc->nhc_exceptions);
684 	if (!hash) {
685 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
686 		if (!hash)
687 			goto out_unlock;
688 		rcu_assign_pointer(nhc->nhc_exceptions, hash);
689 	}
690 
691 	hash += hval;
692 
693 	depth = 0;
694 	for (fnhe = rcu_dereference(hash->chain); fnhe;
695 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
696 		if (fnhe->fnhe_daddr == daddr)
697 			break;
698 		depth++;
699 	}
700 
701 	if (fnhe) {
702 		if (fnhe->fnhe_genid != genid)
703 			fnhe->fnhe_genid = genid;
704 		if (gw)
705 			fnhe->fnhe_gw = gw;
706 		if (pmtu) {
707 			fnhe->fnhe_pmtu = pmtu;
708 			fnhe->fnhe_mtu_locked = lock;
709 		}
710 		fnhe->fnhe_expires = max(1UL, expires);
711 		/* Update all cached dsts too */
712 		rt = rcu_dereference(fnhe->fnhe_rth_input);
713 		if (rt)
714 			fill_route_from_fnhe(rt, fnhe);
715 		rt = rcu_dereference(fnhe->fnhe_rth_output);
716 		if (rt)
717 			fill_route_from_fnhe(rt, fnhe);
718 	} else {
719 		/* Randomize max depth to avoid some side channels attacks. */
720 		int max_depth = FNHE_RECLAIM_DEPTH +
721 				prandom_u32_max(FNHE_RECLAIM_DEPTH);
722 
723 		while (depth > max_depth) {
724 			fnhe_remove_oldest(hash);
725 			depth--;
726 		}
727 
728 		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
729 		if (!fnhe)
730 			goto out_unlock;
731 
732 		fnhe->fnhe_next = hash->chain;
733 
734 		fnhe->fnhe_genid = genid;
735 		fnhe->fnhe_daddr = daddr;
736 		fnhe->fnhe_gw = gw;
737 		fnhe->fnhe_pmtu = pmtu;
738 		fnhe->fnhe_mtu_locked = lock;
739 		fnhe->fnhe_expires = max(1UL, expires);
740 
741 		rcu_assign_pointer(hash->chain, fnhe);
742 
743 		/* Exception created; mark the cached routes for the nexthop
744 		 * stale, so anyone caching it rechecks if this exception
745 		 * applies to them.
746 		 */
747 		rt = rcu_dereference(nhc->nhc_rth_input);
748 		if (rt)
749 			rt->dst.obsolete = DST_OBSOLETE_KILL;
750 
751 		for_each_possible_cpu(i) {
752 			struct rtable __rcu **prt;
753 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
754 			rt = rcu_dereference(*prt);
755 			if (rt)
756 				rt->dst.obsolete = DST_OBSOLETE_KILL;
757 		}
758 	}
759 
760 	fnhe->fnhe_stamp = jiffies;
761 
762 out_unlock:
763 	spin_unlock_bh(&fnhe_lock);
764 }
765 
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)766 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
767 			     bool kill_route)
768 {
769 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
770 	__be32 old_gw = ip_hdr(skb)->saddr;
771 	struct net_device *dev = skb->dev;
772 	struct in_device *in_dev;
773 	struct fib_result res;
774 	struct neighbour *n;
775 	struct net *net;
776 
777 	switch (icmp_hdr(skb)->code & 7) {
778 	case ICMP_REDIR_NET:
779 	case ICMP_REDIR_NETTOS:
780 	case ICMP_REDIR_HOST:
781 	case ICMP_REDIR_HOSTTOS:
782 		break;
783 
784 	default:
785 		return;
786 	}
787 
788 	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
789 		return;
790 
791 	in_dev = __in_dev_get_rcu(dev);
792 	if (!in_dev)
793 		return;
794 
795 	net = dev_net(dev);
796 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
797 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
798 	    ipv4_is_zeronet(new_gw))
799 		goto reject_redirect;
800 
801 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
802 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
803 			goto reject_redirect;
804 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
805 			goto reject_redirect;
806 	} else {
807 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
808 			goto reject_redirect;
809 	}
810 
811 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
812 	if (!n)
813 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
814 	if (!IS_ERR(n)) {
815 		if (!(n->nud_state & NUD_VALID)) {
816 			neigh_event_send(n, NULL);
817 		} else {
818 			if (fib_lookup(net, fl4, &res, 0) == 0) {
819 				struct fib_nh_common *nhc;
820 
821 				fib_select_path(net, &res, fl4, skb);
822 				nhc = FIB_RES_NHC(res);
823 				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
824 						0, false,
825 						jiffies + ip_rt_gc_timeout);
826 			}
827 			if (kill_route)
828 				rt->dst.obsolete = DST_OBSOLETE_KILL;
829 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
830 		}
831 		neigh_release(n);
832 	}
833 	return;
834 
835 reject_redirect:
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
838 		const struct iphdr *iph = (const struct iphdr *) skb->data;
839 		__be32 daddr = iph->daddr;
840 		__be32 saddr = iph->saddr;
841 
842 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
843 				     "  Advised path = %pI4 -> %pI4\n",
844 				     &old_gw, dev->name, &new_gw,
845 				     &saddr, &daddr);
846 	}
847 #endif
848 	;
849 }
850 
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)851 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
852 {
853 	struct rtable *rt;
854 	struct flowi4 fl4;
855 	const struct iphdr *iph = (const struct iphdr *) skb->data;
856 	struct net *net = dev_net(skb->dev);
857 	int oif = skb->dev->ifindex;
858 	u8 tos = RT_TOS(iph->tos);
859 	u8 prot = iph->protocol;
860 	u32 mark = skb->mark;
861 
862 	rt = (struct rtable *) dst;
863 
864 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
865 	ip_rt_fix_tos(&fl4);
866 	__ip_do_redirect(rt, skb, &fl4, true);
867 }
868 
ipv4_negative_advice(struct dst_entry * dst)869 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
870 {
871 	struct rtable *rt = (struct rtable *)dst;
872 	struct dst_entry *ret = dst;
873 
874 	if (rt) {
875 		if (dst->obsolete > 0) {
876 			ip_rt_put(rt);
877 			ret = NULL;
878 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
879 			   rt->dst.expires) {
880 			ip_rt_put(rt);
881 			ret = NULL;
882 		}
883 	}
884 	return ret;
885 }
886 
887 /*
888  * Algorithm:
889  *	1. The first ip_rt_redirect_number redirects are sent
890  *	   with exponential backoff, then we stop sending them at all,
891  *	   assuming that the host ignores our redirects.
892  *	2. If we did not see packets requiring redirects
893  *	   during ip_rt_redirect_silence, we assume that the host
894  *	   forgot redirected route and start to send redirects again.
895  *
896  * This algorithm is much cheaper and more intelligent than dumb load limiting
897  * in icmp.c.
898  *
899  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
900  * and "frag. need" (breaks PMTU discovery) in icmp.c.
901  */
902 
ip_rt_send_redirect(struct sk_buff * skb)903 void ip_rt_send_redirect(struct sk_buff *skb)
904 {
905 	struct rtable *rt = skb_rtable(skb);
906 	struct in_device *in_dev;
907 	struct inet_peer *peer;
908 	struct net *net;
909 	int log_martians;
910 	int vif;
911 
912 	rcu_read_lock();
913 	in_dev = __in_dev_get_rcu(rt->dst.dev);
914 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
915 		rcu_read_unlock();
916 		return;
917 	}
918 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
919 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
920 	rcu_read_unlock();
921 
922 	net = dev_net(rt->dst.dev);
923 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
924 	if (!peer) {
925 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
926 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
927 		return;
928 	}
929 
930 	/* No redirected packets during ip_rt_redirect_silence;
931 	 * reset the algorithm.
932 	 */
933 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
934 		peer->rate_tokens = 0;
935 		peer->n_redirects = 0;
936 	}
937 
938 	/* Too many ignored redirects; do not send anything
939 	 * set dst.rate_last to the last seen redirected packet.
940 	 */
941 	if (peer->n_redirects >= ip_rt_redirect_number) {
942 		peer->rate_last = jiffies;
943 		goto out_put_peer;
944 	}
945 
946 	/* Check for load limit; set rate_last to the latest sent
947 	 * redirect.
948 	 */
949 	if (peer->n_redirects == 0 ||
950 	    time_after(jiffies,
951 		       (peer->rate_last +
952 			(ip_rt_redirect_load << peer->n_redirects)))) {
953 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
954 
955 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
956 		peer->rate_last = jiffies;
957 		++peer->n_redirects;
958 #ifdef CONFIG_IP_ROUTE_VERBOSE
959 		if (log_martians &&
960 		    peer->n_redirects == ip_rt_redirect_number)
961 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
962 					     &ip_hdr(skb)->saddr, inet_iif(skb),
963 					     &ip_hdr(skb)->daddr, &gw);
964 #endif
965 	}
966 out_put_peer:
967 	inet_putpeer(peer);
968 }
969 
ip_error(struct sk_buff * skb)970 static int ip_error(struct sk_buff *skb)
971 {
972 	struct rtable *rt = skb_rtable(skb);
973 	struct net_device *dev = skb->dev;
974 	struct in_device *in_dev;
975 	struct inet_peer *peer;
976 	unsigned long now;
977 	struct net *net;
978 	bool send;
979 	int code;
980 
981 	if (netif_is_l3_master(skb->dev)) {
982 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
983 		if (!dev)
984 			goto out;
985 	}
986 
987 	in_dev = __in_dev_get_rcu(dev);
988 
989 	/* IP on this device is disabled. */
990 	if (!in_dev)
991 		goto out;
992 
993 	net = dev_net(rt->dst.dev);
994 	if (!IN_DEV_FORWARD(in_dev)) {
995 		switch (rt->dst.error) {
996 		case EHOSTUNREACH:
997 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
998 			break;
999 
1000 		case ENETUNREACH:
1001 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1002 			break;
1003 		}
1004 		goto out;
1005 	}
1006 
1007 	switch (rt->dst.error) {
1008 	case EINVAL:
1009 	default:
1010 		goto out;
1011 	case EHOSTUNREACH:
1012 		code = ICMP_HOST_UNREACH;
1013 		break;
1014 	case ENETUNREACH:
1015 		code = ICMP_NET_UNREACH;
1016 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1017 		break;
1018 	case EACCES:
1019 		code = ICMP_PKT_FILTERED;
1020 		break;
1021 	}
1022 
1023 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1024 			       l3mdev_master_ifindex(skb->dev), 1);
1025 
1026 	send = true;
1027 	if (peer) {
1028 		now = jiffies;
1029 		peer->rate_tokens += now - peer->rate_last;
1030 		if (peer->rate_tokens > ip_rt_error_burst)
1031 			peer->rate_tokens = ip_rt_error_burst;
1032 		peer->rate_last = now;
1033 		if (peer->rate_tokens >= ip_rt_error_cost)
1034 			peer->rate_tokens -= ip_rt_error_cost;
1035 		else
1036 			send = false;
1037 		inet_putpeer(peer);
1038 	}
1039 	if (send)
1040 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1041 
1042 out:	kfree_skb(skb);
1043 	return 0;
1044 }
1045 
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1046 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1047 {
1048 	struct dst_entry *dst = &rt->dst;
1049 	struct net *net = dev_net(dst->dev);
1050 	struct fib_result res;
1051 	bool lock = false;
1052 	u32 old_mtu;
1053 
1054 	if (ip_mtu_locked(dst))
1055 		return;
1056 
1057 	old_mtu = ipv4_mtu(dst);
1058 	if (old_mtu < mtu)
1059 		return;
1060 
1061 	if (mtu < ip_rt_min_pmtu) {
1062 		lock = true;
1063 		mtu = min(old_mtu, ip_rt_min_pmtu);
1064 	}
1065 
1066 	if (rt->rt_pmtu == mtu && !lock &&
1067 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1068 		return;
1069 
1070 	rcu_read_lock();
1071 	if (fib_lookup(net, fl4, &res, 0) == 0) {
1072 		struct fib_nh_common *nhc;
1073 
1074 		fib_select_path(net, &res, fl4, NULL);
1075 		nhc = FIB_RES_NHC(res);
1076 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1077 				      jiffies + ip_rt_mtu_expires);
1078 	}
1079 	rcu_read_unlock();
1080 }
1081 
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1082 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1083 			      struct sk_buff *skb, u32 mtu,
1084 			      bool confirm_neigh)
1085 {
1086 	struct rtable *rt = (struct rtable *) dst;
1087 	struct flowi4 fl4;
1088 
1089 	ip_rt_build_flow_key(&fl4, sk, skb);
1090 	ip_rt_fix_tos(&fl4);
1091 
1092 	/* Don't make lookup fail for bridged encapsulations */
1093 	if (skb && netif_is_any_bridge_port(skb->dev))
1094 		fl4.flowi4_oif = 0;
1095 
1096 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1097 }
1098 
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1099 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1100 		      int oif, u8 protocol)
1101 {
1102 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1103 	struct flowi4 fl4;
1104 	struct rtable *rt;
1105 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1106 
1107 	__build_flow_key(net, &fl4, NULL, iph, oif,
1108 			 RT_TOS(iph->tos), protocol, mark, 0);
1109 	rt = __ip_route_output_key(net, &fl4);
1110 	if (!IS_ERR(rt)) {
1111 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1112 		ip_rt_put(rt);
1113 	}
1114 }
1115 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1116 
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1117 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1118 {
1119 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1120 	struct flowi4 fl4;
1121 	struct rtable *rt;
1122 
1123 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1124 
1125 	if (!fl4.flowi4_mark)
1126 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1127 
1128 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1129 	if (!IS_ERR(rt)) {
1130 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1131 		ip_rt_put(rt);
1132 	}
1133 }
1134 
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1135 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1136 {
1137 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1138 	struct flowi4 fl4;
1139 	struct rtable *rt;
1140 	struct dst_entry *odst = NULL;
1141 	bool new = false;
1142 	struct net *net = sock_net(sk);
1143 
1144 	bh_lock_sock(sk);
1145 
1146 	if (!ip_sk_accept_pmtu(sk))
1147 		goto out;
1148 
1149 	odst = sk_dst_get(sk);
1150 
1151 	if (sock_owned_by_user(sk) || !odst) {
1152 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1153 		goto out;
1154 	}
1155 
1156 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1157 
1158 	rt = (struct rtable *)odst;
1159 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1160 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1161 		if (IS_ERR(rt))
1162 			goto out;
1163 
1164 		new = true;
1165 	} else {
1166 		ip_rt_fix_tos(&fl4);
1167 	}
1168 
1169 	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1170 
1171 	if (!dst_check(&rt->dst, 0)) {
1172 		if (new)
1173 			dst_release(&rt->dst);
1174 
1175 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1176 		if (IS_ERR(rt))
1177 			goto out;
1178 
1179 		new = true;
1180 	}
1181 
1182 	if (new)
1183 		sk_dst_set(sk, &rt->dst);
1184 
1185 out:
1186 	bh_unlock_sock(sk);
1187 	dst_release(odst);
1188 }
1189 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1190 
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1191 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1192 		   int oif, u8 protocol)
1193 {
1194 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1195 	struct flowi4 fl4;
1196 	struct rtable *rt;
1197 
1198 	__build_flow_key(net, &fl4, NULL, iph, oif,
1199 			 RT_TOS(iph->tos), protocol, 0, 0);
1200 	rt = __ip_route_output_key(net, &fl4);
1201 	if (!IS_ERR(rt)) {
1202 		__ip_do_redirect(rt, skb, &fl4, false);
1203 		ip_rt_put(rt);
1204 	}
1205 }
1206 EXPORT_SYMBOL_GPL(ipv4_redirect);
1207 
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1208 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1209 {
1210 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1211 	struct flowi4 fl4;
1212 	struct rtable *rt;
1213 	struct net *net = sock_net(sk);
1214 
1215 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1216 	rt = __ip_route_output_key(net, &fl4);
1217 	if (!IS_ERR(rt)) {
1218 		__ip_do_redirect(rt, skb, &fl4, false);
1219 		ip_rt_put(rt);
1220 	}
1221 }
1222 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1223 
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1224 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1225 {
1226 	struct rtable *rt = (struct rtable *) dst;
1227 
1228 	/* All IPV4 dsts are created with ->obsolete set to the value
1229 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1230 	 * into this function always.
1231 	 *
1232 	 * When a PMTU/redirect information update invalidates a route,
1233 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1234 	 * DST_OBSOLETE_DEAD.
1235 	 */
1236 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1237 		return NULL;
1238 	return dst;
1239 }
1240 
ipv4_send_dest_unreach(struct sk_buff * skb)1241 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1242 {
1243 	struct ip_options opt;
1244 	int res;
1245 
1246 	/* Recompile ip options since IPCB may not be valid anymore.
1247 	 * Also check we have a reasonable ipv4 header.
1248 	 */
1249 	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1250 	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1251 		return;
1252 
1253 	memset(&opt, 0, sizeof(opt));
1254 	if (ip_hdr(skb)->ihl > 5) {
1255 		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1256 			return;
1257 		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1258 
1259 		rcu_read_lock();
1260 		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1261 		rcu_read_unlock();
1262 
1263 		if (res)
1264 			return;
1265 	}
1266 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1267 }
1268 
ipv4_link_failure(struct sk_buff * skb)1269 static void ipv4_link_failure(struct sk_buff *skb)
1270 {
1271 	struct rtable *rt;
1272 
1273 	ipv4_send_dest_unreach(skb);
1274 
1275 	rt = skb_rtable(skb);
1276 	if (rt)
1277 		dst_set_expires(&rt->dst, 0);
1278 }
1279 
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1280 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1281 {
1282 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1283 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1284 		 skb->dev ? skb->dev->name : "?");
1285 	kfree_skb(skb);
1286 	WARN_ON(1);
1287 	return 0;
1288 }
1289 
1290 /*
1291    We do not cache source address of outgoing interface,
1292    because it is used only by IP RR, TS and SRR options,
1293    so that it out of fast path.
1294 
1295    BTW remember: "addr" is allowed to be not aligned
1296    in IP options!
1297  */
1298 
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1299 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1300 {
1301 	__be32 src;
1302 
1303 	if (rt_is_output_route(rt))
1304 		src = ip_hdr(skb)->saddr;
1305 	else {
1306 		struct fib_result res;
1307 		struct iphdr *iph = ip_hdr(skb);
1308 		struct flowi4 fl4 = {
1309 			.daddr = iph->daddr,
1310 			.saddr = iph->saddr,
1311 			.flowi4_tos = RT_TOS(iph->tos),
1312 			.flowi4_oif = rt->dst.dev->ifindex,
1313 			.flowi4_iif = skb->dev->ifindex,
1314 			.flowi4_mark = skb->mark,
1315 		};
1316 
1317 		rcu_read_lock();
1318 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1319 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1320 		else
1321 			src = inet_select_addr(rt->dst.dev,
1322 					       rt_nexthop(rt, iph->daddr),
1323 					       RT_SCOPE_UNIVERSE);
1324 		rcu_read_unlock();
1325 	}
1326 	memcpy(addr, &src, 4);
1327 }
1328 
1329 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1330 static void set_class_tag(struct rtable *rt, u32 tag)
1331 {
1332 	if (!(rt->dst.tclassid & 0xFFFF))
1333 		rt->dst.tclassid |= tag & 0xFFFF;
1334 	if (!(rt->dst.tclassid & 0xFFFF0000))
1335 		rt->dst.tclassid |= tag & 0xFFFF0000;
1336 }
1337 #endif
1338 
ipv4_default_advmss(const struct dst_entry * dst)1339 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1340 {
1341 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1342 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1343 				    ip_rt_min_advmss);
1344 
1345 	return min(advmss, IPV4_MAX_PMTU - header_size);
1346 }
1347 
ipv4_mtu(const struct dst_entry * dst)1348 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1349 {
1350 	const struct rtable *rt = (const struct rtable *)dst;
1351 	unsigned int mtu = rt->rt_pmtu;
1352 
1353 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1354 		mtu = dst_metric_raw(dst, RTAX_MTU);
1355 
1356 	if (mtu)
1357 		goto out;
1358 
1359 	mtu = READ_ONCE(dst->dev->mtu);
1360 
1361 	if (unlikely(ip_mtu_locked(dst))) {
1362 		if (rt->rt_uses_gateway && mtu > 576)
1363 			mtu = 576;
1364 	}
1365 
1366 out:
1367 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1368 
1369 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1370 }
1371 
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1372 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1373 {
1374 	struct fnhe_hash_bucket *hash;
1375 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1376 	u32 hval = fnhe_hashfun(daddr);
1377 
1378 	spin_lock_bh(&fnhe_lock);
1379 
1380 	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1381 					 lockdep_is_held(&fnhe_lock));
1382 	hash += hval;
1383 
1384 	fnhe_p = &hash->chain;
1385 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1386 	while (fnhe) {
1387 		if (fnhe->fnhe_daddr == daddr) {
1388 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1389 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1390 			/* set fnhe_daddr to 0 to ensure it won't bind with
1391 			 * new dsts in rt_bind_exception().
1392 			 */
1393 			fnhe->fnhe_daddr = 0;
1394 			fnhe_flush_routes(fnhe);
1395 			kfree_rcu(fnhe, rcu);
1396 			break;
1397 		}
1398 		fnhe_p = &fnhe->fnhe_next;
1399 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1400 						 lockdep_is_held(&fnhe_lock));
1401 	}
1402 
1403 	spin_unlock_bh(&fnhe_lock);
1404 }
1405 
find_exception(struct fib_nh_common * nhc,__be32 daddr)1406 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1407 					       __be32 daddr)
1408 {
1409 	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1410 	struct fib_nh_exception *fnhe;
1411 	u32 hval;
1412 
1413 	if (!hash)
1414 		return NULL;
1415 
1416 	hval = fnhe_hashfun(daddr);
1417 
1418 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1419 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1420 		if (fnhe->fnhe_daddr == daddr) {
1421 			if (fnhe->fnhe_expires &&
1422 			    time_after(jiffies, fnhe->fnhe_expires)) {
1423 				ip_del_fnhe(nhc, daddr);
1424 				break;
1425 			}
1426 			return fnhe;
1427 		}
1428 	}
1429 	return NULL;
1430 }
1431 
1432 /* MTU selection:
1433  * 1. mtu on route is locked - use it
1434  * 2. mtu from nexthop exception
1435  * 3. mtu from egress device
1436  */
1437 
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1438 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1439 {
1440 	struct fib_nh_common *nhc = res->nhc;
1441 	struct net_device *dev = nhc->nhc_dev;
1442 	struct fib_info *fi = res->fi;
1443 	u32 mtu = 0;
1444 
1445 	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1446 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1447 		mtu = fi->fib_mtu;
1448 
1449 	if (likely(!mtu)) {
1450 		struct fib_nh_exception *fnhe;
1451 
1452 		fnhe = find_exception(nhc, daddr);
1453 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1454 			mtu = fnhe->fnhe_pmtu;
1455 	}
1456 
1457 	if (likely(!mtu))
1458 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1459 
1460 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1461 }
1462 
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1463 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1464 			      __be32 daddr, const bool do_cache)
1465 {
1466 	bool ret = false;
1467 
1468 	spin_lock_bh(&fnhe_lock);
1469 
1470 	if (daddr == fnhe->fnhe_daddr) {
1471 		struct rtable __rcu **porig;
1472 		struct rtable *orig;
1473 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1474 
1475 		if (rt_is_input_route(rt))
1476 			porig = &fnhe->fnhe_rth_input;
1477 		else
1478 			porig = &fnhe->fnhe_rth_output;
1479 		orig = rcu_dereference(*porig);
1480 
1481 		if (fnhe->fnhe_genid != genid) {
1482 			fnhe->fnhe_genid = genid;
1483 			fnhe->fnhe_gw = 0;
1484 			fnhe->fnhe_pmtu = 0;
1485 			fnhe->fnhe_expires = 0;
1486 			fnhe->fnhe_mtu_locked = false;
1487 			fnhe_flush_routes(fnhe);
1488 			orig = NULL;
1489 		}
1490 		fill_route_from_fnhe(rt, fnhe);
1491 		if (!rt->rt_gw4) {
1492 			rt->rt_gw4 = daddr;
1493 			rt->rt_gw_family = AF_INET;
1494 		}
1495 
1496 		if (do_cache) {
1497 			dst_hold(&rt->dst);
1498 			rcu_assign_pointer(*porig, rt);
1499 			if (orig) {
1500 				dst_dev_put(&orig->dst);
1501 				dst_release(&orig->dst);
1502 			}
1503 			ret = true;
1504 		}
1505 
1506 		fnhe->fnhe_stamp = jiffies;
1507 	}
1508 	spin_unlock_bh(&fnhe_lock);
1509 
1510 	return ret;
1511 }
1512 
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1513 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1514 {
1515 	struct rtable *orig, *prev, **p;
1516 	bool ret = true;
1517 
1518 	if (rt_is_input_route(rt)) {
1519 		p = (struct rtable **)&nhc->nhc_rth_input;
1520 	} else {
1521 		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1522 	}
1523 	orig = *p;
1524 
1525 	/* hold dst before doing cmpxchg() to avoid race condition
1526 	 * on this dst
1527 	 */
1528 	dst_hold(&rt->dst);
1529 	prev = cmpxchg(p, orig, rt);
1530 	if (prev == orig) {
1531 		if (orig) {
1532 			rt_add_uncached_list(orig);
1533 			dst_release(&orig->dst);
1534 		}
1535 	} else {
1536 		dst_release(&rt->dst);
1537 		ret = false;
1538 	}
1539 
1540 	return ret;
1541 }
1542 
1543 struct uncached_list {
1544 	spinlock_t		lock;
1545 	struct list_head	head;
1546 };
1547 
1548 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1549 
rt_add_uncached_list(struct rtable * rt)1550 void rt_add_uncached_list(struct rtable *rt)
1551 {
1552 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1553 
1554 	rt->rt_uncached_list = ul;
1555 
1556 	spin_lock_bh(&ul->lock);
1557 	list_add_tail(&rt->rt_uncached, &ul->head);
1558 	spin_unlock_bh(&ul->lock);
1559 }
1560 
rt_del_uncached_list(struct rtable * rt)1561 void rt_del_uncached_list(struct rtable *rt)
1562 {
1563 	if (!list_empty(&rt->rt_uncached)) {
1564 		struct uncached_list *ul = rt->rt_uncached_list;
1565 
1566 		spin_lock_bh(&ul->lock);
1567 		list_del(&rt->rt_uncached);
1568 		spin_unlock_bh(&ul->lock);
1569 	}
1570 }
1571 
ipv4_dst_destroy(struct dst_entry * dst)1572 static void ipv4_dst_destroy(struct dst_entry *dst)
1573 {
1574 	struct rtable *rt = (struct rtable *)dst;
1575 
1576 	ip_dst_metrics_put(dst);
1577 	rt_del_uncached_list(rt);
1578 }
1579 
rt_flush_dev(struct net_device * dev)1580 void rt_flush_dev(struct net_device *dev)
1581 {
1582 	struct rtable *rt;
1583 	int cpu;
1584 
1585 	for_each_possible_cpu(cpu) {
1586 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1587 
1588 		spin_lock_bh(&ul->lock);
1589 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1590 			if (rt->dst.dev != dev)
1591 				continue;
1592 			rt->dst.dev = blackhole_netdev;
1593 			dev_hold(rt->dst.dev);
1594 			dev_put(dev);
1595 		}
1596 		spin_unlock_bh(&ul->lock);
1597 	}
1598 }
1599 
rt_cache_valid(const struct rtable * rt)1600 static bool rt_cache_valid(const struct rtable *rt)
1601 {
1602 	return	rt &&
1603 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1604 		!rt_is_expired(rt);
1605 }
1606 
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1607 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1608 			   const struct fib_result *res,
1609 			   struct fib_nh_exception *fnhe,
1610 			   struct fib_info *fi, u16 type, u32 itag,
1611 			   const bool do_cache)
1612 {
1613 	bool cached = false;
1614 
1615 	if (fi) {
1616 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1617 
1618 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1619 			rt->rt_uses_gateway = 1;
1620 			rt->rt_gw_family = nhc->nhc_gw_family;
1621 			/* only INET and INET6 are supported */
1622 			if (likely(nhc->nhc_gw_family == AF_INET))
1623 				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1624 			else
1625 				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1626 		}
1627 
1628 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1629 
1630 #ifdef CONFIG_IP_ROUTE_CLASSID
1631 		if (nhc->nhc_family == AF_INET) {
1632 			struct fib_nh *nh;
1633 
1634 			nh = container_of(nhc, struct fib_nh, nh_common);
1635 			rt->dst.tclassid = nh->nh_tclassid;
1636 		}
1637 #endif
1638 		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1639 		if (unlikely(fnhe))
1640 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1641 		else if (do_cache)
1642 			cached = rt_cache_route(nhc, rt);
1643 		if (unlikely(!cached)) {
1644 			/* Routes we intend to cache in nexthop exception or
1645 			 * FIB nexthop have the DST_NOCACHE bit clear.
1646 			 * However, if we are unsuccessful at storing this
1647 			 * route into the cache we really need to set it.
1648 			 */
1649 			if (!rt->rt_gw4) {
1650 				rt->rt_gw_family = AF_INET;
1651 				rt->rt_gw4 = daddr;
1652 			}
1653 			rt_add_uncached_list(rt);
1654 		}
1655 	} else
1656 		rt_add_uncached_list(rt);
1657 
1658 #ifdef CONFIG_IP_ROUTE_CLASSID
1659 #ifdef CONFIG_IP_MULTIPLE_TABLES
1660 	set_class_tag(rt, res->tclassid);
1661 #endif
1662 	set_class_tag(rt, itag);
1663 #endif
1664 }
1665 
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm)1666 struct rtable *rt_dst_alloc(struct net_device *dev,
1667 			    unsigned int flags, u16 type,
1668 			    bool nopolicy, bool noxfrm)
1669 {
1670 	struct rtable *rt;
1671 
1672 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1673 		       (nopolicy ? DST_NOPOLICY : 0) |
1674 		       (noxfrm ? DST_NOXFRM : 0));
1675 
1676 	if (rt) {
1677 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1678 		rt->rt_flags = flags;
1679 		rt->rt_type = type;
1680 		rt->rt_is_input = 0;
1681 		rt->rt_iif = 0;
1682 		rt->rt_pmtu = 0;
1683 		rt->rt_mtu_locked = 0;
1684 		rt->rt_uses_gateway = 0;
1685 		rt->rt_gw_family = 0;
1686 		rt->rt_gw4 = 0;
1687 		INIT_LIST_HEAD(&rt->rt_uncached);
1688 
1689 		rt->dst.output = ip_output;
1690 		if (flags & RTCF_LOCAL)
1691 			rt->dst.input = ip_local_deliver;
1692 	}
1693 
1694 	return rt;
1695 }
1696 EXPORT_SYMBOL(rt_dst_alloc);
1697 
rt_dst_clone(struct net_device * dev,struct rtable * rt)1698 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1699 {
1700 	struct rtable *new_rt;
1701 
1702 	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1703 			   rt->dst.flags);
1704 
1705 	if (new_rt) {
1706 		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1707 		new_rt->rt_flags = rt->rt_flags;
1708 		new_rt->rt_type = rt->rt_type;
1709 		new_rt->rt_is_input = rt->rt_is_input;
1710 		new_rt->rt_iif = rt->rt_iif;
1711 		new_rt->rt_pmtu = rt->rt_pmtu;
1712 		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1713 		new_rt->rt_gw_family = rt->rt_gw_family;
1714 		if (rt->rt_gw_family == AF_INET)
1715 			new_rt->rt_gw4 = rt->rt_gw4;
1716 		else if (rt->rt_gw_family == AF_INET6)
1717 			new_rt->rt_gw6 = rt->rt_gw6;
1718 		INIT_LIST_HEAD(&new_rt->rt_uncached);
1719 
1720 		new_rt->dst.input = rt->dst.input;
1721 		new_rt->dst.output = rt->dst.output;
1722 		new_rt->dst.error = rt->dst.error;
1723 		new_rt->dst.lastuse = jiffies;
1724 		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1725 	}
1726 	return new_rt;
1727 }
1728 EXPORT_SYMBOL(rt_dst_clone);
1729 
1730 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1731 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1732 			  u8 tos, struct net_device *dev,
1733 			  struct in_device *in_dev, u32 *itag)
1734 {
1735 	int err;
1736 
1737 	/* Primary sanity checks. */
1738 	if (!in_dev)
1739 		return -EINVAL;
1740 
1741 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1742 	    skb->protocol != htons(ETH_P_IP))
1743 		return -EINVAL;
1744 
1745 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1746 		return -EINVAL;
1747 
1748 	if (ipv4_is_zeronet(saddr)) {
1749 		if (!ipv4_is_local_multicast(daddr) &&
1750 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1751 			return -EINVAL;
1752 	} else {
1753 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1754 					  in_dev, itag);
1755 		if (err < 0)
1756 			return err;
1757 	}
1758 	return 0;
1759 }
1760 
1761 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1762 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1763 			     u8 tos, struct net_device *dev, int our)
1764 {
1765 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1766 	unsigned int flags = RTCF_MULTICAST;
1767 	struct rtable *rth;
1768 	u32 itag = 0;
1769 	int err;
1770 
1771 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1772 	if (err)
1773 		return err;
1774 
1775 	if (our)
1776 		flags |= RTCF_LOCAL;
1777 
1778 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1779 			   IN_DEV_ORCONF(in_dev, NOPOLICY), false);
1780 	if (!rth)
1781 		return -ENOBUFS;
1782 
1783 #ifdef CONFIG_IP_ROUTE_CLASSID
1784 	rth->dst.tclassid = itag;
1785 #endif
1786 	rth->dst.output = ip_rt_bug;
1787 	rth->rt_is_input= 1;
1788 
1789 #ifdef CONFIG_IP_MROUTE
1790 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1791 		rth->dst.input = ip_mr_input;
1792 #endif
1793 	RT_CACHE_STAT_INC(in_slow_mc);
1794 
1795 	skb_dst_drop(skb);
1796 	skb_dst_set(skb, &rth->dst);
1797 	return 0;
1798 }
1799 
1800 
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1801 static void ip_handle_martian_source(struct net_device *dev,
1802 				     struct in_device *in_dev,
1803 				     struct sk_buff *skb,
1804 				     __be32 daddr,
1805 				     __be32 saddr)
1806 {
1807 	RT_CACHE_STAT_INC(in_martian_src);
1808 #ifdef CONFIG_IP_ROUTE_VERBOSE
1809 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1810 		/*
1811 		 *	RFC1812 recommendation, if source is martian,
1812 		 *	the only hint is MAC header.
1813 		 */
1814 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1815 			&daddr, &saddr, dev->name);
1816 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1817 			print_hex_dump(KERN_WARNING, "ll header: ",
1818 				       DUMP_PREFIX_OFFSET, 16, 1,
1819 				       skb_mac_header(skb),
1820 				       dev->hard_header_len, false);
1821 		}
1822 	}
1823 #endif
1824 }
1825 
1826 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1827 static int __mkroute_input(struct sk_buff *skb,
1828 			   const struct fib_result *res,
1829 			   struct in_device *in_dev,
1830 			   __be32 daddr, __be32 saddr, u32 tos)
1831 {
1832 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1833 	struct net_device *dev = nhc->nhc_dev;
1834 	struct fib_nh_exception *fnhe;
1835 	struct rtable *rth;
1836 	int err;
1837 	struct in_device *out_dev;
1838 	bool do_cache;
1839 	u32 itag = 0;
1840 
1841 	/* get a working reference to the output device */
1842 	out_dev = __in_dev_get_rcu(dev);
1843 	if (!out_dev) {
1844 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1845 		return -EINVAL;
1846 	}
1847 
1848 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1849 				  in_dev->dev, in_dev, &itag);
1850 	if (err < 0) {
1851 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1852 					 saddr);
1853 
1854 		goto cleanup;
1855 	}
1856 
1857 	do_cache = res->fi && !itag;
1858 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1859 	    skb->protocol == htons(ETH_P_IP)) {
1860 		__be32 gw;
1861 
1862 		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1863 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1864 		    inet_addr_onlink(out_dev, saddr, gw))
1865 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1866 	}
1867 
1868 	if (skb->protocol != htons(ETH_P_IP)) {
1869 		/* Not IP (i.e. ARP). Do not create route, if it is
1870 		 * invalid for proxy arp. DNAT routes are always valid.
1871 		 *
1872 		 * Proxy arp feature have been extended to allow, ARP
1873 		 * replies back to the same interface, to support
1874 		 * Private VLAN switch technologies. See arp.c.
1875 		 */
1876 		if (out_dev == in_dev &&
1877 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1878 			err = -EINVAL;
1879 			goto cleanup;
1880 		}
1881 	}
1882 
1883 	fnhe = find_exception(nhc, daddr);
1884 	if (do_cache) {
1885 		if (fnhe)
1886 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1887 		else
1888 			rth = rcu_dereference(nhc->nhc_rth_input);
1889 		if (rt_cache_valid(rth)) {
1890 			skb_dst_set_noref(skb, &rth->dst);
1891 			goto out;
1892 		}
1893 	}
1894 
1895 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1896 			   IN_DEV_ORCONF(in_dev, NOPOLICY),
1897 			   IN_DEV_ORCONF(out_dev, NOXFRM));
1898 	if (!rth) {
1899 		err = -ENOBUFS;
1900 		goto cleanup;
1901 	}
1902 
1903 	rth->rt_is_input = 1;
1904 	RT_CACHE_STAT_INC(in_slow_tot);
1905 
1906 	rth->dst.input = ip_forward;
1907 
1908 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1909 		       do_cache);
1910 	lwtunnel_set_redirect(&rth->dst);
1911 	skb_dst_set(skb, &rth->dst);
1912 out:
1913 	err = 0;
1914  cleanup:
1915 	return err;
1916 }
1917 
1918 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1919 /* To make ICMP packets follow the right flow, the multipath hash is
1920  * calculated from the inner IP addresses.
1921  */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1922 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1923 				 struct flow_keys *hash_keys)
1924 {
1925 	const struct iphdr *outer_iph = ip_hdr(skb);
1926 	const struct iphdr *key_iph = outer_iph;
1927 	const struct iphdr *inner_iph;
1928 	const struct icmphdr *icmph;
1929 	struct iphdr _inner_iph;
1930 	struct icmphdr _icmph;
1931 
1932 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1933 		goto out;
1934 
1935 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1936 		goto out;
1937 
1938 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1939 				   &_icmph);
1940 	if (!icmph)
1941 		goto out;
1942 
1943 	if (!icmp_is_err(icmph->type))
1944 		goto out;
1945 
1946 	inner_iph = skb_header_pointer(skb,
1947 				       outer_iph->ihl * 4 + sizeof(_icmph),
1948 				       sizeof(_inner_iph), &_inner_iph);
1949 	if (!inner_iph)
1950 		goto out;
1951 
1952 	key_iph = inner_iph;
1953 out:
1954 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1955 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1956 }
1957 
1958 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1959 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1960 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1961 {
1962 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1963 	struct flow_keys hash_keys;
1964 	u32 mhash;
1965 
1966 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1967 	case 0:
1968 		memset(&hash_keys, 0, sizeof(hash_keys));
1969 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1970 		if (skb) {
1971 			ip_multipath_l3_keys(skb, &hash_keys);
1972 		} else {
1973 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1974 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1975 		}
1976 		break;
1977 	case 1:
1978 		/* skb is currently provided only when forwarding */
1979 		if (skb) {
1980 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1981 			struct flow_keys keys;
1982 
1983 			/* short-circuit if we already have L4 hash present */
1984 			if (skb->l4_hash)
1985 				return skb_get_hash_raw(skb) >> 1;
1986 
1987 			memset(&hash_keys, 0, sizeof(hash_keys));
1988 
1989 			if (!flkeys) {
1990 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1991 				flkeys = &keys;
1992 			}
1993 
1994 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1995 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1996 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1997 			hash_keys.ports.src = flkeys->ports.src;
1998 			hash_keys.ports.dst = flkeys->ports.dst;
1999 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2000 		} else {
2001 			memset(&hash_keys, 0, sizeof(hash_keys));
2002 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2003 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2004 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2005 			hash_keys.ports.src = fl4->fl4_sport;
2006 			hash_keys.ports.dst = fl4->fl4_dport;
2007 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
2008 		}
2009 		break;
2010 	case 2:
2011 		memset(&hash_keys, 0, sizeof(hash_keys));
2012 		/* skb is currently provided only when forwarding */
2013 		if (skb) {
2014 			struct flow_keys keys;
2015 
2016 			skb_flow_dissect_flow_keys(skb, &keys, 0);
2017 			/* Inner can be v4 or v6 */
2018 			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2019 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2020 				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2021 				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2022 			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2023 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2024 				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2025 				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2026 				hash_keys.tags.flow_label = keys.tags.flow_label;
2027 				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2028 			} else {
2029 				/* Same as case 0 */
2030 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2031 				ip_multipath_l3_keys(skb, &hash_keys);
2032 			}
2033 		} else {
2034 			/* Same as case 0 */
2035 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2036 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2037 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2038 		}
2039 		break;
2040 	}
2041 	mhash = flow_hash_from_keys(&hash_keys);
2042 
2043 	if (multipath_hash)
2044 		mhash = jhash_2words(mhash, multipath_hash, 0);
2045 
2046 	return mhash >> 1;
2047 }
2048 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2049 
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2050 static int ip_mkroute_input(struct sk_buff *skb,
2051 			    struct fib_result *res,
2052 			    struct in_device *in_dev,
2053 			    __be32 daddr, __be32 saddr, u32 tos,
2054 			    struct flow_keys *hkeys)
2055 {
2056 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2057 	if (res->fi && fib_info_num_path(res->fi) > 1) {
2058 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2059 
2060 		fib_select_multipath(res, h);
2061 	}
2062 #endif
2063 
2064 	/* create a routing cache entry */
2065 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2066 }
2067 
2068 /* Implements all the saddr-related checks as ip_route_input_slow(),
2069  * assuming daddr is valid and the destination is not a local broadcast one.
2070  * Uses the provided hint instead of performing a route lookup.
2071  */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2072 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2073 		      u8 tos, struct net_device *dev,
2074 		      const struct sk_buff *hint)
2075 {
2076 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2077 	struct rtable *rt = skb_rtable(hint);
2078 	struct net *net = dev_net(dev);
2079 	int err = -EINVAL;
2080 	u32 tag = 0;
2081 
2082 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2083 		goto martian_source;
2084 
2085 	if (ipv4_is_zeronet(saddr))
2086 		goto martian_source;
2087 
2088 	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2089 		goto martian_source;
2090 
2091 	if (rt->rt_type != RTN_LOCAL)
2092 		goto skip_validate_source;
2093 
2094 	tos &= IPTOS_RT_MASK;
2095 	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2096 	if (err < 0)
2097 		goto martian_source;
2098 
2099 skip_validate_source:
2100 	skb_dst_copy(skb, hint);
2101 	return 0;
2102 
2103 martian_source:
2104 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2105 	return err;
2106 }
2107 
2108 /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2109 static struct net_device *ip_rt_get_dev(struct net *net,
2110 					const struct fib_result *res)
2111 {
2112 	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2113 	struct net_device *dev = NULL;
2114 
2115 	if (nhc)
2116 		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2117 
2118 	return dev ? : net->loopback_dev;
2119 }
2120 
2121 /*
2122  *	NOTE. We drop all the packets that has local source
2123  *	addresses, because every properly looped back packet
2124  *	must have correct destination already attached by output routine.
2125  *	Changes in the enforced policies must be applied also to
2126  *	ip_route_use_hint().
2127  *
2128  *	Such approach solves two big problems:
2129  *	1. Not simplex devices are handled properly.
2130  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2131  *	called with rcu_read_lock()
2132  */
2133 
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2134 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2135 			       u8 tos, struct net_device *dev,
2136 			       struct fib_result *res)
2137 {
2138 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2139 	struct flow_keys *flkeys = NULL, _flkeys;
2140 	struct net    *net = dev_net(dev);
2141 	struct ip_tunnel_info *tun_info;
2142 	int		err = -EINVAL;
2143 	unsigned int	flags = 0;
2144 	u32		itag = 0;
2145 	struct rtable	*rth;
2146 	struct flowi4	fl4;
2147 	bool do_cache = true;
2148 
2149 	/* IP on this device is disabled. */
2150 
2151 	if (!in_dev)
2152 		goto out;
2153 
2154 	/* Check for the most weird martians, which can be not detected
2155 	   by fib_lookup.
2156 	 */
2157 
2158 	tun_info = skb_tunnel_info(skb);
2159 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2160 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2161 	else
2162 		fl4.flowi4_tun_key.tun_id = 0;
2163 	skb_dst_drop(skb);
2164 
2165 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2166 		goto martian_source;
2167 
2168 	res->fi = NULL;
2169 	res->table = NULL;
2170 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2171 		goto brd_input;
2172 
2173 	/* Accept zero addresses only to limited broadcast;
2174 	 * I even do not know to fix it or not. Waiting for complains :-)
2175 	 */
2176 	if (ipv4_is_zeronet(saddr))
2177 		goto martian_source;
2178 
2179 	if (ipv4_is_zeronet(daddr))
2180 		goto martian_destination;
2181 
2182 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2183 	 * and call it once if daddr or/and saddr are loopback addresses
2184 	 */
2185 	if (ipv4_is_loopback(daddr)) {
2186 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2187 			goto martian_destination;
2188 	} else if (ipv4_is_loopback(saddr)) {
2189 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2190 			goto martian_source;
2191 	}
2192 
2193 	/*
2194 	 *	Now we are ready to route packet.
2195 	 */
2196 	fl4.flowi4_oif = 0;
2197 	fl4.flowi4_iif = dev->ifindex;
2198 	fl4.flowi4_mark = skb->mark;
2199 	fl4.flowi4_tos = tos;
2200 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2201 	fl4.flowi4_flags = 0;
2202 	fl4.daddr = daddr;
2203 	fl4.saddr = saddr;
2204 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2205 	fl4.flowi4_multipath_hash = 0;
2206 
2207 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2208 		flkeys = &_flkeys;
2209 	} else {
2210 		fl4.flowi4_proto = 0;
2211 		fl4.fl4_sport = 0;
2212 		fl4.fl4_dport = 0;
2213 	}
2214 
2215 	err = fib_lookup(net, &fl4, res, 0);
2216 	if (err != 0) {
2217 		if (!IN_DEV_FORWARD(in_dev))
2218 			err = -EHOSTUNREACH;
2219 		goto no_route;
2220 	}
2221 
2222 	if (res->type == RTN_BROADCAST) {
2223 		if (IN_DEV_BFORWARD(in_dev))
2224 			goto make_route;
2225 		/* not do cache if bc_forwarding is enabled */
2226 		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2227 			do_cache = false;
2228 		goto brd_input;
2229 	}
2230 
2231 	if (res->type == RTN_LOCAL) {
2232 		err = fib_validate_source(skb, saddr, daddr, tos,
2233 					  0, dev, in_dev, &itag);
2234 		if (err < 0)
2235 			goto martian_source;
2236 		goto local_input;
2237 	}
2238 
2239 	if (!IN_DEV_FORWARD(in_dev)) {
2240 		err = -EHOSTUNREACH;
2241 		goto no_route;
2242 	}
2243 	if (res->type != RTN_UNICAST)
2244 		goto martian_destination;
2245 
2246 make_route:
2247 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2248 out:	return err;
2249 
2250 brd_input:
2251 	if (skb->protocol != htons(ETH_P_IP))
2252 		goto e_inval;
2253 
2254 	if (!ipv4_is_zeronet(saddr)) {
2255 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2256 					  in_dev, &itag);
2257 		if (err < 0)
2258 			goto martian_source;
2259 	}
2260 	flags |= RTCF_BROADCAST;
2261 	res->type = RTN_BROADCAST;
2262 	RT_CACHE_STAT_INC(in_brd);
2263 
2264 local_input:
2265 	do_cache &= res->fi && !itag;
2266 	if (do_cache) {
2267 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2268 
2269 		rth = rcu_dereference(nhc->nhc_rth_input);
2270 		if (rt_cache_valid(rth)) {
2271 			skb_dst_set_noref(skb, &rth->dst);
2272 			err = 0;
2273 			goto out;
2274 		}
2275 	}
2276 
2277 	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2278 			   flags | RTCF_LOCAL, res->type,
2279 			   IN_DEV_ORCONF(in_dev, NOPOLICY), false);
2280 	if (!rth)
2281 		goto e_nobufs;
2282 
2283 	rth->dst.output= ip_rt_bug;
2284 #ifdef CONFIG_IP_ROUTE_CLASSID
2285 	rth->dst.tclassid = itag;
2286 #endif
2287 	rth->rt_is_input = 1;
2288 
2289 	RT_CACHE_STAT_INC(in_slow_tot);
2290 	if (res->type == RTN_UNREACHABLE) {
2291 		rth->dst.input= ip_error;
2292 		rth->dst.error= -err;
2293 		rth->rt_flags 	&= ~RTCF_LOCAL;
2294 	}
2295 
2296 	if (do_cache) {
2297 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2298 
2299 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2300 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2301 			WARN_ON(rth->dst.input == lwtunnel_input);
2302 			rth->dst.lwtstate->orig_input = rth->dst.input;
2303 			rth->dst.input = lwtunnel_input;
2304 		}
2305 
2306 		if (unlikely(!rt_cache_route(nhc, rth)))
2307 			rt_add_uncached_list(rth);
2308 	}
2309 	skb_dst_set(skb, &rth->dst);
2310 	err = 0;
2311 	goto out;
2312 
2313 no_route:
2314 	RT_CACHE_STAT_INC(in_no_route);
2315 	res->type = RTN_UNREACHABLE;
2316 	res->fi = NULL;
2317 	res->table = NULL;
2318 	goto local_input;
2319 
2320 	/*
2321 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2322 	 */
2323 martian_destination:
2324 	RT_CACHE_STAT_INC(in_martian_dst);
2325 #ifdef CONFIG_IP_ROUTE_VERBOSE
2326 	if (IN_DEV_LOG_MARTIANS(in_dev))
2327 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2328 				     &daddr, &saddr, dev->name);
2329 #endif
2330 
2331 e_inval:
2332 	err = -EINVAL;
2333 	goto out;
2334 
2335 e_nobufs:
2336 	err = -ENOBUFS;
2337 	goto out;
2338 
2339 martian_source:
2340 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2341 	goto out;
2342 }
2343 
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2344 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2345 			 u8 tos, struct net_device *dev)
2346 {
2347 	struct fib_result res;
2348 	int err;
2349 
2350 	tos &= IPTOS_RT_MASK;
2351 	rcu_read_lock();
2352 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2353 	rcu_read_unlock();
2354 
2355 	return err;
2356 }
2357 EXPORT_SYMBOL(ip_route_input_noref);
2358 
2359 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2360 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2361 		       u8 tos, struct net_device *dev, struct fib_result *res)
2362 {
2363 	/* Multicast recognition logic is moved from route cache to here.
2364 	   The problem was that too many Ethernet cards have broken/missing
2365 	   hardware multicast filters :-( As result the host on multicasting
2366 	   network acquires a lot of useless route cache entries, sort of
2367 	   SDR messages from all the world. Now we try to get rid of them.
2368 	   Really, provided software IP multicast filter is organized
2369 	   reasonably (at least, hashed), it does not result in a slowdown
2370 	   comparing with route cache reject entries.
2371 	   Note, that multicast routers are not affected, because
2372 	   route cache entry is created eventually.
2373 	 */
2374 	if (ipv4_is_multicast(daddr)) {
2375 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2376 		int our = 0;
2377 		int err = -EINVAL;
2378 
2379 		if (!in_dev)
2380 			return err;
2381 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2382 				      ip_hdr(skb)->protocol);
2383 
2384 		/* check l3 master if no match yet */
2385 		if (!our && netif_is_l3_slave(dev)) {
2386 			struct in_device *l3_in_dev;
2387 
2388 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2389 			if (l3_in_dev)
2390 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2391 						      ip_hdr(skb)->protocol);
2392 		}
2393 
2394 		if (our
2395 #ifdef CONFIG_IP_MROUTE
2396 			||
2397 		    (!ipv4_is_local_multicast(daddr) &&
2398 		     IN_DEV_MFORWARD(in_dev))
2399 #endif
2400 		   ) {
2401 			err = ip_route_input_mc(skb, daddr, saddr,
2402 						tos, dev, our);
2403 		}
2404 		return err;
2405 	}
2406 
2407 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2408 }
2409 
2410 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2411 static struct rtable *__mkroute_output(const struct fib_result *res,
2412 				       const struct flowi4 *fl4, int orig_oif,
2413 				       struct net_device *dev_out,
2414 				       unsigned int flags)
2415 {
2416 	struct fib_info *fi = res->fi;
2417 	struct fib_nh_exception *fnhe;
2418 	struct in_device *in_dev;
2419 	u16 type = res->type;
2420 	struct rtable *rth;
2421 	bool do_cache;
2422 
2423 	in_dev = __in_dev_get_rcu(dev_out);
2424 	if (!in_dev)
2425 		return ERR_PTR(-EINVAL);
2426 
2427 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2428 		if (ipv4_is_loopback(fl4->saddr) &&
2429 		    !(dev_out->flags & IFF_LOOPBACK) &&
2430 		    !netif_is_l3_master(dev_out))
2431 			return ERR_PTR(-EINVAL);
2432 
2433 	if (ipv4_is_lbcast(fl4->daddr))
2434 		type = RTN_BROADCAST;
2435 	else if (ipv4_is_multicast(fl4->daddr))
2436 		type = RTN_MULTICAST;
2437 	else if (ipv4_is_zeronet(fl4->daddr))
2438 		return ERR_PTR(-EINVAL);
2439 
2440 	if (dev_out->flags & IFF_LOOPBACK)
2441 		flags |= RTCF_LOCAL;
2442 
2443 	do_cache = true;
2444 	if (type == RTN_BROADCAST) {
2445 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2446 		fi = NULL;
2447 	} else if (type == RTN_MULTICAST) {
2448 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2449 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2450 				     fl4->flowi4_proto))
2451 			flags &= ~RTCF_LOCAL;
2452 		else
2453 			do_cache = false;
2454 		/* If multicast route do not exist use
2455 		 * default one, but do not gateway in this case.
2456 		 * Yes, it is hack.
2457 		 */
2458 		if (fi && res->prefixlen < 4)
2459 			fi = NULL;
2460 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2461 		   (orig_oif != dev_out->ifindex)) {
2462 		/* For local routes that require a particular output interface
2463 		 * we do not want to cache the result.  Caching the result
2464 		 * causes incorrect behaviour when there are multiple source
2465 		 * addresses on the interface, the end result being that if the
2466 		 * intended recipient is waiting on that interface for the
2467 		 * packet he won't receive it because it will be delivered on
2468 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2469 		 * be set to the loopback interface as well.
2470 		 */
2471 		do_cache = false;
2472 	}
2473 
2474 	fnhe = NULL;
2475 	do_cache &= fi != NULL;
2476 	if (fi) {
2477 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2478 		struct rtable __rcu **prth;
2479 
2480 		fnhe = find_exception(nhc, fl4->daddr);
2481 		if (!do_cache)
2482 			goto add;
2483 		if (fnhe) {
2484 			prth = &fnhe->fnhe_rth_output;
2485 		} else {
2486 			if (unlikely(fl4->flowi4_flags &
2487 				     FLOWI_FLAG_KNOWN_NH &&
2488 				     !(nhc->nhc_gw_family &&
2489 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2490 				do_cache = false;
2491 				goto add;
2492 			}
2493 			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2494 		}
2495 		rth = rcu_dereference(*prth);
2496 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2497 			return rth;
2498 	}
2499 
2500 add:
2501 	rth = rt_dst_alloc(dev_out, flags, type,
2502 			   IN_DEV_ORCONF(in_dev, NOPOLICY),
2503 			   IN_DEV_ORCONF(in_dev, NOXFRM));
2504 	if (!rth)
2505 		return ERR_PTR(-ENOBUFS);
2506 
2507 	rth->rt_iif = orig_oif;
2508 
2509 	RT_CACHE_STAT_INC(out_slow_tot);
2510 
2511 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2512 		if (flags & RTCF_LOCAL &&
2513 		    !(dev_out->flags & IFF_LOOPBACK)) {
2514 			rth->dst.output = ip_mc_output;
2515 			RT_CACHE_STAT_INC(out_slow_mc);
2516 		}
2517 #ifdef CONFIG_IP_MROUTE
2518 		if (type == RTN_MULTICAST) {
2519 			if (IN_DEV_MFORWARD(in_dev) &&
2520 			    !ipv4_is_local_multicast(fl4->daddr)) {
2521 				rth->dst.input = ip_mr_input;
2522 				rth->dst.output = ip_mc_output;
2523 			}
2524 		}
2525 #endif
2526 	}
2527 
2528 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2529 	lwtunnel_set_redirect(&rth->dst);
2530 
2531 	return rth;
2532 }
2533 
2534 /*
2535  * Major route resolver routine.
2536  */
2537 
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2538 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2539 					const struct sk_buff *skb)
2540 {
2541 	struct fib_result res = {
2542 		.type		= RTN_UNSPEC,
2543 		.fi		= NULL,
2544 		.table		= NULL,
2545 		.tclassid	= 0,
2546 	};
2547 	struct rtable *rth;
2548 
2549 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2550 	ip_rt_fix_tos(fl4);
2551 
2552 	rcu_read_lock();
2553 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2554 	rcu_read_unlock();
2555 
2556 	return rth;
2557 }
2558 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2559 
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2560 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2561 					    struct fib_result *res,
2562 					    const struct sk_buff *skb)
2563 {
2564 	struct net_device *dev_out = NULL;
2565 	int orig_oif = fl4->flowi4_oif;
2566 	unsigned int flags = 0;
2567 	struct rtable *rth;
2568 	int err;
2569 
2570 	if (fl4->saddr) {
2571 		if (ipv4_is_multicast(fl4->saddr) ||
2572 		    ipv4_is_lbcast(fl4->saddr) ||
2573 		    ipv4_is_zeronet(fl4->saddr)) {
2574 			rth = ERR_PTR(-EINVAL);
2575 			goto out;
2576 		}
2577 
2578 		rth = ERR_PTR(-ENETUNREACH);
2579 
2580 		/* I removed check for oif == dev_out->oif here.
2581 		   It was wrong for two reasons:
2582 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2583 		      is assigned to multiple interfaces.
2584 		   2. Moreover, we are allowed to send packets with saddr
2585 		      of another iface. --ANK
2586 		 */
2587 
2588 		if (fl4->flowi4_oif == 0 &&
2589 		    (ipv4_is_multicast(fl4->daddr) ||
2590 		     ipv4_is_lbcast(fl4->daddr))) {
2591 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2592 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2593 			if (!dev_out)
2594 				goto out;
2595 
2596 			/* Special hack: user can direct multicasts
2597 			   and limited broadcast via necessary interface
2598 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2599 			   This hack is not just for fun, it allows
2600 			   vic,vat and friends to work.
2601 			   They bind socket to loopback, set ttl to zero
2602 			   and expect that it will work.
2603 			   From the viewpoint of routing cache they are broken,
2604 			   because we are not allowed to build multicast path
2605 			   with loopback source addr (look, routing cache
2606 			   cannot know, that ttl is zero, so that packet
2607 			   will not leave this host and route is valid).
2608 			   Luckily, this hack is good workaround.
2609 			 */
2610 
2611 			fl4->flowi4_oif = dev_out->ifindex;
2612 			goto make_route;
2613 		}
2614 
2615 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2616 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2617 			if (!__ip_dev_find(net, fl4->saddr, false))
2618 				goto out;
2619 		}
2620 	}
2621 
2622 
2623 	if (fl4->flowi4_oif) {
2624 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2625 		rth = ERR_PTR(-ENODEV);
2626 		if (!dev_out)
2627 			goto out;
2628 
2629 		/* RACE: Check return value of inet_select_addr instead. */
2630 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2631 			rth = ERR_PTR(-ENETUNREACH);
2632 			goto out;
2633 		}
2634 		if (ipv4_is_local_multicast(fl4->daddr) ||
2635 		    ipv4_is_lbcast(fl4->daddr) ||
2636 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2637 			if (!fl4->saddr)
2638 				fl4->saddr = inet_select_addr(dev_out, 0,
2639 							      RT_SCOPE_LINK);
2640 			goto make_route;
2641 		}
2642 		if (!fl4->saddr) {
2643 			if (ipv4_is_multicast(fl4->daddr))
2644 				fl4->saddr = inet_select_addr(dev_out, 0,
2645 							      fl4->flowi4_scope);
2646 			else if (!fl4->daddr)
2647 				fl4->saddr = inet_select_addr(dev_out, 0,
2648 							      RT_SCOPE_HOST);
2649 		}
2650 	}
2651 
2652 	if (!fl4->daddr) {
2653 		fl4->daddr = fl4->saddr;
2654 		if (!fl4->daddr)
2655 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2656 		dev_out = net->loopback_dev;
2657 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2658 		res->type = RTN_LOCAL;
2659 		flags |= RTCF_LOCAL;
2660 		goto make_route;
2661 	}
2662 
2663 	err = fib_lookup(net, fl4, res, 0);
2664 	if (err) {
2665 		res->fi = NULL;
2666 		res->table = NULL;
2667 		if (fl4->flowi4_oif &&
2668 		    (ipv4_is_multicast(fl4->daddr) ||
2669 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2670 			/* Apparently, routing tables are wrong. Assume,
2671 			   that the destination is on link.
2672 
2673 			   WHY? DW.
2674 			   Because we are allowed to send to iface
2675 			   even if it has NO routes and NO assigned
2676 			   addresses. When oif is specified, routing
2677 			   tables are looked up with only one purpose:
2678 			   to catch if destination is gatewayed, rather than
2679 			   direct. Moreover, if MSG_DONTROUTE is set,
2680 			   we send packet, ignoring both routing tables
2681 			   and ifaddr state. --ANK
2682 
2683 
2684 			   We could make it even if oif is unknown,
2685 			   likely IPv6, but we do not.
2686 			 */
2687 
2688 			if (fl4->saddr == 0)
2689 				fl4->saddr = inet_select_addr(dev_out, 0,
2690 							      RT_SCOPE_LINK);
2691 			res->type = RTN_UNICAST;
2692 			goto make_route;
2693 		}
2694 		rth = ERR_PTR(err);
2695 		goto out;
2696 	}
2697 
2698 	if (res->type == RTN_LOCAL) {
2699 		if (!fl4->saddr) {
2700 			if (res->fi->fib_prefsrc)
2701 				fl4->saddr = res->fi->fib_prefsrc;
2702 			else
2703 				fl4->saddr = fl4->daddr;
2704 		}
2705 
2706 		/* L3 master device is the loopback for that domain */
2707 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2708 			net->loopback_dev;
2709 
2710 		/* make sure orig_oif points to fib result device even
2711 		 * though packet rx/tx happens over loopback or l3mdev
2712 		 */
2713 		orig_oif = FIB_RES_OIF(*res);
2714 
2715 		fl4->flowi4_oif = dev_out->ifindex;
2716 		flags |= RTCF_LOCAL;
2717 		goto make_route;
2718 	}
2719 
2720 	fib_select_path(net, res, fl4, skb);
2721 
2722 	dev_out = FIB_RES_DEV(*res);
2723 
2724 make_route:
2725 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2726 
2727 out:
2728 	return rth;
2729 }
2730 
2731 static struct dst_ops ipv4_dst_blackhole_ops = {
2732 	.family			= AF_INET,
2733 	.default_advmss		= ipv4_default_advmss,
2734 	.neigh_lookup		= ipv4_neigh_lookup,
2735 	.check			= dst_blackhole_check,
2736 	.cow_metrics		= dst_blackhole_cow_metrics,
2737 	.update_pmtu		= dst_blackhole_update_pmtu,
2738 	.redirect		= dst_blackhole_redirect,
2739 	.mtu			= dst_blackhole_mtu,
2740 };
2741 
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2742 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2743 {
2744 	struct rtable *ort = (struct rtable *) dst_orig;
2745 	struct rtable *rt;
2746 
2747 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2748 	if (rt) {
2749 		struct dst_entry *new = &rt->dst;
2750 
2751 		new->__use = 1;
2752 		new->input = dst_discard;
2753 		new->output = dst_discard_out;
2754 
2755 		new->dev = net->loopback_dev;
2756 		if (new->dev)
2757 			dev_hold(new->dev);
2758 
2759 		rt->rt_is_input = ort->rt_is_input;
2760 		rt->rt_iif = ort->rt_iif;
2761 		rt->rt_pmtu = ort->rt_pmtu;
2762 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2763 
2764 		rt->rt_genid = rt_genid_ipv4(net);
2765 		rt->rt_flags = ort->rt_flags;
2766 		rt->rt_type = ort->rt_type;
2767 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2768 		rt->rt_gw_family = ort->rt_gw_family;
2769 		if (rt->rt_gw_family == AF_INET)
2770 			rt->rt_gw4 = ort->rt_gw4;
2771 		else if (rt->rt_gw_family == AF_INET6)
2772 			rt->rt_gw6 = ort->rt_gw6;
2773 
2774 		INIT_LIST_HEAD(&rt->rt_uncached);
2775 	}
2776 
2777 	dst_release(dst_orig);
2778 
2779 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2780 }
2781 
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2782 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2783 				    const struct sock *sk)
2784 {
2785 	struct rtable *rt = __ip_route_output_key(net, flp4);
2786 
2787 	if (IS_ERR(rt))
2788 		return rt;
2789 
2790 	if (flp4->flowi4_proto) {
2791 		flp4->flowi4_oif = rt->dst.dev->ifindex;
2792 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2793 							flowi4_to_flowi(flp4),
2794 							sk, 0);
2795 	}
2796 
2797 	return rt;
2798 }
2799 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2800 
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2801 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2802 				      struct net_device *dev,
2803 				      struct net *net, __be32 *saddr,
2804 				      const struct ip_tunnel_info *info,
2805 				      u8 protocol, bool use_cache)
2806 {
2807 #ifdef CONFIG_DST_CACHE
2808 	struct dst_cache *dst_cache;
2809 #endif
2810 	struct rtable *rt = NULL;
2811 	struct flowi4 fl4;
2812 	__u8 tos;
2813 
2814 #ifdef CONFIG_DST_CACHE
2815 	dst_cache = (struct dst_cache *)&info->dst_cache;
2816 	if (use_cache) {
2817 		rt = dst_cache_get_ip4(dst_cache, saddr);
2818 		if (rt)
2819 			return rt;
2820 	}
2821 #endif
2822 	memset(&fl4, 0, sizeof(fl4));
2823 	fl4.flowi4_mark = skb->mark;
2824 	fl4.flowi4_proto = protocol;
2825 	fl4.daddr = info->key.u.ipv4.dst;
2826 	fl4.saddr = info->key.u.ipv4.src;
2827 	tos = info->key.tos;
2828 	fl4.flowi4_tos = RT_TOS(tos);
2829 
2830 	rt = ip_route_output_key(net, &fl4);
2831 	if (IS_ERR(rt)) {
2832 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2833 		return ERR_PTR(-ENETUNREACH);
2834 	}
2835 	if (rt->dst.dev == dev) { /* is this necessary? */
2836 		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2837 		ip_rt_put(rt);
2838 		return ERR_PTR(-ELOOP);
2839 	}
2840 #ifdef CONFIG_DST_CACHE
2841 	if (use_cache)
2842 		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2843 #endif
2844 	*saddr = fl4.saddr;
2845 	return rt;
2846 }
2847 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2848 
2849 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2850 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2851 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2852 			struct sk_buff *skb, u32 portid, u32 seq,
2853 			unsigned int flags)
2854 {
2855 	struct rtmsg *r;
2856 	struct nlmsghdr *nlh;
2857 	unsigned long expires = 0;
2858 	u32 error;
2859 	u32 metrics[RTAX_MAX];
2860 
2861 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2862 	if (!nlh)
2863 		return -EMSGSIZE;
2864 
2865 	r = nlmsg_data(nlh);
2866 	r->rtm_family	 = AF_INET;
2867 	r->rtm_dst_len	= 32;
2868 	r->rtm_src_len	= 0;
2869 	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2870 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2871 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2872 		goto nla_put_failure;
2873 	r->rtm_type	= rt->rt_type;
2874 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2875 	r->rtm_protocol = RTPROT_UNSPEC;
2876 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2877 	if (rt->rt_flags & RTCF_NOTIFY)
2878 		r->rtm_flags |= RTM_F_NOTIFY;
2879 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2880 		r->rtm_flags |= RTCF_DOREDIRECT;
2881 
2882 	if (nla_put_in_addr(skb, RTA_DST, dst))
2883 		goto nla_put_failure;
2884 	if (src) {
2885 		r->rtm_src_len = 32;
2886 		if (nla_put_in_addr(skb, RTA_SRC, src))
2887 			goto nla_put_failure;
2888 	}
2889 	if (rt->dst.dev &&
2890 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2891 		goto nla_put_failure;
2892 #ifdef CONFIG_IP_ROUTE_CLASSID
2893 	if (rt->dst.tclassid &&
2894 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2895 		goto nla_put_failure;
2896 #endif
2897 	if (fl4 && !rt_is_input_route(rt) &&
2898 	    fl4->saddr != src) {
2899 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2900 			goto nla_put_failure;
2901 	}
2902 	if (rt->rt_uses_gateway) {
2903 		if (rt->rt_gw_family == AF_INET &&
2904 		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2905 			goto nla_put_failure;
2906 		} else if (rt->rt_gw_family == AF_INET6) {
2907 			int alen = sizeof(struct in6_addr);
2908 			struct nlattr *nla;
2909 			struct rtvia *via;
2910 
2911 			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2912 			if (!nla)
2913 				goto nla_put_failure;
2914 
2915 			via = nla_data(nla);
2916 			via->rtvia_family = AF_INET6;
2917 			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2918 		}
2919 	}
2920 
2921 	expires = rt->dst.expires;
2922 	if (expires) {
2923 		unsigned long now = jiffies;
2924 
2925 		if (time_before(now, expires))
2926 			expires -= now;
2927 		else
2928 			expires = 0;
2929 	}
2930 
2931 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2932 	if (rt->rt_pmtu && expires)
2933 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2934 	if (rt->rt_mtu_locked && expires)
2935 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2936 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2937 		goto nla_put_failure;
2938 
2939 	if (fl4) {
2940 		if (fl4->flowi4_mark &&
2941 		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2942 			goto nla_put_failure;
2943 
2944 		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2945 		    nla_put_u32(skb, RTA_UID,
2946 				from_kuid_munged(current_user_ns(),
2947 						 fl4->flowi4_uid)))
2948 			goto nla_put_failure;
2949 
2950 		if (rt_is_input_route(rt)) {
2951 #ifdef CONFIG_IP_MROUTE
2952 			if (ipv4_is_multicast(dst) &&
2953 			    !ipv4_is_local_multicast(dst) &&
2954 			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2955 				int err = ipmr_get_route(net, skb,
2956 							 fl4->saddr, fl4->daddr,
2957 							 r, portid);
2958 
2959 				if (err <= 0) {
2960 					if (err == 0)
2961 						return 0;
2962 					goto nla_put_failure;
2963 				}
2964 			} else
2965 #endif
2966 				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2967 					goto nla_put_failure;
2968 		}
2969 	}
2970 
2971 	error = rt->dst.error;
2972 
2973 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2974 		goto nla_put_failure;
2975 
2976 	nlmsg_end(skb, nlh);
2977 	return 0;
2978 
2979 nla_put_failure:
2980 	nlmsg_cancel(skb, nlh);
2981 	return -EMSGSIZE;
2982 }
2983 
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2984 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2985 			    struct netlink_callback *cb, u32 table_id,
2986 			    struct fnhe_hash_bucket *bucket, int genid,
2987 			    int *fa_index, int fa_start, unsigned int flags)
2988 {
2989 	int i;
2990 
2991 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
2992 		struct fib_nh_exception *fnhe;
2993 
2994 		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2995 		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2996 			struct rtable *rt;
2997 			int err;
2998 
2999 			if (*fa_index < fa_start)
3000 				goto next;
3001 
3002 			if (fnhe->fnhe_genid != genid)
3003 				goto next;
3004 
3005 			if (fnhe->fnhe_expires &&
3006 			    time_after(jiffies, fnhe->fnhe_expires))
3007 				goto next;
3008 
3009 			rt = rcu_dereference(fnhe->fnhe_rth_input);
3010 			if (!rt)
3011 				rt = rcu_dereference(fnhe->fnhe_rth_output);
3012 			if (!rt)
3013 				goto next;
3014 
3015 			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3016 					   table_id, NULL, skb,
3017 					   NETLINK_CB(cb->skb).portid,
3018 					   cb->nlh->nlmsg_seq, flags);
3019 			if (err)
3020 				return err;
3021 next:
3022 			(*fa_index)++;
3023 		}
3024 	}
3025 
3026 	return 0;
3027 }
3028 
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3029 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3030 		       u32 table_id, struct fib_info *fi,
3031 		       int *fa_index, int fa_start, unsigned int flags)
3032 {
3033 	struct net *net = sock_net(cb->skb->sk);
3034 	int nhsel, genid = fnhe_genid(net);
3035 
3036 	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3037 		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3038 		struct fnhe_hash_bucket *bucket;
3039 		int err;
3040 
3041 		if (nhc->nhc_flags & RTNH_F_DEAD)
3042 			continue;
3043 
3044 		rcu_read_lock();
3045 		bucket = rcu_dereference(nhc->nhc_exceptions);
3046 		err = 0;
3047 		if (bucket)
3048 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3049 					       genid, fa_index, fa_start,
3050 					       flags);
3051 		rcu_read_unlock();
3052 		if (err)
3053 			return err;
3054 	}
3055 
3056 	return 0;
3057 }
3058 
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3059 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3060 						   u8 ip_proto, __be16 sport,
3061 						   __be16 dport)
3062 {
3063 	struct sk_buff *skb;
3064 	struct iphdr *iph;
3065 
3066 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3067 	if (!skb)
3068 		return NULL;
3069 
3070 	/* Reserve room for dummy headers, this skb can pass
3071 	 * through good chunk of routing engine.
3072 	 */
3073 	skb_reset_mac_header(skb);
3074 	skb_reset_network_header(skb);
3075 	skb->protocol = htons(ETH_P_IP);
3076 	iph = skb_put(skb, sizeof(struct iphdr));
3077 	iph->protocol = ip_proto;
3078 	iph->saddr = src;
3079 	iph->daddr = dst;
3080 	iph->version = 0x4;
3081 	iph->frag_off = 0;
3082 	iph->ihl = 0x5;
3083 	skb_set_transport_header(skb, skb->len);
3084 
3085 	switch (iph->protocol) {
3086 	case IPPROTO_UDP: {
3087 		struct udphdr *udph;
3088 
3089 		udph = skb_put_zero(skb, sizeof(struct udphdr));
3090 		udph->source = sport;
3091 		udph->dest = dport;
3092 		udph->len = htons(sizeof(struct udphdr));
3093 		udph->check = 0;
3094 		break;
3095 	}
3096 	case IPPROTO_TCP: {
3097 		struct tcphdr *tcph;
3098 
3099 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3100 		tcph->source	= sport;
3101 		tcph->dest	= dport;
3102 		tcph->doff	= sizeof(struct tcphdr) / 4;
3103 		tcph->rst = 1;
3104 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3105 					    src, dst, 0);
3106 		break;
3107 	}
3108 	case IPPROTO_ICMP: {
3109 		struct icmphdr *icmph;
3110 
3111 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3112 		icmph->type = ICMP_ECHO;
3113 		icmph->code = 0;
3114 	}
3115 	}
3116 
3117 	return skb;
3118 }
3119 
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3120 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3121 				       const struct nlmsghdr *nlh,
3122 				       struct nlattr **tb,
3123 				       struct netlink_ext_ack *extack)
3124 {
3125 	struct rtmsg *rtm;
3126 	int i, err;
3127 
3128 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3129 		NL_SET_ERR_MSG(extack,
3130 			       "ipv4: Invalid header for route get request");
3131 		return -EINVAL;
3132 	}
3133 
3134 	if (!netlink_strict_get_check(skb))
3135 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3136 					      rtm_ipv4_policy, extack);
3137 
3138 	rtm = nlmsg_data(nlh);
3139 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3140 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3141 	    rtm->rtm_table || rtm->rtm_protocol ||
3142 	    rtm->rtm_scope || rtm->rtm_type) {
3143 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3144 		return -EINVAL;
3145 	}
3146 
3147 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3148 			       RTM_F_LOOKUP_TABLE |
3149 			       RTM_F_FIB_MATCH)) {
3150 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3151 		return -EINVAL;
3152 	}
3153 
3154 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3155 					    rtm_ipv4_policy, extack);
3156 	if (err)
3157 		return err;
3158 
3159 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3160 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3161 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3162 		return -EINVAL;
3163 	}
3164 
3165 	for (i = 0; i <= RTA_MAX; i++) {
3166 		if (!tb[i])
3167 			continue;
3168 
3169 		switch (i) {
3170 		case RTA_IIF:
3171 		case RTA_OIF:
3172 		case RTA_SRC:
3173 		case RTA_DST:
3174 		case RTA_IP_PROTO:
3175 		case RTA_SPORT:
3176 		case RTA_DPORT:
3177 		case RTA_MARK:
3178 		case RTA_UID:
3179 			break;
3180 		default:
3181 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3182 			return -EINVAL;
3183 		}
3184 	}
3185 
3186 	return 0;
3187 }
3188 
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3189 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3190 			     struct netlink_ext_ack *extack)
3191 {
3192 	struct net *net = sock_net(in_skb->sk);
3193 	struct nlattr *tb[RTA_MAX+1];
3194 	u32 table_id = RT_TABLE_MAIN;
3195 	__be16 sport = 0, dport = 0;
3196 	struct fib_result res = {};
3197 	u8 ip_proto = IPPROTO_UDP;
3198 	struct rtable *rt = NULL;
3199 	struct sk_buff *skb;
3200 	struct rtmsg *rtm;
3201 	struct flowi4 fl4 = {};
3202 	__be32 dst = 0;
3203 	__be32 src = 0;
3204 	kuid_t uid;
3205 	u32 iif;
3206 	int err;
3207 	int mark;
3208 
3209 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3210 	if (err < 0)
3211 		return err;
3212 
3213 	rtm = nlmsg_data(nlh);
3214 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3215 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3216 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3217 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3218 	if (tb[RTA_UID])
3219 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3220 	else
3221 		uid = (iif ? INVALID_UID : current_uid());
3222 
3223 	if (tb[RTA_IP_PROTO]) {
3224 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3225 						  &ip_proto, AF_INET, extack);
3226 		if (err)
3227 			return err;
3228 	}
3229 
3230 	if (tb[RTA_SPORT])
3231 		sport = nla_get_be16(tb[RTA_SPORT]);
3232 
3233 	if (tb[RTA_DPORT])
3234 		dport = nla_get_be16(tb[RTA_DPORT]);
3235 
3236 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3237 	if (!skb)
3238 		return -ENOBUFS;
3239 
3240 	fl4.daddr = dst;
3241 	fl4.saddr = src;
3242 	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3243 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3244 	fl4.flowi4_mark = mark;
3245 	fl4.flowi4_uid = uid;
3246 	if (sport)
3247 		fl4.fl4_sport = sport;
3248 	if (dport)
3249 		fl4.fl4_dport = dport;
3250 	fl4.flowi4_proto = ip_proto;
3251 
3252 	rcu_read_lock();
3253 
3254 	if (iif) {
3255 		struct net_device *dev;
3256 
3257 		dev = dev_get_by_index_rcu(net, iif);
3258 		if (!dev) {
3259 			err = -ENODEV;
3260 			goto errout_rcu;
3261 		}
3262 
3263 		fl4.flowi4_iif = iif; /* for rt_fill_info */
3264 		skb->dev	= dev;
3265 		skb->mark	= mark;
3266 		err = ip_route_input_rcu(skb, dst, src,
3267 					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3268 					 &res);
3269 
3270 		rt = skb_rtable(skb);
3271 		if (err == 0 && rt->dst.error)
3272 			err = -rt->dst.error;
3273 	} else {
3274 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3275 		skb->dev = net->loopback_dev;
3276 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3277 		err = 0;
3278 		if (IS_ERR(rt))
3279 			err = PTR_ERR(rt);
3280 		else
3281 			skb_dst_set(skb, &rt->dst);
3282 	}
3283 
3284 	if (err)
3285 		goto errout_rcu;
3286 
3287 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3288 		rt->rt_flags |= RTCF_NOTIFY;
3289 
3290 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3291 		table_id = res.table ? res.table->tb_id : 0;
3292 
3293 	/* reset skb for netlink reply msg */
3294 	skb_trim(skb, 0);
3295 	skb_reset_network_header(skb);
3296 	skb_reset_transport_header(skb);
3297 	skb_reset_mac_header(skb);
3298 
3299 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3300 		struct fib_rt_info fri;
3301 
3302 		if (!res.fi) {
3303 			err = fib_props[res.type].error;
3304 			if (!err)
3305 				err = -EHOSTUNREACH;
3306 			goto errout_rcu;
3307 		}
3308 		fri.fi = res.fi;
3309 		fri.tb_id = table_id;
3310 		fri.dst = res.prefix;
3311 		fri.dst_len = res.prefixlen;
3312 		fri.tos = fl4.flowi4_tos;
3313 		fri.type = rt->rt_type;
3314 		fri.offload = 0;
3315 		fri.trap = 0;
3316 		if (res.fa_head) {
3317 			struct fib_alias *fa;
3318 
3319 			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3320 				u8 slen = 32 - fri.dst_len;
3321 
3322 				if (fa->fa_slen == slen &&
3323 				    fa->tb_id == fri.tb_id &&
3324 				    fa->fa_tos == fri.tos &&
3325 				    fa->fa_info == res.fi &&
3326 				    fa->fa_type == fri.type) {
3327 					fri.offload = fa->offload;
3328 					fri.trap = fa->trap;
3329 					break;
3330 				}
3331 			}
3332 		}
3333 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3334 				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3335 	} else {
3336 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3337 				   NETLINK_CB(in_skb).portid,
3338 				   nlh->nlmsg_seq, 0);
3339 	}
3340 	if (err < 0)
3341 		goto errout_rcu;
3342 
3343 	rcu_read_unlock();
3344 
3345 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3346 
3347 errout_free:
3348 	return err;
3349 errout_rcu:
3350 	rcu_read_unlock();
3351 	kfree_skb(skb);
3352 	goto errout_free;
3353 }
3354 
ip_rt_multicast_event(struct in_device * in_dev)3355 void ip_rt_multicast_event(struct in_device *in_dev)
3356 {
3357 	rt_cache_flush(dev_net(in_dev->dev));
3358 }
3359 
3360 #ifdef CONFIG_SYSCTL
3361 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3362 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3363 static int ip_rt_gc_elasticity __read_mostly	= 8;
3364 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3365 
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3366 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3367 		void *buffer, size_t *lenp, loff_t *ppos)
3368 {
3369 	struct net *net = (struct net *)__ctl->extra1;
3370 
3371 	if (write) {
3372 		rt_cache_flush(net);
3373 		fnhe_genid_bump(net);
3374 		return 0;
3375 	}
3376 
3377 	return -EINVAL;
3378 }
3379 
3380 static struct ctl_table ipv4_route_table[] = {
3381 	{
3382 		.procname	= "gc_thresh",
3383 		.data		= &ipv4_dst_ops.gc_thresh,
3384 		.maxlen		= sizeof(int),
3385 		.mode		= 0644,
3386 		.proc_handler	= proc_dointvec,
3387 	},
3388 	{
3389 		.procname	= "max_size",
3390 		.data		= &ip_rt_max_size,
3391 		.maxlen		= sizeof(int),
3392 		.mode		= 0644,
3393 		.proc_handler	= proc_dointvec,
3394 	},
3395 	{
3396 		/*  Deprecated. Use gc_min_interval_ms */
3397 
3398 		.procname	= "gc_min_interval",
3399 		.data		= &ip_rt_gc_min_interval,
3400 		.maxlen		= sizeof(int),
3401 		.mode		= 0644,
3402 		.proc_handler	= proc_dointvec_jiffies,
3403 	},
3404 	{
3405 		.procname	= "gc_min_interval_ms",
3406 		.data		= &ip_rt_gc_min_interval,
3407 		.maxlen		= sizeof(int),
3408 		.mode		= 0644,
3409 		.proc_handler	= proc_dointvec_ms_jiffies,
3410 	},
3411 	{
3412 		.procname	= "gc_timeout",
3413 		.data		= &ip_rt_gc_timeout,
3414 		.maxlen		= sizeof(int),
3415 		.mode		= 0644,
3416 		.proc_handler	= proc_dointvec_jiffies,
3417 	},
3418 	{
3419 		.procname	= "gc_interval",
3420 		.data		= &ip_rt_gc_interval,
3421 		.maxlen		= sizeof(int),
3422 		.mode		= 0644,
3423 		.proc_handler	= proc_dointvec_jiffies,
3424 	},
3425 	{
3426 		.procname	= "redirect_load",
3427 		.data		= &ip_rt_redirect_load,
3428 		.maxlen		= sizeof(int),
3429 		.mode		= 0644,
3430 		.proc_handler	= proc_dointvec,
3431 	},
3432 	{
3433 		.procname	= "redirect_number",
3434 		.data		= &ip_rt_redirect_number,
3435 		.maxlen		= sizeof(int),
3436 		.mode		= 0644,
3437 		.proc_handler	= proc_dointvec,
3438 	},
3439 	{
3440 		.procname	= "redirect_silence",
3441 		.data		= &ip_rt_redirect_silence,
3442 		.maxlen		= sizeof(int),
3443 		.mode		= 0644,
3444 		.proc_handler	= proc_dointvec,
3445 	},
3446 	{
3447 		.procname	= "error_cost",
3448 		.data		= &ip_rt_error_cost,
3449 		.maxlen		= sizeof(int),
3450 		.mode		= 0644,
3451 		.proc_handler	= proc_dointvec,
3452 	},
3453 	{
3454 		.procname	= "error_burst",
3455 		.data		= &ip_rt_error_burst,
3456 		.maxlen		= sizeof(int),
3457 		.mode		= 0644,
3458 		.proc_handler	= proc_dointvec,
3459 	},
3460 	{
3461 		.procname	= "gc_elasticity",
3462 		.data		= &ip_rt_gc_elasticity,
3463 		.maxlen		= sizeof(int),
3464 		.mode		= 0644,
3465 		.proc_handler	= proc_dointvec,
3466 	},
3467 	{
3468 		.procname	= "mtu_expires",
3469 		.data		= &ip_rt_mtu_expires,
3470 		.maxlen		= sizeof(int),
3471 		.mode		= 0644,
3472 		.proc_handler	= proc_dointvec_jiffies,
3473 	},
3474 	{
3475 		.procname	= "min_pmtu",
3476 		.data		= &ip_rt_min_pmtu,
3477 		.maxlen		= sizeof(int),
3478 		.mode		= 0644,
3479 		.proc_handler	= proc_dointvec_minmax,
3480 		.extra1		= &ip_min_valid_pmtu,
3481 	},
3482 	{
3483 		.procname	= "min_adv_mss",
3484 		.data		= &ip_rt_min_advmss,
3485 		.maxlen		= sizeof(int),
3486 		.mode		= 0644,
3487 		.proc_handler	= proc_dointvec,
3488 	},
3489 	{ }
3490 };
3491 
3492 static const char ipv4_route_flush_procname[] = "flush";
3493 
3494 static struct ctl_table ipv4_route_flush_table[] = {
3495 	{
3496 		.procname	= ipv4_route_flush_procname,
3497 		.maxlen		= sizeof(int),
3498 		.mode		= 0200,
3499 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3500 	},
3501 	{ },
3502 };
3503 
sysctl_route_net_init(struct net * net)3504 static __net_init int sysctl_route_net_init(struct net *net)
3505 {
3506 	struct ctl_table *tbl;
3507 
3508 	tbl = ipv4_route_flush_table;
3509 	if (!net_eq(net, &init_net)) {
3510 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3511 		if (!tbl)
3512 			goto err_dup;
3513 
3514 		/* Don't export non-whitelisted sysctls to unprivileged users */
3515 		if (net->user_ns != &init_user_ns) {
3516 			if (tbl[0].procname != ipv4_route_flush_procname)
3517 				tbl[0].procname = NULL;
3518 		}
3519 	}
3520 	tbl[0].extra1 = net;
3521 
3522 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3523 	if (!net->ipv4.route_hdr)
3524 		goto err_reg;
3525 	return 0;
3526 
3527 err_reg:
3528 	if (tbl != ipv4_route_flush_table)
3529 		kfree(tbl);
3530 err_dup:
3531 	return -ENOMEM;
3532 }
3533 
sysctl_route_net_exit(struct net * net)3534 static __net_exit void sysctl_route_net_exit(struct net *net)
3535 {
3536 	struct ctl_table *tbl;
3537 
3538 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3539 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3540 	BUG_ON(tbl == ipv4_route_flush_table);
3541 	kfree(tbl);
3542 }
3543 
3544 static __net_initdata struct pernet_operations sysctl_route_ops = {
3545 	.init = sysctl_route_net_init,
3546 	.exit = sysctl_route_net_exit,
3547 };
3548 #endif
3549 
rt_genid_init(struct net * net)3550 static __net_init int rt_genid_init(struct net *net)
3551 {
3552 	atomic_set(&net->ipv4.rt_genid, 0);
3553 	atomic_set(&net->fnhe_genid, 0);
3554 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3555 	return 0;
3556 }
3557 
3558 static __net_initdata struct pernet_operations rt_genid_ops = {
3559 	.init = rt_genid_init,
3560 };
3561 
ipv4_inetpeer_init(struct net * net)3562 static int __net_init ipv4_inetpeer_init(struct net *net)
3563 {
3564 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3565 
3566 	if (!bp)
3567 		return -ENOMEM;
3568 	inet_peer_base_init(bp);
3569 	net->ipv4.peers = bp;
3570 	return 0;
3571 }
3572 
ipv4_inetpeer_exit(struct net * net)3573 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3574 {
3575 	struct inet_peer_base *bp = net->ipv4.peers;
3576 
3577 	net->ipv4.peers = NULL;
3578 	inetpeer_invalidate_tree(bp);
3579 	kfree(bp);
3580 }
3581 
3582 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3583 	.init	=	ipv4_inetpeer_init,
3584 	.exit	=	ipv4_inetpeer_exit,
3585 };
3586 
3587 #ifdef CONFIG_IP_ROUTE_CLASSID
3588 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3589 #endif /* CONFIG_IP_ROUTE_CLASSID */
3590 
ip_rt_init(void)3591 int __init ip_rt_init(void)
3592 {
3593 	void *idents_hash;
3594 	int cpu;
3595 
3596 	/* For modern hosts, this will use 2 MB of memory */
3597 	idents_hash = alloc_large_system_hash("IP idents",
3598 					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3599 					      0,
3600 					      16, /* one bucket per 64 KB */
3601 					      HASH_ZERO,
3602 					      NULL,
3603 					      &ip_idents_mask,
3604 					      2048,
3605 					      256*1024);
3606 
3607 	ip_idents = idents_hash;
3608 
3609 	prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3610 
3611 	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3612 
3613 	for_each_possible_cpu(cpu) {
3614 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3615 
3616 		INIT_LIST_HEAD(&ul->head);
3617 		spin_lock_init(&ul->lock);
3618 	}
3619 #ifdef CONFIG_IP_ROUTE_CLASSID
3620 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3621 	if (!ip_rt_acct)
3622 		panic("IP: failed to allocate ip_rt_acct\n");
3623 #endif
3624 
3625 	ipv4_dst_ops.kmem_cachep =
3626 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3627 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3628 
3629 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3630 
3631 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3632 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3633 
3634 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3635 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3636 
3637 	ipv4_dst_ops.gc_thresh = ~0;
3638 	ip_rt_max_size = INT_MAX;
3639 
3640 	devinet_init();
3641 	ip_fib_init();
3642 
3643 	if (ip_rt_proc_init())
3644 		pr_err("Unable to create route proc files\n");
3645 #ifdef CONFIG_XFRM
3646 	xfrm_init();
3647 	xfrm4_init();
3648 #endif
3649 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3650 		      RTNL_FLAG_DOIT_UNLOCKED);
3651 
3652 #ifdef CONFIG_SYSCTL
3653 	register_pernet_subsys(&sysctl_route_ops);
3654 #endif
3655 	register_pernet_subsys(&rt_genid_ops);
3656 	register_pernet_subsys(&ipv4_inetpeer_ops);
3657 	return 0;
3658 }
3659 
3660 #ifdef CONFIG_SYSCTL
3661 /*
3662  * We really need to sanitize the damn ipv4 init order, then all
3663  * this nonsense will go away.
3664  */
ip_static_sysctl_init(void)3665 void __init ip_static_sysctl_init(void)
3666 {
3667 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3668 }
3669 #endif
3670