1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
40 *
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
59 */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113
114 #include "fib_lookup.h"
115
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly = 9;
123 static int ip_rt_redirect_load __read_mostly = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly = HZ;
126 static int ip_rt_error_burst __read_mostly = 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly = 256;
130
131 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
132
133 /*
134 * Interface to generic destination cache.
135 */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void ipv4_link_failure(struct sk_buff *skb);
142 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu,
144 bool confirm_neigh);
145 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
148
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151 WARN_ON(1);
152 return NULL;
153 }
154
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 struct sk_buff *skb,
157 const void *daddr);
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .check = ipv4_dst_check,
163 .default_advmss = ipv4_default_advmss,
164 .mtu = ipv4_mtu,
165 .cow_metrics = ipv4_cow_metrics,
166 .destroy = ipv4_dst_destroy,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
170 .redirect = ip_do_redirect,
171 .local_out = __ip_local_out,
172 .neigh_lookup = ipv4_neigh_lookup,
173 .confirm_neigh = ipv4_confirm_neigh,
174 };
175
176 #define ECN_OR_COST(class) TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BESTEFFORT,
182 ECN_OR_COST(BESTEFFORT),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_BULK,
186 ECN_OR_COST(BULK),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE,
190 ECN_OR_COST(INTERACTIVE),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 if (*pos)
205 return NULL;
206 return SEQ_START_TOKEN;
207 }
208
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 ++*pos;
212 return NULL;
213 }
214
rt_cache_seq_stop(struct seq_file * seq,void * v)215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
rt_cache_seq_show(struct seq_file * seq,void * v)219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 if (v == SEQ_START_TOKEN)
222 seq_printf(seq, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 "HHUptod\tSpecDst");
226 return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230 .start = rt_cache_seq_start,
231 .next = rt_cache_seq_next,
232 .stop = rt_cache_seq_stop,
233 .show = rt_cache_seq_show,
234 };
235
rt_cache_seq_open(struct inode * inode,struct file * file)236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct proc_ops rt_cache_proc_ops = {
242 .proc_open = rt_cache_seq_open,
243 .proc_read = seq_read,
244 .proc_lseek = seq_lseek,
245 .proc_release = seq_release,
246 };
247
248
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251 int cpu;
252
253 if (*pos == 0)
254 return SEQ_START_TOKEN;
255
256 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257 if (!cpu_possible(cpu))
258 continue;
259 *pos = cpu+1;
260 return &per_cpu(rt_cache_stat, cpu);
261 }
262 return NULL;
263 }
264
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267 int cpu;
268
269 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270 if (!cpu_possible(cpu))
271 continue;
272 *pos = cpu+1;
273 return &per_cpu(rt_cache_stat, cpu);
274 }
275 (*pos)++;
276 return NULL;
277
278 }
279
rt_cpu_seq_stop(struct seq_file * seq,void * v)280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
rt_cpu_seq_show(struct seq_file * seq,void * v)285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 struct rt_cache_stat *st = v;
288
289 if (v == SEQ_START_TOKEN) {
290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 return 0;
292 }
293
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 dst_entries_get_slow(&ipv4_dst_ops),
297 0, /* st->in_hit */
298 st->in_slow_tot,
299 st->in_slow_mc,
300 st->in_no_route,
301 st->in_brd,
302 st->in_martian_dst,
303 st->in_martian_src,
304
305 0, /* st->out_hit */
306 st->out_slow_tot,
307 st->out_slow_mc,
308
309 0, /* st->gc_total */
310 0, /* st->gc_ignored */
311 0, /* st->gc_goal_miss */
312 0, /* st->gc_dst_overflow */
313 0, /* st->in_hlist_search */
314 0 /* st->out_hlist_search */
315 );
316 return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
324 };
325
326
rt_cpu_seq_open(struct inode * inode,struct file * file)327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct proc_ops rt_cpu_proc_ops = {
333 .proc_open = rt_cpu_seq_open,
334 .proc_read = seq_read,
335 .proc_lseek = seq_lseek,
336 .proc_release = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342 struct ip_rt_acct *dst, *src;
343 unsigned int i, j;
344
345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 if (!dst)
347 return -ENOMEM;
348
349 for_each_possible_cpu(i) {
350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 for (j = 0; j < 256; j++) {
352 dst[j].o_bytes += src[j].o_bytes;
353 dst[j].o_packets += src[j].o_packets;
354 dst[j].i_bytes += src[j].i_bytes;
355 dst[j].i_packets += src[j].i_packets;
356 }
357 }
358
359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 kfree(dst);
361 return 0;
362 }
363 #endif
364
ip_rt_do_proc_init(struct net * net)365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367 struct proc_dir_entry *pde;
368
369 pde = proc_create("rt_cache", 0444, net->proc_net,
370 &rt_cache_proc_ops);
371 if (!pde)
372 goto err1;
373
374 pde = proc_create("rt_cache", 0444,
375 net->proc_net_stat, &rt_cpu_proc_ops);
376 if (!pde)
377 goto err2;
378
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 pde = proc_create_single("rt_acct", 0, net->proc_net,
381 rt_acct_proc_show);
382 if (!pde)
383 goto err3;
384 #endif
385 return 0;
386
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389 remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392 remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394 return -ENOMEM;
395 }
396
ip_rt_do_proc_exit(struct net * net)397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399 remove_proc_entry("rt_cache", net->proc_net_stat);
400 remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405
406 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
407 .init = ip_rt_do_proc_init,
408 .exit = ip_rt_do_proc_exit,
409 };
410
ip_rt_proc_init(void)411 static int __init ip_rt_proc_init(void)
412 {
413 return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415
416 #else
ip_rt_proc_init(void)417 static inline int ip_rt_proc_init(void)
418 {
419 return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422
rt_is_expired(const struct rtable * rth)423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427
rt_cache_flush(struct net * net)428 void rt_cache_flush(struct net *net)
429 {
430 rt_genid_bump_ipv4(net);
431 }
432
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434 struct sk_buff *skb,
435 const void *daddr)
436 {
437 const struct rtable *rt = container_of(dst, struct rtable, dst);
438 struct net_device *dev = dst->dev;
439 struct neighbour *n;
440
441 rcu_read_lock_bh();
442
443 if (likely(rt->rt_gw_family == AF_INET)) {
444 n = ip_neigh_gw4(dev, rt->rt_gw4);
445 } else if (rt->rt_gw_family == AF_INET6) {
446 n = ip_neigh_gw6(dev, &rt->rt_gw6);
447 } else {
448 __be32 pkey;
449
450 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451 n = ip_neigh_gw4(dev, pkey);
452 }
453
454 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455 n = NULL;
456
457 rcu_read_unlock_bh();
458
459 return n;
460 }
461
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 {
464 const struct rtable *rt = container_of(dst, struct rtable, dst);
465 struct net_device *dev = dst->dev;
466 const __be32 *pkey = daddr;
467
468 if (rt->rt_gw_family == AF_INET) {
469 pkey = (const __be32 *)&rt->rt_gw4;
470 } else if (rt->rt_gw_family == AF_INET6) {
471 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472 } else if (!daddr ||
473 (rt->rt_flags &
474 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475 return;
476 }
477 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478 }
479
480 /* Hash tables of size 2048..262144 depending on RAM size.
481 * Each bucket uses 8 bytes.
482 */
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486
487 /* In order to protect privacy, we add a perturbation to identifiers
488 * if one generator is seldom used. This makes hard for an attacker
489 * to infer how many packets were sent between two points in time.
490 */
ip_idents_reserve(u32 hash,int segs)491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493 u32 bucket, old, now = (u32)jiffies;
494 atomic_t *p_id;
495 u32 *p_tstamp;
496 u32 delta = 0;
497
498 bucket = hash & ip_idents_mask;
499 p_tstamp = ip_tstamps + bucket;
500 p_id = ip_idents + bucket;
501 old = READ_ONCE(*p_tstamp);
502
503 if (old != now && cmpxchg(p_tstamp, old, now) == old)
504 delta = prandom_u32_max(now - old);
505
506 /* If UBSAN reports an error there, please make sure your compiler
507 * supports -fno-strict-overflow before reporting it that was a bug
508 * in UBSAN, and it has been fixed in GCC-8.
509 */
510 return atomic_add_return(segs + delta, p_id) - segs;
511 }
512 EXPORT_SYMBOL(ip_idents_reserve);
513
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515 {
516 u32 hash, id;
517
518 /* Note the following code is not safe, but this is okay. */
519 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520 get_random_bytes(&net->ipv4.ip_id_key,
521 sizeof(net->ipv4.ip_id_key));
522
523 hash = siphash_3u32((__force u32)iph->daddr,
524 (__force u32)iph->saddr,
525 iph->protocol,
526 &net->ipv4.ip_id_key);
527 id = ip_idents_reserve(hash, segs);
528 iph->id = htons(id);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531
ip_rt_fix_tos(struct flowi4 * fl4)532 static void ip_rt_fix_tos(struct flowi4 *fl4)
533 {
534 __u8 tos = RT_FL_TOS(fl4);
535
536 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
537 fl4->flowi4_scope = tos & RTO_ONLINK ?
538 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
539 }
540
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)541 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
542 const struct sock *sk,
543 const struct iphdr *iph,
544 int oif, u8 tos,
545 u8 prot, u32 mark, int flow_flags)
546 {
547 if (sk) {
548 const struct inet_sock *inet = inet_sk(sk);
549
550 oif = sk->sk_bound_dev_if;
551 mark = sk->sk_mark;
552 tos = RT_CONN_FLAGS(sk);
553 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
554 }
555 flowi4_init_output(fl4, oif, mark, tos,
556 RT_SCOPE_UNIVERSE, prot,
557 flow_flags,
558 iph->daddr, iph->saddr, 0, 0,
559 sock_net_uid(net, sk));
560 }
561
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)562 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
563 const struct sock *sk)
564 {
565 const struct net *net = dev_net(skb->dev);
566 const struct iphdr *iph = ip_hdr(skb);
567 int oif = skb->dev->ifindex;
568 u8 tos = RT_TOS(iph->tos);
569 u8 prot = iph->protocol;
570 u32 mark = skb->mark;
571
572 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
573 }
574
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)575 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
576 {
577 const struct inet_sock *inet = inet_sk(sk);
578 const struct ip_options_rcu *inet_opt;
579 __be32 daddr = inet->inet_daddr;
580
581 rcu_read_lock();
582 inet_opt = rcu_dereference(inet->inet_opt);
583 if (inet_opt && inet_opt->opt.srr)
584 daddr = inet_opt->opt.faddr;
585 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
586 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
587 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
588 inet_sk_flowi_flags(sk),
589 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
590 rcu_read_unlock();
591 }
592
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)593 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
594 const struct sk_buff *skb)
595 {
596 if (skb)
597 build_skb_flow_key(fl4, skb, sk);
598 else
599 build_sk_flow_key(fl4, sk);
600 }
601
602 static DEFINE_SPINLOCK(fnhe_lock);
603
fnhe_flush_routes(struct fib_nh_exception * fnhe)604 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
605 {
606 struct rtable *rt;
607
608 rt = rcu_dereference(fnhe->fnhe_rth_input);
609 if (rt) {
610 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
611 dst_dev_put(&rt->dst);
612 dst_release(&rt->dst);
613 }
614 rt = rcu_dereference(fnhe->fnhe_rth_output);
615 if (rt) {
616 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
617 dst_dev_put(&rt->dst);
618 dst_release(&rt->dst);
619 }
620 }
621
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)622 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
623 {
624 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
625 struct fib_nh_exception *fnhe, *oldest = NULL;
626
627 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
628 fnhe = rcu_dereference_protected(*fnhe_p,
629 lockdep_is_held(&fnhe_lock));
630 if (!fnhe)
631 break;
632 if (!oldest ||
633 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
634 oldest = fnhe;
635 oldest_p = fnhe_p;
636 }
637 }
638 fnhe_flush_routes(oldest);
639 *oldest_p = oldest->fnhe_next;
640 kfree_rcu(oldest, rcu);
641 }
642
fnhe_hashfun(__be32 daddr)643 static u32 fnhe_hashfun(__be32 daddr)
644 {
645 static siphash_key_t fnhe_hash_key __read_mostly;
646 u64 hval;
647
648 net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
649 hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
650 return hash_64(hval, FNHE_HASH_SHIFT);
651 }
652
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)653 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
654 {
655 rt->rt_pmtu = fnhe->fnhe_pmtu;
656 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
657 rt->dst.expires = fnhe->fnhe_expires;
658
659 if (fnhe->fnhe_gw) {
660 rt->rt_flags |= RTCF_REDIRECTED;
661 rt->rt_uses_gateway = 1;
662 rt->rt_gw_family = AF_INET;
663 rt->rt_gw4 = fnhe->fnhe_gw;
664 }
665 }
666
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)667 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
668 __be32 gw, u32 pmtu, bool lock,
669 unsigned long expires)
670 {
671 struct fnhe_hash_bucket *hash;
672 struct fib_nh_exception *fnhe;
673 struct rtable *rt;
674 u32 genid, hval;
675 unsigned int i;
676 int depth;
677
678 genid = fnhe_genid(dev_net(nhc->nhc_dev));
679 hval = fnhe_hashfun(daddr);
680
681 spin_lock_bh(&fnhe_lock);
682
683 hash = rcu_dereference(nhc->nhc_exceptions);
684 if (!hash) {
685 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
686 if (!hash)
687 goto out_unlock;
688 rcu_assign_pointer(nhc->nhc_exceptions, hash);
689 }
690
691 hash += hval;
692
693 depth = 0;
694 for (fnhe = rcu_dereference(hash->chain); fnhe;
695 fnhe = rcu_dereference(fnhe->fnhe_next)) {
696 if (fnhe->fnhe_daddr == daddr)
697 break;
698 depth++;
699 }
700
701 if (fnhe) {
702 if (fnhe->fnhe_genid != genid)
703 fnhe->fnhe_genid = genid;
704 if (gw)
705 fnhe->fnhe_gw = gw;
706 if (pmtu) {
707 fnhe->fnhe_pmtu = pmtu;
708 fnhe->fnhe_mtu_locked = lock;
709 }
710 fnhe->fnhe_expires = max(1UL, expires);
711 /* Update all cached dsts too */
712 rt = rcu_dereference(fnhe->fnhe_rth_input);
713 if (rt)
714 fill_route_from_fnhe(rt, fnhe);
715 rt = rcu_dereference(fnhe->fnhe_rth_output);
716 if (rt)
717 fill_route_from_fnhe(rt, fnhe);
718 } else {
719 /* Randomize max depth to avoid some side channels attacks. */
720 int max_depth = FNHE_RECLAIM_DEPTH +
721 prandom_u32_max(FNHE_RECLAIM_DEPTH);
722
723 while (depth > max_depth) {
724 fnhe_remove_oldest(hash);
725 depth--;
726 }
727
728 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
729 if (!fnhe)
730 goto out_unlock;
731
732 fnhe->fnhe_next = hash->chain;
733
734 fnhe->fnhe_genid = genid;
735 fnhe->fnhe_daddr = daddr;
736 fnhe->fnhe_gw = gw;
737 fnhe->fnhe_pmtu = pmtu;
738 fnhe->fnhe_mtu_locked = lock;
739 fnhe->fnhe_expires = max(1UL, expires);
740
741 rcu_assign_pointer(hash->chain, fnhe);
742
743 /* Exception created; mark the cached routes for the nexthop
744 * stale, so anyone caching it rechecks if this exception
745 * applies to them.
746 */
747 rt = rcu_dereference(nhc->nhc_rth_input);
748 if (rt)
749 rt->dst.obsolete = DST_OBSOLETE_KILL;
750
751 for_each_possible_cpu(i) {
752 struct rtable __rcu **prt;
753 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
754 rt = rcu_dereference(*prt);
755 if (rt)
756 rt->dst.obsolete = DST_OBSOLETE_KILL;
757 }
758 }
759
760 fnhe->fnhe_stamp = jiffies;
761
762 out_unlock:
763 spin_unlock_bh(&fnhe_lock);
764 }
765
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)766 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
767 bool kill_route)
768 {
769 __be32 new_gw = icmp_hdr(skb)->un.gateway;
770 __be32 old_gw = ip_hdr(skb)->saddr;
771 struct net_device *dev = skb->dev;
772 struct in_device *in_dev;
773 struct fib_result res;
774 struct neighbour *n;
775 struct net *net;
776
777 switch (icmp_hdr(skb)->code & 7) {
778 case ICMP_REDIR_NET:
779 case ICMP_REDIR_NETTOS:
780 case ICMP_REDIR_HOST:
781 case ICMP_REDIR_HOSTTOS:
782 break;
783
784 default:
785 return;
786 }
787
788 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
789 return;
790
791 in_dev = __in_dev_get_rcu(dev);
792 if (!in_dev)
793 return;
794
795 net = dev_net(dev);
796 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
797 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
798 ipv4_is_zeronet(new_gw))
799 goto reject_redirect;
800
801 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
802 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
803 goto reject_redirect;
804 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
805 goto reject_redirect;
806 } else {
807 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
808 goto reject_redirect;
809 }
810
811 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
812 if (!n)
813 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
814 if (!IS_ERR(n)) {
815 if (!(n->nud_state & NUD_VALID)) {
816 neigh_event_send(n, NULL);
817 } else {
818 if (fib_lookup(net, fl4, &res, 0) == 0) {
819 struct fib_nh_common *nhc;
820
821 fib_select_path(net, &res, fl4, skb);
822 nhc = FIB_RES_NHC(res);
823 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
824 0, false,
825 jiffies + ip_rt_gc_timeout);
826 }
827 if (kill_route)
828 rt->dst.obsolete = DST_OBSOLETE_KILL;
829 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
830 }
831 neigh_release(n);
832 }
833 return;
834
835 reject_redirect:
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837 if (IN_DEV_LOG_MARTIANS(in_dev)) {
838 const struct iphdr *iph = (const struct iphdr *) skb->data;
839 __be32 daddr = iph->daddr;
840 __be32 saddr = iph->saddr;
841
842 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
843 " Advised path = %pI4 -> %pI4\n",
844 &old_gw, dev->name, &new_gw,
845 &saddr, &daddr);
846 }
847 #endif
848 ;
849 }
850
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)851 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
852 {
853 struct rtable *rt;
854 struct flowi4 fl4;
855 const struct iphdr *iph = (const struct iphdr *) skb->data;
856 struct net *net = dev_net(skb->dev);
857 int oif = skb->dev->ifindex;
858 u8 tos = RT_TOS(iph->tos);
859 u8 prot = iph->protocol;
860 u32 mark = skb->mark;
861
862 rt = (struct rtable *) dst;
863
864 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
865 ip_rt_fix_tos(&fl4);
866 __ip_do_redirect(rt, skb, &fl4, true);
867 }
868
ipv4_negative_advice(struct dst_entry * dst)869 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
870 {
871 struct rtable *rt = (struct rtable *)dst;
872 struct dst_entry *ret = dst;
873
874 if (rt) {
875 if (dst->obsolete > 0) {
876 ip_rt_put(rt);
877 ret = NULL;
878 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
879 rt->dst.expires) {
880 ip_rt_put(rt);
881 ret = NULL;
882 }
883 }
884 return ret;
885 }
886
887 /*
888 * Algorithm:
889 * 1. The first ip_rt_redirect_number redirects are sent
890 * with exponential backoff, then we stop sending them at all,
891 * assuming that the host ignores our redirects.
892 * 2. If we did not see packets requiring redirects
893 * during ip_rt_redirect_silence, we assume that the host
894 * forgot redirected route and start to send redirects again.
895 *
896 * This algorithm is much cheaper and more intelligent than dumb load limiting
897 * in icmp.c.
898 *
899 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
900 * and "frag. need" (breaks PMTU discovery) in icmp.c.
901 */
902
ip_rt_send_redirect(struct sk_buff * skb)903 void ip_rt_send_redirect(struct sk_buff *skb)
904 {
905 struct rtable *rt = skb_rtable(skb);
906 struct in_device *in_dev;
907 struct inet_peer *peer;
908 struct net *net;
909 int log_martians;
910 int vif;
911
912 rcu_read_lock();
913 in_dev = __in_dev_get_rcu(rt->dst.dev);
914 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
915 rcu_read_unlock();
916 return;
917 }
918 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
919 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
920 rcu_read_unlock();
921
922 net = dev_net(rt->dst.dev);
923 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
924 if (!peer) {
925 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
926 rt_nexthop(rt, ip_hdr(skb)->daddr));
927 return;
928 }
929
930 /* No redirected packets during ip_rt_redirect_silence;
931 * reset the algorithm.
932 */
933 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
934 peer->rate_tokens = 0;
935 peer->n_redirects = 0;
936 }
937
938 /* Too many ignored redirects; do not send anything
939 * set dst.rate_last to the last seen redirected packet.
940 */
941 if (peer->n_redirects >= ip_rt_redirect_number) {
942 peer->rate_last = jiffies;
943 goto out_put_peer;
944 }
945
946 /* Check for load limit; set rate_last to the latest sent
947 * redirect.
948 */
949 if (peer->n_redirects == 0 ||
950 time_after(jiffies,
951 (peer->rate_last +
952 (ip_rt_redirect_load << peer->n_redirects)))) {
953 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
954
955 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
956 peer->rate_last = jiffies;
957 ++peer->n_redirects;
958 #ifdef CONFIG_IP_ROUTE_VERBOSE
959 if (log_martians &&
960 peer->n_redirects == ip_rt_redirect_number)
961 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
962 &ip_hdr(skb)->saddr, inet_iif(skb),
963 &ip_hdr(skb)->daddr, &gw);
964 #endif
965 }
966 out_put_peer:
967 inet_putpeer(peer);
968 }
969
ip_error(struct sk_buff * skb)970 static int ip_error(struct sk_buff *skb)
971 {
972 struct rtable *rt = skb_rtable(skb);
973 struct net_device *dev = skb->dev;
974 struct in_device *in_dev;
975 struct inet_peer *peer;
976 unsigned long now;
977 struct net *net;
978 bool send;
979 int code;
980
981 if (netif_is_l3_master(skb->dev)) {
982 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
983 if (!dev)
984 goto out;
985 }
986
987 in_dev = __in_dev_get_rcu(dev);
988
989 /* IP on this device is disabled. */
990 if (!in_dev)
991 goto out;
992
993 net = dev_net(rt->dst.dev);
994 if (!IN_DEV_FORWARD(in_dev)) {
995 switch (rt->dst.error) {
996 case EHOSTUNREACH:
997 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
998 break;
999
1000 case ENETUNREACH:
1001 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1002 break;
1003 }
1004 goto out;
1005 }
1006
1007 switch (rt->dst.error) {
1008 case EINVAL:
1009 default:
1010 goto out;
1011 case EHOSTUNREACH:
1012 code = ICMP_HOST_UNREACH;
1013 break;
1014 case ENETUNREACH:
1015 code = ICMP_NET_UNREACH;
1016 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1017 break;
1018 case EACCES:
1019 code = ICMP_PKT_FILTERED;
1020 break;
1021 }
1022
1023 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1024 l3mdev_master_ifindex(skb->dev), 1);
1025
1026 send = true;
1027 if (peer) {
1028 now = jiffies;
1029 peer->rate_tokens += now - peer->rate_last;
1030 if (peer->rate_tokens > ip_rt_error_burst)
1031 peer->rate_tokens = ip_rt_error_burst;
1032 peer->rate_last = now;
1033 if (peer->rate_tokens >= ip_rt_error_cost)
1034 peer->rate_tokens -= ip_rt_error_cost;
1035 else
1036 send = false;
1037 inet_putpeer(peer);
1038 }
1039 if (send)
1040 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1041
1042 out: kfree_skb(skb);
1043 return 0;
1044 }
1045
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1046 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1047 {
1048 struct dst_entry *dst = &rt->dst;
1049 struct net *net = dev_net(dst->dev);
1050 struct fib_result res;
1051 bool lock = false;
1052 u32 old_mtu;
1053
1054 if (ip_mtu_locked(dst))
1055 return;
1056
1057 old_mtu = ipv4_mtu(dst);
1058 if (old_mtu < mtu)
1059 return;
1060
1061 if (mtu < ip_rt_min_pmtu) {
1062 lock = true;
1063 mtu = min(old_mtu, ip_rt_min_pmtu);
1064 }
1065
1066 if (rt->rt_pmtu == mtu && !lock &&
1067 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1068 return;
1069
1070 rcu_read_lock();
1071 if (fib_lookup(net, fl4, &res, 0) == 0) {
1072 struct fib_nh_common *nhc;
1073
1074 fib_select_path(net, &res, fl4, NULL);
1075 nhc = FIB_RES_NHC(res);
1076 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1077 jiffies + ip_rt_mtu_expires);
1078 }
1079 rcu_read_unlock();
1080 }
1081
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1082 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1083 struct sk_buff *skb, u32 mtu,
1084 bool confirm_neigh)
1085 {
1086 struct rtable *rt = (struct rtable *) dst;
1087 struct flowi4 fl4;
1088
1089 ip_rt_build_flow_key(&fl4, sk, skb);
1090 ip_rt_fix_tos(&fl4);
1091
1092 /* Don't make lookup fail for bridged encapsulations */
1093 if (skb && netif_is_any_bridge_port(skb->dev))
1094 fl4.flowi4_oif = 0;
1095
1096 __ip_rt_update_pmtu(rt, &fl4, mtu);
1097 }
1098
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1099 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1100 int oif, u8 protocol)
1101 {
1102 const struct iphdr *iph = (const struct iphdr *)skb->data;
1103 struct flowi4 fl4;
1104 struct rtable *rt;
1105 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1106
1107 __build_flow_key(net, &fl4, NULL, iph, oif,
1108 RT_TOS(iph->tos), protocol, mark, 0);
1109 rt = __ip_route_output_key(net, &fl4);
1110 if (!IS_ERR(rt)) {
1111 __ip_rt_update_pmtu(rt, &fl4, mtu);
1112 ip_rt_put(rt);
1113 }
1114 }
1115 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1116
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1117 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1118 {
1119 const struct iphdr *iph = (const struct iphdr *)skb->data;
1120 struct flowi4 fl4;
1121 struct rtable *rt;
1122
1123 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1124
1125 if (!fl4.flowi4_mark)
1126 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1127
1128 rt = __ip_route_output_key(sock_net(sk), &fl4);
1129 if (!IS_ERR(rt)) {
1130 __ip_rt_update_pmtu(rt, &fl4, mtu);
1131 ip_rt_put(rt);
1132 }
1133 }
1134
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1135 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1136 {
1137 const struct iphdr *iph = (const struct iphdr *)skb->data;
1138 struct flowi4 fl4;
1139 struct rtable *rt;
1140 struct dst_entry *odst = NULL;
1141 bool new = false;
1142 struct net *net = sock_net(sk);
1143
1144 bh_lock_sock(sk);
1145
1146 if (!ip_sk_accept_pmtu(sk))
1147 goto out;
1148
1149 odst = sk_dst_get(sk);
1150
1151 if (sock_owned_by_user(sk) || !odst) {
1152 __ipv4_sk_update_pmtu(skb, sk, mtu);
1153 goto out;
1154 }
1155
1156 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1157
1158 rt = (struct rtable *)odst;
1159 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1160 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1161 if (IS_ERR(rt))
1162 goto out;
1163
1164 new = true;
1165 } else {
1166 ip_rt_fix_tos(&fl4);
1167 }
1168
1169 __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1170
1171 if (!dst_check(&rt->dst, 0)) {
1172 if (new)
1173 dst_release(&rt->dst);
1174
1175 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1176 if (IS_ERR(rt))
1177 goto out;
1178
1179 new = true;
1180 }
1181
1182 if (new)
1183 sk_dst_set(sk, &rt->dst);
1184
1185 out:
1186 bh_unlock_sock(sk);
1187 dst_release(odst);
1188 }
1189 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1190
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1191 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1192 int oif, u8 protocol)
1193 {
1194 const struct iphdr *iph = (const struct iphdr *)skb->data;
1195 struct flowi4 fl4;
1196 struct rtable *rt;
1197
1198 __build_flow_key(net, &fl4, NULL, iph, oif,
1199 RT_TOS(iph->tos), protocol, 0, 0);
1200 rt = __ip_route_output_key(net, &fl4);
1201 if (!IS_ERR(rt)) {
1202 __ip_do_redirect(rt, skb, &fl4, false);
1203 ip_rt_put(rt);
1204 }
1205 }
1206 EXPORT_SYMBOL_GPL(ipv4_redirect);
1207
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1208 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1209 {
1210 const struct iphdr *iph = (const struct iphdr *)skb->data;
1211 struct flowi4 fl4;
1212 struct rtable *rt;
1213 struct net *net = sock_net(sk);
1214
1215 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1216 rt = __ip_route_output_key(net, &fl4);
1217 if (!IS_ERR(rt)) {
1218 __ip_do_redirect(rt, skb, &fl4, false);
1219 ip_rt_put(rt);
1220 }
1221 }
1222 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1223
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1224 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1225 {
1226 struct rtable *rt = (struct rtable *) dst;
1227
1228 /* All IPV4 dsts are created with ->obsolete set to the value
1229 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1230 * into this function always.
1231 *
1232 * When a PMTU/redirect information update invalidates a route,
1233 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1234 * DST_OBSOLETE_DEAD.
1235 */
1236 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1237 return NULL;
1238 return dst;
1239 }
1240
ipv4_send_dest_unreach(struct sk_buff * skb)1241 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1242 {
1243 struct ip_options opt;
1244 int res;
1245
1246 /* Recompile ip options since IPCB may not be valid anymore.
1247 * Also check we have a reasonable ipv4 header.
1248 */
1249 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1250 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1251 return;
1252
1253 memset(&opt, 0, sizeof(opt));
1254 if (ip_hdr(skb)->ihl > 5) {
1255 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1256 return;
1257 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1258
1259 rcu_read_lock();
1260 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1261 rcu_read_unlock();
1262
1263 if (res)
1264 return;
1265 }
1266 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1267 }
1268
ipv4_link_failure(struct sk_buff * skb)1269 static void ipv4_link_failure(struct sk_buff *skb)
1270 {
1271 struct rtable *rt;
1272
1273 ipv4_send_dest_unreach(skb);
1274
1275 rt = skb_rtable(skb);
1276 if (rt)
1277 dst_set_expires(&rt->dst, 0);
1278 }
1279
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1280 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1281 {
1282 pr_debug("%s: %pI4 -> %pI4, %s\n",
1283 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1284 skb->dev ? skb->dev->name : "?");
1285 kfree_skb(skb);
1286 WARN_ON(1);
1287 return 0;
1288 }
1289
1290 /*
1291 We do not cache source address of outgoing interface,
1292 because it is used only by IP RR, TS and SRR options,
1293 so that it out of fast path.
1294
1295 BTW remember: "addr" is allowed to be not aligned
1296 in IP options!
1297 */
1298
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1299 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1300 {
1301 __be32 src;
1302
1303 if (rt_is_output_route(rt))
1304 src = ip_hdr(skb)->saddr;
1305 else {
1306 struct fib_result res;
1307 struct iphdr *iph = ip_hdr(skb);
1308 struct flowi4 fl4 = {
1309 .daddr = iph->daddr,
1310 .saddr = iph->saddr,
1311 .flowi4_tos = RT_TOS(iph->tos),
1312 .flowi4_oif = rt->dst.dev->ifindex,
1313 .flowi4_iif = skb->dev->ifindex,
1314 .flowi4_mark = skb->mark,
1315 };
1316
1317 rcu_read_lock();
1318 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1319 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1320 else
1321 src = inet_select_addr(rt->dst.dev,
1322 rt_nexthop(rt, iph->daddr),
1323 RT_SCOPE_UNIVERSE);
1324 rcu_read_unlock();
1325 }
1326 memcpy(addr, &src, 4);
1327 }
1328
1329 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1330 static void set_class_tag(struct rtable *rt, u32 tag)
1331 {
1332 if (!(rt->dst.tclassid & 0xFFFF))
1333 rt->dst.tclassid |= tag & 0xFFFF;
1334 if (!(rt->dst.tclassid & 0xFFFF0000))
1335 rt->dst.tclassid |= tag & 0xFFFF0000;
1336 }
1337 #endif
1338
ipv4_default_advmss(const struct dst_entry * dst)1339 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1340 {
1341 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1342 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1343 ip_rt_min_advmss);
1344
1345 return min(advmss, IPV4_MAX_PMTU - header_size);
1346 }
1347
ipv4_mtu(const struct dst_entry * dst)1348 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1349 {
1350 const struct rtable *rt = (const struct rtable *)dst;
1351 unsigned int mtu = rt->rt_pmtu;
1352
1353 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1354 mtu = dst_metric_raw(dst, RTAX_MTU);
1355
1356 if (mtu)
1357 goto out;
1358
1359 mtu = READ_ONCE(dst->dev->mtu);
1360
1361 if (unlikely(ip_mtu_locked(dst))) {
1362 if (rt->rt_uses_gateway && mtu > 576)
1363 mtu = 576;
1364 }
1365
1366 out:
1367 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1368
1369 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1370 }
1371
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1372 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1373 {
1374 struct fnhe_hash_bucket *hash;
1375 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1376 u32 hval = fnhe_hashfun(daddr);
1377
1378 spin_lock_bh(&fnhe_lock);
1379
1380 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1381 lockdep_is_held(&fnhe_lock));
1382 hash += hval;
1383
1384 fnhe_p = &hash->chain;
1385 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1386 while (fnhe) {
1387 if (fnhe->fnhe_daddr == daddr) {
1388 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1389 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1390 /* set fnhe_daddr to 0 to ensure it won't bind with
1391 * new dsts in rt_bind_exception().
1392 */
1393 fnhe->fnhe_daddr = 0;
1394 fnhe_flush_routes(fnhe);
1395 kfree_rcu(fnhe, rcu);
1396 break;
1397 }
1398 fnhe_p = &fnhe->fnhe_next;
1399 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1400 lockdep_is_held(&fnhe_lock));
1401 }
1402
1403 spin_unlock_bh(&fnhe_lock);
1404 }
1405
find_exception(struct fib_nh_common * nhc,__be32 daddr)1406 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1407 __be32 daddr)
1408 {
1409 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1410 struct fib_nh_exception *fnhe;
1411 u32 hval;
1412
1413 if (!hash)
1414 return NULL;
1415
1416 hval = fnhe_hashfun(daddr);
1417
1418 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1419 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1420 if (fnhe->fnhe_daddr == daddr) {
1421 if (fnhe->fnhe_expires &&
1422 time_after(jiffies, fnhe->fnhe_expires)) {
1423 ip_del_fnhe(nhc, daddr);
1424 break;
1425 }
1426 return fnhe;
1427 }
1428 }
1429 return NULL;
1430 }
1431
1432 /* MTU selection:
1433 * 1. mtu on route is locked - use it
1434 * 2. mtu from nexthop exception
1435 * 3. mtu from egress device
1436 */
1437
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1438 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1439 {
1440 struct fib_nh_common *nhc = res->nhc;
1441 struct net_device *dev = nhc->nhc_dev;
1442 struct fib_info *fi = res->fi;
1443 u32 mtu = 0;
1444
1445 if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1446 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1447 mtu = fi->fib_mtu;
1448
1449 if (likely(!mtu)) {
1450 struct fib_nh_exception *fnhe;
1451
1452 fnhe = find_exception(nhc, daddr);
1453 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1454 mtu = fnhe->fnhe_pmtu;
1455 }
1456
1457 if (likely(!mtu))
1458 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1459
1460 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1461 }
1462
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1463 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1464 __be32 daddr, const bool do_cache)
1465 {
1466 bool ret = false;
1467
1468 spin_lock_bh(&fnhe_lock);
1469
1470 if (daddr == fnhe->fnhe_daddr) {
1471 struct rtable __rcu **porig;
1472 struct rtable *orig;
1473 int genid = fnhe_genid(dev_net(rt->dst.dev));
1474
1475 if (rt_is_input_route(rt))
1476 porig = &fnhe->fnhe_rth_input;
1477 else
1478 porig = &fnhe->fnhe_rth_output;
1479 orig = rcu_dereference(*porig);
1480
1481 if (fnhe->fnhe_genid != genid) {
1482 fnhe->fnhe_genid = genid;
1483 fnhe->fnhe_gw = 0;
1484 fnhe->fnhe_pmtu = 0;
1485 fnhe->fnhe_expires = 0;
1486 fnhe->fnhe_mtu_locked = false;
1487 fnhe_flush_routes(fnhe);
1488 orig = NULL;
1489 }
1490 fill_route_from_fnhe(rt, fnhe);
1491 if (!rt->rt_gw4) {
1492 rt->rt_gw4 = daddr;
1493 rt->rt_gw_family = AF_INET;
1494 }
1495
1496 if (do_cache) {
1497 dst_hold(&rt->dst);
1498 rcu_assign_pointer(*porig, rt);
1499 if (orig) {
1500 dst_dev_put(&orig->dst);
1501 dst_release(&orig->dst);
1502 }
1503 ret = true;
1504 }
1505
1506 fnhe->fnhe_stamp = jiffies;
1507 }
1508 spin_unlock_bh(&fnhe_lock);
1509
1510 return ret;
1511 }
1512
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1513 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1514 {
1515 struct rtable *orig, *prev, **p;
1516 bool ret = true;
1517
1518 if (rt_is_input_route(rt)) {
1519 p = (struct rtable **)&nhc->nhc_rth_input;
1520 } else {
1521 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1522 }
1523 orig = *p;
1524
1525 /* hold dst before doing cmpxchg() to avoid race condition
1526 * on this dst
1527 */
1528 dst_hold(&rt->dst);
1529 prev = cmpxchg(p, orig, rt);
1530 if (prev == orig) {
1531 if (orig) {
1532 rt_add_uncached_list(orig);
1533 dst_release(&orig->dst);
1534 }
1535 } else {
1536 dst_release(&rt->dst);
1537 ret = false;
1538 }
1539
1540 return ret;
1541 }
1542
1543 struct uncached_list {
1544 spinlock_t lock;
1545 struct list_head head;
1546 };
1547
1548 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1549
rt_add_uncached_list(struct rtable * rt)1550 void rt_add_uncached_list(struct rtable *rt)
1551 {
1552 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1553
1554 rt->rt_uncached_list = ul;
1555
1556 spin_lock_bh(&ul->lock);
1557 list_add_tail(&rt->rt_uncached, &ul->head);
1558 spin_unlock_bh(&ul->lock);
1559 }
1560
rt_del_uncached_list(struct rtable * rt)1561 void rt_del_uncached_list(struct rtable *rt)
1562 {
1563 if (!list_empty(&rt->rt_uncached)) {
1564 struct uncached_list *ul = rt->rt_uncached_list;
1565
1566 spin_lock_bh(&ul->lock);
1567 list_del(&rt->rt_uncached);
1568 spin_unlock_bh(&ul->lock);
1569 }
1570 }
1571
ipv4_dst_destroy(struct dst_entry * dst)1572 static void ipv4_dst_destroy(struct dst_entry *dst)
1573 {
1574 struct rtable *rt = (struct rtable *)dst;
1575
1576 ip_dst_metrics_put(dst);
1577 rt_del_uncached_list(rt);
1578 }
1579
rt_flush_dev(struct net_device * dev)1580 void rt_flush_dev(struct net_device *dev)
1581 {
1582 struct rtable *rt;
1583 int cpu;
1584
1585 for_each_possible_cpu(cpu) {
1586 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1587
1588 spin_lock_bh(&ul->lock);
1589 list_for_each_entry(rt, &ul->head, rt_uncached) {
1590 if (rt->dst.dev != dev)
1591 continue;
1592 rt->dst.dev = blackhole_netdev;
1593 dev_hold(rt->dst.dev);
1594 dev_put(dev);
1595 }
1596 spin_unlock_bh(&ul->lock);
1597 }
1598 }
1599
rt_cache_valid(const struct rtable * rt)1600 static bool rt_cache_valid(const struct rtable *rt)
1601 {
1602 return rt &&
1603 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1604 !rt_is_expired(rt);
1605 }
1606
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1607 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1608 const struct fib_result *res,
1609 struct fib_nh_exception *fnhe,
1610 struct fib_info *fi, u16 type, u32 itag,
1611 const bool do_cache)
1612 {
1613 bool cached = false;
1614
1615 if (fi) {
1616 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1617
1618 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1619 rt->rt_uses_gateway = 1;
1620 rt->rt_gw_family = nhc->nhc_gw_family;
1621 /* only INET and INET6 are supported */
1622 if (likely(nhc->nhc_gw_family == AF_INET))
1623 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1624 else
1625 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1626 }
1627
1628 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1629
1630 #ifdef CONFIG_IP_ROUTE_CLASSID
1631 if (nhc->nhc_family == AF_INET) {
1632 struct fib_nh *nh;
1633
1634 nh = container_of(nhc, struct fib_nh, nh_common);
1635 rt->dst.tclassid = nh->nh_tclassid;
1636 }
1637 #endif
1638 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1639 if (unlikely(fnhe))
1640 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1641 else if (do_cache)
1642 cached = rt_cache_route(nhc, rt);
1643 if (unlikely(!cached)) {
1644 /* Routes we intend to cache in nexthop exception or
1645 * FIB nexthop have the DST_NOCACHE bit clear.
1646 * However, if we are unsuccessful at storing this
1647 * route into the cache we really need to set it.
1648 */
1649 if (!rt->rt_gw4) {
1650 rt->rt_gw_family = AF_INET;
1651 rt->rt_gw4 = daddr;
1652 }
1653 rt_add_uncached_list(rt);
1654 }
1655 } else
1656 rt_add_uncached_list(rt);
1657
1658 #ifdef CONFIG_IP_ROUTE_CLASSID
1659 #ifdef CONFIG_IP_MULTIPLE_TABLES
1660 set_class_tag(rt, res->tclassid);
1661 #endif
1662 set_class_tag(rt, itag);
1663 #endif
1664 }
1665
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm)1666 struct rtable *rt_dst_alloc(struct net_device *dev,
1667 unsigned int flags, u16 type,
1668 bool nopolicy, bool noxfrm)
1669 {
1670 struct rtable *rt;
1671
1672 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1673 (nopolicy ? DST_NOPOLICY : 0) |
1674 (noxfrm ? DST_NOXFRM : 0));
1675
1676 if (rt) {
1677 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1678 rt->rt_flags = flags;
1679 rt->rt_type = type;
1680 rt->rt_is_input = 0;
1681 rt->rt_iif = 0;
1682 rt->rt_pmtu = 0;
1683 rt->rt_mtu_locked = 0;
1684 rt->rt_uses_gateway = 0;
1685 rt->rt_gw_family = 0;
1686 rt->rt_gw4 = 0;
1687 INIT_LIST_HEAD(&rt->rt_uncached);
1688
1689 rt->dst.output = ip_output;
1690 if (flags & RTCF_LOCAL)
1691 rt->dst.input = ip_local_deliver;
1692 }
1693
1694 return rt;
1695 }
1696 EXPORT_SYMBOL(rt_dst_alloc);
1697
rt_dst_clone(struct net_device * dev,struct rtable * rt)1698 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1699 {
1700 struct rtable *new_rt;
1701
1702 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1703 rt->dst.flags);
1704
1705 if (new_rt) {
1706 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1707 new_rt->rt_flags = rt->rt_flags;
1708 new_rt->rt_type = rt->rt_type;
1709 new_rt->rt_is_input = rt->rt_is_input;
1710 new_rt->rt_iif = rt->rt_iif;
1711 new_rt->rt_pmtu = rt->rt_pmtu;
1712 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1713 new_rt->rt_gw_family = rt->rt_gw_family;
1714 if (rt->rt_gw_family == AF_INET)
1715 new_rt->rt_gw4 = rt->rt_gw4;
1716 else if (rt->rt_gw_family == AF_INET6)
1717 new_rt->rt_gw6 = rt->rt_gw6;
1718 INIT_LIST_HEAD(&new_rt->rt_uncached);
1719
1720 new_rt->dst.input = rt->dst.input;
1721 new_rt->dst.output = rt->dst.output;
1722 new_rt->dst.error = rt->dst.error;
1723 new_rt->dst.lastuse = jiffies;
1724 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1725 }
1726 return new_rt;
1727 }
1728 EXPORT_SYMBOL(rt_dst_clone);
1729
1730 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1731 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1732 u8 tos, struct net_device *dev,
1733 struct in_device *in_dev, u32 *itag)
1734 {
1735 int err;
1736
1737 /* Primary sanity checks. */
1738 if (!in_dev)
1739 return -EINVAL;
1740
1741 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1742 skb->protocol != htons(ETH_P_IP))
1743 return -EINVAL;
1744
1745 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1746 return -EINVAL;
1747
1748 if (ipv4_is_zeronet(saddr)) {
1749 if (!ipv4_is_local_multicast(daddr) &&
1750 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1751 return -EINVAL;
1752 } else {
1753 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1754 in_dev, itag);
1755 if (err < 0)
1756 return err;
1757 }
1758 return 0;
1759 }
1760
1761 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1762 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1763 u8 tos, struct net_device *dev, int our)
1764 {
1765 struct in_device *in_dev = __in_dev_get_rcu(dev);
1766 unsigned int flags = RTCF_MULTICAST;
1767 struct rtable *rth;
1768 u32 itag = 0;
1769 int err;
1770
1771 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1772 if (err)
1773 return err;
1774
1775 if (our)
1776 flags |= RTCF_LOCAL;
1777
1778 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1779 IN_DEV_ORCONF(in_dev, NOPOLICY), false);
1780 if (!rth)
1781 return -ENOBUFS;
1782
1783 #ifdef CONFIG_IP_ROUTE_CLASSID
1784 rth->dst.tclassid = itag;
1785 #endif
1786 rth->dst.output = ip_rt_bug;
1787 rth->rt_is_input= 1;
1788
1789 #ifdef CONFIG_IP_MROUTE
1790 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1791 rth->dst.input = ip_mr_input;
1792 #endif
1793 RT_CACHE_STAT_INC(in_slow_mc);
1794
1795 skb_dst_drop(skb);
1796 skb_dst_set(skb, &rth->dst);
1797 return 0;
1798 }
1799
1800
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1801 static void ip_handle_martian_source(struct net_device *dev,
1802 struct in_device *in_dev,
1803 struct sk_buff *skb,
1804 __be32 daddr,
1805 __be32 saddr)
1806 {
1807 RT_CACHE_STAT_INC(in_martian_src);
1808 #ifdef CONFIG_IP_ROUTE_VERBOSE
1809 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1810 /*
1811 * RFC1812 recommendation, if source is martian,
1812 * the only hint is MAC header.
1813 */
1814 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1815 &daddr, &saddr, dev->name);
1816 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1817 print_hex_dump(KERN_WARNING, "ll header: ",
1818 DUMP_PREFIX_OFFSET, 16, 1,
1819 skb_mac_header(skb),
1820 dev->hard_header_len, false);
1821 }
1822 }
1823 #endif
1824 }
1825
1826 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1827 static int __mkroute_input(struct sk_buff *skb,
1828 const struct fib_result *res,
1829 struct in_device *in_dev,
1830 __be32 daddr, __be32 saddr, u32 tos)
1831 {
1832 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1833 struct net_device *dev = nhc->nhc_dev;
1834 struct fib_nh_exception *fnhe;
1835 struct rtable *rth;
1836 int err;
1837 struct in_device *out_dev;
1838 bool do_cache;
1839 u32 itag = 0;
1840
1841 /* get a working reference to the output device */
1842 out_dev = __in_dev_get_rcu(dev);
1843 if (!out_dev) {
1844 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1845 return -EINVAL;
1846 }
1847
1848 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1849 in_dev->dev, in_dev, &itag);
1850 if (err < 0) {
1851 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1852 saddr);
1853
1854 goto cleanup;
1855 }
1856
1857 do_cache = res->fi && !itag;
1858 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1859 skb->protocol == htons(ETH_P_IP)) {
1860 __be32 gw;
1861
1862 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1863 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1864 inet_addr_onlink(out_dev, saddr, gw))
1865 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1866 }
1867
1868 if (skb->protocol != htons(ETH_P_IP)) {
1869 /* Not IP (i.e. ARP). Do not create route, if it is
1870 * invalid for proxy arp. DNAT routes are always valid.
1871 *
1872 * Proxy arp feature have been extended to allow, ARP
1873 * replies back to the same interface, to support
1874 * Private VLAN switch technologies. See arp.c.
1875 */
1876 if (out_dev == in_dev &&
1877 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1878 err = -EINVAL;
1879 goto cleanup;
1880 }
1881 }
1882
1883 fnhe = find_exception(nhc, daddr);
1884 if (do_cache) {
1885 if (fnhe)
1886 rth = rcu_dereference(fnhe->fnhe_rth_input);
1887 else
1888 rth = rcu_dereference(nhc->nhc_rth_input);
1889 if (rt_cache_valid(rth)) {
1890 skb_dst_set_noref(skb, &rth->dst);
1891 goto out;
1892 }
1893 }
1894
1895 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1896 IN_DEV_ORCONF(in_dev, NOPOLICY),
1897 IN_DEV_ORCONF(out_dev, NOXFRM));
1898 if (!rth) {
1899 err = -ENOBUFS;
1900 goto cleanup;
1901 }
1902
1903 rth->rt_is_input = 1;
1904 RT_CACHE_STAT_INC(in_slow_tot);
1905
1906 rth->dst.input = ip_forward;
1907
1908 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1909 do_cache);
1910 lwtunnel_set_redirect(&rth->dst);
1911 skb_dst_set(skb, &rth->dst);
1912 out:
1913 err = 0;
1914 cleanup:
1915 return err;
1916 }
1917
1918 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1919 /* To make ICMP packets follow the right flow, the multipath hash is
1920 * calculated from the inner IP addresses.
1921 */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1922 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1923 struct flow_keys *hash_keys)
1924 {
1925 const struct iphdr *outer_iph = ip_hdr(skb);
1926 const struct iphdr *key_iph = outer_iph;
1927 const struct iphdr *inner_iph;
1928 const struct icmphdr *icmph;
1929 struct iphdr _inner_iph;
1930 struct icmphdr _icmph;
1931
1932 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1933 goto out;
1934
1935 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1936 goto out;
1937
1938 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1939 &_icmph);
1940 if (!icmph)
1941 goto out;
1942
1943 if (!icmp_is_err(icmph->type))
1944 goto out;
1945
1946 inner_iph = skb_header_pointer(skb,
1947 outer_iph->ihl * 4 + sizeof(_icmph),
1948 sizeof(_inner_iph), &_inner_iph);
1949 if (!inner_iph)
1950 goto out;
1951
1952 key_iph = inner_iph;
1953 out:
1954 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1955 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1956 }
1957
1958 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1959 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1960 const struct sk_buff *skb, struct flow_keys *flkeys)
1961 {
1962 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1963 struct flow_keys hash_keys;
1964 u32 mhash;
1965
1966 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1967 case 0:
1968 memset(&hash_keys, 0, sizeof(hash_keys));
1969 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1970 if (skb) {
1971 ip_multipath_l3_keys(skb, &hash_keys);
1972 } else {
1973 hash_keys.addrs.v4addrs.src = fl4->saddr;
1974 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1975 }
1976 break;
1977 case 1:
1978 /* skb is currently provided only when forwarding */
1979 if (skb) {
1980 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1981 struct flow_keys keys;
1982
1983 /* short-circuit if we already have L4 hash present */
1984 if (skb->l4_hash)
1985 return skb_get_hash_raw(skb) >> 1;
1986
1987 memset(&hash_keys, 0, sizeof(hash_keys));
1988
1989 if (!flkeys) {
1990 skb_flow_dissect_flow_keys(skb, &keys, flag);
1991 flkeys = &keys;
1992 }
1993
1994 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1995 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1996 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1997 hash_keys.ports.src = flkeys->ports.src;
1998 hash_keys.ports.dst = flkeys->ports.dst;
1999 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2000 } else {
2001 memset(&hash_keys, 0, sizeof(hash_keys));
2002 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2003 hash_keys.addrs.v4addrs.src = fl4->saddr;
2004 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2005 hash_keys.ports.src = fl4->fl4_sport;
2006 hash_keys.ports.dst = fl4->fl4_dport;
2007 hash_keys.basic.ip_proto = fl4->flowi4_proto;
2008 }
2009 break;
2010 case 2:
2011 memset(&hash_keys, 0, sizeof(hash_keys));
2012 /* skb is currently provided only when forwarding */
2013 if (skb) {
2014 struct flow_keys keys;
2015
2016 skb_flow_dissect_flow_keys(skb, &keys, 0);
2017 /* Inner can be v4 or v6 */
2018 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2019 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2020 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2021 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2022 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2023 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2024 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2025 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2026 hash_keys.tags.flow_label = keys.tags.flow_label;
2027 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2028 } else {
2029 /* Same as case 0 */
2030 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2031 ip_multipath_l3_keys(skb, &hash_keys);
2032 }
2033 } else {
2034 /* Same as case 0 */
2035 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2036 hash_keys.addrs.v4addrs.src = fl4->saddr;
2037 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2038 }
2039 break;
2040 }
2041 mhash = flow_hash_from_keys(&hash_keys);
2042
2043 if (multipath_hash)
2044 mhash = jhash_2words(mhash, multipath_hash, 0);
2045
2046 return mhash >> 1;
2047 }
2048 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2049
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2050 static int ip_mkroute_input(struct sk_buff *skb,
2051 struct fib_result *res,
2052 struct in_device *in_dev,
2053 __be32 daddr, __be32 saddr, u32 tos,
2054 struct flow_keys *hkeys)
2055 {
2056 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2057 if (res->fi && fib_info_num_path(res->fi) > 1) {
2058 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2059
2060 fib_select_multipath(res, h);
2061 }
2062 #endif
2063
2064 /* create a routing cache entry */
2065 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2066 }
2067
2068 /* Implements all the saddr-related checks as ip_route_input_slow(),
2069 * assuming daddr is valid and the destination is not a local broadcast one.
2070 * Uses the provided hint instead of performing a route lookup.
2071 */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2072 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2073 u8 tos, struct net_device *dev,
2074 const struct sk_buff *hint)
2075 {
2076 struct in_device *in_dev = __in_dev_get_rcu(dev);
2077 struct rtable *rt = skb_rtable(hint);
2078 struct net *net = dev_net(dev);
2079 int err = -EINVAL;
2080 u32 tag = 0;
2081
2082 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2083 goto martian_source;
2084
2085 if (ipv4_is_zeronet(saddr))
2086 goto martian_source;
2087
2088 if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2089 goto martian_source;
2090
2091 if (rt->rt_type != RTN_LOCAL)
2092 goto skip_validate_source;
2093
2094 tos &= IPTOS_RT_MASK;
2095 err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2096 if (err < 0)
2097 goto martian_source;
2098
2099 skip_validate_source:
2100 skb_dst_copy(skb, hint);
2101 return 0;
2102
2103 martian_source:
2104 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2105 return err;
2106 }
2107
2108 /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2109 static struct net_device *ip_rt_get_dev(struct net *net,
2110 const struct fib_result *res)
2111 {
2112 struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2113 struct net_device *dev = NULL;
2114
2115 if (nhc)
2116 dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2117
2118 return dev ? : net->loopback_dev;
2119 }
2120
2121 /*
2122 * NOTE. We drop all the packets that has local source
2123 * addresses, because every properly looped back packet
2124 * must have correct destination already attached by output routine.
2125 * Changes in the enforced policies must be applied also to
2126 * ip_route_use_hint().
2127 *
2128 * Such approach solves two big problems:
2129 * 1. Not simplex devices are handled properly.
2130 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2131 * called with rcu_read_lock()
2132 */
2133
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2134 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2135 u8 tos, struct net_device *dev,
2136 struct fib_result *res)
2137 {
2138 struct in_device *in_dev = __in_dev_get_rcu(dev);
2139 struct flow_keys *flkeys = NULL, _flkeys;
2140 struct net *net = dev_net(dev);
2141 struct ip_tunnel_info *tun_info;
2142 int err = -EINVAL;
2143 unsigned int flags = 0;
2144 u32 itag = 0;
2145 struct rtable *rth;
2146 struct flowi4 fl4;
2147 bool do_cache = true;
2148
2149 /* IP on this device is disabled. */
2150
2151 if (!in_dev)
2152 goto out;
2153
2154 /* Check for the most weird martians, which can be not detected
2155 by fib_lookup.
2156 */
2157
2158 tun_info = skb_tunnel_info(skb);
2159 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2160 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2161 else
2162 fl4.flowi4_tun_key.tun_id = 0;
2163 skb_dst_drop(skb);
2164
2165 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2166 goto martian_source;
2167
2168 res->fi = NULL;
2169 res->table = NULL;
2170 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2171 goto brd_input;
2172
2173 /* Accept zero addresses only to limited broadcast;
2174 * I even do not know to fix it or not. Waiting for complains :-)
2175 */
2176 if (ipv4_is_zeronet(saddr))
2177 goto martian_source;
2178
2179 if (ipv4_is_zeronet(daddr))
2180 goto martian_destination;
2181
2182 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2183 * and call it once if daddr or/and saddr are loopback addresses
2184 */
2185 if (ipv4_is_loopback(daddr)) {
2186 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2187 goto martian_destination;
2188 } else if (ipv4_is_loopback(saddr)) {
2189 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2190 goto martian_source;
2191 }
2192
2193 /*
2194 * Now we are ready to route packet.
2195 */
2196 fl4.flowi4_oif = 0;
2197 fl4.flowi4_iif = dev->ifindex;
2198 fl4.flowi4_mark = skb->mark;
2199 fl4.flowi4_tos = tos;
2200 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2201 fl4.flowi4_flags = 0;
2202 fl4.daddr = daddr;
2203 fl4.saddr = saddr;
2204 fl4.flowi4_uid = sock_net_uid(net, NULL);
2205 fl4.flowi4_multipath_hash = 0;
2206
2207 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2208 flkeys = &_flkeys;
2209 } else {
2210 fl4.flowi4_proto = 0;
2211 fl4.fl4_sport = 0;
2212 fl4.fl4_dport = 0;
2213 }
2214
2215 err = fib_lookup(net, &fl4, res, 0);
2216 if (err != 0) {
2217 if (!IN_DEV_FORWARD(in_dev))
2218 err = -EHOSTUNREACH;
2219 goto no_route;
2220 }
2221
2222 if (res->type == RTN_BROADCAST) {
2223 if (IN_DEV_BFORWARD(in_dev))
2224 goto make_route;
2225 /* not do cache if bc_forwarding is enabled */
2226 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2227 do_cache = false;
2228 goto brd_input;
2229 }
2230
2231 if (res->type == RTN_LOCAL) {
2232 err = fib_validate_source(skb, saddr, daddr, tos,
2233 0, dev, in_dev, &itag);
2234 if (err < 0)
2235 goto martian_source;
2236 goto local_input;
2237 }
2238
2239 if (!IN_DEV_FORWARD(in_dev)) {
2240 err = -EHOSTUNREACH;
2241 goto no_route;
2242 }
2243 if (res->type != RTN_UNICAST)
2244 goto martian_destination;
2245
2246 make_route:
2247 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2248 out: return err;
2249
2250 brd_input:
2251 if (skb->protocol != htons(ETH_P_IP))
2252 goto e_inval;
2253
2254 if (!ipv4_is_zeronet(saddr)) {
2255 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2256 in_dev, &itag);
2257 if (err < 0)
2258 goto martian_source;
2259 }
2260 flags |= RTCF_BROADCAST;
2261 res->type = RTN_BROADCAST;
2262 RT_CACHE_STAT_INC(in_brd);
2263
2264 local_input:
2265 do_cache &= res->fi && !itag;
2266 if (do_cache) {
2267 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2268
2269 rth = rcu_dereference(nhc->nhc_rth_input);
2270 if (rt_cache_valid(rth)) {
2271 skb_dst_set_noref(skb, &rth->dst);
2272 err = 0;
2273 goto out;
2274 }
2275 }
2276
2277 rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2278 flags | RTCF_LOCAL, res->type,
2279 IN_DEV_ORCONF(in_dev, NOPOLICY), false);
2280 if (!rth)
2281 goto e_nobufs;
2282
2283 rth->dst.output= ip_rt_bug;
2284 #ifdef CONFIG_IP_ROUTE_CLASSID
2285 rth->dst.tclassid = itag;
2286 #endif
2287 rth->rt_is_input = 1;
2288
2289 RT_CACHE_STAT_INC(in_slow_tot);
2290 if (res->type == RTN_UNREACHABLE) {
2291 rth->dst.input= ip_error;
2292 rth->dst.error= -err;
2293 rth->rt_flags &= ~RTCF_LOCAL;
2294 }
2295
2296 if (do_cache) {
2297 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2298
2299 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2300 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2301 WARN_ON(rth->dst.input == lwtunnel_input);
2302 rth->dst.lwtstate->orig_input = rth->dst.input;
2303 rth->dst.input = lwtunnel_input;
2304 }
2305
2306 if (unlikely(!rt_cache_route(nhc, rth)))
2307 rt_add_uncached_list(rth);
2308 }
2309 skb_dst_set(skb, &rth->dst);
2310 err = 0;
2311 goto out;
2312
2313 no_route:
2314 RT_CACHE_STAT_INC(in_no_route);
2315 res->type = RTN_UNREACHABLE;
2316 res->fi = NULL;
2317 res->table = NULL;
2318 goto local_input;
2319
2320 /*
2321 * Do not cache martian addresses: they should be logged (RFC1812)
2322 */
2323 martian_destination:
2324 RT_CACHE_STAT_INC(in_martian_dst);
2325 #ifdef CONFIG_IP_ROUTE_VERBOSE
2326 if (IN_DEV_LOG_MARTIANS(in_dev))
2327 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2328 &daddr, &saddr, dev->name);
2329 #endif
2330
2331 e_inval:
2332 err = -EINVAL;
2333 goto out;
2334
2335 e_nobufs:
2336 err = -ENOBUFS;
2337 goto out;
2338
2339 martian_source:
2340 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2341 goto out;
2342 }
2343
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2344 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2345 u8 tos, struct net_device *dev)
2346 {
2347 struct fib_result res;
2348 int err;
2349
2350 tos &= IPTOS_RT_MASK;
2351 rcu_read_lock();
2352 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2353 rcu_read_unlock();
2354
2355 return err;
2356 }
2357 EXPORT_SYMBOL(ip_route_input_noref);
2358
2359 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2360 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2361 u8 tos, struct net_device *dev, struct fib_result *res)
2362 {
2363 /* Multicast recognition logic is moved from route cache to here.
2364 The problem was that too many Ethernet cards have broken/missing
2365 hardware multicast filters :-( As result the host on multicasting
2366 network acquires a lot of useless route cache entries, sort of
2367 SDR messages from all the world. Now we try to get rid of them.
2368 Really, provided software IP multicast filter is organized
2369 reasonably (at least, hashed), it does not result in a slowdown
2370 comparing with route cache reject entries.
2371 Note, that multicast routers are not affected, because
2372 route cache entry is created eventually.
2373 */
2374 if (ipv4_is_multicast(daddr)) {
2375 struct in_device *in_dev = __in_dev_get_rcu(dev);
2376 int our = 0;
2377 int err = -EINVAL;
2378
2379 if (!in_dev)
2380 return err;
2381 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2382 ip_hdr(skb)->protocol);
2383
2384 /* check l3 master if no match yet */
2385 if (!our && netif_is_l3_slave(dev)) {
2386 struct in_device *l3_in_dev;
2387
2388 l3_in_dev = __in_dev_get_rcu(skb->dev);
2389 if (l3_in_dev)
2390 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2391 ip_hdr(skb)->protocol);
2392 }
2393
2394 if (our
2395 #ifdef CONFIG_IP_MROUTE
2396 ||
2397 (!ipv4_is_local_multicast(daddr) &&
2398 IN_DEV_MFORWARD(in_dev))
2399 #endif
2400 ) {
2401 err = ip_route_input_mc(skb, daddr, saddr,
2402 tos, dev, our);
2403 }
2404 return err;
2405 }
2406
2407 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2408 }
2409
2410 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2411 static struct rtable *__mkroute_output(const struct fib_result *res,
2412 const struct flowi4 *fl4, int orig_oif,
2413 struct net_device *dev_out,
2414 unsigned int flags)
2415 {
2416 struct fib_info *fi = res->fi;
2417 struct fib_nh_exception *fnhe;
2418 struct in_device *in_dev;
2419 u16 type = res->type;
2420 struct rtable *rth;
2421 bool do_cache;
2422
2423 in_dev = __in_dev_get_rcu(dev_out);
2424 if (!in_dev)
2425 return ERR_PTR(-EINVAL);
2426
2427 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2428 if (ipv4_is_loopback(fl4->saddr) &&
2429 !(dev_out->flags & IFF_LOOPBACK) &&
2430 !netif_is_l3_master(dev_out))
2431 return ERR_PTR(-EINVAL);
2432
2433 if (ipv4_is_lbcast(fl4->daddr))
2434 type = RTN_BROADCAST;
2435 else if (ipv4_is_multicast(fl4->daddr))
2436 type = RTN_MULTICAST;
2437 else if (ipv4_is_zeronet(fl4->daddr))
2438 return ERR_PTR(-EINVAL);
2439
2440 if (dev_out->flags & IFF_LOOPBACK)
2441 flags |= RTCF_LOCAL;
2442
2443 do_cache = true;
2444 if (type == RTN_BROADCAST) {
2445 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2446 fi = NULL;
2447 } else if (type == RTN_MULTICAST) {
2448 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2449 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2450 fl4->flowi4_proto))
2451 flags &= ~RTCF_LOCAL;
2452 else
2453 do_cache = false;
2454 /* If multicast route do not exist use
2455 * default one, but do not gateway in this case.
2456 * Yes, it is hack.
2457 */
2458 if (fi && res->prefixlen < 4)
2459 fi = NULL;
2460 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2461 (orig_oif != dev_out->ifindex)) {
2462 /* For local routes that require a particular output interface
2463 * we do not want to cache the result. Caching the result
2464 * causes incorrect behaviour when there are multiple source
2465 * addresses on the interface, the end result being that if the
2466 * intended recipient is waiting on that interface for the
2467 * packet he won't receive it because it will be delivered on
2468 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2469 * be set to the loopback interface as well.
2470 */
2471 do_cache = false;
2472 }
2473
2474 fnhe = NULL;
2475 do_cache &= fi != NULL;
2476 if (fi) {
2477 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2478 struct rtable __rcu **prth;
2479
2480 fnhe = find_exception(nhc, fl4->daddr);
2481 if (!do_cache)
2482 goto add;
2483 if (fnhe) {
2484 prth = &fnhe->fnhe_rth_output;
2485 } else {
2486 if (unlikely(fl4->flowi4_flags &
2487 FLOWI_FLAG_KNOWN_NH &&
2488 !(nhc->nhc_gw_family &&
2489 nhc->nhc_scope == RT_SCOPE_LINK))) {
2490 do_cache = false;
2491 goto add;
2492 }
2493 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2494 }
2495 rth = rcu_dereference(*prth);
2496 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2497 return rth;
2498 }
2499
2500 add:
2501 rth = rt_dst_alloc(dev_out, flags, type,
2502 IN_DEV_ORCONF(in_dev, NOPOLICY),
2503 IN_DEV_ORCONF(in_dev, NOXFRM));
2504 if (!rth)
2505 return ERR_PTR(-ENOBUFS);
2506
2507 rth->rt_iif = orig_oif;
2508
2509 RT_CACHE_STAT_INC(out_slow_tot);
2510
2511 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2512 if (flags & RTCF_LOCAL &&
2513 !(dev_out->flags & IFF_LOOPBACK)) {
2514 rth->dst.output = ip_mc_output;
2515 RT_CACHE_STAT_INC(out_slow_mc);
2516 }
2517 #ifdef CONFIG_IP_MROUTE
2518 if (type == RTN_MULTICAST) {
2519 if (IN_DEV_MFORWARD(in_dev) &&
2520 !ipv4_is_local_multicast(fl4->daddr)) {
2521 rth->dst.input = ip_mr_input;
2522 rth->dst.output = ip_mc_output;
2523 }
2524 }
2525 #endif
2526 }
2527
2528 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2529 lwtunnel_set_redirect(&rth->dst);
2530
2531 return rth;
2532 }
2533
2534 /*
2535 * Major route resolver routine.
2536 */
2537
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2538 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2539 const struct sk_buff *skb)
2540 {
2541 struct fib_result res = {
2542 .type = RTN_UNSPEC,
2543 .fi = NULL,
2544 .table = NULL,
2545 .tclassid = 0,
2546 };
2547 struct rtable *rth;
2548
2549 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2550 ip_rt_fix_tos(fl4);
2551
2552 rcu_read_lock();
2553 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2554 rcu_read_unlock();
2555
2556 return rth;
2557 }
2558 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2559
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2560 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2561 struct fib_result *res,
2562 const struct sk_buff *skb)
2563 {
2564 struct net_device *dev_out = NULL;
2565 int orig_oif = fl4->flowi4_oif;
2566 unsigned int flags = 0;
2567 struct rtable *rth;
2568 int err;
2569
2570 if (fl4->saddr) {
2571 if (ipv4_is_multicast(fl4->saddr) ||
2572 ipv4_is_lbcast(fl4->saddr) ||
2573 ipv4_is_zeronet(fl4->saddr)) {
2574 rth = ERR_PTR(-EINVAL);
2575 goto out;
2576 }
2577
2578 rth = ERR_PTR(-ENETUNREACH);
2579
2580 /* I removed check for oif == dev_out->oif here.
2581 It was wrong for two reasons:
2582 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2583 is assigned to multiple interfaces.
2584 2. Moreover, we are allowed to send packets with saddr
2585 of another iface. --ANK
2586 */
2587
2588 if (fl4->flowi4_oif == 0 &&
2589 (ipv4_is_multicast(fl4->daddr) ||
2590 ipv4_is_lbcast(fl4->daddr))) {
2591 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2592 dev_out = __ip_dev_find(net, fl4->saddr, false);
2593 if (!dev_out)
2594 goto out;
2595
2596 /* Special hack: user can direct multicasts
2597 and limited broadcast via necessary interface
2598 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2599 This hack is not just for fun, it allows
2600 vic,vat and friends to work.
2601 They bind socket to loopback, set ttl to zero
2602 and expect that it will work.
2603 From the viewpoint of routing cache they are broken,
2604 because we are not allowed to build multicast path
2605 with loopback source addr (look, routing cache
2606 cannot know, that ttl is zero, so that packet
2607 will not leave this host and route is valid).
2608 Luckily, this hack is good workaround.
2609 */
2610
2611 fl4->flowi4_oif = dev_out->ifindex;
2612 goto make_route;
2613 }
2614
2615 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2616 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2617 if (!__ip_dev_find(net, fl4->saddr, false))
2618 goto out;
2619 }
2620 }
2621
2622
2623 if (fl4->flowi4_oif) {
2624 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2625 rth = ERR_PTR(-ENODEV);
2626 if (!dev_out)
2627 goto out;
2628
2629 /* RACE: Check return value of inet_select_addr instead. */
2630 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2631 rth = ERR_PTR(-ENETUNREACH);
2632 goto out;
2633 }
2634 if (ipv4_is_local_multicast(fl4->daddr) ||
2635 ipv4_is_lbcast(fl4->daddr) ||
2636 fl4->flowi4_proto == IPPROTO_IGMP) {
2637 if (!fl4->saddr)
2638 fl4->saddr = inet_select_addr(dev_out, 0,
2639 RT_SCOPE_LINK);
2640 goto make_route;
2641 }
2642 if (!fl4->saddr) {
2643 if (ipv4_is_multicast(fl4->daddr))
2644 fl4->saddr = inet_select_addr(dev_out, 0,
2645 fl4->flowi4_scope);
2646 else if (!fl4->daddr)
2647 fl4->saddr = inet_select_addr(dev_out, 0,
2648 RT_SCOPE_HOST);
2649 }
2650 }
2651
2652 if (!fl4->daddr) {
2653 fl4->daddr = fl4->saddr;
2654 if (!fl4->daddr)
2655 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2656 dev_out = net->loopback_dev;
2657 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2658 res->type = RTN_LOCAL;
2659 flags |= RTCF_LOCAL;
2660 goto make_route;
2661 }
2662
2663 err = fib_lookup(net, fl4, res, 0);
2664 if (err) {
2665 res->fi = NULL;
2666 res->table = NULL;
2667 if (fl4->flowi4_oif &&
2668 (ipv4_is_multicast(fl4->daddr) ||
2669 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2670 /* Apparently, routing tables are wrong. Assume,
2671 that the destination is on link.
2672
2673 WHY? DW.
2674 Because we are allowed to send to iface
2675 even if it has NO routes and NO assigned
2676 addresses. When oif is specified, routing
2677 tables are looked up with only one purpose:
2678 to catch if destination is gatewayed, rather than
2679 direct. Moreover, if MSG_DONTROUTE is set,
2680 we send packet, ignoring both routing tables
2681 and ifaddr state. --ANK
2682
2683
2684 We could make it even if oif is unknown,
2685 likely IPv6, but we do not.
2686 */
2687
2688 if (fl4->saddr == 0)
2689 fl4->saddr = inet_select_addr(dev_out, 0,
2690 RT_SCOPE_LINK);
2691 res->type = RTN_UNICAST;
2692 goto make_route;
2693 }
2694 rth = ERR_PTR(err);
2695 goto out;
2696 }
2697
2698 if (res->type == RTN_LOCAL) {
2699 if (!fl4->saddr) {
2700 if (res->fi->fib_prefsrc)
2701 fl4->saddr = res->fi->fib_prefsrc;
2702 else
2703 fl4->saddr = fl4->daddr;
2704 }
2705
2706 /* L3 master device is the loopback for that domain */
2707 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2708 net->loopback_dev;
2709
2710 /* make sure orig_oif points to fib result device even
2711 * though packet rx/tx happens over loopback or l3mdev
2712 */
2713 orig_oif = FIB_RES_OIF(*res);
2714
2715 fl4->flowi4_oif = dev_out->ifindex;
2716 flags |= RTCF_LOCAL;
2717 goto make_route;
2718 }
2719
2720 fib_select_path(net, res, fl4, skb);
2721
2722 dev_out = FIB_RES_DEV(*res);
2723
2724 make_route:
2725 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2726
2727 out:
2728 return rth;
2729 }
2730
2731 static struct dst_ops ipv4_dst_blackhole_ops = {
2732 .family = AF_INET,
2733 .default_advmss = ipv4_default_advmss,
2734 .neigh_lookup = ipv4_neigh_lookup,
2735 .check = dst_blackhole_check,
2736 .cow_metrics = dst_blackhole_cow_metrics,
2737 .update_pmtu = dst_blackhole_update_pmtu,
2738 .redirect = dst_blackhole_redirect,
2739 .mtu = dst_blackhole_mtu,
2740 };
2741
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2742 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2743 {
2744 struct rtable *ort = (struct rtable *) dst_orig;
2745 struct rtable *rt;
2746
2747 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2748 if (rt) {
2749 struct dst_entry *new = &rt->dst;
2750
2751 new->__use = 1;
2752 new->input = dst_discard;
2753 new->output = dst_discard_out;
2754
2755 new->dev = net->loopback_dev;
2756 if (new->dev)
2757 dev_hold(new->dev);
2758
2759 rt->rt_is_input = ort->rt_is_input;
2760 rt->rt_iif = ort->rt_iif;
2761 rt->rt_pmtu = ort->rt_pmtu;
2762 rt->rt_mtu_locked = ort->rt_mtu_locked;
2763
2764 rt->rt_genid = rt_genid_ipv4(net);
2765 rt->rt_flags = ort->rt_flags;
2766 rt->rt_type = ort->rt_type;
2767 rt->rt_uses_gateway = ort->rt_uses_gateway;
2768 rt->rt_gw_family = ort->rt_gw_family;
2769 if (rt->rt_gw_family == AF_INET)
2770 rt->rt_gw4 = ort->rt_gw4;
2771 else if (rt->rt_gw_family == AF_INET6)
2772 rt->rt_gw6 = ort->rt_gw6;
2773
2774 INIT_LIST_HEAD(&rt->rt_uncached);
2775 }
2776
2777 dst_release(dst_orig);
2778
2779 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2780 }
2781
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2782 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2783 const struct sock *sk)
2784 {
2785 struct rtable *rt = __ip_route_output_key(net, flp4);
2786
2787 if (IS_ERR(rt))
2788 return rt;
2789
2790 if (flp4->flowi4_proto) {
2791 flp4->flowi4_oif = rt->dst.dev->ifindex;
2792 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2793 flowi4_to_flowi(flp4),
2794 sk, 0);
2795 }
2796
2797 return rt;
2798 }
2799 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2800
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2801 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2802 struct net_device *dev,
2803 struct net *net, __be32 *saddr,
2804 const struct ip_tunnel_info *info,
2805 u8 protocol, bool use_cache)
2806 {
2807 #ifdef CONFIG_DST_CACHE
2808 struct dst_cache *dst_cache;
2809 #endif
2810 struct rtable *rt = NULL;
2811 struct flowi4 fl4;
2812 __u8 tos;
2813
2814 #ifdef CONFIG_DST_CACHE
2815 dst_cache = (struct dst_cache *)&info->dst_cache;
2816 if (use_cache) {
2817 rt = dst_cache_get_ip4(dst_cache, saddr);
2818 if (rt)
2819 return rt;
2820 }
2821 #endif
2822 memset(&fl4, 0, sizeof(fl4));
2823 fl4.flowi4_mark = skb->mark;
2824 fl4.flowi4_proto = protocol;
2825 fl4.daddr = info->key.u.ipv4.dst;
2826 fl4.saddr = info->key.u.ipv4.src;
2827 tos = info->key.tos;
2828 fl4.flowi4_tos = RT_TOS(tos);
2829
2830 rt = ip_route_output_key(net, &fl4);
2831 if (IS_ERR(rt)) {
2832 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2833 return ERR_PTR(-ENETUNREACH);
2834 }
2835 if (rt->dst.dev == dev) { /* is this necessary? */
2836 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2837 ip_rt_put(rt);
2838 return ERR_PTR(-ELOOP);
2839 }
2840 #ifdef CONFIG_DST_CACHE
2841 if (use_cache)
2842 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2843 #endif
2844 *saddr = fl4.saddr;
2845 return rt;
2846 }
2847 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2848
2849 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2850 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2851 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2852 struct sk_buff *skb, u32 portid, u32 seq,
2853 unsigned int flags)
2854 {
2855 struct rtmsg *r;
2856 struct nlmsghdr *nlh;
2857 unsigned long expires = 0;
2858 u32 error;
2859 u32 metrics[RTAX_MAX];
2860
2861 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2862 if (!nlh)
2863 return -EMSGSIZE;
2864
2865 r = nlmsg_data(nlh);
2866 r->rtm_family = AF_INET;
2867 r->rtm_dst_len = 32;
2868 r->rtm_src_len = 0;
2869 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2870 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2871 if (nla_put_u32(skb, RTA_TABLE, table_id))
2872 goto nla_put_failure;
2873 r->rtm_type = rt->rt_type;
2874 r->rtm_scope = RT_SCOPE_UNIVERSE;
2875 r->rtm_protocol = RTPROT_UNSPEC;
2876 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2877 if (rt->rt_flags & RTCF_NOTIFY)
2878 r->rtm_flags |= RTM_F_NOTIFY;
2879 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2880 r->rtm_flags |= RTCF_DOREDIRECT;
2881
2882 if (nla_put_in_addr(skb, RTA_DST, dst))
2883 goto nla_put_failure;
2884 if (src) {
2885 r->rtm_src_len = 32;
2886 if (nla_put_in_addr(skb, RTA_SRC, src))
2887 goto nla_put_failure;
2888 }
2889 if (rt->dst.dev &&
2890 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2891 goto nla_put_failure;
2892 #ifdef CONFIG_IP_ROUTE_CLASSID
2893 if (rt->dst.tclassid &&
2894 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2895 goto nla_put_failure;
2896 #endif
2897 if (fl4 && !rt_is_input_route(rt) &&
2898 fl4->saddr != src) {
2899 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2900 goto nla_put_failure;
2901 }
2902 if (rt->rt_uses_gateway) {
2903 if (rt->rt_gw_family == AF_INET &&
2904 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2905 goto nla_put_failure;
2906 } else if (rt->rt_gw_family == AF_INET6) {
2907 int alen = sizeof(struct in6_addr);
2908 struct nlattr *nla;
2909 struct rtvia *via;
2910
2911 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2912 if (!nla)
2913 goto nla_put_failure;
2914
2915 via = nla_data(nla);
2916 via->rtvia_family = AF_INET6;
2917 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2918 }
2919 }
2920
2921 expires = rt->dst.expires;
2922 if (expires) {
2923 unsigned long now = jiffies;
2924
2925 if (time_before(now, expires))
2926 expires -= now;
2927 else
2928 expires = 0;
2929 }
2930
2931 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2932 if (rt->rt_pmtu && expires)
2933 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2934 if (rt->rt_mtu_locked && expires)
2935 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2936 if (rtnetlink_put_metrics(skb, metrics) < 0)
2937 goto nla_put_failure;
2938
2939 if (fl4) {
2940 if (fl4->flowi4_mark &&
2941 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2942 goto nla_put_failure;
2943
2944 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2945 nla_put_u32(skb, RTA_UID,
2946 from_kuid_munged(current_user_ns(),
2947 fl4->flowi4_uid)))
2948 goto nla_put_failure;
2949
2950 if (rt_is_input_route(rt)) {
2951 #ifdef CONFIG_IP_MROUTE
2952 if (ipv4_is_multicast(dst) &&
2953 !ipv4_is_local_multicast(dst) &&
2954 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2955 int err = ipmr_get_route(net, skb,
2956 fl4->saddr, fl4->daddr,
2957 r, portid);
2958
2959 if (err <= 0) {
2960 if (err == 0)
2961 return 0;
2962 goto nla_put_failure;
2963 }
2964 } else
2965 #endif
2966 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2967 goto nla_put_failure;
2968 }
2969 }
2970
2971 error = rt->dst.error;
2972
2973 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2974 goto nla_put_failure;
2975
2976 nlmsg_end(skb, nlh);
2977 return 0;
2978
2979 nla_put_failure:
2980 nlmsg_cancel(skb, nlh);
2981 return -EMSGSIZE;
2982 }
2983
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2984 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2985 struct netlink_callback *cb, u32 table_id,
2986 struct fnhe_hash_bucket *bucket, int genid,
2987 int *fa_index, int fa_start, unsigned int flags)
2988 {
2989 int i;
2990
2991 for (i = 0; i < FNHE_HASH_SIZE; i++) {
2992 struct fib_nh_exception *fnhe;
2993
2994 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2995 fnhe = rcu_dereference(fnhe->fnhe_next)) {
2996 struct rtable *rt;
2997 int err;
2998
2999 if (*fa_index < fa_start)
3000 goto next;
3001
3002 if (fnhe->fnhe_genid != genid)
3003 goto next;
3004
3005 if (fnhe->fnhe_expires &&
3006 time_after(jiffies, fnhe->fnhe_expires))
3007 goto next;
3008
3009 rt = rcu_dereference(fnhe->fnhe_rth_input);
3010 if (!rt)
3011 rt = rcu_dereference(fnhe->fnhe_rth_output);
3012 if (!rt)
3013 goto next;
3014
3015 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3016 table_id, NULL, skb,
3017 NETLINK_CB(cb->skb).portid,
3018 cb->nlh->nlmsg_seq, flags);
3019 if (err)
3020 return err;
3021 next:
3022 (*fa_index)++;
3023 }
3024 }
3025
3026 return 0;
3027 }
3028
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3029 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3030 u32 table_id, struct fib_info *fi,
3031 int *fa_index, int fa_start, unsigned int flags)
3032 {
3033 struct net *net = sock_net(cb->skb->sk);
3034 int nhsel, genid = fnhe_genid(net);
3035
3036 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3037 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3038 struct fnhe_hash_bucket *bucket;
3039 int err;
3040
3041 if (nhc->nhc_flags & RTNH_F_DEAD)
3042 continue;
3043
3044 rcu_read_lock();
3045 bucket = rcu_dereference(nhc->nhc_exceptions);
3046 err = 0;
3047 if (bucket)
3048 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3049 genid, fa_index, fa_start,
3050 flags);
3051 rcu_read_unlock();
3052 if (err)
3053 return err;
3054 }
3055
3056 return 0;
3057 }
3058
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3059 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3060 u8 ip_proto, __be16 sport,
3061 __be16 dport)
3062 {
3063 struct sk_buff *skb;
3064 struct iphdr *iph;
3065
3066 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3067 if (!skb)
3068 return NULL;
3069
3070 /* Reserve room for dummy headers, this skb can pass
3071 * through good chunk of routing engine.
3072 */
3073 skb_reset_mac_header(skb);
3074 skb_reset_network_header(skb);
3075 skb->protocol = htons(ETH_P_IP);
3076 iph = skb_put(skb, sizeof(struct iphdr));
3077 iph->protocol = ip_proto;
3078 iph->saddr = src;
3079 iph->daddr = dst;
3080 iph->version = 0x4;
3081 iph->frag_off = 0;
3082 iph->ihl = 0x5;
3083 skb_set_transport_header(skb, skb->len);
3084
3085 switch (iph->protocol) {
3086 case IPPROTO_UDP: {
3087 struct udphdr *udph;
3088
3089 udph = skb_put_zero(skb, sizeof(struct udphdr));
3090 udph->source = sport;
3091 udph->dest = dport;
3092 udph->len = htons(sizeof(struct udphdr));
3093 udph->check = 0;
3094 break;
3095 }
3096 case IPPROTO_TCP: {
3097 struct tcphdr *tcph;
3098
3099 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3100 tcph->source = sport;
3101 tcph->dest = dport;
3102 tcph->doff = sizeof(struct tcphdr) / 4;
3103 tcph->rst = 1;
3104 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3105 src, dst, 0);
3106 break;
3107 }
3108 case IPPROTO_ICMP: {
3109 struct icmphdr *icmph;
3110
3111 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3112 icmph->type = ICMP_ECHO;
3113 icmph->code = 0;
3114 }
3115 }
3116
3117 return skb;
3118 }
3119
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3120 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3121 const struct nlmsghdr *nlh,
3122 struct nlattr **tb,
3123 struct netlink_ext_ack *extack)
3124 {
3125 struct rtmsg *rtm;
3126 int i, err;
3127
3128 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3129 NL_SET_ERR_MSG(extack,
3130 "ipv4: Invalid header for route get request");
3131 return -EINVAL;
3132 }
3133
3134 if (!netlink_strict_get_check(skb))
3135 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3136 rtm_ipv4_policy, extack);
3137
3138 rtm = nlmsg_data(nlh);
3139 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3140 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3141 rtm->rtm_table || rtm->rtm_protocol ||
3142 rtm->rtm_scope || rtm->rtm_type) {
3143 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3144 return -EINVAL;
3145 }
3146
3147 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3148 RTM_F_LOOKUP_TABLE |
3149 RTM_F_FIB_MATCH)) {
3150 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3151 return -EINVAL;
3152 }
3153
3154 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3155 rtm_ipv4_policy, extack);
3156 if (err)
3157 return err;
3158
3159 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3160 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3161 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3162 return -EINVAL;
3163 }
3164
3165 for (i = 0; i <= RTA_MAX; i++) {
3166 if (!tb[i])
3167 continue;
3168
3169 switch (i) {
3170 case RTA_IIF:
3171 case RTA_OIF:
3172 case RTA_SRC:
3173 case RTA_DST:
3174 case RTA_IP_PROTO:
3175 case RTA_SPORT:
3176 case RTA_DPORT:
3177 case RTA_MARK:
3178 case RTA_UID:
3179 break;
3180 default:
3181 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3182 return -EINVAL;
3183 }
3184 }
3185
3186 return 0;
3187 }
3188
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3189 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3190 struct netlink_ext_ack *extack)
3191 {
3192 struct net *net = sock_net(in_skb->sk);
3193 struct nlattr *tb[RTA_MAX+1];
3194 u32 table_id = RT_TABLE_MAIN;
3195 __be16 sport = 0, dport = 0;
3196 struct fib_result res = {};
3197 u8 ip_proto = IPPROTO_UDP;
3198 struct rtable *rt = NULL;
3199 struct sk_buff *skb;
3200 struct rtmsg *rtm;
3201 struct flowi4 fl4 = {};
3202 __be32 dst = 0;
3203 __be32 src = 0;
3204 kuid_t uid;
3205 u32 iif;
3206 int err;
3207 int mark;
3208
3209 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3210 if (err < 0)
3211 return err;
3212
3213 rtm = nlmsg_data(nlh);
3214 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3215 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3216 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3217 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3218 if (tb[RTA_UID])
3219 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3220 else
3221 uid = (iif ? INVALID_UID : current_uid());
3222
3223 if (tb[RTA_IP_PROTO]) {
3224 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3225 &ip_proto, AF_INET, extack);
3226 if (err)
3227 return err;
3228 }
3229
3230 if (tb[RTA_SPORT])
3231 sport = nla_get_be16(tb[RTA_SPORT]);
3232
3233 if (tb[RTA_DPORT])
3234 dport = nla_get_be16(tb[RTA_DPORT]);
3235
3236 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3237 if (!skb)
3238 return -ENOBUFS;
3239
3240 fl4.daddr = dst;
3241 fl4.saddr = src;
3242 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3243 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3244 fl4.flowi4_mark = mark;
3245 fl4.flowi4_uid = uid;
3246 if (sport)
3247 fl4.fl4_sport = sport;
3248 if (dport)
3249 fl4.fl4_dport = dport;
3250 fl4.flowi4_proto = ip_proto;
3251
3252 rcu_read_lock();
3253
3254 if (iif) {
3255 struct net_device *dev;
3256
3257 dev = dev_get_by_index_rcu(net, iif);
3258 if (!dev) {
3259 err = -ENODEV;
3260 goto errout_rcu;
3261 }
3262
3263 fl4.flowi4_iif = iif; /* for rt_fill_info */
3264 skb->dev = dev;
3265 skb->mark = mark;
3266 err = ip_route_input_rcu(skb, dst, src,
3267 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3268 &res);
3269
3270 rt = skb_rtable(skb);
3271 if (err == 0 && rt->dst.error)
3272 err = -rt->dst.error;
3273 } else {
3274 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3275 skb->dev = net->loopback_dev;
3276 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3277 err = 0;
3278 if (IS_ERR(rt))
3279 err = PTR_ERR(rt);
3280 else
3281 skb_dst_set(skb, &rt->dst);
3282 }
3283
3284 if (err)
3285 goto errout_rcu;
3286
3287 if (rtm->rtm_flags & RTM_F_NOTIFY)
3288 rt->rt_flags |= RTCF_NOTIFY;
3289
3290 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3291 table_id = res.table ? res.table->tb_id : 0;
3292
3293 /* reset skb for netlink reply msg */
3294 skb_trim(skb, 0);
3295 skb_reset_network_header(skb);
3296 skb_reset_transport_header(skb);
3297 skb_reset_mac_header(skb);
3298
3299 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3300 struct fib_rt_info fri;
3301
3302 if (!res.fi) {
3303 err = fib_props[res.type].error;
3304 if (!err)
3305 err = -EHOSTUNREACH;
3306 goto errout_rcu;
3307 }
3308 fri.fi = res.fi;
3309 fri.tb_id = table_id;
3310 fri.dst = res.prefix;
3311 fri.dst_len = res.prefixlen;
3312 fri.tos = fl4.flowi4_tos;
3313 fri.type = rt->rt_type;
3314 fri.offload = 0;
3315 fri.trap = 0;
3316 if (res.fa_head) {
3317 struct fib_alias *fa;
3318
3319 hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3320 u8 slen = 32 - fri.dst_len;
3321
3322 if (fa->fa_slen == slen &&
3323 fa->tb_id == fri.tb_id &&
3324 fa->fa_tos == fri.tos &&
3325 fa->fa_info == res.fi &&
3326 fa->fa_type == fri.type) {
3327 fri.offload = fa->offload;
3328 fri.trap = fa->trap;
3329 break;
3330 }
3331 }
3332 }
3333 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3334 nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3335 } else {
3336 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3337 NETLINK_CB(in_skb).portid,
3338 nlh->nlmsg_seq, 0);
3339 }
3340 if (err < 0)
3341 goto errout_rcu;
3342
3343 rcu_read_unlock();
3344
3345 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3346
3347 errout_free:
3348 return err;
3349 errout_rcu:
3350 rcu_read_unlock();
3351 kfree_skb(skb);
3352 goto errout_free;
3353 }
3354
ip_rt_multicast_event(struct in_device * in_dev)3355 void ip_rt_multicast_event(struct in_device *in_dev)
3356 {
3357 rt_cache_flush(dev_net(in_dev->dev));
3358 }
3359
3360 #ifdef CONFIG_SYSCTL
3361 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3362 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3363 static int ip_rt_gc_elasticity __read_mostly = 8;
3364 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3365
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3366 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3367 void *buffer, size_t *lenp, loff_t *ppos)
3368 {
3369 struct net *net = (struct net *)__ctl->extra1;
3370
3371 if (write) {
3372 rt_cache_flush(net);
3373 fnhe_genid_bump(net);
3374 return 0;
3375 }
3376
3377 return -EINVAL;
3378 }
3379
3380 static struct ctl_table ipv4_route_table[] = {
3381 {
3382 .procname = "gc_thresh",
3383 .data = &ipv4_dst_ops.gc_thresh,
3384 .maxlen = sizeof(int),
3385 .mode = 0644,
3386 .proc_handler = proc_dointvec,
3387 },
3388 {
3389 .procname = "max_size",
3390 .data = &ip_rt_max_size,
3391 .maxlen = sizeof(int),
3392 .mode = 0644,
3393 .proc_handler = proc_dointvec,
3394 },
3395 {
3396 /* Deprecated. Use gc_min_interval_ms */
3397
3398 .procname = "gc_min_interval",
3399 .data = &ip_rt_gc_min_interval,
3400 .maxlen = sizeof(int),
3401 .mode = 0644,
3402 .proc_handler = proc_dointvec_jiffies,
3403 },
3404 {
3405 .procname = "gc_min_interval_ms",
3406 .data = &ip_rt_gc_min_interval,
3407 .maxlen = sizeof(int),
3408 .mode = 0644,
3409 .proc_handler = proc_dointvec_ms_jiffies,
3410 },
3411 {
3412 .procname = "gc_timeout",
3413 .data = &ip_rt_gc_timeout,
3414 .maxlen = sizeof(int),
3415 .mode = 0644,
3416 .proc_handler = proc_dointvec_jiffies,
3417 },
3418 {
3419 .procname = "gc_interval",
3420 .data = &ip_rt_gc_interval,
3421 .maxlen = sizeof(int),
3422 .mode = 0644,
3423 .proc_handler = proc_dointvec_jiffies,
3424 },
3425 {
3426 .procname = "redirect_load",
3427 .data = &ip_rt_redirect_load,
3428 .maxlen = sizeof(int),
3429 .mode = 0644,
3430 .proc_handler = proc_dointvec,
3431 },
3432 {
3433 .procname = "redirect_number",
3434 .data = &ip_rt_redirect_number,
3435 .maxlen = sizeof(int),
3436 .mode = 0644,
3437 .proc_handler = proc_dointvec,
3438 },
3439 {
3440 .procname = "redirect_silence",
3441 .data = &ip_rt_redirect_silence,
3442 .maxlen = sizeof(int),
3443 .mode = 0644,
3444 .proc_handler = proc_dointvec,
3445 },
3446 {
3447 .procname = "error_cost",
3448 .data = &ip_rt_error_cost,
3449 .maxlen = sizeof(int),
3450 .mode = 0644,
3451 .proc_handler = proc_dointvec,
3452 },
3453 {
3454 .procname = "error_burst",
3455 .data = &ip_rt_error_burst,
3456 .maxlen = sizeof(int),
3457 .mode = 0644,
3458 .proc_handler = proc_dointvec,
3459 },
3460 {
3461 .procname = "gc_elasticity",
3462 .data = &ip_rt_gc_elasticity,
3463 .maxlen = sizeof(int),
3464 .mode = 0644,
3465 .proc_handler = proc_dointvec,
3466 },
3467 {
3468 .procname = "mtu_expires",
3469 .data = &ip_rt_mtu_expires,
3470 .maxlen = sizeof(int),
3471 .mode = 0644,
3472 .proc_handler = proc_dointvec_jiffies,
3473 },
3474 {
3475 .procname = "min_pmtu",
3476 .data = &ip_rt_min_pmtu,
3477 .maxlen = sizeof(int),
3478 .mode = 0644,
3479 .proc_handler = proc_dointvec_minmax,
3480 .extra1 = &ip_min_valid_pmtu,
3481 },
3482 {
3483 .procname = "min_adv_mss",
3484 .data = &ip_rt_min_advmss,
3485 .maxlen = sizeof(int),
3486 .mode = 0644,
3487 .proc_handler = proc_dointvec,
3488 },
3489 { }
3490 };
3491
3492 static const char ipv4_route_flush_procname[] = "flush";
3493
3494 static struct ctl_table ipv4_route_flush_table[] = {
3495 {
3496 .procname = ipv4_route_flush_procname,
3497 .maxlen = sizeof(int),
3498 .mode = 0200,
3499 .proc_handler = ipv4_sysctl_rtcache_flush,
3500 },
3501 { },
3502 };
3503
sysctl_route_net_init(struct net * net)3504 static __net_init int sysctl_route_net_init(struct net *net)
3505 {
3506 struct ctl_table *tbl;
3507
3508 tbl = ipv4_route_flush_table;
3509 if (!net_eq(net, &init_net)) {
3510 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3511 if (!tbl)
3512 goto err_dup;
3513
3514 /* Don't export non-whitelisted sysctls to unprivileged users */
3515 if (net->user_ns != &init_user_ns) {
3516 if (tbl[0].procname != ipv4_route_flush_procname)
3517 tbl[0].procname = NULL;
3518 }
3519 }
3520 tbl[0].extra1 = net;
3521
3522 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3523 if (!net->ipv4.route_hdr)
3524 goto err_reg;
3525 return 0;
3526
3527 err_reg:
3528 if (tbl != ipv4_route_flush_table)
3529 kfree(tbl);
3530 err_dup:
3531 return -ENOMEM;
3532 }
3533
sysctl_route_net_exit(struct net * net)3534 static __net_exit void sysctl_route_net_exit(struct net *net)
3535 {
3536 struct ctl_table *tbl;
3537
3538 tbl = net->ipv4.route_hdr->ctl_table_arg;
3539 unregister_net_sysctl_table(net->ipv4.route_hdr);
3540 BUG_ON(tbl == ipv4_route_flush_table);
3541 kfree(tbl);
3542 }
3543
3544 static __net_initdata struct pernet_operations sysctl_route_ops = {
3545 .init = sysctl_route_net_init,
3546 .exit = sysctl_route_net_exit,
3547 };
3548 #endif
3549
rt_genid_init(struct net * net)3550 static __net_init int rt_genid_init(struct net *net)
3551 {
3552 atomic_set(&net->ipv4.rt_genid, 0);
3553 atomic_set(&net->fnhe_genid, 0);
3554 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3555 return 0;
3556 }
3557
3558 static __net_initdata struct pernet_operations rt_genid_ops = {
3559 .init = rt_genid_init,
3560 };
3561
ipv4_inetpeer_init(struct net * net)3562 static int __net_init ipv4_inetpeer_init(struct net *net)
3563 {
3564 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3565
3566 if (!bp)
3567 return -ENOMEM;
3568 inet_peer_base_init(bp);
3569 net->ipv4.peers = bp;
3570 return 0;
3571 }
3572
ipv4_inetpeer_exit(struct net * net)3573 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3574 {
3575 struct inet_peer_base *bp = net->ipv4.peers;
3576
3577 net->ipv4.peers = NULL;
3578 inetpeer_invalidate_tree(bp);
3579 kfree(bp);
3580 }
3581
3582 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3583 .init = ipv4_inetpeer_init,
3584 .exit = ipv4_inetpeer_exit,
3585 };
3586
3587 #ifdef CONFIG_IP_ROUTE_CLASSID
3588 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3589 #endif /* CONFIG_IP_ROUTE_CLASSID */
3590
ip_rt_init(void)3591 int __init ip_rt_init(void)
3592 {
3593 void *idents_hash;
3594 int cpu;
3595
3596 /* For modern hosts, this will use 2 MB of memory */
3597 idents_hash = alloc_large_system_hash("IP idents",
3598 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3599 0,
3600 16, /* one bucket per 64 KB */
3601 HASH_ZERO,
3602 NULL,
3603 &ip_idents_mask,
3604 2048,
3605 256*1024);
3606
3607 ip_idents = idents_hash;
3608
3609 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3610
3611 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3612
3613 for_each_possible_cpu(cpu) {
3614 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3615
3616 INIT_LIST_HEAD(&ul->head);
3617 spin_lock_init(&ul->lock);
3618 }
3619 #ifdef CONFIG_IP_ROUTE_CLASSID
3620 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3621 if (!ip_rt_acct)
3622 panic("IP: failed to allocate ip_rt_acct\n");
3623 #endif
3624
3625 ipv4_dst_ops.kmem_cachep =
3626 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3627 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3628
3629 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3630
3631 if (dst_entries_init(&ipv4_dst_ops) < 0)
3632 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3633
3634 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3635 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3636
3637 ipv4_dst_ops.gc_thresh = ~0;
3638 ip_rt_max_size = INT_MAX;
3639
3640 devinet_init();
3641 ip_fib_init();
3642
3643 if (ip_rt_proc_init())
3644 pr_err("Unable to create route proc files\n");
3645 #ifdef CONFIG_XFRM
3646 xfrm_init();
3647 xfrm4_init();
3648 #endif
3649 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3650 RTNL_FLAG_DOIT_UNLOCKED);
3651
3652 #ifdef CONFIG_SYSCTL
3653 register_pernet_subsys(&sysctl_route_ops);
3654 #endif
3655 register_pernet_subsys(&rt_genid_ops);
3656 register_pernet_subsys(&ipv4_inetpeer_ops);
3657 return 0;
3658 }
3659
3660 #ifdef CONFIG_SYSCTL
3661 /*
3662 * We really need to sanitize the damn ipv4 init order, then all
3663 * this nonsense will go away.
3664 */
ip_static_sysctl_init(void)3665 void __init ip_static_sysctl_init(void)
3666 {
3667 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3668 }
3669 #endif
3670