1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * INET An implementation of the TCP/IP protocol suite for the LINUX
4*4882a593Smuzhiyun * operating system. INET is implemented using the BSD Socket
5*4882a593Smuzhiyun * interface as the means of communication with the user level.
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * ROUTE - implementation of the IP router.
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * Authors: Ross Biro
10*4882a593Smuzhiyun * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11*4882a593Smuzhiyun * Alan Cox, <gw4pts@gw4pts.ampr.org>
12*4882a593Smuzhiyun * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13*4882a593Smuzhiyun * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14*4882a593Smuzhiyun *
15*4882a593Smuzhiyun * Fixes:
16*4882a593Smuzhiyun * Alan Cox : Verify area fixes.
17*4882a593Smuzhiyun * Alan Cox : cli() protects routing changes
18*4882a593Smuzhiyun * Rui Oliveira : ICMP routing table updates
19*4882a593Smuzhiyun * (rco@di.uminho.pt) Routing table insertion and update
20*4882a593Smuzhiyun * Linus Torvalds : Rewrote bits to be sensible
21*4882a593Smuzhiyun * Alan Cox : Added BSD route gw semantics
22*4882a593Smuzhiyun * Alan Cox : Super /proc >4K
23*4882a593Smuzhiyun * Alan Cox : MTU in route table
24*4882a593Smuzhiyun * Alan Cox : MSS actually. Also added the window
25*4882a593Smuzhiyun * clamper.
26*4882a593Smuzhiyun * Sam Lantinga : Fixed route matching in rt_del()
27*4882a593Smuzhiyun * Alan Cox : Routing cache support.
28*4882a593Smuzhiyun * Alan Cox : Removed compatibility cruft.
29*4882a593Smuzhiyun * Alan Cox : RTF_REJECT support.
30*4882a593Smuzhiyun * Alan Cox : TCP irtt support.
31*4882a593Smuzhiyun * Jonathan Naylor : Added Metric support.
32*4882a593Smuzhiyun * Miquel van Smoorenburg : BSD API fixes.
33*4882a593Smuzhiyun * Miquel van Smoorenburg : Metrics.
34*4882a593Smuzhiyun * Alan Cox : Use __u32 properly
35*4882a593Smuzhiyun * Alan Cox : Aligned routing errors more closely with BSD
36*4882a593Smuzhiyun * our system is still very different.
37*4882a593Smuzhiyun * Alan Cox : Faster /proc handling
38*4882a593Smuzhiyun * Alexey Kuznetsov : Massive rework to support tree based routing,
39*4882a593Smuzhiyun * routing caches and better behaviour.
40*4882a593Smuzhiyun *
41*4882a593Smuzhiyun * Olaf Erb : irtt wasn't being copied right.
42*4882a593Smuzhiyun * Bjorn Ekwall : Kerneld route support.
43*4882a593Smuzhiyun * Alan Cox : Multicast fixed (I hope)
44*4882a593Smuzhiyun * Pavel Krauz : Limited broadcast fixed
45*4882a593Smuzhiyun * Mike McLagan : Routing by source
46*4882a593Smuzhiyun * Alexey Kuznetsov : End of old history. Split to fib.c and
47*4882a593Smuzhiyun * route.c and rewritten from scratch.
48*4882a593Smuzhiyun * Andi Kleen : Load-limit warning messages.
49*4882a593Smuzhiyun * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50*4882a593Smuzhiyun * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51*4882a593Smuzhiyun * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52*4882a593Smuzhiyun * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53*4882a593Smuzhiyun * Marc Boucher : routing by fwmark
54*4882a593Smuzhiyun * Robert Olsson : Added rt_cache statistics
55*4882a593Smuzhiyun * Arnaldo C. Melo : Convert proc stuff to seq_file
56*4882a593Smuzhiyun * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57*4882a593Smuzhiyun * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58*4882a593Smuzhiyun * Ilia Sotnikov : Removed TOS from hash calculations
59*4882a593Smuzhiyun */
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun #define pr_fmt(fmt) "IPv4: " fmt
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun #include <linux/module.h>
64*4882a593Smuzhiyun #include <linux/uaccess.h>
65*4882a593Smuzhiyun #include <linux/bitops.h>
66*4882a593Smuzhiyun #include <linux/types.h>
67*4882a593Smuzhiyun #include <linux/kernel.h>
68*4882a593Smuzhiyun #include <linux/mm.h>
69*4882a593Smuzhiyun #include <linux/memblock.h>
70*4882a593Smuzhiyun #include <linux/string.h>
71*4882a593Smuzhiyun #include <linux/socket.h>
72*4882a593Smuzhiyun #include <linux/sockios.h>
73*4882a593Smuzhiyun #include <linux/errno.h>
74*4882a593Smuzhiyun #include <linux/in.h>
75*4882a593Smuzhiyun #include <linux/inet.h>
76*4882a593Smuzhiyun #include <linux/netdevice.h>
77*4882a593Smuzhiyun #include <linux/proc_fs.h>
78*4882a593Smuzhiyun #include <linux/init.h>
79*4882a593Smuzhiyun #include <linux/skbuff.h>
80*4882a593Smuzhiyun #include <linux/inetdevice.h>
81*4882a593Smuzhiyun #include <linux/igmp.h>
82*4882a593Smuzhiyun #include <linux/pkt_sched.h>
83*4882a593Smuzhiyun #include <linux/mroute.h>
84*4882a593Smuzhiyun #include <linux/netfilter_ipv4.h>
85*4882a593Smuzhiyun #include <linux/random.h>
86*4882a593Smuzhiyun #include <linux/rcupdate.h>
87*4882a593Smuzhiyun #include <linux/times.h>
88*4882a593Smuzhiyun #include <linux/slab.h>
89*4882a593Smuzhiyun #include <linux/jhash.h>
90*4882a593Smuzhiyun #include <net/dst.h>
91*4882a593Smuzhiyun #include <net/dst_metadata.h>
92*4882a593Smuzhiyun #include <net/net_namespace.h>
93*4882a593Smuzhiyun #include <net/protocol.h>
94*4882a593Smuzhiyun #include <net/ip.h>
95*4882a593Smuzhiyun #include <net/route.h>
96*4882a593Smuzhiyun #include <net/inetpeer.h>
97*4882a593Smuzhiyun #include <net/sock.h>
98*4882a593Smuzhiyun #include <net/ip_fib.h>
99*4882a593Smuzhiyun #include <net/nexthop.h>
100*4882a593Smuzhiyun #include <net/arp.h>
101*4882a593Smuzhiyun #include <net/tcp.h>
102*4882a593Smuzhiyun #include <net/icmp.h>
103*4882a593Smuzhiyun #include <net/xfrm.h>
104*4882a593Smuzhiyun #include <net/lwtunnel.h>
105*4882a593Smuzhiyun #include <net/netevent.h>
106*4882a593Smuzhiyun #include <net/rtnetlink.h>
107*4882a593Smuzhiyun #ifdef CONFIG_SYSCTL
108*4882a593Smuzhiyun #include <linux/sysctl.h>
109*4882a593Smuzhiyun #endif
110*4882a593Smuzhiyun #include <net/secure_seq.h>
111*4882a593Smuzhiyun #include <net/ip_tunnels.h>
112*4882a593Smuzhiyun #include <net/l3mdev.h>
113*4882a593Smuzhiyun
114*4882a593Smuzhiyun #include "fib_lookup.h"
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun #define RT_FL_TOS(oldflp4) \
117*4882a593Smuzhiyun ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun #define RT_GC_TIMEOUT (300*HZ)
120*4882a593Smuzhiyun
121*4882a593Smuzhiyun static int ip_rt_max_size;
122*4882a593Smuzhiyun static int ip_rt_redirect_number __read_mostly = 9;
123*4882a593Smuzhiyun static int ip_rt_redirect_load __read_mostly = HZ / 50;
124*4882a593Smuzhiyun static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125*4882a593Smuzhiyun static int ip_rt_error_cost __read_mostly = HZ;
126*4882a593Smuzhiyun static int ip_rt_error_burst __read_mostly = 5 * HZ;
127*4882a593Smuzhiyun static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
128*4882a593Smuzhiyun static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
129*4882a593Smuzhiyun static int ip_rt_min_advmss __read_mostly = 256;
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun /*
134*4882a593Smuzhiyun * Interface to generic destination cache.
135*4882a593Smuzhiyun */
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138*4882a593Smuzhiyun static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139*4882a593Smuzhiyun static unsigned int ipv4_mtu(const struct dst_entry *dst);
140*4882a593Smuzhiyun static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141*4882a593Smuzhiyun static void ipv4_link_failure(struct sk_buff *skb);
142*4882a593Smuzhiyun static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143*4882a593Smuzhiyun struct sk_buff *skb, u32 mtu,
144*4882a593Smuzhiyun bool confirm_neigh);
145*4882a593Smuzhiyun static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146*4882a593Smuzhiyun struct sk_buff *skb);
147*4882a593Smuzhiyun static void ipv4_dst_destroy(struct dst_entry *dst);
148*4882a593Smuzhiyun
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)149*4882a593Smuzhiyun static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150*4882a593Smuzhiyun {
151*4882a593Smuzhiyun WARN_ON(1);
152*4882a593Smuzhiyun return NULL;
153*4882a593Smuzhiyun }
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156*4882a593Smuzhiyun struct sk_buff *skb,
157*4882a593Smuzhiyun const void *daddr);
158*4882a593Smuzhiyun static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun static struct dst_ops ipv4_dst_ops = {
161*4882a593Smuzhiyun .family = AF_INET,
162*4882a593Smuzhiyun .check = ipv4_dst_check,
163*4882a593Smuzhiyun .default_advmss = ipv4_default_advmss,
164*4882a593Smuzhiyun .mtu = ipv4_mtu,
165*4882a593Smuzhiyun .cow_metrics = ipv4_cow_metrics,
166*4882a593Smuzhiyun .destroy = ipv4_dst_destroy,
167*4882a593Smuzhiyun .negative_advice = ipv4_negative_advice,
168*4882a593Smuzhiyun .link_failure = ipv4_link_failure,
169*4882a593Smuzhiyun .update_pmtu = ip_rt_update_pmtu,
170*4882a593Smuzhiyun .redirect = ip_do_redirect,
171*4882a593Smuzhiyun .local_out = __ip_local_out,
172*4882a593Smuzhiyun .neigh_lookup = ipv4_neigh_lookup,
173*4882a593Smuzhiyun .confirm_neigh = ipv4_confirm_neigh,
174*4882a593Smuzhiyun };
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun #define ECN_OR_COST(class) TC_PRIO_##class
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun const __u8 ip_tos2prio[16] = {
179*4882a593Smuzhiyun TC_PRIO_BESTEFFORT,
180*4882a593Smuzhiyun ECN_OR_COST(BESTEFFORT),
181*4882a593Smuzhiyun TC_PRIO_BESTEFFORT,
182*4882a593Smuzhiyun ECN_OR_COST(BESTEFFORT),
183*4882a593Smuzhiyun TC_PRIO_BULK,
184*4882a593Smuzhiyun ECN_OR_COST(BULK),
185*4882a593Smuzhiyun TC_PRIO_BULK,
186*4882a593Smuzhiyun ECN_OR_COST(BULK),
187*4882a593Smuzhiyun TC_PRIO_INTERACTIVE,
188*4882a593Smuzhiyun ECN_OR_COST(INTERACTIVE),
189*4882a593Smuzhiyun TC_PRIO_INTERACTIVE,
190*4882a593Smuzhiyun ECN_OR_COST(INTERACTIVE),
191*4882a593Smuzhiyun TC_PRIO_INTERACTIVE_BULK,
192*4882a593Smuzhiyun ECN_OR_COST(INTERACTIVE_BULK),
193*4882a593Smuzhiyun TC_PRIO_INTERACTIVE_BULK,
194*4882a593Smuzhiyun ECN_OR_COST(INTERACTIVE_BULK)
195*4882a593Smuzhiyun };
196*4882a593Smuzhiyun EXPORT_SYMBOL(ip_tos2prio);
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199*4882a593Smuzhiyun #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200*4882a593Smuzhiyun
201*4882a593Smuzhiyun #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202*4882a593Smuzhiyun static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203*4882a593Smuzhiyun {
204*4882a593Smuzhiyun if (*pos)
205*4882a593Smuzhiyun return NULL;
206*4882a593Smuzhiyun return SEQ_START_TOKEN;
207*4882a593Smuzhiyun }
208*4882a593Smuzhiyun
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209*4882a593Smuzhiyun static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210*4882a593Smuzhiyun {
211*4882a593Smuzhiyun ++*pos;
212*4882a593Smuzhiyun return NULL;
213*4882a593Smuzhiyun }
214*4882a593Smuzhiyun
rt_cache_seq_stop(struct seq_file * seq,void * v)215*4882a593Smuzhiyun static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216*4882a593Smuzhiyun {
217*4882a593Smuzhiyun }
218*4882a593Smuzhiyun
rt_cache_seq_show(struct seq_file * seq,void * v)219*4882a593Smuzhiyun static int rt_cache_seq_show(struct seq_file *seq, void *v)
220*4882a593Smuzhiyun {
221*4882a593Smuzhiyun if (v == SEQ_START_TOKEN)
222*4882a593Smuzhiyun seq_printf(seq, "%-127s\n",
223*4882a593Smuzhiyun "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224*4882a593Smuzhiyun "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225*4882a593Smuzhiyun "HHUptod\tSpecDst");
226*4882a593Smuzhiyun return 0;
227*4882a593Smuzhiyun }
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun static const struct seq_operations rt_cache_seq_ops = {
230*4882a593Smuzhiyun .start = rt_cache_seq_start,
231*4882a593Smuzhiyun .next = rt_cache_seq_next,
232*4882a593Smuzhiyun .stop = rt_cache_seq_stop,
233*4882a593Smuzhiyun .show = rt_cache_seq_show,
234*4882a593Smuzhiyun };
235*4882a593Smuzhiyun
rt_cache_seq_open(struct inode * inode,struct file * file)236*4882a593Smuzhiyun static int rt_cache_seq_open(struct inode *inode, struct file *file)
237*4882a593Smuzhiyun {
238*4882a593Smuzhiyun return seq_open(file, &rt_cache_seq_ops);
239*4882a593Smuzhiyun }
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun static const struct proc_ops rt_cache_proc_ops = {
242*4882a593Smuzhiyun .proc_open = rt_cache_seq_open,
243*4882a593Smuzhiyun .proc_read = seq_read,
244*4882a593Smuzhiyun .proc_lseek = seq_lseek,
245*4882a593Smuzhiyun .proc_release = seq_release,
246*4882a593Smuzhiyun };
247*4882a593Smuzhiyun
248*4882a593Smuzhiyun
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)249*4882a593Smuzhiyun static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250*4882a593Smuzhiyun {
251*4882a593Smuzhiyun int cpu;
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun if (*pos == 0)
254*4882a593Smuzhiyun return SEQ_START_TOKEN;
255*4882a593Smuzhiyun
256*4882a593Smuzhiyun for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257*4882a593Smuzhiyun if (!cpu_possible(cpu))
258*4882a593Smuzhiyun continue;
259*4882a593Smuzhiyun *pos = cpu+1;
260*4882a593Smuzhiyun return &per_cpu(rt_cache_stat, cpu);
261*4882a593Smuzhiyun }
262*4882a593Smuzhiyun return NULL;
263*4882a593Smuzhiyun }
264*4882a593Smuzhiyun
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)265*4882a593Smuzhiyun static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266*4882a593Smuzhiyun {
267*4882a593Smuzhiyun int cpu;
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270*4882a593Smuzhiyun if (!cpu_possible(cpu))
271*4882a593Smuzhiyun continue;
272*4882a593Smuzhiyun *pos = cpu+1;
273*4882a593Smuzhiyun return &per_cpu(rt_cache_stat, cpu);
274*4882a593Smuzhiyun }
275*4882a593Smuzhiyun (*pos)++;
276*4882a593Smuzhiyun return NULL;
277*4882a593Smuzhiyun
278*4882a593Smuzhiyun }
279*4882a593Smuzhiyun
rt_cpu_seq_stop(struct seq_file * seq,void * v)280*4882a593Smuzhiyun static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281*4882a593Smuzhiyun {
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun }
284*4882a593Smuzhiyun
rt_cpu_seq_show(struct seq_file * seq,void * v)285*4882a593Smuzhiyun static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286*4882a593Smuzhiyun {
287*4882a593Smuzhiyun struct rt_cache_stat *st = v;
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun if (v == SEQ_START_TOKEN) {
290*4882a593Smuzhiyun seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291*4882a593Smuzhiyun return 0;
292*4882a593Smuzhiyun }
293*4882a593Smuzhiyun
294*4882a593Smuzhiyun seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295*4882a593Smuzhiyun " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296*4882a593Smuzhiyun dst_entries_get_slow(&ipv4_dst_ops),
297*4882a593Smuzhiyun 0, /* st->in_hit */
298*4882a593Smuzhiyun st->in_slow_tot,
299*4882a593Smuzhiyun st->in_slow_mc,
300*4882a593Smuzhiyun st->in_no_route,
301*4882a593Smuzhiyun st->in_brd,
302*4882a593Smuzhiyun st->in_martian_dst,
303*4882a593Smuzhiyun st->in_martian_src,
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun 0, /* st->out_hit */
306*4882a593Smuzhiyun st->out_slow_tot,
307*4882a593Smuzhiyun st->out_slow_mc,
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun 0, /* st->gc_total */
310*4882a593Smuzhiyun 0, /* st->gc_ignored */
311*4882a593Smuzhiyun 0, /* st->gc_goal_miss */
312*4882a593Smuzhiyun 0, /* st->gc_dst_overflow */
313*4882a593Smuzhiyun 0, /* st->in_hlist_search */
314*4882a593Smuzhiyun 0 /* st->out_hlist_search */
315*4882a593Smuzhiyun );
316*4882a593Smuzhiyun return 0;
317*4882a593Smuzhiyun }
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun static const struct seq_operations rt_cpu_seq_ops = {
320*4882a593Smuzhiyun .start = rt_cpu_seq_start,
321*4882a593Smuzhiyun .next = rt_cpu_seq_next,
322*4882a593Smuzhiyun .stop = rt_cpu_seq_stop,
323*4882a593Smuzhiyun .show = rt_cpu_seq_show,
324*4882a593Smuzhiyun };
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun
rt_cpu_seq_open(struct inode * inode,struct file * file)327*4882a593Smuzhiyun static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328*4882a593Smuzhiyun {
329*4882a593Smuzhiyun return seq_open(file, &rt_cpu_seq_ops);
330*4882a593Smuzhiyun }
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun static const struct proc_ops rt_cpu_proc_ops = {
333*4882a593Smuzhiyun .proc_open = rt_cpu_seq_open,
334*4882a593Smuzhiyun .proc_read = seq_read,
335*4882a593Smuzhiyun .proc_lseek = seq_lseek,
336*4882a593Smuzhiyun .proc_release = seq_release,
337*4882a593Smuzhiyun };
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)340*4882a593Smuzhiyun static int rt_acct_proc_show(struct seq_file *m, void *v)
341*4882a593Smuzhiyun {
342*4882a593Smuzhiyun struct ip_rt_acct *dst, *src;
343*4882a593Smuzhiyun unsigned int i, j;
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346*4882a593Smuzhiyun if (!dst)
347*4882a593Smuzhiyun return -ENOMEM;
348*4882a593Smuzhiyun
349*4882a593Smuzhiyun for_each_possible_cpu(i) {
350*4882a593Smuzhiyun src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351*4882a593Smuzhiyun for (j = 0; j < 256; j++) {
352*4882a593Smuzhiyun dst[j].o_bytes += src[j].o_bytes;
353*4882a593Smuzhiyun dst[j].o_packets += src[j].o_packets;
354*4882a593Smuzhiyun dst[j].i_bytes += src[j].i_bytes;
355*4882a593Smuzhiyun dst[j].i_packets += src[j].i_packets;
356*4882a593Smuzhiyun }
357*4882a593Smuzhiyun }
358*4882a593Smuzhiyun
359*4882a593Smuzhiyun seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360*4882a593Smuzhiyun kfree(dst);
361*4882a593Smuzhiyun return 0;
362*4882a593Smuzhiyun }
363*4882a593Smuzhiyun #endif
364*4882a593Smuzhiyun
ip_rt_do_proc_init(struct net * net)365*4882a593Smuzhiyun static int __net_init ip_rt_do_proc_init(struct net *net)
366*4882a593Smuzhiyun {
367*4882a593Smuzhiyun struct proc_dir_entry *pde;
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun pde = proc_create("rt_cache", 0444, net->proc_net,
370*4882a593Smuzhiyun &rt_cache_proc_ops);
371*4882a593Smuzhiyun if (!pde)
372*4882a593Smuzhiyun goto err1;
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun pde = proc_create("rt_cache", 0444,
375*4882a593Smuzhiyun net->proc_net_stat, &rt_cpu_proc_ops);
376*4882a593Smuzhiyun if (!pde)
377*4882a593Smuzhiyun goto err2;
378*4882a593Smuzhiyun
379*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_CLASSID
380*4882a593Smuzhiyun pde = proc_create_single("rt_acct", 0, net->proc_net,
381*4882a593Smuzhiyun rt_acct_proc_show);
382*4882a593Smuzhiyun if (!pde)
383*4882a593Smuzhiyun goto err3;
384*4882a593Smuzhiyun #endif
385*4882a593Smuzhiyun return 0;
386*4882a593Smuzhiyun
387*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_CLASSID
388*4882a593Smuzhiyun err3:
389*4882a593Smuzhiyun remove_proc_entry("rt_cache", net->proc_net_stat);
390*4882a593Smuzhiyun #endif
391*4882a593Smuzhiyun err2:
392*4882a593Smuzhiyun remove_proc_entry("rt_cache", net->proc_net);
393*4882a593Smuzhiyun err1:
394*4882a593Smuzhiyun return -ENOMEM;
395*4882a593Smuzhiyun }
396*4882a593Smuzhiyun
ip_rt_do_proc_exit(struct net * net)397*4882a593Smuzhiyun static void __net_exit ip_rt_do_proc_exit(struct net *net)
398*4882a593Smuzhiyun {
399*4882a593Smuzhiyun remove_proc_entry("rt_cache", net->proc_net_stat);
400*4882a593Smuzhiyun remove_proc_entry("rt_cache", net->proc_net);
401*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_CLASSID
402*4882a593Smuzhiyun remove_proc_entry("rt_acct", net->proc_net);
403*4882a593Smuzhiyun #endif
404*4882a593Smuzhiyun }
405*4882a593Smuzhiyun
406*4882a593Smuzhiyun static struct pernet_operations ip_rt_proc_ops __net_initdata = {
407*4882a593Smuzhiyun .init = ip_rt_do_proc_init,
408*4882a593Smuzhiyun .exit = ip_rt_do_proc_exit,
409*4882a593Smuzhiyun };
410*4882a593Smuzhiyun
ip_rt_proc_init(void)411*4882a593Smuzhiyun static int __init ip_rt_proc_init(void)
412*4882a593Smuzhiyun {
413*4882a593Smuzhiyun return register_pernet_subsys(&ip_rt_proc_ops);
414*4882a593Smuzhiyun }
415*4882a593Smuzhiyun
416*4882a593Smuzhiyun #else
ip_rt_proc_init(void)417*4882a593Smuzhiyun static inline int ip_rt_proc_init(void)
418*4882a593Smuzhiyun {
419*4882a593Smuzhiyun return 0;
420*4882a593Smuzhiyun }
421*4882a593Smuzhiyun #endif /* CONFIG_PROC_FS */
422*4882a593Smuzhiyun
rt_is_expired(const struct rtable * rth)423*4882a593Smuzhiyun static inline bool rt_is_expired(const struct rtable *rth)
424*4882a593Smuzhiyun {
425*4882a593Smuzhiyun return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426*4882a593Smuzhiyun }
427*4882a593Smuzhiyun
rt_cache_flush(struct net * net)428*4882a593Smuzhiyun void rt_cache_flush(struct net *net)
429*4882a593Smuzhiyun {
430*4882a593Smuzhiyun rt_genid_bump_ipv4(net);
431*4882a593Smuzhiyun }
432*4882a593Smuzhiyun
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)433*4882a593Smuzhiyun static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434*4882a593Smuzhiyun struct sk_buff *skb,
435*4882a593Smuzhiyun const void *daddr)
436*4882a593Smuzhiyun {
437*4882a593Smuzhiyun const struct rtable *rt = container_of(dst, struct rtable, dst);
438*4882a593Smuzhiyun struct net_device *dev = dst->dev;
439*4882a593Smuzhiyun struct neighbour *n;
440*4882a593Smuzhiyun
441*4882a593Smuzhiyun rcu_read_lock_bh();
442*4882a593Smuzhiyun
443*4882a593Smuzhiyun if (likely(rt->rt_gw_family == AF_INET)) {
444*4882a593Smuzhiyun n = ip_neigh_gw4(dev, rt->rt_gw4);
445*4882a593Smuzhiyun } else if (rt->rt_gw_family == AF_INET6) {
446*4882a593Smuzhiyun n = ip_neigh_gw6(dev, &rt->rt_gw6);
447*4882a593Smuzhiyun } else {
448*4882a593Smuzhiyun __be32 pkey;
449*4882a593Smuzhiyun
450*4882a593Smuzhiyun pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451*4882a593Smuzhiyun n = ip_neigh_gw4(dev, pkey);
452*4882a593Smuzhiyun }
453*4882a593Smuzhiyun
454*4882a593Smuzhiyun if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455*4882a593Smuzhiyun n = NULL;
456*4882a593Smuzhiyun
457*4882a593Smuzhiyun rcu_read_unlock_bh();
458*4882a593Smuzhiyun
459*4882a593Smuzhiyun return n;
460*4882a593Smuzhiyun }
461*4882a593Smuzhiyun
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)462*4882a593Smuzhiyun static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463*4882a593Smuzhiyun {
464*4882a593Smuzhiyun const struct rtable *rt = container_of(dst, struct rtable, dst);
465*4882a593Smuzhiyun struct net_device *dev = dst->dev;
466*4882a593Smuzhiyun const __be32 *pkey = daddr;
467*4882a593Smuzhiyun
468*4882a593Smuzhiyun if (rt->rt_gw_family == AF_INET) {
469*4882a593Smuzhiyun pkey = (const __be32 *)&rt->rt_gw4;
470*4882a593Smuzhiyun } else if (rt->rt_gw_family == AF_INET6) {
471*4882a593Smuzhiyun return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472*4882a593Smuzhiyun } else if (!daddr ||
473*4882a593Smuzhiyun (rt->rt_flags &
474*4882a593Smuzhiyun (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475*4882a593Smuzhiyun return;
476*4882a593Smuzhiyun }
477*4882a593Smuzhiyun __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478*4882a593Smuzhiyun }
479*4882a593Smuzhiyun
480*4882a593Smuzhiyun /* Hash tables of size 2048..262144 depending on RAM size.
481*4882a593Smuzhiyun * Each bucket uses 8 bytes.
482*4882a593Smuzhiyun */
483*4882a593Smuzhiyun static u32 ip_idents_mask __read_mostly;
484*4882a593Smuzhiyun static atomic_t *ip_idents __read_mostly;
485*4882a593Smuzhiyun static u32 *ip_tstamps __read_mostly;
486*4882a593Smuzhiyun
487*4882a593Smuzhiyun /* In order to protect privacy, we add a perturbation to identifiers
488*4882a593Smuzhiyun * if one generator is seldom used. This makes hard for an attacker
489*4882a593Smuzhiyun * to infer how many packets were sent between two points in time.
490*4882a593Smuzhiyun */
ip_idents_reserve(u32 hash,int segs)491*4882a593Smuzhiyun u32 ip_idents_reserve(u32 hash, int segs)
492*4882a593Smuzhiyun {
493*4882a593Smuzhiyun u32 bucket, old, now = (u32)jiffies;
494*4882a593Smuzhiyun atomic_t *p_id;
495*4882a593Smuzhiyun u32 *p_tstamp;
496*4882a593Smuzhiyun u32 delta = 0;
497*4882a593Smuzhiyun
498*4882a593Smuzhiyun bucket = hash & ip_idents_mask;
499*4882a593Smuzhiyun p_tstamp = ip_tstamps + bucket;
500*4882a593Smuzhiyun p_id = ip_idents + bucket;
501*4882a593Smuzhiyun old = READ_ONCE(*p_tstamp);
502*4882a593Smuzhiyun
503*4882a593Smuzhiyun if (old != now && cmpxchg(p_tstamp, old, now) == old)
504*4882a593Smuzhiyun delta = prandom_u32_max(now - old);
505*4882a593Smuzhiyun
506*4882a593Smuzhiyun /* If UBSAN reports an error there, please make sure your compiler
507*4882a593Smuzhiyun * supports -fno-strict-overflow before reporting it that was a bug
508*4882a593Smuzhiyun * in UBSAN, and it has been fixed in GCC-8.
509*4882a593Smuzhiyun */
510*4882a593Smuzhiyun return atomic_add_return(segs + delta, p_id) - segs;
511*4882a593Smuzhiyun }
512*4882a593Smuzhiyun EXPORT_SYMBOL(ip_idents_reserve);
513*4882a593Smuzhiyun
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)514*4882a593Smuzhiyun void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515*4882a593Smuzhiyun {
516*4882a593Smuzhiyun u32 hash, id;
517*4882a593Smuzhiyun
518*4882a593Smuzhiyun /* Note the following code is not safe, but this is okay. */
519*4882a593Smuzhiyun if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520*4882a593Smuzhiyun get_random_bytes(&net->ipv4.ip_id_key,
521*4882a593Smuzhiyun sizeof(net->ipv4.ip_id_key));
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun hash = siphash_3u32((__force u32)iph->daddr,
524*4882a593Smuzhiyun (__force u32)iph->saddr,
525*4882a593Smuzhiyun iph->protocol,
526*4882a593Smuzhiyun &net->ipv4.ip_id_key);
527*4882a593Smuzhiyun id = ip_idents_reserve(hash, segs);
528*4882a593Smuzhiyun iph->id = htons(id);
529*4882a593Smuzhiyun }
530*4882a593Smuzhiyun EXPORT_SYMBOL(__ip_select_ident);
531*4882a593Smuzhiyun
ip_rt_fix_tos(struct flowi4 * fl4)532*4882a593Smuzhiyun static void ip_rt_fix_tos(struct flowi4 *fl4)
533*4882a593Smuzhiyun {
534*4882a593Smuzhiyun __u8 tos = RT_FL_TOS(fl4);
535*4882a593Smuzhiyun
536*4882a593Smuzhiyun fl4->flowi4_tos = tos & IPTOS_RT_MASK;
537*4882a593Smuzhiyun fl4->flowi4_scope = tos & RTO_ONLINK ?
538*4882a593Smuzhiyun RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
539*4882a593Smuzhiyun }
540*4882a593Smuzhiyun
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)541*4882a593Smuzhiyun static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
542*4882a593Smuzhiyun const struct sock *sk,
543*4882a593Smuzhiyun const struct iphdr *iph,
544*4882a593Smuzhiyun int oif, u8 tos,
545*4882a593Smuzhiyun u8 prot, u32 mark, int flow_flags)
546*4882a593Smuzhiyun {
547*4882a593Smuzhiyun if (sk) {
548*4882a593Smuzhiyun const struct inet_sock *inet = inet_sk(sk);
549*4882a593Smuzhiyun
550*4882a593Smuzhiyun oif = sk->sk_bound_dev_if;
551*4882a593Smuzhiyun mark = sk->sk_mark;
552*4882a593Smuzhiyun tos = RT_CONN_FLAGS(sk);
553*4882a593Smuzhiyun prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
554*4882a593Smuzhiyun }
555*4882a593Smuzhiyun flowi4_init_output(fl4, oif, mark, tos,
556*4882a593Smuzhiyun RT_SCOPE_UNIVERSE, prot,
557*4882a593Smuzhiyun flow_flags,
558*4882a593Smuzhiyun iph->daddr, iph->saddr, 0, 0,
559*4882a593Smuzhiyun sock_net_uid(net, sk));
560*4882a593Smuzhiyun }
561*4882a593Smuzhiyun
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)562*4882a593Smuzhiyun static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
563*4882a593Smuzhiyun const struct sock *sk)
564*4882a593Smuzhiyun {
565*4882a593Smuzhiyun const struct net *net = dev_net(skb->dev);
566*4882a593Smuzhiyun const struct iphdr *iph = ip_hdr(skb);
567*4882a593Smuzhiyun int oif = skb->dev->ifindex;
568*4882a593Smuzhiyun u8 tos = RT_TOS(iph->tos);
569*4882a593Smuzhiyun u8 prot = iph->protocol;
570*4882a593Smuzhiyun u32 mark = skb->mark;
571*4882a593Smuzhiyun
572*4882a593Smuzhiyun __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
573*4882a593Smuzhiyun }
574*4882a593Smuzhiyun
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)575*4882a593Smuzhiyun static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
576*4882a593Smuzhiyun {
577*4882a593Smuzhiyun const struct inet_sock *inet = inet_sk(sk);
578*4882a593Smuzhiyun const struct ip_options_rcu *inet_opt;
579*4882a593Smuzhiyun __be32 daddr = inet->inet_daddr;
580*4882a593Smuzhiyun
581*4882a593Smuzhiyun rcu_read_lock();
582*4882a593Smuzhiyun inet_opt = rcu_dereference(inet->inet_opt);
583*4882a593Smuzhiyun if (inet_opt && inet_opt->opt.srr)
584*4882a593Smuzhiyun daddr = inet_opt->opt.faddr;
585*4882a593Smuzhiyun flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
586*4882a593Smuzhiyun RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
587*4882a593Smuzhiyun inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
588*4882a593Smuzhiyun inet_sk_flowi_flags(sk),
589*4882a593Smuzhiyun daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
590*4882a593Smuzhiyun rcu_read_unlock();
591*4882a593Smuzhiyun }
592*4882a593Smuzhiyun
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)593*4882a593Smuzhiyun static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
594*4882a593Smuzhiyun const struct sk_buff *skb)
595*4882a593Smuzhiyun {
596*4882a593Smuzhiyun if (skb)
597*4882a593Smuzhiyun build_skb_flow_key(fl4, skb, sk);
598*4882a593Smuzhiyun else
599*4882a593Smuzhiyun build_sk_flow_key(fl4, sk);
600*4882a593Smuzhiyun }
601*4882a593Smuzhiyun
602*4882a593Smuzhiyun static DEFINE_SPINLOCK(fnhe_lock);
603*4882a593Smuzhiyun
fnhe_flush_routes(struct fib_nh_exception * fnhe)604*4882a593Smuzhiyun static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
605*4882a593Smuzhiyun {
606*4882a593Smuzhiyun struct rtable *rt;
607*4882a593Smuzhiyun
608*4882a593Smuzhiyun rt = rcu_dereference(fnhe->fnhe_rth_input);
609*4882a593Smuzhiyun if (rt) {
610*4882a593Smuzhiyun RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
611*4882a593Smuzhiyun dst_dev_put(&rt->dst);
612*4882a593Smuzhiyun dst_release(&rt->dst);
613*4882a593Smuzhiyun }
614*4882a593Smuzhiyun rt = rcu_dereference(fnhe->fnhe_rth_output);
615*4882a593Smuzhiyun if (rt) {
616*4882a593Smuzhiyun RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
617*4882a593Smuzhiyun dst_dev_put(&rt->dst);
618*4882a593Smuzhiyun dst_release(&rt->dst);
619*4882a593Smuzhiyun }
620*4882a593Smuzhiyun }
621*4882a593Smuzhiyun
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)622*4882a593Smuzhiyun static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
623*4882a593Smuzhiyun {
624*4882a593Smuzhiyun struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
625*4882a593Smuzhiyun struct fib_nh_exception *fnhe, *oldest = NULL;
626*4882a593Smuzhiyun
627*4882a593Smuzhiyun for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
628*4882a593Smuzhiyun fnhe = rcu_dereference_protected(*fnhe_p,
629*4882a593Smuzhiyun lockdep_is_held(&fnhe_lock));
630*4882a593Smuzhiyun if (!fnhe)
631*4882a593Smuzhiyun break;
632*4882a593Smuzhiyun if (!oldest ||
633*4882a593Smuzhiyun time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
634*4882a593Smuzhiyun oldest = fnhe;
635*4882a593Smuzhiyun oldest_p = fnhe_p;
636*4882a593Smuzhiyun }
637*4882a593Smuzhiyun }
638*4882a593Smuzhiyun fnhe_flush_routes(oldest);
639*4882a593Smuzhiyun *oldest_p = oldest->fnhe_next;
640*4882a593Smuzhiyun kfree_rcu(oldest, rcu);
641*4882a593Smuzhiyun }
642*4882a593Smuzhiyun
fnhe_hashfun(__be32 daddr)643*4882a593Smuzhiyun static u32 fnhe_hashfun(__be32 daddr)
644*4882a593Smuzhiyun {
645*4882a593Smuzhiyun static siphash_key_t fnhe_hash_key __read_mostly;
646*4882a593Smuzhiyun u64 hval;
647*4882a593Smuzhiyun
648*4882a593Smuzhiyun net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
649*4882a593Smuzhiyun hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
650*4882a593Smuzhiyun return hash_64(hval, FNHE_HASH_SHIFT);
651*4882a593Smuzhiyun }
652*4882a593Smuzhiyun
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)653*4882a593Smuzhiyun static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
654*4882a593Smuzhiyun {
655*4882a593Smuzhiyun rt->rt_pmtu = fnhe->fnhe_pmtu;
656*4882a593Smuzhiyun rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
657*4882a593Smuzhiyun rt->dst.expires = fnhe->fnhe_expires;
658*4882a593Smuzhiyun
659*4882a593Smuzhiyun if (fnhe->fnhe_gw) {
660*4882a593Smuzhiyun rt->rt_flags |= RTCF_REDIRECTED;
661*4882a593Smuzhiyun rt->rt_uses_gateway = 1;
662*4882a593Smuzhiyun rt->rt_gw_family = AF_INET;
663*4882a593Smuzhiyun rt->rt_gw4 = fnhe->fnhe_gw;
664*4882a593Smuzhiyun }
665*4882a593Smuzhiyun }
666*4882a593Smuzhiyun
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)667*4882a593Smuzhiyun static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
668*4882a593Smuzhiyun __be32 gw, u32 pmtu, bool lock,
669*4882a593Smuzhiyun unsigned long expires)
670*4882a593Smuzhiyun {
671*4882a593Smuzhiyun struct fnhe_hash_bucket *hash;
672*4882a593Smuzhiyun struct fib_nh_exception *fnhe;
673*4882a593Smuzhiyun struct rtable *rt;
674*4882a593Smuzhiyun u32 genid, hval;
675*4882a593Smuzhiyun unsigned int i;
676*4882a593Smuzhiyun int depth;
677*4882a593Smuzhiyun
678*4882a593Smuzhiyun genid = fnhe_genid(dev_net(nhc->nhc_dev));
679*4882a593Smuzhiyun hval = fnhe_hashfun(daddr);
680*4882a593Smuzhiyun
681*4882a593Smuzhiyun spin_lock_bh(&fnhe_lock);
682*4882a593Smuzhiyun
683*4882a593Smuzhiyun hash = rcu_dereference(nhc->nhc_exceptions);
684*4882a593Smuzhiyun if (!hash) {
685*4882a593Smuzhiyun hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
686*4882a593Smuzhiyun if (!hash)
687*4882a593Smuzhiyun goto out_unlock;
688*4882a593Smuzhiyun rcu_assign_pointer(nhc->nhc_exceptions, hash);
689*4882a593Smuzhiyun }
690*4882a593Smuzhiyun
691*4882a593Smuzhiyun hash += hval;
692*4882a593Smuzhiyun
693*4882a593Smuzhiyun depth = 0;
694*4882a593Smuzhiyun for (fnhe = rcu_dereference(hash->chain); fnhe;
695*4882a593Smuzhiyun fnhe = rcu_dereference(fnhe->fnhe_next)) {
696*4882a593Smuzhiyun if (fnhe->fnhe_daddr == daddr)
697*4882a593Smuzhiyun break;
698*4882a593Smuzhiyun depth++;
699*4882a593Smuzhiyun }
700*4882a593Smuzhiyun
701*4882a593Smuzhiyun if (fnhe) {
702*4882a593Smuzhiyun if (fnhe->fnhe_genid != genid)
703*4882a593Smuzhiyun fnhe->fnhe_genid = genid;
704*4882a593Smuzhiyun if (gw)
705*4882a593Smuzhiyun fnhe->fnhe_gw = gw;
706*4882a593Smuzhiyun if (pmtu) {
707*4882a593Smuzhiyun fnhe->fnhe_pmtu = pmtu;
708*4882a593Smuzhiyun fnhe->fnhe_mtu_locked = lock;
709*4882a593Smuzhiyun }
710*4882a593Smuzhiyun fnhe->fnhe_expires = max(1UL, expires);
711*4882a593Smuzhiyun /* Update all cached dsts too */
712*4882a593Smuzhiyun rt = rcu_dereference(fnhe->fnhe_rth_input);
713*4882a593Smuzhiyun if (rt)
714*4882a593Smuzhiyun fill_route_from_fnhe(rt, fnhe);
715*4882a593Smuzhiyun rt = rcu_dereference(fnhe->fnhe_rth_output);
716*4882a593Smuzhiyun if (rt)
717*4882a593Smuzhiyun fill_route_from_fnhe(rt, fnhe);
718*4882a593Smuzhiyun } else {
719*4882a593Smuzhiyun /* Randomize max depth to avoid some side channels attacks. */
720*4882a593Smuzhiyun int max_depth = FNHE_RECLAIM_DEPTH +
721*4882a593Smuzhiyun prandom_u32_max(FNHE_RECLAIM_DEPTH);
722*4882a593Smuzhiyun
723*4882a593Smuzhiyun while (depth > max_depth) {
724*4882a593Smuzhiyun fnhe_remove_oldest(hash);
725*4882a593Smuzhiyun depth--;
726*4882a593Smuzhiyun }
727*4882a593Smuzhiyun
728*4882a593Smuzhiyun fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
729*4882a593Smuzhiyun if (!fnhe)
730*4882a593Smuzhiyun goto out_unlock;
731*4882a593Smuzhiyun
732*4882a593Smuzhiyun fnhe->fnhe_next = hash->chain;
733*4882a593Smuzhiyun
734*4882a593Smuzhiyun fnhe->fnhe_genid = genid;
735*4882a593Smuzhiyun fnhe->fnhe_daddr = daddr;
736*4882a593Smuzhiyun fnhe->fnhe_gw = gw;
737*4882a593Smuzhiyun fnhe->fnhe_pmtu = pmtu;
738*4882a593Smuzhiyun fnhe->fnhe_mtu_locked = lock;
739*4882a593Smuzhiyun fnhe->fnhe_expires = max(1UL, expires);
740*4882a593Smuzhiyun
741*4882a593Smuzhiyun rcu_assign_pointer(hash->chain, fnhe);
742*4882a593Smuzhiyun
743*4882a593Smuzhiyun /* Exception created; mark the cached routes for the nexthop
744*4882a593Smuzhiyun * stale, so anyone caching it rechecks if this exception
745*4882a593Smuzhiyun * applies to them.
746*4882a593Smuzhiyun */
747*4882a593Smuzhiyun rt = rcu_dereference(nhc->nhc_rth_input);
748*4882a593Smuzhiyun if (rt)
749*4882a593Smuzhiyun rt->dst.obsolete = DST_OBSOLETE_KILL;
750*4882a593Smuzhiyun
751*4882a593Smuzhiyun for_each_possible_cpu(i) {
752*4882a593Smuzhiyun struct rtable __rcu **prt;
753*4882a593Smuzhiyun prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
754*4882a593Smuzhiyun rt = rcu_dereference(*prt);
755*4882a593Smuzhiyun if (rt)
756*4882a593Smuzhiyun rt->dst.obsolete = DST_OBSOLETE_KILL;
757*4882a593Smuzhiyun }
758*4882a593Smuzhiyun }
759*4882a593Smuzhiyun
760*4882a593Smuzhiyun fnhe->fnhe_stamp = jiffies;
761*4882a593Smuzhiyun
762*4882a593Smuzhiyun out_unlock:
763*4882a593Smuzhiyun spin_unlock_bh(&fnhe_lock);
764*4882a593Smuzhiyun }
765*4882a593Smuzhiyun
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)766*4882a593Smuzhiyun static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
767*4882a593Smuzhiyun bool kill_route)
768*4882a593Smuzhiyun {
769*4882a593Smuzhiyun __be32 new_gw = icmp_hdr(skb)->un.gateway;
770*4882a593Smuzhiyun __be32 old_gw = ip_hdr(skb)->saddr;
771*4882a593Smuzhiyun struct net_device *dev = skb->dev;
772*4882a593Smuzhiyun struct in_device *in_dev;
773*4882a593Smuzhiyun struct fib_result res;
774*4882a593Smuzhiyun struct neighbour *n;
775*4882a593Smuzhiyun struct net *net;
776*4882a593Smuzhiyun
777*4882a593Smuzhiyun switch (icmp_hdr(skb)->code & 7) {
778*4882a593Smuzhiyun case ICMP_REDIR_NET:
779*4882a593Smuzhiyun case ICMP_REDIR_NETTOS:
780*4882a593Smuzhiyun case ICMP_REDIR_HOST:
781*4882a593Smuzhiyun case ICMP_REDIR_HOSTTOS:
782*4882a593Smuzhiyun break;
783*4882a593Smuzhiyun
784*4882a593Smuzhiyun default:
785*4882a593Smuzhiyun return;
786*4882a593Smuzhiyun }
787*4882a593Smuzhiyun
788*4882a593Smuzhiyun if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
789*4882a593Smuzhiyun return;
790*4882a593Smuzhiyun
791*4882a593Smuzhiyun in_dev = __in_dev_get_rcu(dev);
792*4882a593Smuzhiyun if (!in_dev)
793*4882a593Smuzhiyun return;
794*4882a593Smuzhiyun
795*4882a593Smuzhiyun net = dev_net(dev);
796*4882a593Smuzhiyun if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
797*4882a593Smuzhiyun ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
798*4882a593Smuzhiyun ipv4_is_zeronet(new_gw))
799*4882a593Smuzhiyun goto reject_redirect;
800*4882a593Smuzhiyun
801*4882a593Smuzhiyun if (!IN_DEV_SHARED_MEDIA(in_dev)) {
802*4882a593Smuzhiyun if (!inet_addr_onlink(in_dev, new_gw, old_gw))
803*4882a593Smuzhiyun goto reject_redirect;
804*4882a593Smuzhiyun if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
805*4882a593Smuzhiyun goto reject_redirect;
806*4882a593Smuzhiyun } else {
807*4882a593Smuzhiyun if (inet_addr_type(net, new_gw) != RTN_UNICAST)
808*4882a593Smuzhiyun goto reject_redirect;
809*4882a593Smuzhiyun }
810*4882a593Smuzhiyun
811*4882a593Smuzhiyun n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
812*4882a593Smuzhiyun if (!n)
813*4882a593Smuzhiyun n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
814*4882a593Smuzhiyun if (!IS_ERR(n)) {
815*4882a593Smuzhiyun if (!(n->nud_state & NUD_VALID)) {
816*4882a593Smuzhiyun neigh_event_send(n, NULL);
817*4882a593Smuzhiyun } else {
818*4882a593Smuzhiyun if (fib_lookup(net, fl4, &res, 0) == 0) {
819*4882a593Smuzhiyun struct fib_nh_common *nhc;
820*4882a593Smuzhiyun
821*4882a593Smuzhiyun fib_select_path(net, &res, fl4, skb);
822*4882a593Smuzhiyun nhc = FIB_RES_NHC(res);
823*4882a593Smuzhiyun update_or_create_fnhe(nhc, fl4->daddr, new_gw,
824*4882a593Smuzhiyun 0, false,
825*4882a593Smuzhiyun jiffies + ip_rt_gc_timeout);
826*4882a593Smuzhiyun }
827*4882a593Smuzhiyun if (kill_route)
828*4882a593Smuzhiyun rt->dst.obsolete = DST_OBSOLETE_KILL;
829*4882a593Smuzhiyun call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
830*4882a593Smuzhiyun }
831*4882a593Smuzhiyun neigh_release(n);
832*4882a593Smuzhiyun }
833*4882a593Smuzhiyun return;
834*4882a593Smuzhiyun
835*4882a593Smuzhiyun reject_redirect:
836*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_VERBOSE
837*4882a593Smuzhiyun if (IN_DEV_LOG_MARTIANS(in_dev)) {
838*4882a593Smuzhiyun const struct iphdr *iph = (const struct iphdr *) skb->data;
839*4882a593Smuzhiyun __be32 daddr = iph->daddr;
840*4882a593Smuzhiyun __be32 saddr = iph->saddr;
841*4882a593Smuzhiyun
842*4882a593Smuzhiyun net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
843*4882a593Smuzhiyun " Advised path = %pI4 -> %pI4\n",
844*4882a593Smuzhiyun &old_gw, dev->name, &new_gw,
845*4882a593Smuzhiyun &saddr, &daddr);
846*4882a593Smuzhiyun }
847*4882a593Smuzhiyun #endif
848*4882a593Smuzhiyun ;
849*4882a593Smuzhiyun }
850*4882a593Smuzhiyun
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)851*4882a593Smuzhiyun static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
852*4882a593Smuzhiyun {
853*4882a593Smuzhiyun struct rtable *rt;
854*4882a593Smuzhiyun struct flowi4 fl4;
855*4882a593Smuzhiyun const struct iphdr *iph = (const struct iphdr *) skb->data;
856*4882a593Smuzhiyun struct net *net = dev_net(skb->dev);
857*4882a593Smuzhiyun int oif = skb->dev->ifindex;
858*4882a593Smuzhiyun u8 tos = RT_TOS(iph->tos);
859*4882a593Smuzhiyun u8 prot = iph->protocol;
860*4882a593Smuzhiyun u32 mark = skb->mark;
861*4882a593Smuzhiyun
862*4882a593Smuzhiyun rt = (struct rtable *) dst;
863*4882a593Smuzhiyun
864*4882a593Smuzhiyun __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
865*4882a593Smuzhiyun ip_rt_fix_tos(&fl4);
866*4882a593Smuzhiyun __ip_do_redirect(rt, skb, &fl4, true);
867*4882a593Smuzhiyun }
868*4882a593Smuzhiyun
ipv4_negative_advice(struct dst_entry * dst)869*4882a593Smuzhiyun static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
870*4882a593Smuzhiyun {
871*4882a593Smuzhiyun struct rtable *rt = (struct rtable *)dst;
872*4882a593Smuzhiyun struct dst_entry *ret = dst;
873*4882a593Smuzhiyun
874*4882a593Smuzhiyun if (rt) {
875*4882a593Smuzhiyun if (dst->obsolete > 0) {
876*4882a593Smuzhiyun ip_rt_put(rt);
877*4882a593Smuzhiyun ret = NULL;
878*4882a593Smuzhiyun } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
879*4882a593Smuzhiyun rt->dst.expires) {
880*4882a593Smuzhiyun ip_rt_put(rt);
881*4882a593Smuzhiyun ret = NULL;
882*4882a593Smuzhiyun }
883*4882a593Smuzhiyun }
884*4882a593Smuzhiyun return ret;
885*4882a593Smuzhiyun }
886*4882a593Smuzhiyun
887*4882a593Smuzhiyun /*
888*4882a593Smuzhiyun * Algorithm:
889*4882a593Smuzhiyun * 1. The first ip_rt_redirect_number redirects are sent
890*4882a593Smuzhiyun * with exponential backoff, then we stop sending them at all,
891*4882a593Smuzhiyun * assuming that the host ignores our redirects.
892*4882a593Smuzhiyun * 2. If we did not see packets requiring redirects
893*4882a593Smuzhiyun * during ip_rt_redirect_silence, we assume that the host
894*4882a593Smuzhiyun * forgot redirected route and start to send redirects again.
895*4882a593Smuzhiyun *
896*4882a593Smuzhiyun * This algorithm is much cheaper and more intelligent than dumb load limiting
897*4882a593Smuzhiyun * in icmp.c.
898*4882a593Smuzhiyun *
899*4882a593Smuzhiyun * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
900*4882a593Smuzhiyun * and "frag. need" (breaks PMTU discovery) in icmp.c.
901*4882a593Smuzhiyun */
902*4882a593Smuzhiyun
ip_rt_send_redirect(struct sk_buff * skb)903*4882a593Smuzhiyun void ip_rt_send_redirect(struct sk_buff *skb)
904*4882a593Smuzhiyun {
905*4882a593Smuzhiyun struct rtable *rt = skb_rtable(skb);
906*4882a593Smuzhiyun struct in_device *in_dev;
907*4882a593Smuzhiyun struct inet_peer *peer;
908*4882a593Smuzhiyun struct net *net;
909*4882a593Smuzhiyun int log_martians;
910*4882a593Smuzhiyun int vif;
911*4882a593Smuzhiyun
912*4882a593Smuzhiyun rcu_read_lock();
913*4882a593Smuzhiyun in_dev = __in_dev_get_rcu(rt->dst.dev);
914*4882a593Smuzhiyun if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
915*4882a593Smuzhiyun rcu_read_unlock();
916*4882a593Smuzhiyun return;
917*4882a593Smuzhiyun }
918*4882a593Smuzhiyun log_martians = IN_DEV_LOG_MARTIANS(in_dev);
919*4882a593Smuzhiyun vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
920*4882a593Smuzhiyun rcu_read_unlock();
921*4882a593Smuzhiyun
922*4882a593Smuzhiyun net = dev_net(rt->dst.dev);
923*4882a593Smuzhiyun peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
924*4882a593Smuzhiyun if (!peer) {
925*4882a593Smuzhiyun icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
926*4882a593Smuzhiyun rt_nexthop(rt, ip_hdr(skb)->daddr));
927*4882a593Smuzhiyun return;
928*4882a593Smuzhiyun }
929*4882a593Smuzhiyun
930*4882a593Smuzhiyun /* No redirected packets during ip_rt_redirect_silence;
931*4882a593Smuzhiyun * reset the algorithm.
932*4882a593Smuzhiyun */
933*4882a593Smuzhiyun if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
934*4882a593Smuzhiyun peer->rate_tokens = 0;
935*4882a593Smuzhiyun peer->n_redirects = 0;
936*4882a593Smuzhiyun }
937*4882a593Smuzhiyun
938*4882a593Smuzhiyun /* Too many ignored redirects; do not send anything
939*4882a593Smuzhiyun * set dst.rate_last to the last seen redirected packet.
940*4882a593Smuzhiyun */
941*4882a593Smuzhiyun if (peer->n_redirects >= ip_rt_redirect_number) {
942*4882a593Smuzhiyun peer->rate_last = jiffies;
943*4882a593Smuzhiyun goto out_put_peer;
944*4882a593Smuzhiyun }
945*4882a593Smuzhiyun
946*4882a593Smuzhiyun /* Check for load limit; set rate_last to the latest sent
947*4882a593Smuzhiyun * redirect.
948*4882a593Smuzhiyun */
949*4882a593Smuzhiyun if (peer->n_redirects == 0 ||
950*4882a593Smuzhiyun time_after(jiffies,
951*4882a593Smuzhiyun (peer->rate_last +
952*4882a593Smuzhiyun (ip_rt_redirect_load << peer->n_redirects)))) {
953*4882a593Smuzhiyun __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
954*4882a593Smuzhiyun
955*4882a593Smuzhiyun icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
956*4882a593Smuzhiyun peer->rate_last = jiffies;
957*4882a593Smuzhiyun ++peer->n_redirects;
958*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_VERBOSE
959*4882a593Smuzhiyun if (log_martians &&
960*4882a593Smuzhiyun peer->n_redirects == ip_rt_redirect_number)
961*4882a593Smuzhiyun net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
962*4882a593Smuzhiyun &ip_hdr(skb)->saddr, inet_iif(skb),
963*4882a593Smuzhiyun &ip_hdr(skb)->daddr, &gw);
964*4882a593Smuzhiyun #endif
965*4882a593Smuzhiyun }
966*4882a593Smuzhiyun out_put_peer:
967*4882a593Smuzhiyun inet_putpeer(peer);
968*4882a593Smuzhiyun }
969*4882a593Smuzhiyun
ip_error(struct sk_buff * skb)970*4882a593Smuzhiyun static int ip_error(struct sk_buff *skb)
971*4882a593Smuzhiyun {
972*4882a593Smuzhiyun struct rtable *rt = skb_rtable(skb);
973*4882a593Smuzhiyun struct net_device *dev = skb->dev;
974*4882a593Smuzhiyun struct in_device *in_dev;
975*4882a593Smuzhiyun struct inet_peer *peer;
976*4882a593Smuzhiyun unsigned long now;
977*4882a593Smuzhiyun struct net *net;
978*4882a593Smuzhiyun bool send;
979*4882a593Smuzhiyun int code;
980*4882a593Smuzhiyun
981*4882a593Smuzhiyun if (netif_is_l3_master(skb->dev)) {
982*4882a593Smuzhiyun dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
983*4882a593Smuzhiyun if (!dev)
984*4882a593Smuzhiyun goto out;
985*4882a593Smuzhiyun }
986*4882a593Smuzhiyun
987*4882a593Smuzhiyun in_dev = __in_dev_get_rcu(dev);
988*4882a593Smuzhiyun
989*4882a593Smuzhiyun /* IP on this device is disabled. */
990*4882a593Smuzhiyun if (!in_dev)
991*4882a593Smuzhiyun goto out;
992*4882a593Smuzhiyun
993*4882a593Smuzhiyun net = dev_net(rt->dst.dev);
994*4882a593Smuzhiyun if (!IN_DEV_FORWARD(in_dev)) {
995*4882a593Smuzhiyun switch (rt->dst.error) {
996*4882a593Smuzhiyun case EHOSTUNREACH:
997*4882a593Smuzhiyun __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
998*4882a593Smuzhiyun break;
999*4882a593Smuzhiyun
1000*4882a593Smuzhiyun case ENETUNREACH:
1001*4882a593Smuzhiyun __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1002*4882a593Smuzhiyun break;
1003*4882a593Smuzhiyun }
1004*4882a593Smuzhiyun goto out;
1005*4882a593Smuzhiyun }
1006*4882a593Smuzhiyun
1007*4882a593Smuzhiyun switch (rt->dst.error) {
1008*4882a593Smuzhiyun case EINVAL:
1009*4882a593Smuzhiyun default:
1010*4882a593Smuzhiyun goto out;
1011*4882a593Smuzhiyun case EHOSTUNREACH:
1012*4882a593Smuzhiyun code = ICMP_HOST_UNREACH;
1013*4882a593Smuzhiyun break;
1014*4882a593Smuzhiyun case ENETUNREACH:
1015*4882a593Smuzhiyun code = ICMP_NET_UNREACH;
1016*4882a593Smuzhiyun __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1017*4882a593Smuzhiyun break;
1018*4882a593Smuzhiyun case EACCES:
1019*4882a593Smuzhiyun code = ICMP_PKT_FILTERED;
1020*4882a593Smuzhiyun break;
1021*4882a593Smuzhiyun }
1022*4882a593Smuzhiyun
1023*4882a593Smuzhiyun peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1024*4882a593Smuzhiyun l3mdev_master_ifindex(skb->dev), 1);
1025*4882a593Smuzhiyun
1026*4882a593Smuzhiyun send = true;
1027*4882a593Smuzhiyun if (peer) {
1028*4882a593Smuzhiyun now = jiffies;
1029*4882a593Smuzhiyun peer->rate_tokens += now - peer->rate_last;
1030*4882a593Smuzhiyun if (peer->rate_tokens > ip_rt_error_burst)
1031*4882a593Smuzhiyun peer->rate_tokens = ip_rt_error_burst;
1032*4882a593Smuzhiyun peer->rate_last = now;
1033*4882a593Smuzhiyun if (peer->rate_tokens >= ip_rt_error_cost)
1034*4882a593Smuzhiyun peer->rate_tokens -= ip_rt_error_cost;
1035*4882a593Smuzhiyun else
1036*4882a593Smuzhiyun send = false;
1037*4882a593Smuzhiyun inet_putpeer(peer);
1038*4882a593Smuzhiyun }
1039*4882a593Smuzhiyun if (send)
1040*4882a593Smuzhiyun icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1041*4882a593Smuzhiyun
1042*4882a593Smuzhiyun out: kfree_skb(skb);
1043*4882a593Smuzhiyun return 0;
1044*4882a593Smuzhiyun }
1045*4882a593Smuzhiyun
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1046*4882a593Smuzhiyun static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1047*4882a593Smuzhiyun {
1048*4882a593Smuzhiyun struct dst_entry *dst = &rt->dst;
1049*4882a593Smuzhiyun struct net *net = dev_net(dst->dev);
1050*4882a593Smuzhiyun struct fib_result res;
1051*4882a593Smuzhiyun bool lock = false;
1052*4882a593Smuzhiyun u32 old_mtu;
1053*4882a593Smuzhiyun
1054*4882a593Smuzhiyun if (ip_mtu_locked(dst))
1055*4882a593Smuzhiyun return;
1056*4882a593Smuzhiyun
1057*4882a593Smuzhiyun old_mtu = ipv4_mtu(dst);
1058*4882a593Smuzhiyun if (old_mtu < mtu)
1059*4882a593Smuzhiyun return;
1060*4882a593Smuzhiyun
1061*4882a593Smuzhiyun if (mtu < ip_rt_min_pmtu) {
1062*4882a593Smuzhiyun lock = true;
1063*4882a593Smuzhiyun mtu = min(old_mtu, ip_rt_min_pmtu);
1064*4882a593Smuzhiyun }
1065*4882a593Smuzhiyun
1066*4882a593Smuzhiyun if (rt->rt_pmtu == mtu && !lock &&
1067*4882a593Smuzhiyun time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1068*4882a593Smuzhiyun return;
1069*4882a593Smuzhiyun
1070*4882a593Smuzhiyun rcu_read_lock();
1071*4882a593Smuzhiyun if (fib_lookup(net, fl4, &res, 0) == 0) {
1072*4882a593Smuzhiyun struct fib_nh_common *nhc;
1073*4882a593Smuzhiyun
1074*4882a593Smuzhiyun fib_select_path(net, &res, fl4, NULL);
1075*4882a593Smuzhiyun nhc = FIB_RES_NHC(res);
1076*4882a593Smuzhiyun update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1077*4882a593Smuzhiyun jiffies + ip_rt_mtu_expires);
1078*4882a593Smuzhiyun }
1079*4882a593Smuzhiyun rcu_read_unlock();
1080*4882a593Smuzhiyun }
1081*4882a593Smuzhiyun
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1082*4882a593Smuzhiyun static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1083*4882a593Smuzhiyun struct sk_buff *skb, u32 mtu,
1084*4882a593Smuzhiyun bool confirm_neigh)
1085*4882a593Smuzhiyun {
1086*4882a593Smuzhiyun struct rtable *rt = (struct rtable *) dst;
1087*4882a593Smuzhiyun struct flowi4 fl4;
1088*4882a593Smuzhiyun
1089*4882a593Smuzhiyun ip_rt_build_flow_key(&fl4, sk, skb);
1090*4882a593Smuzhiyun ip_rt_fix_tos(&fl4);
1091*4882a593Smuzhiyun
1092*4882a593Smuzhiyun /* Don't make lookup fail for bridged encapsulations */
1093*4882a593Smuzhiyun if (skb && netif_is_any_bridge_port(skb->dev))
1094*4882a593Smuzhiyun fl4.flowi4_oif = 0;
1095*4882a593Smuzhiyun
1096*4882a593Smuzhiyun __ip_rt_update_pmtu(rt, &fl4, mtu);
1097*4882a593Smuzhiyun }
1098*4882a593Smuzhiyun
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1099*4882a593Smuzhiyun void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1100*4882a593Smuzhiyun int oif, u8 protocol)
1101*4882a593Smuzhiyun {
1102*4882a593Smuzhiyun const struct iphdr *iph = (const struct iphdr *)skb->data;
1103*4882a593Smuzhiyun struct flowi4 fl4;
1104*4882a593Smuzhiyun struct rtable *rt;
1105*4882a593Smuzhiyun u32 mark = IP4_REPLY_MARK(net, skb->mark);
1106*4882a593Smuzhiyun
1107*4882a593Smuzhiyun __build_flow_key(net, &fl4, NULL, iph, oif,
1108*4882a593Smuzhiyun RT_TOS(iph->tos), protocol, mark, 0);
1109*4882a593Smuzhiyun rt = __ip_route_output_key(net, &fl4);
1110*4882a593Smuzhiyun if (!IS_ERR(rt)) {
1111*4882a593Smuzhiyun __ip_rt_update_pmtu(rt, &fl4, mtu);
1112*4882a593Smuzhiyun ip_rt_put(rt);
1113*4882a593Smuzhiyun }
1114*4882a593Smuzhiyun }
1115*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1116*4882a593Smuzhiyun
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1117*4882a593Smuzhiyun static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1118*4882a593Smuzhiyun {
1119*4882a593Smuzhiyun const struct iphdr *iph = (const struct iphdr *)skb->data;
1120*4882a593Smuzhiyun struct flowi4 fl4;
1121*4882a593Smuzhiyun struct rtable *rt;
1122*4882a593Smuzhiyun
1123*4882a593Smuzhiyun __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1124*4882a593Smuzhiyun
1125*4882a593Smuzhiyun if (!fl4.flowi4_mark)
1126*4882a593Smuzhiyun fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1127*4882a593Smuzhiyun
1128*4882a593Smuzhiyun rt = __ip_route_output_key(sock_net(sk), &fl4);
1129*4882a593Smuzhiyun if (!IS_ERR(rt)) {
1130*4882a593Smuzhiyun __ip_rt_update_pmtu(rt, &fl4, mtu);
1131*4882a593Smuzhiyun ip_rt_put(rt);
1132*4882a593Smuzhiyun }
1133*4882a593Smuzhiyun }
1134*4882a593Smuzhiyun
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1135*4882a593Smuzhiyun void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1136*4882a593Smuzhiyun {
1137*4882a593Smuzhiyun const struct iphdr *iph = (const struct iphdr *)skb->data;
1138*4882a593Smuzhiyun struct flowi4 fl4;
1139*4882a593Smuzhiyun struct rtable *rt;
1140*4882a593Smuzhiyun struct dst_entry *odst = NULL;
1141*4882a593Smuzhiyun bool new = false;
1142*4882a593Smuzhiyun struct net *net = sock_net(sk);
1143*4882a593Smuzhiyun
1144*4882a593Smuzhiyun bh_lock_sock(sk);
1145*4882a593Smuzhiyun
1146*4882a593Smuzhiyun if (!ip_sk_accept_pmtu(sk))
1147*4882a593Smuzhiyun goto out;
1148*4882a593Smuzhiyun
1149*4882a593Smuzhiyun odst = sk_dst_get(sk);
1150*4882a593Smuzhiyun
1151*4882a593Smuzhiyun if (sock_owned_by_user(sk) || !odst) {
1152*4882a593Smuzhiyun __ipv4_sk_update_pmtu(skb, sk, mtu);
1153*4882a593Smuzhiyun goto out;
1154*4882a593Smuzhiyun }
1155*4882a593Smuzhiyun
1156*4882a593Smuzhiyun __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1157*4882a593Smuzhiyun
1158*4882a593Smuzhiyun rt = (struct rtable *)odst;
1159*4882a593Smuzhiyun if (odst->obsolete && !odst->ops->check(odst, 0)) {
1160*4882a593Smuzhiyun rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1161*4882a593Smuzhiyun if (IS_ERR(rt))
1162*4882a593Smuzhiyun goto out;
1163*4882a593Smuzhiyun
1164*4882a593Smuzhiyun new = true;
1165*4882a593Smuzhiyun } else {
1166*4882a593Smuzhiyun ip_rt_fix_tos(&fl4);
1167*4882a593Smuzhiyun }
1168*4882a593Smuzhiyun
1169*4882a593Smuzhiyun __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1170*4882a593Smuzhiyun
1171*4882a593Smuzhiyun if (!dst_check(&rt->dst, 0)) {
1172*4882a593Smuzhiyun if (new)
1173*4882a593Smuzhiyun dst_release(&rt->dst);
1174*4882a593Smuzhiyun
1175*4882a593Smuzhiyun rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1176*4882a593Smuzhiyun if (IS_ERR(rt))
1177*4882a593Smuzhiyun goto out;
1178*4882a593Smuzhiyun
1179*4882a593Smuzhiyun new = true;
1180*4882a593Smuzhiyun }
1181*4882a593Smuzhiyun
1182*4882a593Smuzhiyun if (new)
1183*4882a593Smuzhiyun sk_dst_set(sk, &rt->dst);
1184*4882a593Smuzhiyun
1185*4882a593Smuzhiyun out:
1186*4882a593Smuzhiyun bh_unlock_sock(sk);
1187*4882a593Smuzhiyun dst_release(odst);
1188*4882a593Smuzhiyun }
1189*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1190*4882a593Smuzhiyun
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1191*4882a593Smuzhiyun void ipv4_redirect(struct sk_buff *skb, struct net *net,
1192*4882a593Smuzhiyun int oif, u8 protocol)
1193*4882a593Smuzhiyun {
1194*4882a593Smuzhiyun const struct iphdr *iph = (const struct iphdr *)skb->data;
1195*4882a593Smuzhiyun struct flowi4 fl4;
1196*4882a593Smuzhiyun struct rtable *rt;
1197*4882a593Smuzhiyun
1198*4882a593Smuzhiyun __build_flow_key(net, &fl4, NULL, iph, oif,
1199*4882a593Smuzhiyun RT_TOS(iph->tos), protocol, 0, 0);
1200*4882a593Smuzhiyun rt = __ip_route_output_key(net, &fl4);
1201*4882a593Smuzhiyun if (!IS_ERR(rt)) {
1202*4882a593Smuzhiyun __ip_do_redirect(rt, skb, &fl4, false);
1203*4882a593Smuzhiyun ip_rt_put(rt);
1204*4882a593Smuzhiyun }
1205*4882a593Smuzhiyun }
1206*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(ipv4_redirect);
1207*4882a593Smuzhiyun
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1208*4882a593Smuzhiyun void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1209*4882a593Smuzhiyun {
1210*4882a593Smuzhiyun const struct iphdr *iph = (const struct iphdr *)skb->data;
1211*4882a593Smuzhiyun struct flowi4 fl4;
1212*4882a593Smuzhiyun struct rtable *rt;
1213*4882a593Smuzhiyun struct net *net = sock_net(sk);
1214*4882a593Smuzhiyun
1215*4882a593Smuzhiyun __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1216*4882a593Smuzhiyun rt = __ip_route_output_key(net, &fl4);
1217*4882a593Smuzhiyun if (!IS_ERR(rt)) {
1218*4882a593Smuzhiyun __ip_do_redirect(rt, skb, &fl4, false);
1219*4882a593Smuzhiyun ip_rt_put(rt);
1220*4882a593Smuzhiyun }
1221*4882a593Smuzhiyun }
1222*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1223*4882a593Smuzhiyun
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1224*4882a593Smuzhiyun static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1225*4882a593Smuzhiyun {
1226*4882a593Smuzhiyun struct rtable *rt = (struct rtable *) dst;
1227*4882a593Smuzhiyun
1228*4882a593Smuzhiyun /* All IPV4 dsts are created with ->obsolete set to the value
1229*4882a593Smuzhiyun * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1230*4882a593Smuzhiyun * into this function always.
1231*4882a593Smuzhiyun *
1232*4882a593Smuzhiyun * When a PMTU/redirect information update invalidates a route,
1233*4882a593Smuzhiyun * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1234*4882a593Smuzhiyun * DST_OBSOLETE_DEAD.
1235*4882a593Smuzhiyun */
1236*4882a593Smuzhiyun if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1237*4882a593Smuzhiyun return NULL;
1238*4882a593Smuzhiyun return dst;
1239*4882a593Smuzhiyun }
1240*4882a593Smuzhiyun
ipv4_send_dest_unreach(struct sk_buff * skb)1241*4882a593Smuzhiyun static void ipv4_send_dest_unreach(struct sk_buff *skb)
1242*4882a593Smuzhiyun {
1243*4882a593Smuzhiyun struct ip_options opt;
1244*4882a593Smuzhiyun int res;
1245*4882a593Smuzhiyun
1246*4882a593Smuzhiyun /* Recompile ip options since IPCB may not be valid anymore.
1247*4882a593Smuzhiyun * Also check we have a reasonable ipv4 header.
1248*4882a593Smuzhiyun */
1249*4882a593Smuzhiyun if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1250*4882a593Smuzhiyun ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1251*4882a593Smuzhiyun return;
1252*4882a593Smuzhiyun
1253*4882a593Smuzhiyun memset(&opt, 0, sizeof(opt));
1254*4882a593Smuzhiyun if (ip_hdr(skb)->ihl > 5) {
1255*4882a593Smuzhiyun if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1256*4882a593Smuzhiyun return;
1257*4882a593Smuzhiyun opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1258*4882a593Smuzhiyun
1259*4882a593Smuzhiyun rcu_read_lock();
1260*4882a593Smuzhiyun res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1261*4882a593Smuzhiyun rcu_read_unlock();
1262*4882a593Smuzhiyun
1263*4882a593Smuzhiyun if (res)
1264*4882a593Smuzhiyun return;
1265*4882a593Smuzhiyun }
1266*4882a593Smuzhiyun __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1267*4882a593Smuzhiyun }
1268*4882a593Smuzhiyun
ipv4_link_failure(struct sk_buff * skb)1269*4882a593Smuzhiyun static void ipv4_link_failure(struct sk_buff *skb)
1270*4882a593Smuzhiyun {
1271*4882a593Smuzhiyun struct rtable *rt;
1272*4882a593Smuzhiyun
1273*4882a593Smuzhiyun ipv4_send_dest_unreach(skb);
1274*4882a593Smuzhiyun
1275*4882a593Smuzhiyun rt = skb_rtable(skb);
1276*4882a593Smuzhiyun if (rt)
1277*4882a593Smuzhiyun dst_set_expires(&rt->dst, 0);
1278*4882a593Smuzhiyun }
1279*4882a593Smuzhiyun
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1280*4882a593Smuzhiyun static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1281*4882a593Smuzhiyun {
1282*4882a593Smuzhiyun pr_debug("%s: %pI4 -> %pI4, %s\n",
1283*4882a593Smuzhiyun __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1284*4882a593Smuzhiyun skb->dev ? skb->dev->name : "?");
1285*4882a593Smuzhiyun kfree_skb(skb);
1286*4882a593Smuzhiyun WARN_ON(1);
1287*4882a593Smuzhiyun return 0;
1288*4882a593Smuzhiyun }
1289*4882a593Smuzhiyun
1290*4882a593Smuzhiyun /*
1291*4882a593Smuzhiyun We do not cache source address of outgoing interface,
1292*4882a593Smuzhiyun because it is used only by IP RR, TS and SRR options,
1293*4882a593Smuzhiyun so that it out of fast path.
1294*4882a593Smuzhiyun
1295*4882a593Smuzhiyun BTW remember: "addr" is allowed to be not aligned
1296*4882a593Smuzhiyun in IP options!
1297*4882a593Smuzhiyun */
1298*4882a593Smuzhiyun
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1299*4882a593Smuzhiyun void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1300*4882a593Smuzhiyun {
1301*4882a593Smuzhiyun __be32 src;
1302*4882a593Smuzhiyun
1303*4882a593Smuzhiyun if (rt_is_output_route(rt))
1304*4882a593Smuzhiyun src = ip_hdr(skb)->saddr;
1305*4882a593Smuzhiyun else {
1306*4882a593Smuzhiyun struct fib_result res;
1307*4882a593Smuzhiyun struct iphdr *iph = ip_hdr(skb);
1308*4882a593Smuzhiyun struct flowi4 fl4 = {
1309*4882a593Smuzhiyun .daddr = iph->daddr,
1310*4882a593Smuzhiyun .saddr = iph->saddr,
1311*4882a593Smuzhiyun .flowi4_tos = RT_TOS(iph->tos),
1312*4882a593Smuzhiyun .flowi4_oif = rt->dst.dev->ifindex,
1313*4882a593Smuzhiyun .flowi4_iif = skb->dev->ifindex,
1314*4882a593Smuzhiyun .flowi4_mark = skb->mark,
1315*4882a593Smuzhiyun };
1316*4882a593Smuzhiyun
1317*4882a593Smuzhiyun rcu_read_lock();
1318*4882a593Smuzhiyun if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1319*4882a593Smuzhiyun src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1320*4882a593Smuzhiyun else
1321*4882a593Smuzhiyun src = inet_select_addr(rt->dst.dev,
1322*4882a593Smuzhiyun rt_nexthop(rt, iph->daddr),
1323*4882a593Smuzhiyun RT_SCOPE_UNIVERSE);
1324*4882a593Smuzhiyun rcu_read_unlock();
1325*4882a593Smuzhiyun }
1326*4882a593Smuzhiyun memcpy(addr, &src, 4);
1327*4882a593Smuzhiyun }
1328*4882a593Smuzhiyun
1329*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1330*4882a593Smuzhiyun static void set_class_tag(struct rtable *rt, u32 tag)
1331*4882a593Smuzhiyun {
1332*4882a593Smuzhiyun if (!(rt->dst.tclassid & 0xFFFF))
1333*4882a593Smuzhiyun rt->dst.tclassid |= tag & 0xFFFF;
1334*4882a593Smuzhiyun if (!(rt->dst.tclassid & 0xFFFF0000))
1335*4882a593Smuzhiyun rt->dst.tclassid |= tag & 0xFFFF0000;
1336*4882a593Smuzhiyun }
1337*4882a593Smuzhiyun #endif
1338*4882a593Smuzhiyun
ipv4_default_advmss(const struct dst_entry * dst)1339*4882a593Smuzhiyun static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1340*4882a593Smuzhiyun {
1341*4882a593Smuzhiyun unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1342*4882a593Smuzhiyun unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1343*4882a593Smuzhiyun ip_rt_min_advmss);
1344*4882a593Smuzhiyun
1345*4882a593Smuzhiyun return min(advmss, IPV4_MAX_PMTU - header_size);
1346*4882a593Smuzhiyun }
1347*4882a593Smuzhiyun
ipv4_mtu(const struct dst_entry * dst)1348*4882a593Smuzhiyun static unsigned int ipv4_mtu(const struct dst_entry *dst)
1349*4882a593Smuzhiyun {
1350*4882a593Smuzhiyun const struct rtable *rt = (const struct rtable *)dst;
1351*4882a593Smuzhiyun unsigned int mtu = rt->rt_pmtu;
1352*4882a593Smuzhiyun
1353*4882a593Smuzhiyun if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1354*4882a593Smuzhiyun mtu = dst_metric_raw(dst, RTAX_MTU);
1355*4882a593Smuzhiyun
1356*4882a593Smuzhiyun if (mtu)
1357*4882a593Smuzhiyun goto out;
1358*4882a593Smuzhiyun
1359*4882a593Smuzhiyun mtu = READ_ONCE(dst->dev->mtu);
1360*4882a593Smuzhiyun
1361*4882a593Smuzhiyun if (unlikely(ip_mtu_locked(dst))) {
1362*4882a593Smuzhiyun if (rt->rt_uses_gateway && mtu > 576)
1363*4882a593Smuzhiyun mtu = 576;
1364*4882a593Smuzhiyun }
1365*4882a593Smuzhiyun
1366*4882a593Smuzhiyun out:
1367*4882a593Smuzhiyun mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1368*4882a593Smuzhiyun
1369*4882a593Smuzhiyun return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1370*4882a593Smuzhiyun }
1371*4882a593Smuzhiyun
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1372*4882a593Smuzhiyun static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1373*4882a593Smuzhiyun {
1374*4882a593Smuzhiyun struct fnhe_hash_bucket *hash;
1375*4882a593Smuzhiyun struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1376*4882a593Smuzhiyun u32 hval = fnhe_hashfun(daddr);
1377*4882a593Smuzhiyun
1378*4882a593Smuzhiyun spin_lock_bh(&fnhe_lock);
1379*4882a593Smuzhiyun
1380*4882a593Smuzhiyun hash = rcu_dereference_protected(nhc->nhc_exceptions,
1381*4882a593Smuzhiyun lockdep_is_held(&fnhe_lock));
1382*4882a593Smuzhiyun hash += hval;
1383*4882a593Smuzhiyun
1384*4882a593Smuzhiyun fnhe_p = &hash->chain;
1385*4882a593Smuzhiyun fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1386*4882a593Smuzhiyun while (fnhe) {
1387*4882a593Smuzhiyun if (fnhe->fnhe_daddr == daddr) {
1388*4882a593Smuzhiyun rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1389*4882a593Smuzhiyun fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1390*4882a593Smuzhiyun /* set fnhe_daddr to 0 to ensure it won't bind with
1391*4882a593Smuzhiyun * new dsts in rt_bind_exception().
1392*4882a593Smuzhiyun */
1393*4882a593Smuzhiyun fnhe->fnhe_daddr = 0;
1394*4882a593Smuzhiyun fnhe_flush_routes(fnhe);
1395*4882a593Smuzhiyun kfree_rcu(fnhe, rcu);
1396*4882a593Smuzhiyun break;
1397*4882a593Smuzhiyun }
1398*4882a593Smuzhiyun fnhe_p = &fnhe->fnhe_next;
1399*4882a593Smuzhiyun fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1400*4882a593Smuzhiyun lockdep_is_held(&fnhe_lock));
1401*4882a593Smuzhiyun }
1402*4882a593Smuzhiyun
1403*4882a593Smuzhiyun spin_unlock_bh(&fnhe_lock);
1404*4882a593Smuzhiyun }
1405*4882a593Smuzhiyun
find_exception(struct fib_nh_common * nhc,__be32 daddr)1406*4882a593Smuzhiyun static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1407*4882a593Smuzhiyun __be32 daddr)
1408*4882a593Smuzhiyun {
1409*4882a593Smuzhiyun struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1410*4882a593Smuzhiyun struct fib_nh_exception *fnhe;
1411*4882a593Smuzhiyun u32 hval;
1412*4882a593Smuzhiyun
1413*4882a593Smuzhiyun if (!hash)
1414*4882a593Smuzhiyun return NULL;
1415*4882a593Smuzhiyun
1416*4882a593Smuzhiyun hval = fnhe_hashfun(daddr);
1417*4882a593Smuzhiyun
1418*4882a593Smuzhiyun for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1419*4882a593Smuzhiyun fnhe = rcu_dereference(fnhe->fnhe_next)) {
1420*4882a593Smuzhiyun if (fnhe->fnhe_daddr == daddr) {
1421*4882a593Smuzhiyun if (fnhe->fnhe_expires &&
1422*4882a593Smuzhiyun time_after(jiffies, fnhe->fnhe_expires)) {
1423*4882a593Smuzhiyun ip_del_fnhe(nhc, daddr);
1424*4882a593Smuzhiyun break;
1425*4882a593Smuzhiyun }
1426*4882a593Smuzhiyun return fnhe;
1427*4882a593Smuzhiyun }
1428*4882a593Smuzhiyun }
1429*4882a593Smuzhiyun return NULL;
1430*4882a593Smuzhiyun }
1431*4882a593Smuzhiyun
1432*4882a593Smuzhiyun /* MTU selection:
1433*4882a593Smuzhiyun * 1. mtu on route is locked - use it
1434*4882a593Smuzhiyun * 2. mtu from nexthop exception
1435*4882a593Smuzhiyun * 3. mtu from egress device
1436*4882a593Smuzhiyun */
1437*4882a593Smuzhiyun
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1438*4882a593Smuzhiyun u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1439*4882a593Smuzhiyun {
1440*4882a593Smuzhiyun struct fib_nh_common *nhc = res->nhc;
1441*4882a593Smuzhiyun struct net_device *dev = nhc->nhc_dev;
1442*4882a593Smuzhiyun struct fib_info *fi = res->fi;
1443*4882a593Smuzhiyun u32 mtu = 0;
1444*4882a593Smuzhiyun
1445*4882a593Smuzhiyun if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1446*4882a593Smuzhiyun fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1447*4882a593Smuzhiyun mtu = fi->fib_mtu;
1448*4882a593Smuzhiyun
1449*4882a593Smuzhiyun if (likely(!mtu)) {
1450*4882a593Smuzhiyun struct fib_nh_exception *fnhe;
1451*4882a593Smuzhiyun
1452*4882a593Smuzhiyun fnhe = find_exception(nhc, daddr);
1453*4882a593Smuzhiyun if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1454*4882a593Smuzhiyun mtu = fnhe->fnhe_pmtu;
1455*4882a593Smuzhiyun }
1456*4882a593Smuzhiyun
1457*4882a593Smuzhiyun if (likely(!mtu))
1458*4882a593Smuzhiyun mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1459*4882a593Smuzhiyun
1460*4882a593Smuzhiyun return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1461*4882a593Smuzhiyun }
1462*4882a593Smuzhiyun
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1463*4882a593Smuzhiyun static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1464*4882a593Smuzhiyun __be32 daddr, const bool do_cache)
1465*4882a593Smuzhiyun {
1466*4882a593Smuzhiyun bool ret = false;
1467*4882a593Smuzhiyun
1468*4882a593Smuzhiyun spin_lock_bh(&fnhe_lock);
1469*4882a593Smuzhiyun
1470*4882a593Smuzhiyun if (daddr == fnhe->fnhe_daddr) {
1471*4882a593Smuzhiyun struct rtable __rcu **porig;
1472*4882a593Smuzhiyun struct rtable *orig;
1473*4882a593Smuzhiyun int genid = fnhe_genid(dev_net(rt->dst.dev));
1474*4882a593Smuzhiyun
1475*4882a593Smuzhiyun if (rt_is_input_route(rt))
1476*4882a593Smuzhiyun porig = &fnhe->fnhe_rth_input;
1477*4882a593Smuzhiyun else
1478*4882a593Smuzhiyun porig = &fnhe->fnhe_rth_output;
1479*4882a593Smuzhiyun orig = rcu_dereference(*porig);
1480*4882a593Smuzhiyun
1481*4882a593Smuzhiyun if (fnhe->fnhe_genid != genid) {
1482*4882a593Smuzhiyun fnhe->fnhe_genid = genid;
1483*4882a593Smuzhiyun fnhe->fnhe_gw = 0;
1484*4882a593Smuzhiyun fnhe->fnhe_pmtu = 0;
1485*4882a593Smuzhiyun fnhe->fnhe_expires = 0;
1486*4882a593Smuzhiyun fnhe->fnhe_mtu_locked = false;
1487*4882a593Smuzhiyun fnhe_flush_routes(fnhe);
1488*4882a593Smuzhiyun orig = NULL;
1489*4882a593Smuzhiyun }
1490*4882a593Smuzhiyun fill_route_from_fnhe(rt, fnhe);
1491*4882a593Smuzhiyun if (!rt->rt_gw4) {
1492*4882a593Smuzhiyun rt->rt_gw4 = daddr;
1493*4882a593Smuzhiyun rt->rt_gw_family = AF_INET;
1494*4882a593Smuzhiyun }
1495*4882a593Smuzhiyun
1496*4882a593Smuzhiyun if (do_cache) {
1497*4882a593Smuzhiyun dst_hold(&rt->dst);
1498*4882a593Smuzhiyun rcu_assign_pointer(*porig, rt);
1499*4882a593Smuzhiyun if (orig) {
1500*4882a593Smuzhiyun dst_dev_put(&orig->dst);
1501*4882a593Smuzhiyun dst_release(&orig->dst);
1502*4882a593Smuzhiyun }
1503*4882a593Smuzhiyun ret = true;
1504*4882a593Smuzhiyun }
1505*4882a593Smuzhiyun
1506*4882a593Smuzhiyun fnhe->fnhe_stamp = jiffies;
1507*4882a593Smuzhiyun }
1508*4882a593Smuzhiyun spin_unlock_bh(&fnhe_lock);
1509*4882a593Smuzhiyun
1510*4882a593Smuzhiyun return ret;
1511*4882a593Smuzhiyun }
1512*4882a593Smuzhiyun
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1513*4882a593Smuzhiyun static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1514*4882a593Smuzhiyun {
1515*4882a593Smuzhiyun struct rtable *orig, *prev, **p;
1516*4882a593Smuzhiyun bool ret = true;
1517*4882a593Smuzhiyun
1518*4882a593Smuzhiyun if (rt_is_input_route(rt)) {
1519*4882a593Smuzhiyun p = (struct rtable **)&nhc->nhc_rth_input;
1520*4882a593Smuzhiyun } else {
1521*4882a593Smuzhiyun p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1522*4882a593Smuzhiyun }
1523*4882a593Smuzhiyun orig = *p;
1524*4882a593Smuzhiyun
1525*4882a593Smuzhiyun /* hold dst before doing cmpxchg() to avoid race condition
1526*4882a593Smuzhiyun * on this dst
1527*4882a593Smuzhiyun */
1528*4882a593Smuzhiyun dst_hold(&rt->dst);
1529*4882a593Smuzhiyun prev = cmpxchg(p, orig, rt);
1530*4882a593Smuzhiyun if (prev == orig) {
1531*4882a593Smuzhiyun if (orig) {
1532*4882a593Smuzhiyun rt_add_uncached_list(orig);
1533*4882a593Smuzhiyun dst_release(&orig->dst);
1534*4882a593Smuzhiyun }
1535*4882a593Smuzhiyun } else {
1536*4882a593Smuzhiyun dst_release(&rt->dst);
1537*4882a593Smuzhiyun ret = false;
1538*4882a593Smuzhiyun }
1539*4882a593Smuzhiyun
1540*4882a593Smuzhiyun return ret;
1541*4882a593Smuzhiyun }
1542*4882a593Smuzhiyun
1543*4882a593Smuzhiyun struct uncached_list {
1544*4882a593Smuzhiyun spinlock_t lock;
1545*4882a593Smuzhiyun struct list_head head;
1546*4882a593Smuzhiyun };
1547*4882a593Smuzhiyun
1548*4882a593Smuzhiyun static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1549*4882a593Smuzhiyun
rt_add_uncached_list(struct rtable * rt)1550*4882a593Smuzhiyun void rt_add_uncached_list(struct rtable *rt)
1551*4882a593Smuzhiyun {
1552*4882a593Smuzhiyun struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1553*4882a593Smuzhiyun
1554*4882a593Smuzhiyun rt->rt_uncached_list = ul;
1555*4882a593Smuzhiyun
1556*4882a593Smuzhiyun spin_lock_bh(&ul->lock);
1557*4882a593Smuzhiyun list_add_tail(&rt->rt_uncached, &ul->head);
1558*4882a593Smuzhiyun spin_unlock_bh(&ul->lock);
1559*4882a593Smuzhiyun }
1560*4882a593Smuzhiyun
rt_del_uncached_list(struct rtable * rt)1561*4882a593Smuzhiyun void rt_del_uncached_list(struct rtable *rt)
1562*4882a593Smuzhiyun {
1563*4882a593Smuzhiyun if (!list_empty(&rt->rt_uncached)) {
1564*4882a593Smuzhiyun struct uncached_list *ul = rt->rt_uncached_list;
1565*4882a593Smuzhiyun
1566*4882a593Smuzhiyun spin_lock_bh(&ul->lock);
1567*4882a593Smuzhiyun list_del(&rt->rt_uncached);
1568*4882a593Smuzhiyun spin_unlock_bh(&ul->lock);
1569*4882a593Smuzhiyun }
1570*4882a593Smuzhiyun }
1571*4882a593Smuzhiyun
ipv4_dst_destroy(struct dst_entry * dst)1572*4882a593Smuzhiyun static void ipv4_dst_destroy(struct dst_entry *dst)
1573*4882a593Smuzhiyun {
1574*4882a593Smuzhiyun struct rtable *rt = (struct rtable *)dst;
1575*4882a593Smuzhiyun
1576*4882a593Smuzhiyun ip_dst_metrics_put(dst);
1577*4882a593Smuzhiyun rt_del_uncached_list(rt);
1578*4882a593Smuzhiyun }
1579*4882a593Smuzhiyun
rt_flush_dev(struct net_device * dev)1580*4882a593Smuzhiyun void rt_flush_dev(struct net_device *dev)
1581*4882a593Smuzhiyun {
1582*4882a593Smuzhiyun struct rtable *rt;
1583*4882a593Smuzhiyun int cpu;
1584*4882a593Smuzhiyun
1585*4882a593Smuzhiyun for_each_possible_cpu(cpu) {
1586*4882a593Smuzhiyun struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1587*4882a593Smuzhiyun
1588*4882a593Smuzhiyun spin_lock_bh(&ul->lock);
1589*4882a593Smuzhiyun list_for_each_entry(rt, &ul->head, rt_uncached) {
1590*4882a593Smuzhiyun if (rt->dst.dev != dev)
1591*4882a593Smuzhiyun continue;
1592*4882a593Smuzhiyun rt->dst.dev = blackhole_netdev;
1593*4882a593Smuzhiyun dev_hold(rt->dst.dev);
1594*4882a593Smuzhiyun dev_put(dev);
1595*4882a593Smuzhiyun }
1596*4882a593Smuzhiyun spin_unlock_bh(&ul->lock);
1597*4882a593Smuzhiyun }
1598*4882a593Smuzhiyun }
1599*4882a593Smuzhiyun
rt_cache_valid(const struct rtable * rt)1600*4882a593Smuzhiyun static bool rt_cache_valid(const struct rtable *rt)
1601*4882a593Smuzhiyun {
1602*4882a593Smuzhiyun return rt &&
1603*4882a593Smuzhiyun rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1604*4882a593Smuzhiyun !rt_is_expired(rt);
1605*4882a593Smuzhiyun }
1606*4882a593Smuzhiyun
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1607*4882a593Smuzhiyun static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1608*4882a593Smuzhiyun const struct fib_result *res,
1609*4882a593Smuzhiyun struct fib_nh_exception *fnhe,
1610*4882a593Smuzhiyun struct fib_info *fi, u16 type, u32 itag,
1611*4882a593Smuzhiyun const bool do_cache)
1612*4882a593Smuzhiyun {
1613*4882a593Smuzhiyun bool cached = false;
1614*4882a593Smuzhiyun
1615*4882a593Smuzhiyun if (fi) {
1616*4882a593Smuzhiyun struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1617*4882a593Smuzhiyun
1618*4882a593Smuzhiyun if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1619*4882a593Smuzhiyun rt->rt_uses_gateway = 1;
1620*4882a593Smuzhiyun rt->rt_gw_family = nhc->nhc_gw_family;
1621*4882a593Smuzhiyun /* only INET and INET6 are supported */
1622*4882a593Smuzhiyun if (likely(nhc->nhc_gw_family == AF_INET))
1623*4882a593Smuzhiyun rt->rt_gw4 = nhc->nhc_gw.ipv4;
1624*4882a593Smuzhiyun else
1625*4882a593Smuzhiyun rt->rt_gw6 = nhc->nhc_gw.ipv6;
1626*4882a593Smuzhiyun }
1627*4882a593Smuzhiyun
1628*4882a593Smuzhiyun ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1629*4882a593Smuzhiyun
1630*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_CLASSID
1631*4882a593Smuzhiyun if (nhc->nhc_family == AF_INET) {
1632*4882a593Smuzhiyun struct fib_nh *nh;
1633*4882a593Smuzhiyun
1634*4882a593Smuzhiyun nh = container_of(nhc, struct fib_nh, nh_common);
1635*4882a593Smuzhiyun rt->dst.tclassid = nh->nh_tclassid;
1636*4882a593Smuzhiyun }
1637*4882a593Smuzhiyun #endif
1638*4882a593Smuzhiyun rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1639*4882a593Smuzhiyun if (unlikely(fnhe))
1640*4882a593Smuzhiyun cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1641*4882a593Smuzhiyun else if (do_cache)
1642*4882a593Smuzhiyun cached = rt_cache_route(nhc, rt);
1643*4882a593Smuzhiyun if (unlikely(!cached)) {
1644*4882a593Smuzhiyun /* Routes we intend to cache in nexthop exception or
1645*4882a593Smuzhiyun * FIB nexthop have the DST_NOCACHE bit clear.
1646*4882a593Smuzhiyun * However, if we are unsuccessful at storing this
1647*4882a593Smuzhiyun * route into the cache we really need to set it.
1648*4882a593Smuzhiyun */
1649*4882a593Smuzhiyun if (!rt->rt_gw4) {
1650*4882a593Smuzhiyun rt->rt_gw_family = AF_INET;
1651*4882a593Smuzhiyun rt->rt_gw4 = daddr;
1652*4882a593Smuzhiyun }
1653*4882a593Smuzhiyun rt_add_uncached_list(rt);
1654*4882a593Smuzhiyun }
1655*4882a593Smuzhiyun } else
1656*4882a593Smuzhiyun rt_add_uncached_list(rt);
1657*4882a593Smuzhiyun
1658*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_CLASSID
1659*4882a593Smuzhiyun #ifdef CONFIG_IP_MULTIPLE_TABLES
1660*4882a593Smuzhiyun set_class_tag(rt, res->tclassid);
1661*4882a593Smuzhiyun #endif
1662*4882a593Smuzhiyun set_class_tag(rt, itag);
1663*4882a593Smuzhiyun #endif
1664*4882a593Smuzhiyun }
1665*4882a593Smuzhiyun
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm)1666*4882a593Smuzhiyun struct rtable *rt_dst_alloc(struct net_device *dev,
1667*4882a593Smuzhiyun unsigned int flags, u16 type,
1668*4882a593Smuzhiyun bool nopolicy, bool noxfrm)
1669*4882a593Smuzhiyun {
1670*4882a593Smuzhiyun struct rtable *rt;
1671*4882a593Smuzhiyun
1672*4882a593Smuzhiyun rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1673*4882a593Smuzhiyun (nopolicy ? DST_NOPOLICY : 0) |
1674*4882a593Smuzhiyun (noxfrm ? DST_NOXFRM : 0));
1675*4882a593Smuzhiyun
1676*4882a593Smuzhiyun if (rt) {
1677*4882a593Smuzhiyun rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1678*4882a593Smuzhiyun rt->rt_flags = flags;
1679*4882a593Smuzhiyun rt->rt_type = type;
1680*4882a593Smuzhiyun rt->rt_is_input = 0;
1681*4882a593Smuzhiyun rt->rt_iif = 0;
1682*4882a593Smuzhiyun rt->rt_pmtu = 0;
1683*4882a593Smuzhiyun rt->rt_mtu_locked = 0;
1684*4882a593Smuzhiyun rt->rt_uses_gateway = 0;
1685*4882a593Smuzhiyun rt->rt_gw_family = 0;
1686*4882a593Smuzhiyun rt->rt_gw4 = 0;
1687*4882a593Smuzhiyun INIT_LIST_HEAD(&rt->rt_uncached);
1688*4882a593Smuzhiyun
1689*4882a593Smuzhiyun rt->dst.output = ip_output;
1690*4882a593Smuzhiyun if (flags & RTCF_LOCAL)
1691*4882a593Smuzhiyun rt->dst.input = ip_local_deliver;
1692*4882a593Smuzhiyun }
1693*4882a593Smuzhiyun
1694*4882a593Smuzhiyun return rt;
1695*4882a593Smuzhiyun }
1696*4882a593Smuzhiyun EXPORT_SYMBOL(rt_dst_alloc);
1697*4882a593Smuzhiyun
rt_dst_clone(struct net_device * dev,struct rtable * rt)1698*4882a593Smuzhiyun struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1699*4882a593Smuzhiyun {
1700*4882a593Smuzhiyun struct rtable *new_rt;
1701*4882a593Smuzhiyun
1702*4882a593Smuzhiyun new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1703*4882a593Smuzhiyun rt->dst.flags);
1704*4882a593Smuzhiyun
1705*4882a593Smuzhiyun if (new_rt) {
1706*4882a593Smuzhiyun new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1707*4882a593Smuzhiyun new_rt->rt_flags = rt->rt_flags;
1708*4882a593Smuzhiyun new_rt->rt_type = rt->rt_type;
1709*4882a593Smuzhiyun new_rt->rt_is_input = rt->rt_is_input;
1710*4882a593Smuzhiyun new_rt->rt_iif = rt->rt_iif;
1711*4882a593Smuzhiyun new_rt->rt_pmtu = rt->rt_pmtu;
1712*4882a593Smuzhiyun new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1713*4882a593Smuzhiyun new_rt->rt_gw_family = rt->rt_gw_family;
1714*4882a593Smuzhiyun if (rt->rt_gw_family == AF_INET)
1715*4882a593Smuzhiyun new_rt->rt_gw4 = rt->rt_gw4;
1716*4882a593Smuzhiyun else if (rt->rt_gw_family == AF_INET6)
1717*4882a593Smuzhiyun new_rt->rt_gw6 = rt->rt_gw6;
1718*4882a593Smuzhiyun INIT_LIST_HEAD(&new_rt->rt_uncached);
1719*4882a593Smuzhiyun
1720*4882a593Smuzhiyun new_rt->dst.input = rt->dst.input;
1721*4882a593Smuzhiyun new_rt->dst.output = rt->dst.output;
1722*4882a593Smuzhiyun new_rt->dst.error = rt->dst.error;
1723*4882a593Smuzhiyun new_rt->dst.lastuse = jiffies;
1724*4882a593Smuzhiyun new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1725*4882a593Smuzhiyun }
1726*4882a593Smuzhiyun return new_rt;
1727*4882a593Smuzhiyun }
1728*4882a593Smuzhiyun EXPORT_SYMBOL(rt_dst_clone);
1729*4882a593Smuzhiyun
1730*4882a593Smuzhiyun /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1731*4882a593Smuzhiyun int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1732*4882a593Smuzhiyun u8 tos, struct net_device *dev,
1733*4882a593Smuzhiyun struct in_device *in_dev, u32 *itag)
1734*4882a593Smuzhiyun {
1735*4882a593Smuzhiyun int err;
1736*4882a593Smuzhiyun
1737*4882a593Smuzhiyun /* Primary sanity checks. */
1738*4882a593Smuzhiyun if (!in_dev)
1739*4882a593Smuzhiyun return -EINVAL;
1740*4882a593Smuzhiyun
1741*4882a593Smuzhiyun if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1742*4882a593Smuzhiyun skb->protocol != htons(ETH_P_IP))
1743*4882a593Smuzhiyun return -EINVAL;
1744*4882a593Smuzhiyun
1745*4882a593Smuzhiyun if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1746*4882a593Smuzhiyun return -EINVAL;
1747*4882a593Smuzhiyun
1748*4882a593Smuzhiyun if (ipv4_is_zeronet(saddr)) {
1749*4882a593Smuzhiyun if (!ipv4_is_local_multicast(daddr) &&
1750*4882a593Smuzhiyun ip_hdr(skb)->protocol != IPPROTO_IGMP)
1751*4882a593Smuzhiyun return -EINVAL;
1752*4882a593Smuzhiyun } else {
1753*4882a593Smuzhiyun err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1754*4882a593Smuzhiyun in_dev, itag);
1755*4882a593Smuzhiyun if (err < 0)
1756*4882a593Smuzhiyun return err;
1757*4882a593Smuzhiyun }
1758*4882a593Smuzhiyun return 0;
1759*4882a593Smuzhiyun }
1760*4882a593Smuzhiyun
1761*4882a593Smuzhiyun /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1762*4882a593Smuzhiyun static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1763*4882a593Smuzhiyun u8 tos, struct net_device *dev, int our)
1764*4882a593Smuzhiyun {
1765*4882a593Smuzhiyun struct in_device *in_dev = __in_dev_get_rcu(dev);
1766*4882a593Smuzhiyun unsigned int flags = RTCF_MULTICAST;
1767*4882a593Smuzhiyun struct rtable *rth;
1768*4882a593Smuzhiyun u32 itag = 0;
1769*4882a593Smuzhiyun int err;
1770*4882a593Smuzhiyun
1771*4882a593Smuzhiyun err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1772*4882a593Smuzhiyun if (err)
1773*4882a593Smuzhiyun return err;
1774*4882a593Smuzhiyun
1775*4882a593Smuzhiyun if (our)
1776*4882a593Smuzhiyun flags |= RTCF_LOCAL;
1777*4882a593Smuzhiyun
1778*4882a593Smuzhiyun rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1779*4882a593Smuzhiyun IN_DEV_ORCONF(in_dev, NOPOLICY), false);
1780*4882a593Smuzhiyun if (!rth)
1781*4882a593Smuzhiyun return -ENOBUFS;
1782*4882a593Smuzhiyun
1783*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_CLASSID
1784*4882a593Smuzhiyun rth->dst.tclassid = itag;
1785*4882a593Smuzhiyun #endif
1786*4882a593Smuzhiyun rth->dst.output = ip_rt_bug;
1787*4882a593Smuzhiyun rth->rt_is_input= 1;
1788*4882a593Smuzhiyun
1789*4882a593Smuzhiyun #ifdef CONFIG_IP_MROUTE
1790*4882a593Smuzhiyun if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1791*4882a593Smuzhiyun rth->dst.input = ip_mr_input;
1792*4882a593Smuzhiyun #endif
1793*4882a593Smuzhiyun RT_CACHE_STAT_INC(in_slow_mc);
1794*4882a593Smuzhiyun
1795*4882a593Smuzhiyun skb_dst_drop(skb);
1796*4882a593Smuzhiyun skb_dst_set(skb, &rth->dst);
1797*4882a593Smuzhiyun return 0;
1798*4882a593Smuzhiyun }
1799*4882a593Smuzhiyun
1800*4882a593Smuzhiyun
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1801*4882a593Smuzhiyun static void ip_handle_martian_source(struct net_device *dev,
1802*4882a593Smuzhiyun struct in_device *in_dev,
1803*4882a593Smuzhiyun struct sk_buff *skb,
1804*4882a593Smuzhiyun __be32 daddr,
1805*4882a593Smuzhiyun __be32 saddr)
1806*4882a593Smuzhiyun {
1807*4882a593Smuzhiyun RT_CACHE_STAT_INC(in_martian_src);
1808*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_VERBOSE
1809*4882a593Smuzhiyun if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1810*4882a593Smuzhiyun /*
1811*4882a593Smuzhiyun * RFC1812 recommendation, if source is martian,
1812*4882a593Smuzhiyun * the only hint is MAC header.
1813*4882a593Smuzhiyun */
1814*4882a593Smuzhiyun pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1815*4882a593Smuzhiyun &daddr, &saddr, dev->name);
1816*4882a593Smuzhiyun if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1817*4882a593Smuzhiyun print_hex_dump(KERN_WARNING, "ll header: ",
1818*4882a593Smuzhiyun DUMP_PREFIX_OFFSET, 16, 1,
1819*4882a593Smuzhiyun skb_mac_header(skb),
1820*4882a593Smuzhiyun dev->hard_header_len, false);
1821*4882a593Smuzhiyun }
1822*4882a593Smuzhiyun }
1823*4882a593Smuzhiyun #endif
1824*4882a593Smuzhiyun }
1825*4882a593Smuzhiyun
1826*4882a593Smuzhiyun /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1827*4882a593Smuzhiyun static int __mkroute_input(struct sk_buff *skb,
1828*4882a593Smuzhiyun const struct fib_result *res,
1829*4882a593Smuzhiyun struct in_device *in_dev,
1830*4882a593Smuzhiyun __be32 daddr, __be32 saddr, u32 tos)
1831*4882a593Smuzhiyun {
1832*4882a593Smuzhiyun struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1833*4882a593Smuzhiyun struct net_device *dev = nhc->nhc_dev;
1834*4882a593Smuzhiyun struct fib_nh_exception *fnhe;
1835*4882a593Smuzhiyun struct rtable *rth;
1836*4882a593Smuzhiyun int err;
1837*4882a593Smuzhiyun struct in_device *out_dev;
1838*4882a593Smuzhiyun bool do_cache;
1839*4882a593Smuzhiyun u32 itag = 0;
1840*4882a593Smuzhiyun
1841*4882a593Smuzhiyun /* get a working reference to the output device */
1842*4882a593Smuzhiyun out_dev = __in_dev_get_rcu(dev);
1843*4882a593Smuzhiyun if (!out_dev) {
1844*4882a593Smuzhiyun net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1845*4882a593Smuzhiyun return -EINVAL;
1846*4882a593Smuzhiyun }
1847*4882a593Smuzhiyun
1848*4882a593Smuzhiyun err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1849*4882a593Smuzhiyun in_dev->dev, in_dev, &itag);
1850*4882a593Smuzhiyun if (err < 0) {
1851*4882a593Smuzhiyun ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1852*4882a593Smuzhiyun saddr);
1853*4882a593Smuzhiyun
1854*4882a593Smuzhiyun goto cleanup;
1855*4882a593Smuzhiyun }
1856*4882a593Smuzhiyun
1857*4882a593Smuzhiyun do_cache = res->fi && !itag;
1858*4882a593Smuzhiyun if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1859*4882a593Smuzhiyun skb->protocol == htons(ETH_P_IP)) {
1860*4882a593Smuzhiyun __be32 gw;
1861*4882a593Smuzhiyun
1862*4882a593Smuzhiyun gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1863*4882a593Smuzhiyun if (IN_DEV_SHARED_MEDIA(out_dev) ||
1864*4882a593Smuzhiyun inet_addr_onlink(out_dev, saddr, gw))
1865*4882a593Smuzhiyun IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1866*4882a593Smuzhiyun }
1867*4882a593Smuzhiyun
1868*4882a593Smuzhiyun if (skb->protocol != htons(ETH_P_IP)) {
1869*4882a593Smuzhiyun /* Not IP (i.e. ARP). Do not create route, if it is
1870*4882a593Smuzhiyun * invalid for proxy arp. DNAT routes are always valid.
1871*4882a593Smuzhiyun *
1872*4882a593Smuzhiyun * Proxy arp feature have been extended to allow, ARP
1873*4882a593Smuzhiyun * replies back to the same interface, to support
1874*4882a593Smuzhiyun * Private VLAN switch technologies. See arp.c.
1875*4882a593Smuzhiyun */
1876*4882a593Smuzhiyun if (out_dev == in_dev &&
1877*4882a593Smuzhiyun IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1878*4882a593Smuzhiyun err = -EINVAL;
1879*4882a593Smuzhiyun goto cleanup;
1880*4882a593Smuzhiyun }
1881*4882a593Smuzhiyun }
1882*4882a593Smuzhiyun
1883*4882a593Smuzhiyun fnhe = find_exception(nhc, daddr);
1884*4882a593Smuzhiyun if (do_cache) {
1885*4882a593Smuzhiyun if (fnhe)
1886*4882a593Smuzhiyun rth = rcu_dereference(fnhe->fnhe_rth_input);
1887*4882a593Smuzhiyun else
1888*4882a593Smuzhiyun rth = rcu_dereference(nhc->nhc_rth_input);
1889*4882a593Smuzhiyun if (rt_cache_valid(rth)) {
1890*4882a593Smuzhiyun skb_dst_set_noref(skb, &rth->dst);
1891*4882a593Smuzhiyun goto out;
1892*4882a593Smuzhiyun }
1893*4882a593Smuzhiyun }
1894*4882a593Smuzhiyun
1895*4882a593Smuzhiyun rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1896*4882a593Smuzhiyun IN_DEV_ORCONF(in_dev, NOPOLICY),
1897*4882a593Smuzhiyun IN_DEV_ORCONF(out_dev, NOXFRM));
1898*4882a593Smuzhiyun if (!rth) {
1899*4882a593Smuzhiyun err = -ENOBUFS;
1900*4882a593Smuzhiyun goto cleanup;
1901*4882a593Smuzhiyun }
1902*4882a593Smuzhiyun
1903*4882a593Smuzhiyun rth->rt_is_input = 1;
1904*4882a593Smuzhiyun RT_CACHE_STAT_INC(in_slow_tot);
1905*4882a593Smuzhiyun
1906*4882a593Smuzhiyun rth->dst.input = ip_forward;
1907*4882a593Smuzhiyun
1908*4882a593Smuzhiyun rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1909*4882a593Smuzhiyun do_cache);
1910*4882a593Smuzhiyun lwtunnel_set_redirect(&rth->dst);
1911*4882a593Smuzhiyun skb_dst_set(skb, &rth->dst);
1912*4882a593Smuzhiyun out:
1913*4882a593Smuzhiyun err = 0;
1914*4882a593Smuzhiyun cleanup:
1915*4882a593Smuzhiyun return err;
1916*4882a593Smuzhiyun }
1917*4882a593Smuzhiyun
1918*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_MULTIPATH
1919*4882a593Smuzhiyun /* To make ICMP packets follow the right flow, the multipath hash is
1920*4882a593Smuzhiyun * calculated from the inner IP addresses.
1921*4882a593Smuzhiyun */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1922*4882a593Smuzhiyun static void ip_multipath_l3_keys(const struct sk_buff *skb,
1923*4882a593Smuzhiyun struct flow_keys *hash_keys)
1924*4882a593Smuzhiyun {
1925*4882a593Smuzhiyun const struct iphdr *outer_iph = ip_hdr(skb);
1926*4882a593Smuzhiyun const struct iphdr *key_iph = outer_iph;
1927*4882a593Smuzhiyun const struct iphdr *inner_iph;
1928*4882a593Smuzhiyun const struct icmphdr *icmph;
1929*4882a593Smuzhiyun struct iphdr _inner_iph;
1930*4882a593Smuzhiyun struct icmphdr _icmph;
1931*4882a593Smuzhiyun
1932*4882a593Smuzhiyun if (likely(outer_iph->protocol != IPPROTO_ICMP))
1933*4882a593Smuzhiyun goto out;
1934*4882a593Smuzhiyun
1935*4882a593Smuzhiyun if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1936*4882a593Smuzhiyun goto out;
1937*4882a593Smuzhiyun
1938*4882a593Smuzhiyun icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1939*4882a593Smuzhiyun &_icmph);
1940*4882a593Smuzhiyun if (!icmph)
1941*4882a593Smuzhiyun goto out;
1942*4882a593Smuzhiyun
1943*4882a593Smuzhiyun if (!icmp_is_err(icmph->type))
1944*4882a593Smuzhiyun goto out;
1945*4882a593Smuzhiyun
1946*4882a593Smuzhiyun inner_iph = skb_header_pointer(skb,
1947*4882a593Smuzhiyun outer_iph->ihl * 4 + sizeof(_icmph),
1948*4882a593Smuzhiyun sizeof(_inner_iph), &_inner_iph);
1949*4882a593Smuzhiyun if (!inner_iph)
1950*4882a593Smuzhiyun goto out;
1951*4882a593Smuzhiyun
1952*4882a593Smuzhiyun key_iph = inner_iph;
1953*4882a593Smuzhiyun out:
1954*4882a593Smuzhiyun hash_keys->addrs.v4addrs.src = key_iph->saddr;
1955*4882a593Smuzhiyun hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1956*4882a593Smuzhiyun }
1957*4882a593Smuzhiyun
1958*4882a593Smuzhiyun /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1959*4882a593Smuzhiyun int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1960*4882a593Smuzhiyun const struct sk_buff *skb, struct flow_keys *flkeys)
1961*4882a593Smuzhiyun {
1962*4882a593Smuzhiyun u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1963*4882a593Smuzhiyun struct flow_keys hash_keys;
1964*4882a593Smuzhiyun u32 mhash;
1965*4882a593Smuzhiyun
1966*4882a593Smuzhiyun switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1967*4882a593Smuzhiyun case 0:
1968*4882a593Smuzhiyun memset(&hash_keys, 0, sizeof(hash_keys));
1969*4882a593Smuzhiyun hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1970*4882a593Smuzhiyun if (skb) {
1971*4882a593Smuzhiyun ip_multipath_l3_keys(skb, &hash_keys);
1972*4882a593Smuzhiyun } else {
1973*4882a593Smuzhiyun hash_keys.addrs.v4addrs.src = fl4->saddr;
1974*4882a593Smuzhiyun hash_keys.addrs.v4addrs.dst = fl4->daddr;
1975*4882a593Smuzhiyun }
1976*4882a593Smuzhiyun break;
1977*4882a593Smuzhiyun case 1:
1978*4882a593Smuzhiyun /* skb is currently provided only when forwarding */
1979*4882a593Smuzhiyun if (skb) {
1980*4882a593Smuzhiyun unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1981*4882a593Smuzhiyun struct flow_keys keys;
1982*4882a593Smuzhiyun
1983*4882a593Smuzhiyun /* short-circuit if we already have L4 hash present */
1984*4882a593Smuzhiyun if (skb->l4_hash)
1985*4882a593Smuzhiyun return skb_get_hash_raw(skb) >> 1;
1986*4882a593Smuzhiyun
1987*4882a593Smuzhiyun memset(&hash_keys, 0, sizeof(hash_keys));
1988*4882a593Smuzhiyun
1989*4882a593Smuzhiyun if (!flkeys) {
1990*4882a593Smuzhiyun skb_flow_dissect_flow_keys(skb, &keys, flag);
1991*4882a593Smuzhiyun flkeys = &keys;
1992*4882a593Smuzhiyun }
1993*4882a593Smuzhiyun
1994*4882a593Smuzhiyun hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1995*4882a593Smuzhiyun hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1996*4882a593Smuzhiyun hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1997*4882a593Smuzhiyun hash_keys.ports.src = flkeys->ports.src;
1998*4882a593Smuzhiyun hash_keys.ports.dst = flkeys->ports.dst;
1999*4882a593Smuzhiyun hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2000*4882a593Smuzhiyun } else {
2001*4882a593Smuzhiyun memset(&hash_keys, 0, sizeof(hash_keys));
2002*4882a593Smuzhiyun hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2003*4882a593Smuzhiyun hash_keys.addrs.v4addrs.src = fl4->saddr;
2004*4882a593Smuzhiyun hash_keys.addrs.v4addrs.dst = fl4->daddr;
2005*4882a593Smuzhiyun hash_keys.ports.src = fl4->fl4_sport;
2006*4882a593Smuzhiyun hash_keys.ports.dst = fl4->fl4_dport;
2007*4882a593Smuzhiyun hash_keys.basic.ip_proto = fl4->flowi4_proto;
2008*4882a593Smuzhiyun }
2009*4882a593Smuzhiyun break;
2010*4882a593Smuzhiyun case 2:
2011*4882a593Smuzhiyun memset(&hash_keys, 0, sizeof(hash_keys));
2012*4882a593Smuzhiyun /* skb is currently provided only when forwarding */
2013*4882a593Smuzhiyun if (skb) {
2014*4882a593Smuzhiyun struct flow_keys keys;
2015*4882a593Smuzhiyun
2016*4882a593Smuzhiyun skb_flow_dissect_flow_keys(skb, &keys, 0);
2017*4882a593Smuzhiyun /* Inner can be v4 or v6 */
2018*4882a593Smuzhiyun if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2019*4882a593Smuzhiyun hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2020*4882a593Smuzhiyun hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2021*4882a593Smuzhiyun hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2022*4882a593Smuzhiyun } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2023*4882a593Smuzhiyun hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2024*4882a593Smuzhiyun hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2025*4882a593Smuzhiyun hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2026*4882a593Smuzhiyun hash_keys.tags.flow_label = keys.tags.flow_label;
2027*4882a593Smuzhiyun hash_keys.basic.ip_proto = keys.basic.ip_proto;
2028*4882a593Smuzhiyun } else {
2029*4882a593Smuzhiyun /* Same as case 0 */
2030*4882a593Smuzhiyun hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2031*4882a593Smuzhiyun ip_multipath_l3_keys(skb, &hash_keys);
2032*4882a593Smuzhiyun }
2033*4882a593Smuzhiyun } else {
2034*4882a593Smuzhiyun /* Same as case 0 */
2035*4882a593Smuzhiyun hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2036*4882a593Smuzhiyun hash_keys.addrs.v4addrs.src = fl4->saddr;
2037*4882a593Smuzhiyun hash_keys.addrs.v4addrs.dst = fl4->daddr;
2038*4882a593Smuzhiyun }
2039*4882a593Smuzhiyun break;
2040*4882a593Smuzhiyun }
2041*4882a593Smuzhiyun mhash = flow_hash_from_keys(&hash_keys);
2042*4882a593Smuzhiyun
2043*4882a593Smuzhiyun if (multipath_hash)
2044*4882a593Smuzhiyun mhash = jhash_2words(mhash, multipath_hash, 0);
2045*4882a593Smuzhiyun
2046*4882a593Smuzhiyun return mhash >> 1;
2047*4882a593Smuzhiyun }
2048*4882a593Smuzhiyun #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2049*4882a593Smuzhiyun
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2050*4882a593Smuzhiyun static int ip_mkroute_input(struct sk_buff *skb,
2051*4882a593Smuzhiyun struct fib_result *res,
2052*4882a593Smuzhiyun struct in_device *in_dev,
2053*4882a593Smuzhiyun __be32 daddr, __be32 saddr, u32 tos,
2054*4882a593Smuzhiyun struct flow_keys *hkeys)
2055*4882a593Smuzhiyun {
2056*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_MULTIPATH
2057*4882a593Smuzhiyun if (res->fi && fib_info_num_path(res->fi) > 1) {
2058*4882a593Smuzhiyun int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2059*4882a593Smuzhiyun
2060*4882a593Smuzhiyun fib_select_multipath(res, h);
2061*4882a593Smuzhiyun }
2062*4882a593Smuzhiyun #endif
2063*4882a593Smuzhiyun
2064*4882a593Smuzhiyun /* create a routing cache entry */
2065*4882a593Smuzhiyun return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2066*4882a593Smuzhiyun }
2067*4882a593Smuzhiyun
2068*4882a593Smuzhiyun /* Implements all the saddr-related checks as ip_route_input_slow(),
2069*4882a593Smuzhiyun * assuming daddr is valid and the destination is not a local broadcast one.
2070*4882a593Smuzhiyun * Uses the provided hint instead of performing a route lookup.
2071*4882a593Smuzhiyun */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2072*4882a593Smuzhiyun int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2073*4882a593Smuzhiyun u8 tos, struct net_device *dev,
2074*4882a593Smuzhiyun const struct sk_buff *hint)
2075*4882a593Smuzhiyun {
2076*4882a593Smuzhiyun struct in_device *in_dev = __in_dev_get_rcu(dev);
2077*4882a593Smuzhiyun struct rtable *rt = skb_rtable(hint);
2078*4882a593Smuzhiyun struct net *net = dev_net(dev);
2079*4882a593Smuzhiyun int err = -EINVAL;
2080*4882a593Smuzhiyun u32 tag = 0;
2081*4882a593Smuzhiyun
2082*4882a593Smuzhiyun if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2083*4882a593Smuzhiyun goto martian_source;
2084*4882a593Smuzhiyun
2085*4882a593Smuzhiyun if (ipv4_is_zeronet(saddr))
2086*4882a593Smuzhiyun goto martian_source;
2087*4882a593Smuzhiyun
2088*4882a593Smuzhiyun if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2089*4882a593Smuzhiyun goto martian_source;
2090*4882a593Smuzhiyun
2091*4882a593Smuzhiyun if (rt->rt_type != RTN_LOCAL)
2092*4882a593Smuzhiyun goto skip_validate_source;
2093*4882a593Smuzhiyun
2094*4882a593Smuzhiyun tos &= IPTOS_RT_MASK;
2095*4882a593Smuzhiyun err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2096*4882a593Smuzhiyun if (err < 0)
2097*4882a593Smuzhiyun goto martian_source;
2098*4882a593Smuzhiyun
2099*4882a593Smuzhiyun skip_validate_source:
2100*4882a593Smuzhiyun skb_dst_copy(skb, hint);
2101*4882a593Smuzhiyun return 0;
2102*4882a593Smuzhiyun
2103*4882a593Smuzhiyun martian_source:
2104*4882a593Smuzhiyun ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2105*4882a593Smuzhiyun return err;
2106*4882a593Smuzhiyun }
2107*4882a593Smuzhiyun
2108*4882a593Smuzhiyun /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2109*4882a593Smuzhiyun static struct net_device *ip_rt_get_dev(struct net *net,
2110*4882a593Smuzhiyun const struct fib_result *res)
2111*4882a593Smuzhiyun {
2112*4882a593Smuzhiyun struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2113*4882a593Smuzhiyun struct net_device *dev = NULL;
2114*4882a593Smuzhiyun
2115*4882a593Smuzhiyun if (nhc)
2116*4882a593Smuzhiyun dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2117*4882a593Smuzhiyun
2118*4882a593Smuzhiyun return dev ? : net->loopback_dev;
2119*4882a593Smuzhiyun }
2120*4882a593Smuzhiyun
2121*4882a593Smuzhiyun /*
2122*4882a593Smuzhiyun * NOTE. We drop all the packets that has local source
2123*4882a593Smuzhiyun * addresses, because every properly looped back packet
2124*4882a593Smuzhiyun * must have correct destination already attached by output routine.
2125*4882a593Smuzhiyun * Changes in the enforced policies must be applied also to
2126*4882a593Smuzhiyun * ip_route_use_hint().
2127*4882a593Smuzhiyun *
2128*4882a593Smuzhiyun * Such approach solves two big problems:
2129*4882a593Smuzhiyun * 1. Not simplex devices are handled properly.
2130*4882a593Smuzhiyun * 2. IP spoofing attempts are filtered with 100% of guarantee.
2131*4882a593Smuzhiyun * called with rcu_read_lock()
2132*4882a593Smuzhiyun */
2133*4882a593Smuzhiyun
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2134*4882a593Smuzhiyun static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2135*4882a593Smuzhiyun u8 tos, struct net_device *dev,
2136*4882a593Smuzhiyun struct fib_result *res)
2137*4882a593Smuzhiyun {
2138*4882a593Smuzhiyun struct in_device *in_dev = __in_dev_get_rcu(dev);
2139*4882a593Smuzhiyun struct flow_keys *flkeys = NULL, _flkeys;
2140*4882a593Smuzhiyun struct net *net = dev_net(dev);
2141*4882a593Smuzhiyun struct ip_tunnel_info *tun_info;
2142*4882a593Smuzhiyun int err = -EINVAL;
2143*4882a593Smuzhiyun unsigned int flags = 0;
2144*4882a593Smuzhiyun u32 itag = 0;
2145*4882a593Smuzhiyun struct rtable *rth;
2146*4882a593Smuzhiyun struct flowi4 fl4;
2147*4882a593Smuzhiyun bool do_cache = true;
2148*4882a593Smuzhiyun
2149*4882a593Smuzhiyun /* IP on this device is disabled. */
2150*4882a593Smuzhiyun
2151*4882a593Smuzhiyun if (!in_dev)
2152*4882a593Smuzhiyun goto out;
2153*4882a593Smuzhiyun
2154*4882a593Smuzhiyun /* Check for the most weird martians, which can be not detected
2155*4882a593Smuzhiyun by fib_lookup.
2156*4882a593Smuzhiyun */
2157*4882a593Smuzhiyun
2158*4882a593Smuzhiyun tun_info = skb_tunnel_info(skb);
2159*4882a593Smuzhiyun if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2160*4882a593Smuzhiyun fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2161*4882a593Smuzhiyun else
2162*4882a593Smuzhiyun fl4.flowi4_tun_key.tun_id = 0;
2163*4882a593Smuzhiyun skb_dst_drop(skb);
2164*4882a593Smuzhiyun
2165*4882a593Smuzhiyun if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2166*4882a593Smuzhiyun goto martian_source;
2167*4882a593Smuzhiyun
2168*4882a593Smuzhiyun res->fi = NULL;
2169*4882a593Smuzhiyun res->table = NULL;
2170*4882a593Smuzhiyun if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2171*4882a593Smuzhiyun goto brd_input;
2172*4882a593Smuzhiyun
2173*4882a593Smuzhiyun /* Accept zero addresses only to limited broadcast;
2174*4882a593Smuzhiyun * I even do not know to fix it or not. Waiting for complains :-)
2175*4882a593Smuzhiyun */
2176*4882a593Smuzhiyun if (ipv4_is_zeronet(saddr))
2177*4882a593Smuzhiyun goto martian_source;
2178*4882a593Smuzhiyun
2179*4882a593Smuzhiyun if (ipv4_is_zeronet(daddr))
2180*4882a593Smuzhiyun goto martian_destination;
2181*4882a593Smuzhiyun
2182*4882a593Smuzhiyun /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2183*4882a593Smuzhiyun * and call it once if daddr or/and saddr are loopback addresses
2184*4882a593Smuzhiyun */
2185*4882a593Smuzhiyun if (ipv4_is_loopback(daddr)) {
2186*4882a593Smuzhiyun if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2187*4882a593Smuzhiyun goto martian_destination;
2188*4882a593Smuzhiyun } else if (ipv4_is_loopback(saddr)) {
2189*4882a593Smuzhiyun if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2190*4882a593Smuzhiyun goto martian_source;
2191*4882a593Smuzhiyun }
2192*4882a593Smuzhiyun
2193*4882a593Smuzhiyun /*
2194*4882a593Smuzhiyun * Now we are ready to route packet.
2195*4882a593Smuzhiyun */
2196*4882a593Smuzhiyun fl4.flowi4_oif = 0;
2197*4882a593Smuzhiyun fl4.flowi4_iif = dev->ifindex;
2198*4882a593Smuzhiyun fl4.flowi4_mark = skb->mark;
2199*4882a593Smuzhiyun fl4.flowi4_tos = tos;
2200*4882a593Smuzhiyun fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2201*4882a593Smuzhiyun fl4.flowi4_flags = 0;
2202*4882a593Smuzhiyun fl4.daddr = daddr;
2203*4882a593Smuzhiyun fl4.saddr = saddr;
2204*4882a593Smuzhiyun fl4.flowi4_uid = sock_net_uid(net, NULL);
2205*4882a593Smuzhiyun fl4.flowi4_multipath_hash = 0;
2206*4882a593Smuzhiyun
2207*4882a593Smuzhiyun if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2208*4882a593Smuzhiyun flkeys = &_flkeys;
2209*4882a593Smuzhiyun } else {
2210*4882a593Smuzhiyun fl4.flowi4_proto = 0;
2211*4882a593Smuzhiyun fl4.fl4_sport = 0;
2212*4882a593Smuzhiyun fl4.fl4_dport = 0;
2213*4882a593Smuzhiyun }
2214*4882a593Smuzhiyun
2215*4882a593Smuzhiyun err = fib_lookup(net, &fl4, res, 0);
2216*4882a593Smuzhiyun if (err != 0) {
2217*4882a593Smuzhiyun if (!IN_DEV_FORWARD(in_dev))
2218*4882a593Smuzhiyun err = -EHOSTUNREACH;
2219*4882a593Smuzhiyun goto no_route;
2220*4882a593Smuzhiyun }
2221*4882a593Smuzhiyun
2222*4882a593Smuzhiyun if (res->type == RTN_BROADCAST) {
2223*4882a593Smuzhiyun if (IN_DEV_BFORWARD(in_dev))
2224*4882a593Smuzhiyun goto make_route;
2225*4882a593Smuzhiyun /* not do cache if bc_forwarding is enabled */
2226*4882a593Smuzhiyun if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2227*4882a593Smuzhiyun do_cache = false;
2228*4882a593Smuzhiyun goto brd_input;
2229*4882a593Smuzhiyun }
2230*4882a593Smuzhiyun
2231*4882a593Smuzhiyun if (res->type == RTN_LOCAL) {
2232*4882a593Smuzhiyun err = fib_validate_source(skb, saddr, daddr, tos,
2233*4882a593Smuzhiyun 0, dev, in_dev, &itag);
2234*4882a593Smuzhiyun if (err < 0)
2235*4882a593Smuzhiyun goto martian_source;
2236*4882a593Smuzhiyun goto local_input;
2237*4882a593Smuzhiyun }
2238*4882a593Smuzhiyun
2239*4882a593Smuzhiyun if (!IN_DEV_FORWARD(in_dev)) {
2240*4882a593Smuzhiyun err = -EHOSTUNREACH;
2241*4882a593Smuzhiyun goto no_route;
2242*4882a593Smuzhiyun }
2243*4882a593Smuzhiyun if (res->type != RTN_UNICAST)
2244*4882a593Smuzhiyun goto martian_destination;
2245*4882a593Smuzhiyun
2246*4882a593Smuzhiyun make_route:
2247*4882a593Smuzhiyun err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2248*4882a593Smuzhiyun out: return err;
2249*4882a593Smuzhiyun
2250*4882a593Smuzhiyun brd_input:
2251*4882a593Smuzhiyun if (skb->protocol != htons(ETH_P_IP))
2252*4882a593Smuzhiyun goto e_inval;
2253*4882a593Smuzhiyun
2254*4882a593Smuzhiyun if (!ipv4_is_zeronet(saddr)) {
2255*4882a593Smuzhiyun err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2256*4882a593Smuzhiyun in_dev, &itag);
2257*4882a593Smuzhiyun if (err < 0)
2258*4882a593Smuzhiyun goto martian_source;
2259*4882a593Smuzhiyun }
2260*4882a593Smuzhiyun flags |= RTCF_BROADCAST;
2261*4882a593Smuzhiyun res->type = RTN_BROADCAST;
2262*4882a593Smuzhiyun RT_CACHE_STAT_INC(in_brd);
2263*4882a593Smuzhiyun
2264*4882a593Smuzhiyun local_input:
2265*4882a593Smuzhiyun do_cache &= res->fi && !itag;
2266*4882a593Smuzhiyun if (do_cache) {
2267*4882a593Smuzhiyun struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2268*4882a593Smuzhiyun
2269*4882a593Smuzhiyun rth = rcu_dereference(nhc->nhc_rth_input);
2270*4882a593Smuzhiyun if (rt_cache_valid(rth)) {
2271*4882a593Smuzhiyun skb_dst_set_noref(skb, &rth->dst);
2272*4882a593Smuzhiyun err = 0;
2273*4882a593Smuzhiyun goto out;
2274*4882a593Smuzhiyun }
2275*4882a593Smuzhiyun }
2276*4882a593Smuzhiyun
2277*4882a593Smuzhiyun rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2278*4882a593Smuzhiyun flags | RTCF_LOCAL, res->type,
2279*4882a593Smuzhiyun IN_DEV_ORCONF(in_dev, NOPOLICY), false);
2280*4882a593Smuzhiyun if (!rth)
2281*4882a593Smuzhiyun goto e_nobufs;
2282*4882a593Smuzhiyun
2283*4882a593Smuzhiyun rth->dst.output= ip_rt_bug;
2284*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_CLASSID
2285*4882a593Smuzhiyun rth->dst.tclassid = itag;
2286*4882a593Smuzhiyun #endif
2287*4882a593Smuzhiyun rth->rt_is_input = 1;
2288*4882a593Smuzhiyun
2289*4882a593Smuzhiyun RT_CACHE_STAT_INC(in_slow_tot);
2290*4882a593Smuzhiyun if (res->type == RTN_UNREACHABLE) {
2291*4882a593Smuzhiyun rth->dst.input= ip_error;
2292*4882a593Smuzhiyun rth->dst.error= -err;
2293*4882a593Smuzhiyun rth->rt_flags &= ~RTCF_LOCAL;
2294*4882a593Smuzhiyun }
2295*4882a593Smuzhiyun
2296*4882a593Smuzhiyun if (do_cache) {
2297*4882a593Smuzhiyun struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2298*4882a593Smuzhiyun
2299*4882a593Smuzhiyun rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2300*4882a593Smuzhiyun if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2301*4882a593Smuzhiyun WARN_ON(rth->dst.input == lwtunnel_input);
2302*4882a593Smuzhiyun rth->dst.lwtstate->orig_input = rth->dst.input;
2303*4882a593Smuzhiyun rth->dst.input = lwtunnel_input;
2304*4882a593Smuzhiyun }
2305*4882a593Smuzhiyun
2306*4882a593Smuzhiyun if (unlikely(!rt_cache_route(nhc, rth)))
2307*4882a593Smuzhiyun rt_add_uncached_list(rth);
2308*4882a593Smuzhiyun }
2309*4882a593Smuzhiyun skb_dst_set(skb, &rth->dst);
2310*4882a593Smuzhiyun err = 0;
2311*4882a593Smuzhiyun goto out;
2312*4882a593Smuzhiyun
2313*4882a593Smuzhiyun no_route:
2314*4882a593Smuzhiyun RT_CACHE_STAT_INC(in_no_route);
2315*4882a593Smuzhiyun res->type = RTN_UNREACHABLE;
2316*4882a593Smuzhiyun res->fi = NULL;
2317*4882a593Smuzhiyun res->table = NULL;
2318*4882a593Smuzhiyun goto local_input;
2319*4882a593Smuzhiyun
2320*4882a593Smuzhiyun /*
2321*4882a593Smuzhiyun * Do not cache martian addresses: they should be logged (RFC1812)
2322*4882a593Smuzhiyun */
2323*4882a593Smuzhiyun martian_destination:
2324*4882a593Smuzhiyun RT_CACHE_STAT_INC(in_martian_dst);
2325*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_VERBOSE
2326*4882a593Smuzhiyun if (IN_DEV_LOG_MARTIANS(in_dev))
2327*4882a593Smuzhiyun net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2328*4882a593Smuzhiyun &daddr, &saddr, dev->name);
2329*4882a593Smuzhiyun #endif
2330*4882a593Smuzhiyun
2331*4882a593Smuzhiyun e_inval:
2332*4882a593Smuzhiyun err = -EINVAL;
2333*4882a593Smuzhiyun goto out;
2334*4882a593Smuzhiyun
2335*4882a593Smuzhiyun e_nobufs:
2336*4882a593Smuzhiyun err = -ENOBUFS;
2337*4882a593Smuzhiyun goto out;
2338*4882a593Smuzhiyun
2339*4882a593Smuzhiyun martian_source:
2340*4882a593Smuzhiyun ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2341*4882a593Smuzhiyun goto out;
2342*4882a593Smuzhiyun }
2343*4882a593Smuzhiyun
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2344*4882a593Smuzhiyun int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2345*4882a593Smuzhiyun u8 tos, struct net_device *dev)
2346*4882a593Smuzhiyun {
2347*4882a593Smuzhiyun struct fib_result res;
2348*4882a593Smuzhiyun int err;
2349*4882a593Smuzhiyun
2350*4882a593Smuzhiyun tos &= IPTOS_RT_MASK;
2351*4882a593Smuzhiyun rcu_read_lock();
2352*4882a593Smuzhiyun err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2353*4882a593Smuzhiyun rcu_read_unlock();
2354*4882a593Smuzhiyun
2355*4882a593Smuzhiyun return err;
2356*4882a593Smuzhiyun }
2357*4882a593Smuzhiyun EXPORT_SYMBOL(ip_route_input_noref);
2358*4882a593Smuzhiyun
2359*4882a593Smuzhiyun /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2360*4882a593Smuzhiyun int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2361*4882a593Smuzhiyun u8 tos, struct net_device *dev, struct fib_result *res)
2362*4882a593Smuzhiyun {
2363*4882a593Smuzhiyun /* Multicast recognition logic is moved from route cache to here.
2364*4882a593Smuzhiyun The problem was that too many Ethernet cards have broken/missing
2365*4882a593Smuzhiyun hardware multicast filters :-( As result the host on multicasting
2366*4882a593Smuzhiyun network acquires a lot of useless route cache entries, sort of
2367*4882a593Smuzhiyun SDR messages from all the world. Now we try to get rid of them.
2368*4882a593Smuzhiyun Really, provided software IP multicast filter is organized
2369*4882a593Smuzhiyun reasonably (at least, hashed), it does not result in a slowdown
2370*4882a593Smuzhiyun comparing with route cache reject entries.
2371*4882a593Smuzhiyun Note, that multicast routers are not affected, because
2372*4882a593Smuzhiyun route cache entry is created eventually.
2373*4882a593Smuzhiyun */
2374*4882a593Smuzhiyun if (ipv4_is_multicast(daddr)) {
2375*4882a593Smuzhiyun struct in_device *in_dev = __in_dev_get_rcu(dev);
2376*4882a593Smuzhiyun int our = 0;
2377*4882a593Smuzhiyun int err = -EINVAL;
2378*4882a593Smuzhiyun
2379*4882a593Smuzhiyun if (!in_dev)
2380*4882a593Smuzhiyun return err;
2381*4882a593Smuzhiyun our = ip_check_mc_rcu(in_dev, daddr, saddr,
2382*4882a593Smuzhiyun ip_hdr(skb)->protocol);
2383*4882a593Smuzhiyun
2384*4882a593Smuzhiyun /* check l3 master if no match yet */
2385*4882a593Smuzhiyun if (!our && netif_is_l3_slave(dev)) {
2386*4882a593Smuzhiyun struct in_device *l3_in_dev;
2387*4882a593Smuzhiyun
2388*4882a593Smuzhiyun l3_in_dev = __in_dev_get_rcu(skb->dev);
2389*4882a593Smuzhiyun if (l3_in_dev)
2390*4882a593Smuzhiyun our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2391*4882a593Smuzhiyun ip_hdr(skb)->protocol);
2392*4882a593Smuzhiyun }
2393*4882a593Smuzhiyun
2394*4882a593Smuzhiyun if (our
2395*4882a593Smuzhiyun #ifdef CONFIG_IP_MROUTE
2396*4882a593Smuzhiyun ||
2397*4882a593Smuzhiyun (!ipv4_is_local_multicast(daddr) &&
2398*4882a593Smuzhiyun IN_DEV_MFORWARD(in_dev))
2399*4882a593Smuzhiyun #endif
2400*4882a593Smuzhiyun ) {
2401*4882a593Smuzhiyun err = ip_route_input_mc(skb, daddr, saddr,
2402*4882a593Smuzhiyun tos, dev, our);
2403*4882a593Smuzhiyun }
2404*4882a593Smuzhiyun return err;
2405*4882a593Smuzhiyun }
2406*4882a593Smuzhiyun
2407*4882a593Smuzhiyun return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2408*4882a593Smuzhiyun }
2409*4882a593Smuzhiyun
2410*4882a593Smuzhiyun /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2411*4882a593Smuzhiyun static struct rtable *__mkroute_output(const struct fib_result *res,
2412*4882a593Smuzhiyun const struct flowi4 *fl4, int orig_oif,
2413*4882a593Smuzhiyun struct net_device *dev_out,
2414*4882a593Smuzhiyun unsigned int flags)
2415*4882a593Smuzhiyun {
2416*4882a593Smuzhiyun struct fib_info *fi = res->fi;
2417*4882a593Smuzhiyun struct fib_nh_exception *fnhe;
2418*4882a593Smuzhiyun struct in_device *in_dev;
2419*4882a593Smuzhiyun u16 type = res->type;
2420*4882a593Smuzhiyun struct rtable *rth;
2421*4882a593Smuzhiyun bool do_cache;
2422*4882a593Smuzhiyun
2423*4882a593Smuzhiyun in_dev = __in_dev_get_rcu(dev_out);
2424*4882a593Smuzhiyun if (!in_dev)
2425*4882a593Smuzhiyun return ERR_PTR(-EINVAL);
2426*4882a593Smuzhiyun
2427*4882a593Smuzhiyun if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2428*4882a593Smuzhiyun if (ipv4_is_loopback(fl4->saddr) &&
2429*4882a593Smuzhiyun !(dev_out->flags & IFF_LOOPBACK) &&
2430*4882a593Smuzhiyun !netif_is_l3_master(dev_out))
2431*4882a593Smuzhiyun return ERR_PTR(-EINVAL);
2432*4882a593Smuzhiyun
2433*4882a593Smuzhiyun if (ipv4_is_lbcast(fl4->daddr))
2434*4882a593Smuzhiyun type = RTN_BROADCAST;
2435*4882a593Smuzhiyun else if (ipv4_is_multicast(fl4->daddr))
2436*4882a593Smuzhiyun type = RTN_MULTICAST;
2437*4882a593Smuzhiyun else if (ipv4_is_zeronet(fl4->daddr))
2438*4882a593Smuzhiyun return ERR_PTR(-EINVAL);
2439*4882a593Smuzhiyun
2440*4882a593Smuzhiyun if (dev_out->flags & IFF_LOOPBACK)
2441*4882a593Smuzhiyun flags |= RTCF_LOCAL;
2442*4882a593Smuzhiyun
2443*4882a593Smuzhiyun do_cache = true;
2444*4882a593Smuzhiyun if (type == RTN_BROADCAST) {
2445*4882a593Smuzhiyun flags |= RTCF_BROADCAST | RTCF_LOCAL;
2446*4882a593Smuzhiyun fi = NULL;
2447*4882a593Smuzhiyun } else if (type == RTN_MULTICAST) {
2448*4882a593Smuzhiyun flags |= RTCF_MULTICAST | RTCF_LOCAL;
2449*4882a593Smuzhiyun if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2450*4882a593Smuzhiyun fl4->flowi4_proto))
2451*4882a593Smuzhiyun flags &= ~RTCF_LOCAL;
2452*4882a593Smuzhiyun else
2453*4882a593Smuzhiyun do_cache = false;
2454*4882a593Smuzhiyun /* If multicast route do not exist use
2455*4882a593Smuzhiyun * default one, but do not gateway in this case.
2456*4882a593Smuzhiyun * Yes, it is hack.
2457*4882a593Smuzhiyun */
2458*4882a593Smuzhiyun if (fi && res->prefixlen < 4)
2459*4882a593Smuzhiyun fi = NULL;
2460*4882a593Smuzhiyun } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2461*4882a593Smuzhiyun (orig_oif != dev_out->ifindex)) {
2462*4882a593Smuzhiyun /* For local routes that require a particular output interface
2463*4882a593Smuzhiyun * we do not want to cache the result. Caching the result
2464*4882a593Smuzhiyun * causes incorrect behaviour when there are multiple source
2465*4882a593Smuzhiyun * addresses on the interface, the end result being that if the
2466*4882a593Smuzhiyun * intended recipient is waiting on that interface for the
2467*4882a593Smuzhiyun * packet he won't receive it because it will be delivered on
2468*4882a593Smuzhiyun * the loopback interface and the IP_PKTINFO ipi_ifindex will
2469*4882a593Smuzhiyun * be set to the loopback interface as well.
2470*4882a593Smuzhiyun */
2471*4882a593Smuzhiyun do_cache = false;
2472*4882a593Smuzhiyun }
2473*4882a593Smuzhiyun
2474*4882a593Smuzhiyun fnhe = NULL;
2475*4882a593Smuzhiyun do_cache &= fi != NULL;
2476*4882a593Smuzhiyun if (fi) {
2477*4882a593Smuzhiyun struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2478*4882a593Smuzhiyun struct rtable __rcu **prth;
2479*4882a593Smuzhiyun
2480*4882a593Smuzhiyun fnhe = find_exception(nhc, fl4->daddr);
2481*4882a593Smuzhiyun if (!do_cache)
2482*4882a593Smuzhiyun goto add;
2483*4882a593Smuzhiyun if (fnhe) {
2484*4882a593Smuzhiyun prth = &fnhe->fnhe_rth_output;
2485*4882a593Smuzhiyun } else {
2486*4882a593Smuzhiyun if (unlikely(fl4->flowi4_flags &
2487*4882a593Smuzhiyun FLOWI_FLAG_KNOWN_NH &&
2488*4882a593Smuzhiyun !(nhc->nhc_gw_family &&
2489*4882a593Smuzhiyun nhc->nhc_scope == RT_SCOPE_LINK))) {
2490*4882a593Smuzhiyun do_cache = false;
2491*4882a593Smuzhiyun goto add;
2492*4882a593Smuzhiyun }
2493*4882a593Smuzhiyun prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2494*4882a593Smuzhiyun }
2495*4882a593Smuzhiyun rth = rcu_dereference(*prth);
2496*4882a593Smuzhiyun if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2497*4882a593Smuzhiyun return rth;
2498*4882a593Smuzhiyun }
2499*4882a593Smuzhiyun
2500*4882a593Smuzhiyun add:
2501*4882a593Smuzhiyun rth = rt_dst_alloc(dev_out, flags, type,
2502*4882a593Smuzhiyun IN_DEV_ORCONF(in_dev, NOPOLICY),
2503*4882a593Smuzhiyun IN_DEV_ORCONF(in_dev, NOXFRM));
2504*4882a593Smuzhiyun if (!rth)
2505*4882a593Smuzhiyun return ERR_PTR(-ENOBUFS);
2506*4882a593Smuzhiyun
2507*4882a593Smuzhiyun rth->rt_iif = orig_oif;
2508*4882a593Smuzhiyun
2509*4882a593Smuzhiyun RT_CACHE_STAT_INC(out_slow_tot);
2510*4882a593Smuzhiyun
2511*4882a593Smuzhiyun if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2512*4882a593Smuzhiyun if (flags & RTCF_LOCAL &&
2513*4882a593Smuzhiyun !(dev_out->flags & IFF_LOOPBACK)) {
2514*4882a593Smuzhiyun rth->dst.output = ip_mc_output;
2515*4882a593Smuzhiyun RT_CACHE_STAT_INC(out_slow_mc);
2516*4882a593Smuzhiyun }
2517*4882a593Smuzhiyun #ifdef CONFIG_IP_MROUTE
2518*4882a593Smuzhiyun if (type == RTN_MULTICAST) {
2519*4882a593Smuzhiyun if (IN_DEV_MFORWARD(in_dev) &&
2520*4882a593Smuzhiyun !ipv4_is_local_multicast(fl4->daddr)) {
2521*4882a593Smuzhiyun rth->dst.input = ip_mr_input;
2522*4882a593Smuzhiyun rth->dst.output = ip_mc_output;
2523*4882a593Smuzhiyun }
2524*4882a593Smuzhiyun }
2525*4882a593Smuzhiyun #endif
2526*4882a593Smuzhiyun }
2527*4882a593Smuzhiyun
2528*4882a593Smuzhiyun rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2529*4882a593Smuzhiyun lwtunnel_set_redirect(&rth->dst);
2530*4882a593Smuzhiyun
2531*4882a593Smuzhiyun return rth;
2532*4882a593Smuzhiyun }
2533*4882a593Smuzhiyun
2534*4882a593Smuzhiyun /*
2535*4882a593Smuzhiyun * Major route resolver routine.
2536*4882a593Smuzhiyun */
2537*4882a593Smuzhiyun
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2538*4882a593Smuzhiyun struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2539*4882a593Smuzhiyun const struct sk_buff *skb)
2540*4882a593Smuzhiyun {
2541*4882a593Smuzhiyun struct fib_result res = {
2542*4882a593Smuzhiyun .type = RTN_UNSPEC,
2543*4882a593Smuzhiyun .fi = NULL,
2544*4882a593Smuzhiyun .table = NULL,
2545*4882a593Smuzhiyun .tclassid = 0,
2546*4882a593Smuzhiyun };
2547*4882a593Smuzhiyun struct rtable *rth;
2548*4882a593Smuzhiyun
2549*4882a593Smuzhiyun fl4->flowi4_iif = LOOPBACK_IFINDEX;
2550*4882a593Smuzhiyun ip_rt_fix_tos(fl4);
2551*4882a593Smuzhiyun
2552*4882a593Smuzhiyun rcu_read_lock();
2553*4882a593Smuzhiyun rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2554*4882a593Smuzhiyun rcu_read_unlock();
2555*4882a593Smuzhiyun
2556*4882a593Smuzhiyun return rth;
2557*4882a593Smuzhiyun }
2558*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2559*4882a593Smuzhiyun
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2560*4882a593Smuzhiyun struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2561*4882a593Smuzhiyun struct fib_result *res,
2562*4882a593Smuzhiyun const struct sk_buff *skb)
2563*4882a593Smuzhiyun {
2564*4882a593Smuzhiyun struct net_device *dev_out = NULL;
2565*4882a593Smuzhiyun int orig_oif = fl4->flowi4_oif;
2566*4882a593Smuzhiyun unsigned int flags = 0;
2567*4882a593Smuzhiyun struct rtable *rth;
2568*4882a593Smuzhiyun int err;
2569*4882a593Smuzhiyun
2570*4882a593Smuzhiyun if (fl4->saddr) {
2571*4882a593Smuzhiyun if (ipv4_is_multicast(fl4->saddr) ||
2572*4882a593Smuzhiyun ipv4_is_lbcast(fl4->saddr) ||
2573*4882a593Smuzhiyun ipv4_is_zeronet(fl4->saddr)) {
2574*4882a593Smuzhiyun rth = ERR_PTR(-EINVAL);
2575*4882a593Smuzhiyun goto out;
2576*4882a593Smuzhiyun }
2577*4882a593Smuzhiyun
2578*4882a593Smuzhiyun rth = ERR_PTR(-ENETUNREACH);
2579*4882a593Smuzhiyun
2580*4882a593Smuzhiyun /* I removed check for oif == dev_out->oif here.
2581*4882a593Smuzhiyun It was wrong for two reasons:
2582*4882a593Smuzhiyun 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2583*4882a593Smuzhiyun is assigned to multiple interfaces.
2584*4882a593Smuzhiyun 2. Moreover, we are allowed to send packets with saddr
2585*4882a593Smuzhiyun of another iface. --ANK
2586*4882a593Smuzhiyun */
2587*4882a593Smuzhiyun
2588*4882a593Smuzhiyun if (fl4->flowi4_oif == 0 &&
2589*4882a593Smuzhiyun (ipv4_is_multicast(fl4->daddr) ||
2590*4882a593Smuzhiyun ipv4_is_lbcast(fl4->daddr))) {
2591*4882a593Smuzhiyun /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2592*4882a593Smuzhiyun dev_out = __ip_dev_find(net, fl4->saddr, false);
2593*4882a593Smuzhiyun if (!dev_out)
2594*4882a593Smuzhiyun goto out;
2595*4882a593Smuzhiyun
2596*4882a593Smuzhiyun /* Special hack: user can direct multicasts
2597*4882a593Smuzhiyun and limited broadcast via necessary interface
2598*4882a593Smuzhiyun without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2599*4882a593Smuzhiyun This hack is not just for fun, it allows
2600*4882a593Smuzhiyun vic,vat and friends to work.
2601*4882a593Smuzhiyun They bind socket to loopback, set ttl to zero
2602*4882a593Smuzhiyun and expect that it will work.
2603*4882a593Smuzhiyun From the viewpoint of routing cache they are broken,
2604*4882a593Smuzhiyun because we are not allowed to build multicast path
2605*4882a593Smuzhiyun with loopback source addr (look, routing cache
2606*4882a593Smuzhiyun cannot know, that ttl is zero, so that packet
2607*4882a593Smuzhiyun will not leave this host and route is valid).
2608*4882a593Smuzhiyun Luckily, this hack is good workaround.
2609*4882a593Smuzhiyun */
2610*4882a593Smuzhiyun
2611*4882a593Smuzhiyun fl4->flowi4_oif = dev_out->ifindex;
2612*4882a593Smuzhiyun goto make_route;
2613*4882a593Smuzhiyun }
2614*4882a593Smuzhiyun
2615*4882a593Smuzhiyun if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2616*4882a593Smuzhiyun /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2617*4882a593Smuzhiyun if (!__ip_dev_find(net, fl4->saddr, false))
2618*4882a593Smuzhiyun goto out;
2619*4882a593Smuzhiyun }
2620*4882a593Smuzhiyun }
2621*4882a593Smuzhiyun
2622*4882a593Smuzhiyun
2623*4882a593Smuzhiyun if (fl4->flowi4_oif) {
2624*4882a593Smuzhiyun dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2625*4882a593Smuzhiyun rth = ERR_PTR(-ENODEV);
2626*4882a593Smuzhiyun if (!dev_out)
2627*4882a593Smuzhiyun goto out;
2628*4882a593Smuzhiyun
2629*4882a593Smuzhiyun /* RACE: Check return value of inet_select_addr instead. */
2630*4882a593Smuzhiyun if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2631*4882a593Smuzhiyun rth = ERR_PTR(-ENETUNREACH);
2632*4882a593Smuzhiyun goto out;
2633*4882a593Smuzhiyun }
2634*4882a593Smuzhiyun if (ipv4_is_local_multicast(fl4->daddr) ||
2635*4882a593Smuzhiyun ipv4_is_lbcast(fl4->daddr) ||
2636*4882a593Smuzhiyun fl4->flowi4_proto == IPPROTO_IGMP) {
2637*4882a593Smuzhiyun if (!fl4->saddr)
2638*4882a593Smuzhiyun fl4->saddr = inet_select_addr(dev_out, 0,
2639*4882a593Smuzhiyun RT_SCOPE_LINK);
2640*4882a593Smuzhiyun goto make_route;
2641*4882a593Smuzhiyun }
2642*4882a593Smuzhiyun if (!fl4->saddr) {
2643*4882a593Smuzhiyun if (ipv4_is_multicast(fl4->daddr))
2644*4882a593Smuzhiyun fl4->saddr = inet_select_addr(dev_out, 0,
2645*4882a593Smuzhiyun fl4->flowi4_scope);
2646*4882a593Smuzhiyun else if (!fl4->daddr)
2647*4882a593Smuzhiyun fl4->saddr = inet_select_addr(dev_out, 0,
2648*4882a593Smuzhiyun RT_SCOPE_HOST);
2649*4882a593Smuzhiyun }
2650*4882a593Smuzhiyun }
2651*4882a593Smuzhiyun
2652*4882a593Smuzhiyun if (!fl4->daddr) {
2653*4882a593Smuzhiyun fl4->daddr = fl4->saddr;
2654*4882a593Smuzhiyun if (!fl4->daddr)
2655*4882a593Smuzhiyun fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2656*4882a593Smuzhiyun dev_out = net->loopback_dev;
2657*4882a593Smuzhiyun fl4->flowi4_oif = LOOPBACK_IFINDEX;
2658*4882a593Smuzhiyun res->type = RTN_LOCAL;
2659*4882a593Smuzhiyun flags |= RTCF_LOCAL;
2660*4882a593Smuzhiyun goto make_route;
2661*4882a593Smuzhiyun }
2662*4882a593Smuzhiyun
2663*4882a593Smuzhiyun err = fib_lookup(net, fl4, res, 0);
2664*4882a593Smuzhiyun if (err) {
2665*4882a593Smuzhiyun res->fi = NULL;
2666*4882a593Smuzhiyun res->table = NULL;
2667*4882a593Smuzhiyun if (fl4->flowi4_oif &&
2668*4882a593Smuzhiyun (ipv4_is_multicast(fl4->daddr) ||
2669*4882a593Smuzhiyun !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2670*4882a593Smuzhiyun /* Apparently, routing tables are wrong. Assume,
2671*4882a593Smuzhiyun that the destination is on link.
2672*4882a593Smuzhiyun
2673*4882a593Smuzhiyun WHY? DW.
2674*4882a593Smuzhiyun Because we are allowed to send to iface
2675*4882a593Smuzhiyun even if it has NO routes and NO assigned
2676*4882a593Smuzhiyun addresses. When oif is specified, routing
2677*4882a593Smuzhiyun tables are looked up with only one purpose:
2678*4882a593Smuzhiyun to catch if destination is gatewayed, rather than
2679*4882a593Smuzhiyun direct. Moreover, if MSG_DONTROUTE is set,
2680*4882a593Smuzhiyun we send packet, ignoring both routing tables
2681*4882a593Smuzhiyun and ifaddr state. --ANK
2682*4882a593Smuzhiyun
2683*4882a593Smuzhiyun
2684*4882a593Smuzhiyun We could make it even if oif is unknown,
2685*4882a593Smuzhiyun likely IPv6, but we do not.
2686*4882a593Smuzhiyun */
2687*4882a593Smuzhiyun
2688*4882a593Smuzhiyun if (fl4->saddr == 0)
2689*4882a593Smuzhiyun fl4->saddr = inet_select_addr(dev_out, 0,
2690*4882a593Smuzhiyun RT_SCOPE_LINK);
2691*4882a593Smuzhiyun res->type = RTN_UNICAST;
2692*4882a593Smuzhiyun goto make_route;
2693*4882a593Smuzhiyun }
2694*4882a593Smuzhiyun rth = ERR_PTR(err);
2695*4882a593Smuzhiyun goto out;
2696*4882a593Smuzhiyun }
2697*4882a593Smuzhiyun
2698*4882a593Smuzhiyun if (res->type == RTN_LOCAL) {
2699*4882a593Smuzhiyun if (!fl4->saddr) {
2700*4882a593Smuzhiyun if (res->fi->fib_prefsrc)
2701*4882a593Smuzhiyun fl4->saddr = res->fi->fib_prefsrc;
2702*4882a593Smuzhiyun else
2703*4882a593Smuzhiyun fl4->saddr = fl4->daddr;
2704*4882a593Smuzhiyun }
2705*4882a593Smuzhiyun
2706*4882a593Smuzhiyun /* L3 master device is the loopback for that domain */
2707*4882a593Smuzhiyun dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2708*4882a593Smuzhiyun net->loopback_dev;
2709*4882a593Smuzhiyun
2710*4882a593Smuzhiyun /* make sure orig_oif points to fib result device even
2711*4882a593Smuzhiyun * though packet rx/tx happens over loopback or l3mdev
2712*4882a593Smuzhiyun */
2713*4882a593Smuzhiyun orig_oif = FIB_RES_OIF(*res);
2714*4882a593Smuzhiyun
2715*4882a593Smuzhiyun fl4->flowi4_oif = dev_out->ifindex;
2716*4882a593Smuzhiyun flags |= RTCF_LOCAL;
2717*4882a593Smuzhiyun goto make_route;
2718*4882a593Smuzhiyun }
2719*4882a593Smuzhiyun
2720*4882a593Smuzhiyun fib_select_path(net, res, fl4, skb);
2721*4882a593Smuzhiyun
2722*4882a593Smuzhiyun dev_out = FIB_RES_DEV(*res);
2723*4882a593Smuzhiyun
2724*4882a593Smuzhiyun make_route:
2725*4882a593Smuzhiyun rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2726*4882a593Smuzhiyun
2727*4882a593Smuzhiyun out:
2728*4882a593Smuzhiyun return rth;
2729*4882a593Smuzhiyun }
2730*4882a593Smuzhiyun
2731*4882a593Smuzhiyun static struct dst_ops ipv4_dst_blackhole_ops = {
2732*4882a593Smuzhiyun .family = AF_INET,
2733*4882a593Smuzhiyun .default_advmss = ipv4_default_advmss,
2734*4882a593Smuzhiyun .neigh_lookup = ipv4_neigh_lookup,
2735*4882a593Smuzhiyun .check = dst_blackhole_check,
2736*4882a593Smuzhiyun .cow_metrics = dst_blackhole_cow_metrics,
2737*4882a593Smuzhiyun .update_pmtu = dst_blackhole_update_pmtu,
2738*4882a593Smuzhiyun .redirect = dst_blackhole_redirect,
2739*4882a593Smuzhiyun .mtu = dst_blackhole_mtu,
2740*4882a593Smuzhiyun };
2741*4882a593Smuzhiyun
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2742*4882a593Smuzhiyun struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2743*4882a593Smuzhiyun {
2744*4882a593Smuzhiyun struct rtable *ort = (struct rtable *) dst_orig;
2745*4882a593Smuzhiyun struct rtable *rt;
2746*4882a593Smuzhiyun
2747*4882a593Smuzhiyun rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2748*4882a593Smuzhiyun if (rt) {
2749*4882a593Smuzhiyun struct dst_entry *new = &rt->dst;
2750*4882a593Smuzhiyun
2751*4882a593Smuzhiyun new->__use = 1;
2752*4882a593Smuzhiyun new->input = dst_discard;
2753*4882a593Smuzhiyun new->output = dst_discard_out;
2754*4882a593Smuzhiyun
2755*4882a593Smuzhiyun new->dev = net->loopback_dev;
2756*4882a593Smuzhiyun if (new->dev)
2757*4882a593Smuzhiyun dev_hold(new->dev);
2758*4882a593Smuzhiyun
2759*4882a593Smuzhiyun rt->rt_is_input = ort->rt_is_input;
2760*4882a593Smuzhiyun rt->rt_iif = ort->rt_iif;
2761*4882a593Smuzhiyun rt->rt_pmtu = ort->rt_pmtu;
2762*4882a593Smuzhiyun rt->rt_mtu_locked = ort->rt_mtu_locked;
2763*4882a593Smuzhiyun
2764*4882a593Smuzhiyun rt->rt_genid = rt_genid_ipv4(net);
2765*4882a593Smuzhiyun rt->rt_flags = ort->rt_flags;
2766*4882a593Smuzhiyun rt->rt_type = ort->rt_type;
2767*4882a593Smuzhiyun rt->rt_uses_gateway = ort->rt_uses_gateway;
2768*4882a593Smuzhiyun rt->rt_gw_family = ort->rt_gw_family;
2769*4882a593Smuzhiyun if (rt->rt_gw_family == AF_INET)
2770*4882a593Smuzhiyun rt->rt_gw4 = ort->rt_gw4;
2771*4882a593Smuzhiyun else if (rt->rt_gw_family == AF_INET6)
2772*4882a593Smuzhiyun rt->rt_gw6 = ort->rt_gw6;
2773*4882a593Smuzhiyun
2774*4882a593Smuzhiyun INIT_LIST_HEAD(&rt->rt_uncached);
2775*4882a593Smuzhiyun }
2776*4882a593Smuzhiyun
2777*4882a593Smuzhiyun dst_release(dst_orig);
2778*4882a593Smuzhiyun
2779*4882a593Smuzhiyun return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2780*4882a593Smuzhiyun }
2781*4882a593Smuzhiyun
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2782*4882a593Smuzhiyun struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2783*4882a593Smuzhiyun const struct sock *sk)
2784*4882a593Smuzhiyun {
2785*4882a593Smuzhiyun struct rtable *rt = __ip_route_output_key(net, flp4);
2786*4882a593Smuzhiyun
2787*4882a593Smuzhiyun if (IS_ERR(rt))
2788*4882a593Smuzhiyun return rt;
2789*4882a593Smuzhiyun
2790*4882a593Smuzhiyun if (flp4->flowi4_proto) {
2791*4882a593Smuzhiyun flp4->flowi4_oif = rt->dst.dev->ifindex;
2792*4882a593Smuzhiyun rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2793*4882a593Smuzhiyun flowi4_to_flowi(flp4),
2794*4882a593Smuzhiyun sk, 0);
2795*4882a593Smuzhiyun }
2796*4882a593Smuzhiyun
2797*4882a593Smuzhiyun return rt;
2798*4882a593Smuzhiyun }
2799*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(ip_route_output_flow);
2800*4882a593Smuzhiyun
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2801*4882a593Smuzhiyun struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2802*4882a593Smuzhiyun struct net_device *dev,
2803*4882a593Smuzhiyun struct net *net, __be32 *saddr,
2804*4882a593Smuzhiyun const struct ip_tunnel_info *info,
2805*4882a593Smuzhiyun u8 protocol, bool use_cache)
2806*4882a593Smuzhiyun {
2807*4882a593Smuzhiyun #ifdef CONFIG_DST_CACHE
2808*4882a593Smuzhiyun struct dst_cache *dst_cache;
2809*4882a593Smuzhiyun #endif
2810*4882a593Smuzhiyun struct rtable *rt = NULL;
2811*4882a593Smuzhiyun struct flowi4 fl4;
2812*4882a593Smuzhiyun __u8 tos;
2813*4882a593Smuzhiyun
2814*4882a593Smuzhiyun #ifdef CONFIG_DST_CACHE
2815*4882a593Smuzhiyun dst_cache = (struct dst_cache *)&info->dst_cache;
2816*4882a593Smuzhiyun if (use_cache) {
2817*4882a593Smuzhiyun rt = dst_cache_get_ip4(dst_cache, saddr);
2818*4882a593Smuzhiyun if (rt)
2819*4882a593Smuzhiyun return rt;
2820*4882a593Smuzhiyun }
2821*4882a593Smuzhiyun #endif
2822*4882a593Smuzhiyun memset(&fl4, 0, sizeof(fl4));
2823*4882a593Smuzhiyun fl4.flowi4_mark = skb->mark;
2824*4882a593Smuzhiyun fl4.flowi4_proto = protocol;
2825*4882a593Smuzhiyun fl4.daddr = info->key.u.ipv4.dst;
2826*4882a593Smuzhiyun fl4.saddr = info->key.u.ipv4.src;
2827*4882a593Smuzhiyun tos = info->key.tos;
2828*4882a593Smuzhiyun fl4.flowi4_tos = RT_TOS(tos);
2829*4882a593Smuzhiyun
2830*4882a593Smuzhiyun rt = ip_route_output_key(net, &fl4);
2831*4882a593Smuzhiyun if (IS_ERR(rt)) {
2832*4882a593Smuzhiyun netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2833*4882a593Smuzhiyun return ERR_PTR(-ENETUNREACH);
2834*4882a593Smuzhiyun }
2835*4882a593Smuzhiyun if (rt->dst.dev == dev) { /* is this necessary? */
2836*4882a593Smuzhiyun netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2837*4882a593Smuzhiyun ip_rt_put(rt);
2838*4882a593Smuzhiyun return ERR_PTR(-ELOOP);
2839*4882a593Smuzhiyun }
2840*4882a593Smuzhiyun #ifdef CONFIG_DST_CACHE
2841*4882a593Smuzhiyun if (use_cache)
2842*4882a593Smuzhiyun dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2843*4882a593Smuzhiyun #endif
2844*4882a593Smuzhiyun *saddr = fl4.saddr;
2845*4882a593Smuzhiyun return rt;
2846*4882a593Smuzhiyun }
2847*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2848*4882a593Smuzhiyun
2849*4882a593Smuzhiyun /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2850*4882a593Smuzhiyun static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2851*4882a593Smuzhiyun struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2852*4882a593Smuzhiyun struct sk_buff *skb, u32 portid, u32 seq,
2853*4882a593Smuzhiyun unsigned int flags)
2854*4882a593Smuzhiyun {
2855*4882a593Smuzhiyun struct rtmsg *r;
2856*4882a593Smuzhiyun struct nlmsghdr *nlh;
2857*4882a593Smuzhiyun unsigned long expires = 0;
2858*4882a593Smuzhiyun u32 error;
2859*4882a593Smuzhiyun u32 metrics[RTAX_MAX];
2860*4882a593Smuzhiyun
2861*4882a593Smuzhiyun nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2862*4882a593Smuzhiyun if (!nlh)
2863*4882a593Smuzhiyun return -EMSGSIZE;
2864*4882a593Smuzhiyun
2865*4882a593Smuzhiyun r = nlmsg_data(nlh);
2866*4882a593Smuzhiyun r->rtm_family = AF_INET;
2867*4882a593Smuzhiyun r->rtm_dst_len = 32;
2868*4882a593Smuzhiyun r->rtm_src_len = 0;
2869*4882a593Smuzhiyun r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2870*4882a593Smuzhiyun r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2871*4882a593Smuzhiyun if (nla_put_u32(skb, RTA_TABLE, table_id))
2872*4882a593Smuzhiyun goto nla_put_failure;
2873*4882a593Smuzhiyun r->rtm_type = rt->rt_type;
2874*4882a593Smuzhiyun r->rtm_scope = RT_SCOPE_UNIVERSE;
2875*4882a593Smuzhiyun r->rtm_protocol = RTPROT_UNSPEC;
2876*4882a593Smuzhiyun r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2877*4882a593Smuzhiyun if (rt->rt_flags & RTCF_NOTIFY)
2878*4882a593Smuzhiyun r->rtm_flags |= RTM_F_NOTIFY;
2879*4882a593Smuzhiyun if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2880*4882a593Smuzhiyun r->rtm_flags |= RTCF_DOREDIRECT;
2881*4882a593Smuzhiyun
2882*4882a593Smuzhiyun if (nla_put_in_addr(skb, RTA_DST, dst))
2883*4882a593Smuzhiyun goto nla_put_failure;
2884*4882a593Smuzhiyun if (src) {
2885*4882a593Smuzhiyun r->rtm_src_len = 32;
2886*4882a593Smuzhiyun if (nla_put_in_addr(skb, RTA_SRC, src))
2887*4882a593Smuzhiyun goto nla_put_failure;
2888*4882a593Smuzhiyun }
2889*4882a593Smuzhiyun if (rt->dst.dev &&
2890*4882a593Smuzhiyun nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2891*4882a593Smuzhiyun goto nla_put_failure;
2892*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_CLASSID
2893*4882a593Smuzhiyun if (rt->dst.tclassid &&
2894*4882a593Smuzhiyun nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2895*4882a593Smuzhiyun goto nla_put_failure;
2896*4882a593Smuzhiyun #endif
2897*4882a593Smuzhiyun if (fl4 && !rt_is_input_route(rt) &&
2898*4882a593Smuzhiyun fl4->saddr != src) {
2899*4882a593Smuzhiyun if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2900*4882a593Smuzhiyun goto nla_put_failure;
2901*4882a593Smuzhiyun }
2902*4882a593Smuzhiyun if (rt->rt_uses_gateway) {
2903*4882a593Smuzhiyun if (rt->rt_gw_family == AF_INET &&
2904*4882a593Smuzhiyun nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2905*4882a593Smuzhiyun goto nla_put_failure;
2906*4882a593Smuzhiyun } else if (rt->rt_gw_family == AF_INET6) {
2907*4882a593Smuzhiyun int alen = sizeof(struct in6_addr);
2908*4882a593Smuzhiyun struct nlattr *nla;
2909*4882a593Smuzhiyun struct rtvia *via;
2910*4882a593Smuzhiyun
2911*4882a593Smuzhiyun nla = nla_reserve(skb, RTA_VIA, alen + 2);
2912*4882a593Smuzhiyun if (!nla)
2913*4882a593Smuzhiyun goto nla_put_failure;
2914*4882a593Smuzhiyun
2915*4882a593Smuzhiyun via = nla_data(nla);
2916*4882a593Smuzhiyun via->rtvia_family = AF_INET6;
2917*4882a593Smuzhiyun memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2918*4882a593Smuzhiyun }
2919*4882a593Smuzhiyun }
2920*4882a593Smuzhiyun
2921*4882a593Smuzhiyun expires = rt->dst.expires;
2922*4882a593Smuzhiyun if (expires) {
2923*4882a593Smuzhiyun unsigned long now = jiffies;
2924*4882a593Smuzhiyun
2925*4882a593Smuzhiyun if (time_before(now, expires))
2926*4882a593Smuzhiyun expires -= now;
2927*4882a593Smuzhiyun else
2928*4882a593Smuzhiyun expires = 0;
2929*4882a593Smuzhiyun }
2930*4882a593Smuzhiyun
2931*4882a593Smuzhiyun memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2932*4882a593Smuzhiyun if (rt->rt_pmtu && expires)
2933*4882a593Smuzhiyun metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2934*4882a593Smuzhiyun if (rt->rt_mtu_locked && expires)
2935*4882a593Smuzhiyun metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2936*4882a593Smuzhiyun if (rtnetlink_put_metrics(skb, metrics) < 0)
2937*4882a593Smuzhiyun goto nla_put_failure;
2938*4882a593Smuzhiyun
2939*4882a593Smuzhiyun if (fl4) {
2940*4882a593Smuzhiyun if (fl4->flowi4_mark &&
2941*4882a593Smuzhiyun nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2942*4882a593Smuzhiyun goto nla_put_failure;
2943*4882a593Smuzhiyun
2944*4882a593Smuzhiyun if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2945*4882a593Smuzhiyun nla_put_u32(skb, RTA_UID,
2946*4882a593Smuzhiyun from_kuid_munged(current_user_ns(),
2947*4882a593Smuzhiyun fl4->flowi4_uid)))
2948*4882a593Smuzhiyun goto nla_put_failure;
2949*4882a593Smuzhiyun
2950*4882a593Smuzhiyun if (rt_is_input_route(rt)) {
2951*4882a593Smuzhiyun #ifdef CONFIG_IP_MROUTE
2952*4882a593Smuzhiyun if (ipv4_is_multicast(dst) &&
2953*4882a593Smuzhiyun !ipv4_is_local_multicast(dst) &&
2954*4882a593Smuzhiyun IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2955*4882a593Smuzhiyun int err = ipmr_get_route(net, skb,
2956*4882a593Smuzhiyun fl4->saddr, fl4->daddr,
2957*4882a593Smuzhiyun r, portid);
2958*4882a593Smuzhiyun
2959*4882a593Smuzhiyun if (err <= 0) {
2960*4882a593Smuzhiyun if (err == 0)
2961*4882a593Smuzhiyun return 0;
2962*4882a593Smuzhiyun goto nla_put_failure;
2963*4882a593Smuzhiyun }
2964*4882a593Smuzhiyun } else
2965*4882a593Smuzhiyun #endif
2966*4882a593Smuzhiyun if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2967*4882a593Smuzhiyun goto nla_put_failure;
2968*4882a593Smuzhiyun }
2969*4882a593Smuzhiyun }
2970*4882a593Smuzhiyun
2971*4882a593Smuzhiyun error = rt->dst.error;
2972*4882a593Smuzhiyun
2973*4882a593Smuzhiyun if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2974*4882a593Smuzhiyun goto nla_put_failure;
2975*4882a593Smuzhiyun
2976*4882a593Smuzhiyun nlmsg_end(skb, nlh);
2977*4882a593Smuzhiyun return 0;
2978*4882a593Smuzhiyun
2979*4882a593Smuzhiyun nla_put_failure:
2980*4882a593Smuzhiyun nlmsg_cancel(skb, nlh);
2981*4882a593Smuzhiyun return -EMSGSIZE;
2982*4882a593Smuzhiyun }
2983*4882a593Smuzhiyun
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2984*4882a593Smuzhiyun static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2985*4882a593Smuzhiyun struct netlink_callback *cb, u32 table_id,
2986*4882a593Smuzhiyun struct fnhe_hash_bucket *bucket, int genid,
2987*4882a593Smuzhiyun int *fa_index, int fa_start, unsigned int flags)
2988*4882a593Smuzhiyun {
2989*4882a593Smuzhiyun int i;
2990*4882a593Smuzhiyun
2991*4882a593Smuzhiyun for (i = 0; i < FNHE_HASH_SIZE; i++) {
2992*4882a593Smuzhiyun struct fib_nh_exception *fnhe;
2993*4882a593Smuzhiyun
2994*4882a593Smuzhiyun for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2995*4882a593Smuzhiyun fnhe = rcu_dereference(fnhe->fnhe_next)) {
2996*4882a593Smuzhiyun struct rtable *rt;
2997*4882a593Smuzhiyun int err;
2998*4882a593Smuzhiyun
2999*4882a593Smuzhiyun if (*fa_index < fa_start)
3000*4882a593Smuzhiyun goto next;
3001*4882a593Smuzhiyun
3002*4882a593Smuzhiyun if (fnhe->fnhe_genid != genid)
3003*4882a593Smuzhiyun goto next;
3004*4882a593Smuzhiyun
3005*4882a593Smuzhiyun if (fnhe->fnhe_expires &&
3006*4882a593Smuzhiyun time_after(jiffies, fnhe->fnhe_expires))
3007*4882a593Smuzhiyun goto next;
3008*4882a593Smuzhiyun
3009*4882a593Smuzhiyun rt = rcu_dereference(fnhe->fnhe_rth_input);
3010*4882a593Smuzhiyun if (!rt)
3011*4882a593Smuzhiyun rt = rcu_dereference(fnhe->fnhe_rth_output);
3012*4882a593Smuzhiyun if (!rt)
3013*4882a593Smuzhiyun goto next;
3014*4882a593Smuzhiyun
3015*4882a593Smuzhiyun err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3016*4882a593Smuzhiyun table_id, NULL, skb,
3017*4882a593Smuzhiyun NETLINK_CB(cb->skb).portid,
3018*4882a593Smuzhiyun cb->nlh->nlmsg_seq, flags);
3019*4882a593Smuzhiyun if (err)
3020*4882a593Smuzhiyun return err;
3021*4882a593Smuzhiyun next:
3022*4882a593Smuzhiyun (*fa_index)++;
3023*4882a593Smuzhiyun }
3024*4882a593Smuzhiyun }
3025*4882a593Smuzhiyun
3026*4882a593Smuzhiyun return 0;
3027*4882a593Smuzhiyun }
3028*4882a593Smuzhiyun
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3029*4882a593Smuzhiyun int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3030*4882a593Smuzhiyun u32 table_id, struct fib_info *fi,
3031*4882a593Smuzhiyun int *fa_index, int fa_start, unsigned int flags)
3032*4882a593Smuzhiyun {
3033*4882a593Smuzhiyun struct net *net = sock_net(cb->skb->sk);
3034*4882a593Smuzhiyun int nhsel, genid = fnhe_genid(net);
3035*4882a593Smuzhiyun
3036*4882a593Smuzhiyun for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3037*4882a593Smuzhiyun struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3038*4882a593Smuzhiyun struct fnhe_hash_bucket *bucket;
3039*4882a593Smuzhiyun int err;
3040*4882a593Smuzhiyun
3041*4882a593Smuzhiyun if (nhc->nhc_flags & RTNH_F_DEAD)
3042*4882a593Smuzhiyun continue;
3043*4882a593Smuzhiyun
3044*4882a593Smuzhiyun rcu_read_lock();
3045*4882a593Smuzhiyun bucket = rcu_dereference(nhc->nhc_exceptions);
3046*4882a593Smuzhiyun err = 0;
3047*4882a593Smuzhiyun if (bucket)
3048*4882a593Smuzhiyun err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3049*4882a593Smuzhiyun genid, fa_index, fa_start,
3050*4882a593Smuzhiyun flags);
3051*4882a593Smuzhiyun rcu_read_unlock();
3052*4882a593Smuzhiyun if (err)
3053*4882a593Smuzhiyun return err;
3054*4882a593Smuzhiyun }
3055*4882a593Smuzhiyun
3056*4882a593Smuzhiyun return 0;
3057*4882a593Smuzhiyun }
3058*4882a593Smuzhiyun
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3059*4882a593Smuzhiyun static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3060*4882a593Smuzhiyun u8 ip_proto, __be16 sport,
3061*4882a593Smuzhiyun __be16 dport)
3062*4882a593Smuzhiyun {
3063*4882a593Smuzhiyun struct sk_buff *skb;
3064*4882a593Smuzhiyun struct iphdr *iph;
3065*4882a593Smuzhiyun
3066*4882a593Smuzhiyun skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3067*4882a593Smuzhiyun if (!skb)
3068*4882a593Smuzhiyun return NULL;
3069*4882a593Smuzhiyun
3070*4882a593Smuzhiyun /* Reserve room for dummy headers, this skb can pass
3071*4882a593Smuzhiyun * through good chunk of routing engine.
3072*4882a593Smuzhiyun */
3073*4882a593Smuzhiyun skb_reset_mac_header(skb);
3074*4882a593Smuzhiyun skb_reset_network_header(skb);
3075*4882a593Smuzhiyun skb->protocol = htons(ETH_P_IP);
3076*4882a593Smuzhiyun iph = skb_put(skb, sizeof(struct iphdr));
3077*4882a593Smuzhiyun iph->protocol = ip_proto;
3078*4882a593Smuzhiyun iph->saddr = src;
3079*4882a593Smuzhiyun iph->daddr = dst;
3080*4882a593Smuzhiyun iph->version = 0x4;
3081*4882a593Smuzhiyun iph->frag_off = 0;
3082*4882a593Smuzhiyun iph->ihl = 0x5;
3083*4882a593Smuzhiyun skb_set_transport_header(skb, skb->len);
3084*4882a593Smuzhiyun
3085*4882a593Smuzhiyun switch (iph->protocol) {
3086*4882a593Smuzhiyun case IPPROTO_UDP: {
3087*4882a593Smuzhiyun struct udphdr *udph;
3088*4882a593Smuzhiyun
3089*4882a593Smuzhiyun udph = skb_put_zero(skb, sizeof(struct udphdr));
3090*4882a593Smuzhiyun udph->source = sport;
3091*4882a593Smuzhiyun udph->dest = dport;
3092*4882a593Smuzhiyun udph->len = htons(sizeof(struct udphdr));
3093*4882a593Smuzhiyun udph->check = 0;
3094*4882a593Smuzhiyun break;
3095*4882a593Smuzhiyun }
3096*4882a593Smuzhiyun case IPPROTO_TCP: {
3097*4882a593Smuzhiyun struct tcphdr *tcph;
3098*4882a593Smuzhiyun
3099*4882a593Smuzhiyun tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3100*4882a593Smuzhiyun tcph->source = sport;
3101*4882a593Smuzhiyun tcph->dest = dport;
3102*4882a593Smuzhiyun tcph->doff = sizeof(struct tcphdr) / 4;
3103*4882a593Smuzhiyun tcph->rst = 1;
3104*4882a593Smuzhiyun tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3105*4882a593Smuzhiyun src, dst, 0);
3106*4882a593Smuzhiyun break;
3107*4882a593Smuzhiyun }
3108*4882a593Smuzhiyun case IPPROTO_ICMP: {
3109*4882a593Smuzhiyun struct icmphdr *icmph;
3110*4882a593Smuzhiyun
3111*4882a593Smuzhiyun icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3112*4882a593Smuzhiyun icmph->type = ICMP_ECHO;
3113*4882a593Smuzhiyun icmph->code = 0;
3114*4882a593Smuzhiyun }
3115*4882a593Smuzhiyun }
3116*4882a593Smuzhiyun
3117*4882a593Smuzhiyun return skb;
3118*4882a593Smuzhiyun }
3119*4882a593Smuzhiyun
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3120*4882a593Smuzhiyun static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3121*4882a593Smuzhiyun const struct nlmsghdr *nlh,
3122*4882a593Smuzhiyun struct nlattr **tb,
3123*4882a593Smuzhiyun struct netlink_ext_ack *extack)
3124*4882a593Smuzhiyun {
3125*4882a593Smuzhiyun struct rtmsg *rtm;
3126*4882a593Smuzhiyun int i, err;
3127*4882a593Smuzhiyun
3128*4882a593Smuzhiyun if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3129*4882a593Smuzhiyun NL_SET_ERR_MSG(extack,
3130*4882a593Smuzhiyun "ipv4: Invalid header for route get request");
3131*4882a593Smuzhiyun return -EINVAL;
3132*4882a593Smuzhiyun }
3133*4882a593Smuzhiyun
3134*4882a593Smuzhiyun if (!netlink_strict_get_check(skb))
3135*4882a593Smuzhiyun return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3136*4882a593Smuzhiyun rtm_ipv4_policy, extack);
3137*4882a593Smuzhiyun
3138*4882a593Smuzhiyun rtm = nlmsg_data(nlh);
3139*4882a593Smuzhiyun if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3140*4882a593Smuzhiyun (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3141*4882a593Smuzhiyun rtm->rtm_table || rtm->rtm_protocol ||
3142*4882a593Smuzhiyun rtm->rtm_scope || rtm->rtm_type) {
3143*4882a593Smuzhiyun NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3144*4882a593Smuzhiyun return -EINVAL;
3145*4882a593Smuzhiyun }
3146*4882a593Smuzhiyun
3147*4882a593Smuzhiyun if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3148*4882a593Smuzhiyun RTM_F_LOOKUP_TABLE |
3149*4882a593Smuzhiyun RTM_F_FIB_MATCH)) {
3150*4882a593Smuzhiyun NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3151*4882a593Smuzhiyun return -EINVAL;
3152*4882a593Smuzhiyun }
3153*4882a593Smuzhiyun
3154*4882a593Smuzhiyun err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3155*4882a593Smuzhiyun rtm_ipv4_policy, extack);
3156*4882a593Smuzhiyun if (err)
3157*4882a593Smuzhiyun return err;
3158*4882a593Smuzhiyun
3159*4882a593Smuzhiyun if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3160*4882a593Smuzhiyun (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3161*4882a593Smuzhiyun NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3162*4882a593Smuzhiyun return -EINVAL;
3163*4882a593Smuzhiyun }
3164*4882a593Smuzhiyun
3165*4882a593Smuzhiyun for (i = 0; i <= RTA_MAX; i++) {
3166*4882a593Smuzhiyun if (!tb[i])
3167*4882a593Smuzhiyun continue;
3168*4882a593Smuzhiyun
3169*4882a593Smuzhiyun switch (i) {
3170*4882a593Smuzhiyun case RTA_IIF:
3171*4882a593Smuzhiyun case RTA_OIF:
3172*4882a593Smuzhiyun case RTA_SRC:
3173*4882a593Smuzhiyun case RTA_DST:
3174*4882a593Smuzhiyun case RTA_IP_PROTO:
3175*4882a593Smuzhiyun case RTA_SPORT:
3176*4882a593Smuzhiyun case RTA_DPORT:
3177*4882a593Smuzhiyun case RTA_MARK:
3178*4882a593Smuzhiyun case RTA_UID:
3179*4882a593Smuzhiyun break;
3180*4882a593Smuzhiyun default:
3181*4882a593Smuzhiyun NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3182*4882a593Smuzhiyun return -EINVAL;
3183*4882a593Smuzhiyun }
3184*4882a593Smuzhiyun }
3185*4882a593Smuzhiyun
3186*4882a593Smuzhiyun return 0;
3187*4882a593Smuzhiyun }
3188*4882a593Smuzhiyun
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3189*4882a593Smuzhiyun static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3190*4882a593Smuzhiyun struct netlink_ext_ack *extack)
3191*4882a593Smuzhiyun {
3192*4882a593Smuzhiyun struct net *net = sock_net(in_skb->sk);
3193*4882a593Smuzhiyun struct nlattr *tb[RTA_MAX+1];
3194*4882a593Smuzhiyun u32 table_id = RT_TABLE_MAIN;
3195*4882a593Smuzhiyun __be16 sport = 0, dport = 0;
3196*4882a593Smuzhiyun struct fib_result res = {};
3197*4882a593Smuzhiyun u8 ip_proto = IPPROTO_UDP;
3198*4882a593Smuzhiyun struct rtable *rt = NULL;
3199*4882a593Smuzhiyun struct sk_buff *skb;
3200*4882a593Smuzhiyun struct rtmsg *rtm;
3201*4882a593Smuzhiyun struct flowi4 fl4 = {};
3202*4882a593Smuzhiyun __be32 dst = 0;
3203*4882a593Smuzhiyun __be32 src = 0;
3204*4882a593Smuzhiyun kuid_t uid;
3205*4882a593Smuzhiyun u32 iif;
3206*4882a593Smuzhiyun int err;
3207*4882a593Smuzhiyun int mark;
3208*4882a593Smuzhiyun
3209*4882a593Smuzhiyun err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3210*4882a593Smuzhiyun if (err < 0)
3211*4882a593Smuzhiyun return err;
3212*4882a593Smuzhiyun
3213*4882a593Smuzhiyun rtm = nlmsg_data(nlh);
3214*4882a593Smuzhiyun src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3215*4882a593Smuzhiyun dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3216*4882a593Smuzhiyun iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3217*4882a593Smuzhiyun mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3218*4882a593Smuzhiyun if (tb[RTA_UID])
3219*4882a593Smuzhiyun uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3220*4882a593Smuzhiyun else
3221*4882a593Smuzhiyun uid = (iif ? INVALID_UID : current_uid());
3222*4882a593Smuzhiyun
3223*4882a593Smuzhiyun if (tb[RTA_IP_PROTO]) {
3224*4882a593Smuzhiyun err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3225*4882a593Smuzhiyun &ip_proto, AF_INET, extack);
3226*4882a593Smuzhiyun if (err)
3227*4882a593Smuzhiyun return err;
3228*4882a593Smuzhiyun }
3229*4882a593Smuzhiyun
3230*4882a593Smuzhiyun if (tb[RTA_SPORT])
3231*4882a593Smuzhiyun sport = nla_get_be16(tb[RTA_SPORT]);
3232*4882a593Smuzhiyun
3233*4882a593Smuzhiyun if (tb[RTA_DPORT])
3234*4882a593Smuzhiyun dport = nla_get_be16(tb[RTA_DPORT]);
3235*4882a593Smuzhiyun
3236*4882a593Smuzhiyun skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3237*4882a593Smuzhiyun if (!skb)
3238*4882a593Smuzhiyun return -ENOBUFS;
3239*4882a593Smuzhiyun
3240*4882a593Smuzhiyun fl4.daddr = dst;
3241*4882a593Smuzhiyun fl4.saddr = src;
3242*4882a593Smuzhiyun fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3243*4882a593Smuzhiyun fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3244*4882a593Smuzhiyun fl4.flowi4_mark = mark;
3245*4882a593Smuzhiyun fl4.flowi4_uid = uid;
3246*4882a593Smuzhiyun if (sport)
3247*4882a593Smuzhiyun fl4.fl4_sport = sport;
3248*4882a593Smuzhiyun if (dport)
3249*4882a593Smuzhiyun fl4.fl4_dport = dport;
3250*4882a593Smuzhiyun fl4.flowi4_proto = ip_proto;
3251*4882a593Smuzhiyun
3252*4882a593Smuzhiyun rcu_read_lock();
3253*4882a593Smuzhiyun
3254*4882a593Smuzhiyun if (iif) {
3255*4882a593Smuzhiyun struct net_device *dev;
3256*4882a593Smuzhiyun
3257*4882a593Smuzhiyun dev = dev_get_by_index_rcu(net, iif);
3258*4882a593Smuzhiyun if (!dev) {
3259*4882a593Smuzhiyun err = -ENODEV;
3260*4882a593Smuzhiyun goto errout_rcu;
3261*4882a593Smuzhiyun }
3262*4882a593Smuzhiyun
3263*4882a593Smuzhiyun fl4.flowi4_iif = iif; /* for rt_fill_info */
3264*4882a593Smuzhiyun skb->dev = dev;
3265*4882a593Smuzhiyun skb->mark = mark;
3266*4882a593Smuzhiyun err = ip_route_input_rcu(skb, dst, src,
3267*4882a593Smuzhiyun rtm->rtm_tos & IPTOS_RT_MASK, dev,
3268*4882a593Smuzhiyun &res);
3269*4882a593Smuzhiyun
3270*4882a593Smuzhiyun rt = skb_rtable(skb);
3271*4882a593Smuzhiyun if (err == 0 && rt->dst.error)
3272*4882a593Smuzhiyun err = -rt->dst.error;
3273*4882a593Smuzhiyun } else {
3274*4882a593Smuzhiyun fl4.flowi4_iif = LOOPBACK_IFINDEX;
3275*4882a593Smuzhiyun skb->dev = net->loopback_dev;
3276*4882a593Smuzhiyun rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3277*4882a593Smuzhiyun err = 0;
3278*4882a593Smuzhiyun if (IS_ERR(rt))
3279*4882a593Smuzhiyun err = PTR_ERR(rt);
3280*4882a593Smuzhiyun else
3281*4882a593Smuzhiyun skb_dst_set(skb, &rt->dst);
3282*4882a593Smuzhiyun }
3283*4882a593Smuzhiyun
3284*4882a593Smuzhiyun if (err)
3285*4882a593Smuzhiyun goto errout_rcu;
3286*4882a593Smuzhiyun
3287*4882a593Smuzhiyun if (rtm->rtm_flags & RTM_F_NOTIFY)
3288*4882a593Smuzhiyun rt->rt_flags |= RTCF_NOTIFY;
3289*4882a593Smuzhiyun
3290*4882a593Smuzhiyun if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3291*4882a593Smuzhiyun table_id = res.table ? res.table->tb_id : 0;
3292*4882a593Smuzhiyun
3293*4882a593Smuzhiyun /* reset skb for netlink reply msg */
3294*4882a593Smuzhiyun skb_trim(skb, 0);
3295*4882a593Smuzhiyun skb_reset_network_header(skb);
3296*4882a593Smuzhiyun skb_reset_transport_header(skb);
3297*4882a593Smuzhiyun skb_reset_mac_header(skb);
3298*4882a593Smuzhiyun
3299*4882a593Smuzhiyun if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3300*4882a593Smuzhiyun struct fib_rt_info fri;
3301*4882a593Smuzhiyun
3302*4882a593Smuzhiyun if (!res.fi) {
3303*4882a593Smuzhiyun err = fib_props[res.type].error;
3304*4882a593Smuzhiyun if (!err)
3305*4882a593Smuzhiyun err = -EHOSTUNREACH;
3306*4882a593Smuzhiyun goto errout_rcu;
3307*4882a593Smuzhiyun }
3308*4882a593Smuzhiyun fri.fi = res.fi;
3309*4882a593Smuzhiyun fri.tb_id = table_id;
3310*4882a593Smuzhiyun fri.dst = res.prefix;
3311*4882a593Smuzhiyun fri.dst_len = res.prefixlen;
3312*4882a593Smuzhiyun fri.tos = fl4.flowi4_tos;
3313*4882a593Smuzhiyun fri.type = rt->rt_type;
3314*4882a593Smuzhiyun fri.offload = 0;
3315*4882a593Smuzhiyun fri.trap = 0;
3316*4882a593Smuzhiyun if (res.fa_head) {
3317*4882a593Smuzhiyun struct fib_alias *fa;
3318*4882a593Smuzhiyun
3319*4882a593Smuzhiyun hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3320*4882a593Smuzhiyun u8 slen = 32 - fri.dst_len;
3321*4882a593Smuzhiyun
3322*4882a593Smuzhiyun if (fa->fa_slen == slen &&
3323*4882a593Smuzhiyun fa->tb_id == fri.tb_id &&
3324*4882a593Smuzhiyun fa->fa_tos == fri.tos &&
3325*4882a593Smuzhiyun fa->fa_info == res.fi &&
3326*4882a593Smuzhiyun fa->fa_type == fri.type) {
3327*4882a593Smuzhiyun fri.offload = fa->offload;
3328*4882a593Smuzhiyun fri.trap = fa->trap;
3329*4882a593Smuzhiyun break;
3330*4882a593Smuzhiyun }
3331*4882a593Smuzhiyun }
3332*4882a593Smuzhiyun }
3333*4882a593Smuzhiyun err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3334*4882a593Smuzhiyun nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3335*4882a593Smuzhiyun } else {
3336*4882a593Smuzhiyun err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3337*4882a593Smuzhiyun NETLINK_CB(in_skb).portid,
3338*4882a593Smuzhiyun nlh->nlmsg_seq, 0);
3339*4882a593Smuzhiyun }
3340*4882a593Smuzhiyun if (err < 0)
3341*4882a593Smuzhiyun goto errout_rcu;
3342*4882a593Smuzhiyun
3343*4882a593Smuzhiyun rcu_read_unlock();
3344*4882a593Smuzhiyun
3345*4882a593Smuzhiyun err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3346*4882a593Smuzhiyun
3347*4882a593Smuzhiyun errout_free:
3348*4882a593Smuzhiyun return err;
3349*4882a593Smuzhiyun errout_rcu:
3350*4882a593Smuzhiyun rcu_read_unlock();
3351*4882a593Smuzhiyun kfree_skb(skb);
3352*4882a593Smuzhiyun goto errout_free;
3353*4882a593Smuzhiyun }
3354*4882a593Smuzhiyun
ip_rt_multicast_event(struct in_device * in_dev)3355*4882a593Smuzhiyun void ip_rt_multicast_event(struct in_device *in_dev)
3356*4882a593Smuzhiyun {
3357*4882a593Smuzhiyun rt_cache_flush(dev_net(in_dev->dev));
3358*4882a593Smuzhiyun }
3359*4882a593Smuzhiyun
3360*4882a593Smuzhiyun #ifdef CONFIG_SYSCTL
3361*4882a593Smuzhiyun static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3362*4882a593Smuzhiyun static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3363*4882a593Smuzhiyun static int ip_rt_gc_elasticity __read_mostly = 8;
3364*4882a593Smuzhiyun static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3365*4882a593Smuzhiyun
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3366*4882a593Smuzhiyun static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3367*4882a593Smuzhiyun void *buffer, size_t *lenp, loff_t *ppos)
3368*4882a593Smuzhiyun {
3369*4882a593Smuzhiyun struct net *net = (struct net *)__ctl->extra1;
3370*4882a593Smuzhiyun
3371*4882a593Smuzhiyun if (write) {
3372*4882a593Smuzhiyun rt_cache_flush(net);
3373*4882a593Smuzhiyun fnhe_genid_bump(net);
3374*4882a593Smuzhiyun return 0;
3375*4882a593Smuzhiyun }
3376*4882a593Smuzhiyun
3377*4882a593Smuzhiyun return -EINVAL;
3378*4882a593Smuzhiyun }
3379*4882a593Smuzhiyun
3380*4882a593Smuzhiyun static struct ctl_table ipv4_route_table[] = {
3381*4882a593Smuzhiyun {
3382*4882a593Smuzhiyun .procname = "gc_thresh",
3383*4882a593Smuzhiyun .data = &ipv4_dst_ops.gc_thresh,
3384*4882a593Smuzhiyun .maxlen = sizeof(int),
3385*4882a593Smuzhiyun .mode = 0644,
3386*4882a593Smuzhiyun .proc_handler = proc_dointvec,
3387*4882a593Smuzhiyun },
3388*4882a593Smuzhiyun {
3389*4882a593Smuzhiyun .procname = "max_size",
3390*4882a593Smuzhiyun .data = &ip_rt_max_size,
3391*4882a593Smuzhiyun .maxlen = sizeof(int),
3392*4882a593Smuzhiyun .mode = 0644,
3393*4882a593Smuzhiyun .proc_handler = proc_dointvec,
3394*4882a593Smuzhiyun },
3395*4882a593Smuzhiyun {
3396*4882a593Smuzhiyun /* Deprecated. Use gc_min_interval_ms */
3397*4882a593Smuzhiyun
3398*4882a593Smuzhiyun .procname = "gc_min_interval",
3399*4882a593Smuzhiyun .data = &ip_rt_gc_min_interval,
3400*4882a593Smuzhiyun .maxlen = sizeof(int),
3401*4882a593Smuzhiyun .mode = 0644,
3402*4882a593Smuzhiyun .proc_handler = proc_dointvec_jiffies,
3403*4882a593Smuzhiyun },
3404*4882a593Smuzhiyun {
3405*4882a593Smuzhiyun .procname = "gc_min_interval_ms",
3406*4882a593Smuzhiyun .data = &ip_rt_gc_min_interval,
3407*4882a593Smuzhiyun .maxlen = sizeof(int),
3408*4882a593Smuzhiyun .mode = 0644,
3409*4882a593Smuzhiyun .proc_handler = proc_dointvec_ms_jiffies,
3410*4882a593Smuzhiyun },
3411*4882a593Smuzhiyun {
3412*4882a593Smuzhiyun .procname = "gc_timeout",
3413*4882a593Smuzhiyun .data = &ip_rt_gc_timeout,
3414*4882a593Smuzhiyun .maxlen = sizeof(int),
3415*4882a593Smuzhiyun .mode = 0644,
3416*4882a593Smuzhiyun .proc_handler = proc_dointvec_jiffies,
3417*4882a593Smuzhiyun },
3418*4882a593Smuzhiyun {
3419*4882a593Smuzhiyun .procname = "gc_interval",
3420*4882a593Smuzhiyun .data = &ip_rt_gc_interval,
3421*4882a593Smuzhiyun .maxlen = sizeof(int),
3422*4882a593Smuzhiyun .mode = 0644,
3423*4882a593Smuzhiyun .proc_handler = proc_dointvec_jiffies,
3424*4882a593Smuzhiyun },
3425*4882a593Smuzhiyun {
3426*4882a593Smuzhiyun .procname = "redirect_load",
3427*4882a593Smuzhiyun .data = &ip_rt_redirect_load,
3428*4882a593Smuzhiyun .maxlen = sizeof(int),
3429*4882a593Smuzhiyun .mode = 0644,
3430*4882a593Smuzhiyun .proc_handler = proc_dointvec,
3431*4882a593Smuzhiyun },
3432*4882a593Smuzhiyun {
3433*4882a593Smuzhiyun .procname = "redirect_number",
3434*4882a593Smuzhiyun .data = &ip_rt_redirect_number,
3435*4882a593Smuzhiyun .maxlen = sizeof(int),
3436*4882a593Smuzhiyun .mode = 0644,
3437*4882a593Smuzhiyun .proc_handler = proc_dointvec,
3438*4882a593Smuzhiyun },
3439*4882a593Smuzhiyun {
3440*4882a593Smuzhiyun .procname = "redirect_silence",
3441*4882a593Smuzhiyun .data = &ip_rt_redirect_silence,
3442*4882a593Smuzhiyun .maxlen = sizeof(int),
3443*4882a593Smuzhiyun .mode = 0644,
3444*4882a593Smuzhiyun .proc_handler = proc_dointvec,
3445*4882a593Smuzhiyun },
3446*4882a593Smuzhiyun {
3447*4882a593Smuzhiyun .procname = "error_cost",
3448*4882a593Smuzhiyun .data = &ip_rt_error_cost,
3449*4882a593Smuzhiyun .maxlen = sizeof(int),
3450*4882a593Smuzhiyun .mode = 0644,
3451*4882a593Smuzhiyun .proc_handler = proc_dointvec,
3452*4882a593Smuzhiyun },
3453*4882a593Smuzhiyun {
3454*4882a593Smuzhiyun .procname = "error_burst",
3455*4882a593Smuzhiyun .data = &ip_rt_error_burst,
3456*4882a593Smuzhiyun .maxlen = sizeof(int),
3457*4882a593Smuzhiyun .mode = 0644,
3458*4882a593Smuzhiyun .proc_handler = proc_dointvec,
3459*4882a593Smuzhiyun },
3460*4882a593Smuzhiyun {
3461*4882a593Smuzhiyun .procname = "gc_elasticity",
3462*4882a593Smuzhiyun .data = &ip_rt_gc_elasticity,
3463*4882a593Smuzhiyun .maxlen = sizeof(int),
3464*4882a593Smuzhiyun .mode = 0644,
3465*4882a593Smuzhiyun .proc_handler = proc_dointvec,
3466*4882a593Smuzhiyun },
3467*4882a593Smuzhiyun {
3468*4882a593Smuzhiyun .procname = "mtu_expires",
3469*4882a593Smuzhiyun .data = &ip_rt_mtu_expires,
3470*4882a593Smuzhiyun .maxlen = sizeof(int),
3471*4882a593Smuzhiyun .mode = 0644,
3472*4882a593Smuzhiyun .proc_handler = proc_dointvec_jiffies,
3473*4882a593Smuzhiyun },
3474*4882a593Smuzhiyun {
3475*4882a593Smuzhiyun .procname = "min_pmtu",
3476*4882a593Smuzhiyun .data = &ip_rt_min_pmtu,
3477*4882a593Smuzhiyun .maxlen = sizeof(int),
3478*4882a593Smuzhiyun .mode = 0644,
3479*4882a593Smuzhiyun .proc_handler = proc_dointvec_minmax,
3480*4882a593Smuzhiyun .extra1 = &ip_min_valid_pmtu,
3481*4882a593Smuzhiyun },
3482*4882a593Smuzhiyun {
3483*4882a593Smuzhiyun .procname = "min_adv_mss",
3484*4882a593Smuzhiyun .data = &ip_rt_min_advmss,
3485*4882a593Smuzhiyun .maxlen = sizeof(int),
3486*4882a593Smuzhiyun .mode = 0644,
3487*4882a593Smuzhiyun .proc_handler = proc_dointvec,
3488*4882a593Smuzhiyun },
3489*4882a593Smuzhiyun { }
3490*4882a593Smuzhiyun };
3491*4882a593Smuzhiyun
3492*4882a593Smuzhiyun static const char ipv4_route_flush_procname[] = "flush";
3493*4882a593Smuzhiyun
3494*4882a593Smuzhiyun static struct ctl_table ipv4_route_flush_table[] = {
3495*4882a593Smuzhiyun {
3496*4882a593Smuzhiyun .procname = ipv4_route_flush_procname,
3497*4882a593Smuzhiyun .maxlen = sizeof(int),
3498*4882a593Smuzhiyun .mode = 0200,
3499*4882a593Smuzhiyun .proc_handler = ipv4_sysctl_rtcache_flush,
3500*4882a593Smuzhiyun },
3501*4882a593Smuzhiyun { },
3502*4882a593Smuzhiyun };
3503*4882a593Smuzhiyun
sysctl_route_net_init(struct net * net)3504*4882a593Smuzhiyun static __net_init int sysctl_route_net_init(struct net *net)
3505*4882a593Smuzhiyun {
3506*4882a593Smuzhiyun struct ctl_table *tbl;
3507*4882a593Smuzhiyun
3508*4882a593Smuzhiyun tbl = ipv4_route_flush_table;
3509*4882a593Smuzhiyun if (!net_eq(net, &init_net)) {
3510*4882a593Smuzhiyun tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3511*4882a593Smuzhiyun if (!tbl)
3512*4882a593Smuzhiyun goto err_dup;
3513*4882a593Smuzhiyun
3514*4882a593Smuzhiyun /* Don't export non-whitelisted sysctls to unprivileged users */
3515*4882a593Smuzhiyun if (net->user_ns != &init_user_ns) {
3516*4882a593Smuzhiyun if (tbl[0].procname != ipv4_route_flush_procname)
3517*4882a593Smuzhiyun tbl[0].procname = NULL;
3518*4882a593Smuzhiyun }
3519*4882a593Smuzhiyun }
3520*4882a593Smuzhiyun tbl[0].extra1 = net;
3521*4882a593Smuzhiyun
3522*4882a593Smuzhiyun net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3523*4882a593Smuzhiyun if (!net->ipv4.route_hdr)
3524*4882a593Smuzhiyun goto err_reg;
3525*4882a593Smuzhiyun return 0;
3526*4882a593Smuzhiyun
3527*4882a593Smuzhiyun err_reg:
3528*4882a593Smuzhiyun if (tbl != ipv4_route_flush_table)
3529*4882a593Smuzhiyun kfree(tbl);
3530*4882a593Smuzhiyun err_dup:
3531*4882a593Smuzhiyun return -ENOMEM;
3532*4882a593Smuzhiyun }
3533*4882a593Smuzhiyun
sysctl_route_net_exit(struct net * net)3534*4882a593Smuzhiyun static __net_exit void sysctl_route_net_exit(struct net *net)
3535*4882a593Smuzhiyun {
3536*4882a593Smuzhiyun struct ctl_table *tbl;
3537*4882a593Smuzhiyun
3538*4882a593Smuzhiyun tbl = net->ipv4.route_hdr->ctl_table_arg;
3539*4882a593Smuzhiyun unregister_net_sysctl_table(net->ipv4.route_hdr);
3540*4882a593Smuzhiyun BUG_ON(tbl == ipv4_route_flush_table);
3541*4882a593Smuzhiyun kfree(tbl);
3542*4882a593Smuzhiyun }
3543*4882a593Smuzhiyun
3544*4882a593Smuzhiyun static __net_initdata struct pernet_operations sysctl_route_ops = {
3545*4882a593Smuzhiyun .init = sysctl_route_net_init,
3546*4882a593Smuzhiyun .exit = sysctl_route_net_exit,
3547*4882a593Smuzhiyun };
3548*4882a593Smuzhiyun #endif
3549*4882a593Smuzhiyun
rt_genid_init(struct net * net)3550*4882a593Smuzhiyun static __net_init int rt_genid_init(struct net *net)
3551*4882a593Smuzhiyun {
3552*4882a593Smuzhiyun atomic_set(&net->ipv4.rt_genid, 0);
3553*4882a593Smuzhiyun atomic_set(&net->fnhe_genid, 0);
3554*4882a593Smuzhiyun atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3555*4882a593Smuzhiyun return 0;
3556*4882a593Smuzhiyun }
3557*4882a593Smuzhiyun
3558*4882a593Smuzhiyun static __net_initdata struct pernet_operations rt_genid_ops = {
3559*4882a593Smuzhiyun .init = rt_genid_init,
3560*4882a593Smuzhiyun };
3561*4882a593Smuzhiyun
ipv4_inetpeer_init(struct net * net)3562*4882a593Smuzhiyun static int __net_init ipv4_inetpeer_init(struct net *net)
3563*4882a593Smuzhiyun {
3564*4882a593Smuzhiyun struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3565*4882a593Smuzhiyun
3566*4882a593Smuzhiyun if (!bp)
3567*4882a593Smuzhiyun return -ENOMEM;
3568*4882a593Smuzhiyun inet_peer_base_init(bp);
3569*4882a593Smuzhiyun net->ipv4.peers = bp;
3570*4882a593Smuzhiyun return 0;
3571*4882a593Smuzhiyun }
3572*4882a593Smuzhiyun
ipv4_inetpeer_exit(struct net * net)3573*4882a593Smuzhiyun static void __net_exit ipv4_inetpeer_exit(struct net *net)
3574*4882a593Smuzhiyun {
3575*4882a593Smuzhiyun struct inet_peer_base *bp = net->ipv4.peers;
3576*4882a593Smuzhiyun
3577*4882a593Smuzhiyun net->ipv4.peers = NULL;
3578*4882a593Smuzhiyun inetpeer_invalidate_tree(bp);
3579*4882a593Smuzhiyun kfree(bp);
3580*4882a593Smuzhiyun }
3581*4882a593Smuzhiyun
3582*4882a593Smuzhiyun static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3583*4882a593Smuzhiyun .init = ipv4_inetpeer_init,
3584*4882a593Smuzhiyun .exit = ipv4_inetpeer_exit,
3585*4882a593Smuzhiyun };
3586*4882a593Smuzhiyun
3587*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_CLASSID
3588*4882a593Smuzhiyun struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3589*4882a593Smuzhiyun #endif /* CONFIG_IP_ROUTE_CLASSID */
3590*4882a593Smuzhiyun
ip_rt_init(void)3591*4882a593Smuzhiyun int __init ip_rt_init(void)
3592*4882a593Smuzhiyun {
3593*4882a593Smuzhiyun void *idents_hash;
3594*4882a593Smuzhiyun int cpu;
3595*4882a593Smuzhiyun
3596*4882a593Smuzhiyun /* For modern hosts, this will use 2 MB of memory */
3597*4882a593Smuzhiyun idents_hash = alloc_large_system_hash("IP idents",
3598*4882a593Smuzhiyun sizeof(*ip_idents) + sizeof(*ip_tstamps),
3599*4882a593Smuzhiyun 0,
3600*4882a593Smuzhiyun 16, /* one bucket per 64 KB */
3601*4882a593Smuzhiyun HASH_ZERO,
3602*4882a593Smuzhiyun NULL,
3603*4882a593Smuzhiyun &ip_idents_mask,
3604*4882a593Smuzhiyun 2048,
3605*4882a593Smuzhiyun 256*1024);
3606*4882a593Smuzhiyun
3607*4882a593Smuzhiyun ip_idents = idents_hash;
3608*4882a593Smuzhiyun
3609*4882a593Smuzhiyun prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3610*4882a593Smuzhiyun
3611*4882a593Smuzhiyun ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3612*4882a593Smuzhiyun
3613*4882a593Smuzhiyun for_each_possible_cpu(cpu) {
3614*4882a593Smuzhiyun struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3615*4882a593Smuzhiyun
3616*4882a593Smuzhiyun INIT_LIST_HEAD(&ul->head);
3617*4882a593Smuzhiyun spin_lock_init(&ul->lock);
3618*4882a593Smuzhiyun }
3619*4882a593Smuzhiyun #ifdef CONFIG_IP_ROUTE_CLASSID
3620*4882a593Smuzhiyun ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3621*4882a593Smuzhiyun if (!ip_rt_acct)
3622*4882a593Smuzhiyun panic("IP: failed to allocate ip_rt_acct\n");
3623*4882a593Smuzhiyun #endif
3624*4882a593Smuzhiyun
3625*4882a593Smuzhiyun ipv4_dst_ops.kmem_cachep =
3626*4882a593Smuzhiyun kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3627*4882a593Smuzhiyun SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3628*4882a593Smuzhiyun
3629*4882a593Smuzhiyun ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3630*4882a593Smuzhiyun
3631*4882a593Smuzhiyun if (dst_entries_init(&ipv4_dst_ops) < 0)
3632*4882a593Smuzhiyun panic("IP: failed to allocate ipv4_dst_ops counter\n");
3633*4882a593Smuzhiyun
3634*4882a593Smuzhiyun if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3635*4882a593Smuzhiyun panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3636*4882a593Smuzhiyun
3637*4882a593Smuzhiyun ipv4_dst_ops.gc_thresh = ~0;
3638*4882a593Smuzhiyun ip_rt_max_size = INT_MAX;
3639*4882a593Smuzhiyun
3640*4882a593Smuzhiyun devinet_init();
3641*4882a593Smuzhiyun ip_fib_init();
3642*4882a593Smuzhiyun
3643*4882a593Smuzhiyun if (ip_rt_proc_init())
3644*4882a593Smuzhiyun pr_err("Unable to create route proc files\n");
3645*4882a593Smuzhiyun #ifdef CONFIG_XFRM
3646*4882a593Smuzhiyun xfrm_init();
3647*4882a593Smuzhiyun xfrm4_init();
3648*4882a593Smuzhiyun #endif
3649*4882a593Smuzhiyun rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3650*4882a593Smuzhiyun RTNL_FLAG_DOIT_UNLOCKED);
3651*4882a593Smuzhiyun
3652*4882a593Smuzhiyun #ifdef CONFIG_SYSCTL
3653*4882a593Smuzhiyun register_pernet_subsys(&sysctl_route_ops);
3654*4882a593Smuzhiyun #endif
3655*4882a593Smuzhiyun register_pernet_subsys(&rt_genid_ops);
3656*4882a593Smuzhiyun register_pernet_subsys(&ipv4_inetpeer_ops);
3657*4882a593Smuzhiyun return 0;
3658*4882a593Smuzhiyun }
3659*4882a593Smuzhiyun
3660*4882a593Smuzhiyun #ifdef CONFIG_SYSCTL
3661*4882a593Smuzhiyun /*
3662*4882a593Smuzhiyun * We really need to sanitize the damn ipv4 init order, then all
3663*4882a593Smuzhiyun * this nonsense will go away.
3664*4882a593Smuzhiyun */
ip_static_sysctl_init(void)3665*4882a593Smuzhiyun void __init ip_static_sysctl_init(void)
3666*4882a593Smuzhiyun {
3667*4882a593Smuzhiyun register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3668*4882a593Smuzhiyun }
3669*4882a593Smuzhiyun #endif
3670