xref: /OK3568_Linux_fs/kernel/drivers/net/tap.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun #include <linux/etherdevice.h>
3*4882a593Smuzhiyun #include <linux/if_tap.h>
4*4882a593Smuzhiyun #include <linux/if_vlan.h>
5*4882a593Smuzhiyun #include <linux/interrupt.h>
6*4882a593Smuzhiyun #include <linux/nsproxy.h>
7*4882a593Smuzhiyun #include <linux/compat.h>
8*4882a593Smuzhiyun #include <linux/if_tun.h>
9*4882a593Smuzhiyun #include <linux/module.h>
10*4882a593Smuzhiyun #include <linux/skbuff.h>
11*4882a593Smuzhiyun #include <linux/cache.h>
12*4882a593Smuzhiyun #include <linux/sched/signal.h>
13*4882a593Smuzhiyun #include <linux/types.h>
14*4882a593Smuzhiyun #include <linux/slab.h>
15*4882a593Smuzhiyun #include <linux/wait.h>
16*4882a593Smuzhiyun #include <linux/cdev.h>
17*4882a593Smuzhiyun #include <linux/idr.h>
18*4882a593Smuzhiyun #include <linux/fs.h>
19*4882a593Smuzhiyun #include <linux/uio.h>
20*4882a593Smuzhiyun 
21*4882a593Smuzhiyun #include <net/net_namespace.h>
22*4882a593Smuzhiyun #include <net/rtnetlink.h>
23*4882a593Smuzhiyun #include <net/sock.h>
24*4882a593Smuzhiyun #include <linux/virtio_net.h>
25*4882a593Smuzhiyun #include <linux/skb_array.h>
26*4882a593Smuzhiyun 
27*4882a593Smuzhiyun #define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
28*4882a593Smuzhiyun 
29*4882a593Smuzhiyun #define TAP_VNET_LE 0x80000000
30*4882a593Smuzhiyun #define TAP_VNET_BE 0x40000000
31*4882a593Smuzhiyun 
32*4882a593Smuzhiyun #ifdef CONFIG_TUN_VNET_CROSS_LE
tap_legacy_is_little_endian(struct tap_queue * q)33*4882a593Smuzhiyun static inline bool tap_legacy_is_little_endian(struct tap_queue *q)
34*4882a593Smuzhiyun {
35*4882a593Smuzhiyun 	return q->flags & TAP_VNET_BE ? false :
36*4882a593Smuzhiyun 		virtio_legacy_is_little_endian();
37*4882a593Smuzhiyun }
38*4882a593Smuzhiyun 
tap_get_vnet_be(struct tap_queue * q,int __user * sp)39*4882a593Smuzhiyun static long tap_get_vnet_be(struct tap_queue *q, int __user *sp)
40*4882a593Smuzhiyun {
41*4882a593Smuzhiyun 	int s = !!(q->flags & TAP_VNET_BE);
42*4882a593Smuzhiyun 
43*4882a593Smuzhiyun 	if (put_user(s, sp))
44*4882a593Smuzhiyun 		return -EFAULT;
45*4882a593Smuzhiyun 
46*4882a593Smuzhiyun 	return 0;
47*4882a593Smuzhiyun }
48*4882a593Smuzhiyun 
tap_set_vnet_be(struct tap_queue * q,int __user * sp)49*4882a593Smuzhiyun static long tap_set_vnet_be(struct tap_queue *q, int __user *sp)
50*4882a593Smuzhiyun {
51*4882a593Smuzhiyun 	int s;
52*4882a593Smuzhiyun 
53*4882a593Smuzhiyun 	if (get_user(s, sp))
54*4882a593Smuzhiyun 		return -EFAULT;
55*4882a593Smuzhiyun 
56*4882a593Smuzhiyun 	if (s)
57*4882a593Smuzhiyun 		q->flags |= TAP_VNET_BE;
58*4882a593Smuzhiyun 	else
59*4882a593Smuzhiyun 		q->flags &= ~TAP_VNET_BE;
60*4882a593Smuzhiyun 
61*4882a593Smuzhiyun 	return 0;
62*4882a593Smuzhiyun }
63*4882a593Smuzhiyun #else
tap_legacy_is_little_endian(struct tap_queue * q)64*4882a593Smuzhiyun static inline bool tap_legacy_is_little_endian(struct tap_queue *q)
65*4882a593Smuzhiyun {
66*4882a593Smuzhiyun 	return virtio_legacy_is_little_endian();
67*4882a593Smuzhiyun }
68*4882a593Smuzhiyun 
tap_get_vnet_be(struct tap_queue * q,int __user * argp)69*4882a593Smuzhiyun static long tap_get_vnet_be(struct tap_queue *q, int __user *argp)
70*4882a593Smuzhiyun {
71*4882a593Smuzhiyun 	return -EINVAL;
72*4882a593Smuzhiyun }
73*4882a593Smuzhiyun 
tap_set_vnet_be(struct tap_queue * q,int __user * argp)74*4882a593Smuzhiyun static long tap_set_vnet_be(struct tap_queue *q, int __user *argp)
75*4882a593Smuzhiyun {
76*4882a593Smuzhiyun 	return -EINVAL;
77*4882a593Smuzhiyun }
78*4882a593Smuzhiyun #endif /* CONFIG_TUN_VNET_CROSS_LE */
79*4882a593Smuzhiyun 
tap_is_little_endian(struct tap_queue * q)80*4882a593Smuzhiyun static inline bool tap_is_little_endian(struct tap_queue *q)
81*4882a593Smuzhiyun {
82*4882a593Smuzhiyun 	return q->flags & TAP_VNET_LE ||
83*4882a593Smuzhiyun 		tap_legacy_is_little_endian(q);
84*4882a593Smuzhiyun }
85*4882a593Smuzhiyun 
tap16_to_cpu(struct tap_queue * q,__virtio16 val)86*4882a593Smuzhiyun static inline u16 tap16_to_cpu(struct tap_queue *q, __virtio16 val)
87*4882a593Smuzhiyun {
88*4882a593Smuzhiyun 	return __virtio16_to_cpu(tap_is_little_endian(q), val);
89*4882a593Smuzhiyun }
90*4882a593Smuzhiyun 
cpu_to_tap16(struct tap_queue * q,u16 val)91*4882a593Smuzhiyun static inline __virtio16 cpu_to_tap16(struct tap_queue *q, u16 val)
92*4882a593Smuzhiyun {
93*4882a593Smuzhiyun 	return __cpu_to_virtio16(tap_is_little_endian(q), val);
94*4882a593Smuzhiyun }
95*4882a593Smuzhiyun 
96*4882a593Smuzhiyun static struct proto tap_proto = {
97*4882a593Smuzhiyun 	.name = "tap",
98*4882a593Smuzhiyun 	.owner = THIS_MODULE,
99*4882a593Smuzhiyun 	.obj_size = sizeof(struct tap_queue),
100*4882a593Smuzhiyun };
101*4882a593Smuzhiyun 
102*4882a593Smuzhiyun #define TAP_NUM_DEVS (1U << MINORBITS)
103*4882a593Smuzhiyun 
104*4882a593Smuzhiyun static LIST_HEAD(major_list);
105*4882a593Smuzhiyun 
106*4882a593Smuzhiyun struct major_info {
107*4882a593Smuzhiyun 	struct rcu_head rcu;
108*4882a593Smuzhiyun 	dev_t major;
109*4882a593Smuzhiyun 	struct idr minor_idr;
110*4882a593Smuzhiyun 	spinlock_t minor_lock;
111*4882a593Smuzhiyun 	const char *device_name;
112*4882a593Smuzhiyun 	struct list_head next;
113*4882a593Smuzhiyun };
114*4882a593Smuzhiyun 
115*4882a593Smuzhiyun #define GOODCOPY_LEN 128
116*4882a593Smuzhiyun 
117*4882a593Smuzhiyun static const struct proto_ops tap_socket_ops;
118*4882a593Smuzhiyun 
119*4882a593Smuzhiyun #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
120*4882a593Smuzhiyun #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST)
121*4882a593Smuzhiyun 
tap_dev_get_rcu(const struct net_device * dev)122*4882a593Smuzhiyun static struct tap_dev *tap_dev_get_rcu(const struct net_device *dev)
123*4882a593Smuzhiyun {
124*4882a593Smuzhiyun 	return rcu_dereference(dev->rx_handler_data);
125*4882a593Smuzhiyun }
126*4882a593Smuzhiyun 
127*4882a593Smuzhiyun /*
128*4882a593Smuzhiyun  * RCU usage:
129*4882a593Smuzhiyun  * The tap_queue and the macvlan_dev are loosely coupled, the
130*4882a593Smuzhiyun  * pointers from one to the other can only be read while rcu_read_lock
131*4882a593Smuzhiyun  * or rtnl is held.
132*4882a593Smuzhiyun  *
133*4882a593Smuzhiyun  * Both the file and the macvlan_dev hold a reference on the tap_queue
134*4882a593Smuzhiyun  * through sock_hold(&q->sk). When the macvlan_dev goes away first,
135*4882a593Smuzhiyun  * q->vlan becomes inaccessible. When the files gets closed,
136*4882a593Smuzhiyun  * tap_get_queue() fails.
137*4882a593Smuzhiyun  *
138*4882a593Smuzhiyun  * There may still be references to the struct sock inside of the
139*4882a593Smuzhiyun  * queue from outbound SKBs, but these never reference back to the
140*4882a593Smuzhiyun  * file or the dev. The data structure is freed through __sk_free
141*4882a593Smuzhiyun  * when both our references and any pending SKBs are gone.
142*4882a593Smuzhiyun  */
143*4882a593Smuzhiyun 
tap_enable_queue(struct tap_dev * tap,struct file * file,struct tap_queue * q)144*4882a593Smuzhiyun static int tap_enable_queue(struct tap_dev *tap, struct file *file,
145*4882a593Smuzhiyun 			    struct tap_queue *q)
146*4882a593Smuzhiyun {
147*4882a593Smuzhiyun 	int err = -EINVAL;
148*4882a593Smuzhiyun 
149*4882a593Smuzhiyun 	ASSERT_RTNL();
150*4882a593Smuzhiyun 
151*4882a593Smuzhiyun 	if (q->enabled)
152*4882a593Smuzhiyun 		goto out;
153*4882a593Smuzhiyun 
154*4882a593Smuzhiyun 	err = 0;
155*4882a593Smuzhiyun 	rcu_assign_pointer(tap->taps[tap->numvtaps], q);
156*4882a593Smuzhiyun 	q->queue_index = tap->numvtaps;
157*4882a593Smuzhiyun 	q->enabled = true;
158*4882a593Smuzhiyun 
159*4882a593Smuzhiyun 	tap->numvtaps++;
160*4882a593Smuzhiyun out:
161*4882a593Smuzhiyun 	return err;
162*4882a593Smuzhiyun }
163*4882a593Smuzhiyun 
164*4882a593Smuzhiyun /* Requires RTNL */
tap_set_queue(struct tap_dev * tap,struct file * file,struct tap_queue * q)165*4882a593Smuzhiyun static int tap_set_queue(struct tap_dev *tap, struct file *file,
166*4882a593Smuzhiyun 			 struct tap_queue *q)
167*4882a593Smuzhiyun {
168*4882a593Smuzhiyun 	if (tap->numqueues == MAX_TAP_QUEUES)
169*4882a593Smuzhiyun 		return -EBUSY;
170*4882a593Smuzhiyun 
171*4882a593Smuzhiyun 	rcu_assign_pointer(q->tap, tap);
172*4882a593Smuzhiyun 	rcu_assign_pointer(tap->taps[tap->numvtaps], q);
173*4882a593Smuzhiyun 	sock_hold(&q->sk);
174*4882a593Smuzhiyun 
175*4882a593Smuzhiyun 	q->file = file;
176*4882a593Smuzhiyun 	q->queue_index = tap->numvtaps;
177*4882a593Smuzhiyun 	q->enabled = true;
178*4882a593Smuzhiyun 	file->private_data = q;
179*4882a593Smuzhiyun 	list_add_tail(&q->next, &tap->queue_list);
180*4882a593Smuzhiyun 
181*4882a593Smuzhiyun 	tap->numvtaps++;
182*4882a593Smuzhiyun 	tap->numqueues++;
183*4882a593Smuzhiyun 
184*4882a593Smuzhiyun 	return 0;
185*4882a593Smuzhiyun }
186*4882a593Smuzhiyun 
tap_disable_queue(struct tap_queue * q)187*4882a593Smuzhiyun static int tap_disable_queue(struct tap_queue *q)
188*4882a593Smuzhiyun {
189*4882a593Smuzhiyun 	struct tap_dev *tap;
190*4882a593Smuzhiyun 	struct tap_queue *nq;
191*4882a593Smuzhiyun 
192*4882a593Smuzhiyun 	ASSERT_RTNL();
193*4882a593Smuzhiyun 	if (!q->enabled)
194*4882a593Smuzhiyun 		return -EINVAL;
195*4882a593Smuzhiyun 
196*4882a593Smuzhiyun 	tap = rtnl_dereference(q->tap);
197*4882a593Smuzhiyun 
198*4882a593Smuzhiyun 	if (tap) {
199*4882a593Smuzhiyun 		int index = q->queue_index;
200*4882a593Smuzhiyun 		BUG_ON(index >= tap->numvtaps);
201*4882a593Smuzhiyun 		nq = rtnl_dereference(tap->taps[tap->numvtaps - 1]);
202*4882a593Smuzhiyun 		nq->queue_index = index;
203*4882a593Smuzhiyun 
204*4882a593Smuzhiyun 		rcu_assign_pointer(tap->taps[index], nq);
205*4882a593Smuzhiyun 		RCU_INIT_POINTER(tap->taps[tap->numvtaps - 1], NULL);
206*4882a593Smuzhiyun 		q->enabled = false;
207*4882a593Smuzhiyun 
208*4882a593Smuzhiyun 		tap->numvtaps--;
209*4882a593Smuzhiyun 	}
210*4882a593Smuzhiyun 
211*4882a593Smuzhiyun 	return 0;
212*4882a593Smuzhiyun }
213*4882a593Smuzhiyun 
214*4882a593Smuzhiyun /*
215*4882a593Smuzhiyun  * The file owning the queue got closed, give up both
216*4882a593Smuzhiyun  * the reference that the files holds as well as the
217*4882a593Smuzhiyun  * one from the macvlan_dev if that still exists.
218*4882a593Smuzhiyun  *
219*4882a593Smuzhiyun  * Using the spinlock makes sure that we don't get
220*4882a593Smuzhiyun  * to the queue again after destroying it.
221*4882a593Smuzhiyun  */
tap_put_queue(struct tap_queue * q)222*4882a593Smuzhiyun static void tap_put_queue(struct tap_queue *q)
223*4882a593Smuzhiyun {
224*4882a593Smuzhiyun 	struct tap_dev *tap;
225*4882a593Smuzhiyun 
226*4882a593Smuzhiyun 	rtnl_lock();
227*4882a593Smuzhiyun 	tap = rtnl_dereference(q->tap);
228*4882a593Smuzhiyun 
229*4882a593Smuzhiyun 	if (tap) {
230*4882a593Smuzhiyun 		if (q->enabled)
231*4882a593Smuzhiyun 			BUG_ON(tap_disable_queue(q));
232*4882a593Smuzhiyun 
233*4882a593Smuzhiyun 		tap->numqueues--;
234*4882a593Smuzhiyun 		RCU_INIT_POINTER(q->tap, NULL);
235*4882a593Smuzhiyun 		sock_put(&q->sk);
236*4882a593Smuzhiyun 		list_del_init(&q->next);
237*4882a593Smuzhiyun 	}
238*4882a593Smuzhiyun 
239*4882a593Smuzhiyun 	rtnl_unlock();
240*4882a593Smuzhiyun 
241*4882a593Smuzhiyun 	synchronize_rcu();
242*4882a593Smuzhiyun 	sock_put(&q->sk);
243*4882a593Smuzhiyun }
244*4882a593Smuzhiyun 
245*4882a593Smuzhiyun /*
246*4882a593Smuzhiyun  * Select a queue based on the rxq of the device on which this packet
247*4882a593Smuzhiyun  * arrived. If the incoming device is not mq, calculate a flow hash
248*4882a593Smuzhiyun  * to select a queue. If all fails, find the first available queue.
249*4882a593Smuzhiyun  * Cache vlan->numvtaps since it can become zero during the execution
250*4882a593Smuzhiyun  * of this function.
251*4882a593Smuzhiyun  */
tap_get_queue(struct tap_dev * tap,struct sk_buff * skb)252*4882a593Smuzhiyun static struct tap_queue *tap_get_queue(struct tap_dev *tap,
253*4882a593Smuzhiyun 				       struct sk_buff *skb)
254*4882a593Smuzhiyun {
255*4882a593Smuzhiyun 	struct tap_queue *queue = NULL;
256*4882a593Smuzhiyun 	/* Access to taps array is protected by rcu, but access to numvtaps
257*4882a593Smuzhiyun 	 * isn't. Below we use it to lookup a queue, but treat it as a hint
258*4882a593Smuzhiyun 	 * and validate that the result isn't NULL - in case we are
259*4882a593Smuzhiyun 	 * racing against queue removal.
260*4882a593Smuzhiyun 	 */
261*4882a593Smuzhiyun 	int numvtaps = READ_ONCE(tap->numvtaps);
262*4882a593Smuzhiyun 	__u32 rxq;
263*4882a593Smuzhiyun 
264*4882a593Smuzhiyun 	if (!numvtaps)
265*4882a593Smuzhiyun 		goto out;
266*4882a593Smuzhiyun 
267*4882a593Smuzhiyun 	if (numvtaps == 1)
268*4882a593Smuzhiyun 		goto single;
269*4882a593Smuzhiyun 
270*4882a593Smuzhiyun 	/* Check if we can use flow to select a queue */
271*4882a593Smuzhiyun 	rxq = skb_get_hash(skb);
272*4882a593Smuzhiyun 	if (rxq) {
273*4882a593Smuzhiyun 		queue = rcu_dereference(tap->taps[rxq % numvtaps]);
274*4882a593Smuzhiyun 		goto out;
275*4882a593Smuzhiyun 	}
276*4882a593Smuzhiyun 
277*4882a593Smuzhiyun 	if (likely(skb_rx_queue_recorded(skb))) {
278*4882a593Smuzhiyun 		rxq = skb_get_rx_queue(skb);
279*4882a593Smuzhiyun 
280*4882a593Smuzhiyun 		while (unlikely(rxq >= numvtaps))
281*4882a593Smuzhiyun 			rxq -= numvtaps;
282*4882a593Smuzhiyun 
283*4882a593Smuzhiyun 		queue = rcu_dereference(tap->taps[rxq]);
284*4882a593Smuzhiyun 		goto out;
285*4882a593Smuzhiyun 	}
286*4882a593Smuzhiyun 
287*4882a593Smuzhiyun single:
288*4882a593Smuzhiyun 	queue = rcu_dereference(tap->taps[0]);
289*4882a593Smuzhiyun out:
290*4882a593Smuzhiyun 	return queue;
291*4882a593Smuzhiyun }
292*4882a593Smuzhiyun 
293*4882a593Smuzhiyun /*
294*4882a593Smuzhiyun  * The net_device is going away, give up the reference
295*4882a593Smuzhiyun  * that it holds on all queues and safely set the pointer
296*4882a593Smuzhiyun  * from the queues to NULL.
297*4882a593Smuzhiyun  */
tap_del_queues(struct tap_dev * tap)298*4882a593Smuzhiyun void tap_del_queues(struct tap_dev *tap)
299*4882a593Smuzhiyun {
300*4882a593Smuzhiyun 	struct tap_queue *q, *tmp;
301*4882a593Smuzhiyun 
302*4882a593Smuzhiyun 	ASSERT_RTNL();
303*4882a593Smuzhiyun 	list_for_each_entry_safe(q, tmp, &tap->queue_list, next) {
304*4882a593Smuzhiyun 		list_del_init(&q->next);
305*4882a593Smuzhiyun 		RCU_INIT_POINTER(q->tap, NULL);
306*4882a593Smuzhiyun 		if (q->enabled)
307*4882a593Smuzhiyun 			tap->numvtaps--;
308*4882a593Smuzhiyun 		tap->numqueues--;
309*4882a593Smuzhiyun 		sock_put(&q->sk);
310*4882a593Smuzhiyun 	}
311*4882a593Smuzhiyun 	BUG_ON(tap->numvtaps);
312*4882a593Smuzhiyun 	BUG_ON(tap->numqueues);
313*4882a593Smuzhiyun 	/* guarantee that any future tap_set_queue will fail */
314*4882a593Smuzhiyun 	tap->numvtaps = MAX_TAP_QUEUES;
315*4882a593Smuzhiyun }
316*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tap_del_queues);
317*4882a593Smuzhiyun 
tap_handle_frame(struct sk_buff ** pskb)318*4882a593Smuzhiyun rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
319*4882a593Smuzhiyun {
320*4882a593Smuzhiyun 	struct sk_buff *skb = *pskb;
321*4882a593Smuzhiyun 	struct net_device *dev = skb->dev;
322*4882a593Smuzhiyun 	struct tap_dev *tap;
323*4882a593Smuzhiyun 	struct tap_queue *q;
324*4882a593Smuzhiyun 	netdev_features_t features = TAP_FEATURES;
325*4882a593Smuzhiyun 
326*4882a593Smuzhiyun 	tap = tap_dev_get_rcu(dev);
327*4882a593Smuzhiyun 	if (!tap)
328*4882a593Smuzhiyun 		return RX_HANDLER_PASS;
329*4882a593Smuzhiyun 
330*4882a593Smuzhiyun 	q = tap_get_queue(tap, skb);
331*4882a593Smuzhiyun 	if (!q)
332*4882a593Smuzhiyun 		return RX_HANDLER_PASS;
333*4882a593Smuzhiyun 
334*4882a593Smuzhiyun 	skb_push(skb, ETH_HLEN);
335*4882a593Smuzhiyun 
336*4882a593Smuzhiyun 	/* Apply the forward feature mask so that we perform segmentation
337*4882a593Smuzhiyun 	 * according to users wishes.  This only works if VNET_HDR is
338*4882a593Smuzhiyun 	 * enabled.
339*4882a593Smuzhiyun 	 */
340*4882a593Smuzhiyun 	if (q->flags & IFF_VNET_HDR)
341*4882a593Smuzhiyun 		features |= tap->tap_features;
342*4882a593Smuzhiyun 	if (netif_needs_gso(skb, features)) {
343*4882a593Smuzhiyun 		struct sk_buff *segs = __skb_gso_segment(skb, features, false);
344*4882a593Smuzhiyun 		struct sk_buff *next;
345*4882a593Smuzhiyun 
346*4882a593Smuzhiyun 		if (IS_ERR(segs))
347*4882a593Smuzhiyun 			goto drop;
348*4882a593Smuzhiyun 
349*4882a593Smuzhiyun 		if (!segs) {
350*4882a593Smuzhiyun 			if (ptr_ring_produce(&q->ring, skb))
351*4882a593Smuzhiyun 				goto drop;
352*4882a593Smuzhiyun 			goto wake_up;
353*4882a593Smuzhiyun 		}
354*4882a593Smuzhiyun 
355*4882a593Smuzhiyun 		consume_skb(skb);
356*4882a593Smuzhiyun 		skb_list_walk_safe(segs, skb, next) {
357*4882a593Smuzhiyun 			skb_mark_not_on_list(skb);
358*4882a593Smuzhiyun 			if (ptr_ring_produce(&q->ring, skb)) {
359*4882a593Smuzhiyun 				kfree_skb(skb);
360*4882a593Smuzhiyun 				kfree_skb_list(next);
361*4882a593Smuzhiyun 				break;
362*4882a593Smuzhiyun 			}
363*4882a593Smuzhiyun 		}
364*4882a593Smuzhiyun 	} else {
365*4882a593Smuzhiyun 		/* If we receive a partial checksum and the tap side
366*4882a593Smuzhiyun 		 * doesn't support checksum offload, compute the checksum.
367*4882a593Smuzhiyun 		 * Note: it doesn't matter which checksum feature to
368*4882a593Smuzhiyun 		 *	  check, we either support them all or none.
369*4882a593Smuzhiyun 		 */
370*4882a593Smuzhiyun 		if (skb->ip_summed == CHECKSUM_PARTIAL &&
371*4882a593Smuzhiyun 		    !(features & NETIF_F_CSUM_MASK) &&
372*4882a593Smuzhiyun 		    skb_checksum_help(skb))
373*4882a593Smuzhiyun 			goto drop;
374*4882a593Smuzhiyun 		if (ptr_ring_produce(&q->ring, skb))
375*4882a593Smuzhiyun 			goto drop;
376*4882a593Smuzhiyun 	}
377*4882a593Smuzhiyun 
378*4882a593Smuzhiyun wake_up:
379*4882a593Smuzhiyun 	wake_up_interruptible_poll(sk_sleep(&q->sk), EPOLLIN | EPOLLRDNORM | EPOLLRDBAND);
380*4882a593Smuzhiyun 	return RX_HANDLER_CONSUMED;
381*4882a593Smuzhiyun 
382*4882a593Smuzhiyun drop:
383*4882a593Smuzhiyun 	/* Count errors/drops only here, thus don't care about args. */
384*4882a593Smuzhiyun 	if (tap->count_rx_dropped)
385*4882a593Smuzhiyun 		tap->count_rx_dropped(tap);
386*4882a593Smuzhiyun 	kfree_skb(skb);
387*4882a593Smuzhiyun 	return RX_HANDLER_CONSUMED;
388*4882a593Smuzhiyun }
389*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tap_handle_frame);
390*4882a593Smuzhiyun 
tap_get_major(int major)391*4882a593Smuzhiyun static struct major_info *tap_get_major(int major)
392*4882a593Smuzhiyun {
393*4882a593Smuzhiyun 	struct major_info *tap_major;
394*4882a593Smuzhiyun 
395*4882a593Smuzhiyun 	list_for_each_entry_rcu(tap_major, &major_list, next) {
396*4882a593Smuzhiyun 		if (tap_major->major == major)
397*4882a593Smuzhiyun 			return tap_major;
398*4882a593Smuzhiyun 	}
399*4882a593Smuzhiyun 
400*4882a593Smuzhiyun 	return NULL;
401*4882a593Smuzhiyun }
402*4882a593Smuzhiyun 
tap_get_minor(dev_t major,struct tap_dev * tap)403*4882a593Smuzhiyun int tap_get_minor(dev_t major, struct tap_dev *tap)
404*4882a593Smuzhiyun {
405*4882a593Smuzhiyun 	int retval = -ENOMEM;
406*4882a593Smuzhiyun 	struct major_info *tap_major;
407*4882a593Smuzhiyun 
408*4882a593Smuzhiyun 	rcu_read_lock();
409*4882a593Smuzhiyun 	tap_major = tap_get_major(MAJOR(major));
410*4882a593Smuzhiyun 	if (!tap_major) {
411*4882a593Smuzhiyun 		retval = -EINVAL;
412*4882a593Smuzhiyun 		goto unlock;
413*4882a593Smuzhiyun 	}
414*4882a593Smuzhiyun 
415*4882a593Smuzhiyun 	spin_lock(&tap_major->minor_lock);
416*4882a593Smuzhiyun 	retval = idr_alloc(&tap_major->minor_idr, tap, 1, TAP_NUM_DEVS, GFP_ATOMIC);
417*4882a593Smuzhiyun 	if (retval >= 0) {
418*4882a593Smuzhiyun 		tap->minor = retval;
419*4882a593Smuzhiyun 	} else if (retval == -ENOSPC) {
420*4882a593Smuzhiyun 		netdev_err(tap->dev, "Too many tap devices\n");
421*4882a593Smuzhiyun 		retval = -EINVAL;
422*4882a593Smuzhiyun 	}
423*4882a593Smuzhiyun 	spin_unlock(&tap_major->minor_lock);
424*4882a593Smuzhiyun 
425*4882a593Smuzhiyun unlock:
426*4882a593Smuzhiyun 	rcu_read_unlock();
427*4882a593Smuzhiyun 	return retval < 0 ? retval : 0;
428*4882a593Smuzhiyun }
429*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tap_get_minor);
430*4882a593Smuzhiyun 
tap_free_minor(dev_t major,struct tap_dev * tap)431*4882a593Smuzhiyun void tap_free_minor(dev_t major, struct tap_dev *tap)
432*4882a593Smuzhiyun {
433*4882a593Smuzhiyun 	struct major_info *tap_major;
434*4882a593Smuzhiyun 
435*4882a593Smuzhiyun 	rcu_read_lock();
436*4882a593Smuzhiyun 	tap_major = tap_get_major(MAJOR(major));
437*4882a593Smuzhiyun 	if (!tap_major) {
438*4882a593Smuzhiyun 		goto unlock;
439*4882a593Smuzhiyun 	}
440*4882a593Smuzhiyun 
441*4882a593Smuzhiyun 	spin_lock(&tap_major->minor_lock);
442*4882a593Smuzhiyun 	if (tap->minor) {
443*4882a593Smuzhiyun 		idr_remove(&tap_major->minor_idr, tap->minor);
444*4882a593Smuzhiyun 		tap->minor = 0;
445*4882a593Smuzhiyun 	}
446*4882a593Smuzhiyun 	spin_unlock(&tap_major->minor_lock);
447*4882a593Smuzhiyun 
448*4882a593Smuzhiyun unlock:
449*4882a593Smuzhiyun 	rcu_read_unlock();
450*4882a593Smuzhiyun }
451*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tap_free_minor);
452*4882a593Smuzhiyun 
dev_get_by_tap_file(int major,int minor)453*4882a593Smuzhiyun static struct tap_dev *dev_get_by_tap_file(int major, int minor)
454*4882a593Smuzhiyun {
455*4882a593Smuzhiyun 	struct net_device *dev = NULL;
456*4882a593Smuzhiyun 	struct tap_dev *tap;
457*4882a593Smuzhiyun 	struct major_info *tap_major;
458*4882a593Smuzhiyun 
459*4882a593Smuzhiyun 	rcu_read_lock();
460*4882a593Smuzhiyun 	tap_major = tap_get_major(major);
461*4882a593Smuzhiyun 	if (!tap_major) {
462*4882a593Smuzhiyun 		tap = NULL;
463*4882a593Smuzhiyun 		goto unlock;
464*4882a593Smuzhiyun 	}
465*4882a593Smuzhiyun 
466*4882a593Smuzhiyun 	spin_lock(&tap_major->minor_lock);
467*4882a593Smuzhiyun 	tap = idr_find(&tap_major->minor_idr, minor);
468*4882a593Smuzhiyun 	if (tap) {
469*4882a593Smuzhiyun 		dev = tap->dev;
470*4882a593Smuzhiyun 		dev_hold(dev);
471*4882a593Smuzhiyun 	}
472*4882a593Smuzhiyun 	spin_unlock(&tap_major->minor_lock);
473*4882a593Smuzhiyun 
474*4882a593Smuzhiyun unlock:
475*4882a593Smuzhiyun 	rcu_read_unlock();
476*4882a593Smuzhiyun 	return tap;
477*4882a593Smuzhiyun }
478*4882a593Smuzhiyun 
tap_sock_write_space(struct sock * sk)479*4882a593Smuzhiyun static void tap_sock_write_space(struct sock *sk)
480*4882a593Smuzhiyun {
481*4882a593Smuzhiyun 	wait_queue_head_t *wqueue;
482*4882a593Smuzhiyun 
483*4882a593Smuzhiyun 	if (!sock_writeable(sk) ||
484*4882a593Smuzhiyun 	    !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
485*4882a593Smuzhiyun 		return;
486*4882a593Smuzhiyun 
487*4882a593Smuzhiyun 	wqueue = sk_sleep(sk);
488*4882a593Smuzhiyun 	if (wqueue && waitqueue_active(wqueue))
489*4882a593Smuzhiyun 		wake_up_interruptible_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
490*4882a593Smuzhiyun }
491*4882a593Smuzhiyun 
tap_sock_destruct(struct sock * sk)492*4882a593Smuzhiyun static void tap_sock_destruct(struct sock *sk)
493*4882a593Smuzhiyun {
494*4882a593Smuzhiyun 	struct tap_queue *q = container_of(sk, struct tap_queue, sk);
495*4882a593Smuzhiyun 
496*4882a593Smuzhiyun 	ptr_ring_cleanup(&q->ring, __skb_array_destroy_skb);
497*4882a593Smuzhiyun }
498*4882a593Smuzhiyun 
tap_open(struct inode * inode,struct file * file)499*4882a593Smuzhiyun static int tap_open(struct inode *inode, struct file *file)
500*4882a593Smuzhiyun {
501*4882a593Smuzhiyun 	struct net *net = current->nsproxy->net_ns;
502*4882a593Smuzhiyun 	struct tap_dev *tap;
503*4882a593Smuzhiyun 	struct tap_queue *q;
504*4882a593Smuzhiyun 	int err = -ENODEV;
505*4882a593Smuzhiyun 
506*4882a593Smuzhiyun 	rtnl_lock();
507*4882a593Smuzhiyun 	tap = dev_get_by_tap_file(imajor(inode), iminor(inode));
508*4882a593Smuzhiyun 	if (!tap)
509*4882a593Smuzhiyun 		goto err;
510*4882a593Smuzhiyun 
511*4882a593Smuzhiyun 	err = -ENOMEM;
512*4882a593Smuzhiyun 	q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
513*4882a593Smuzhiyun 					     &tap_proto, 0);
514*4882a593Smuzhiyun 	if (!q)
515*4882a593Smuzhiyun 		goto err;
516*4882a593Smuzhiyun 	if (ptr_ring_init(&q->ring, tap->dev->tx_queue_len, GFP_KERNEL)) {
517*4882a593Smuzhiyun 		sk_free(&q->sk);
518*4882a593Smuzhiyun 		goto err;
519*4882a593Smuzhiyun 	}
520*4882a593Smuzhiyun 
521*4882a593Smuzhiyun 	init_waitqueue_head(&q->sock.wq.wait);
522*4882a593Smuzhiyun 	q->sock.type = SOCK_RAW;
523*4882a593Smuzhiyun 	q->sock.state = SS_CONNECTED;
524*4882a593Smuzhiyun 	q->sock.file = file;
525*4882a593Smuzhiyun 	q->sock.ops = &tap_socket_ops;
526*4882a593Smuzhiyun 	sock_init_data(&q->sock, &q->sk);
527*4882a593Smuzhiyun 	q->sk.sk_write_space = tap_sock_write_space;
528*4882a593Smuzhiyun 	q->sk.sk_destruct = tap_sock_destruct;
529*4882a593Smuzhiyun 	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
530*4882a593Smuzhiyun 	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
531*4882a593Smuzhiyun 
532*4882a593Smuzhiyun 	/*
533*4882a593Smuzhiyun 	 * so far only KVM virtio_net uses tap, enable zero copy between
534*4882a593Smuzhiyun 	 * guest kernel and host kernel when lower device supports zerocopy
535*4882a593Smuzhiyun 	 *
536*4882a593Smuzhiyun 	 * The macvlan supports zerocopy iff the lower device supports zero
537*4882a593Smuzhiyun 	 * copy so we don't have to look at the lower device directly.
538*4882a593Smuzhiyun 	 */
539*4882a593Smuzhiyun 	if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG))
540*4882a593Smuzhiyun 		sock_set_flag(&q->sk, SOCK_ZEROCOPY);
541*4882a593Smuzhiyun 
542*4882a593Smuzhiyun 	err = tap_set_queue(tap, file, q);
543*4882a593Smuzhiyun 	if (err) {
544*4882a593Smuzhiyun 		/* tap_sock_destruct() will take care of freeing ptr_ring */
545*4882a593Smuzhiyun 		goto err_put;
546*4882a593Smuzhiyun 	}
547*4882a593Smuzhiyun 
548*4882a593Smuzhiyun 	dev_put(tap->dev);
549*4882a593Smuzhiyun 
550*4882a593Smuzhiyun 	rtnl_unlock();
551*4882a593Smuzhiyun 	return err;
552*4882a593Smuzhiyun 
553*4882a593Smuzhiyun err_put:
554*4882a593Smuzhiyun 	sock_put(&q->sk);
555*4882a593Smuzhiyun err:
556*4882a593Smuzhiyun 	if (tap)
557*4882a593Smuzhiyun 		dev_put(tap->dev);
558*4882a593Smuzhiyun 
559*4882a593Smuzhiyun 	rtnl_unlock();
560*4882a593Smuzhiyun 	return err;
561*4882a593Smuzhiyun }
562*4882a593Smuzhiyun 
tap_release(struct inode * inode,struct file * file)563*4882a593Smuzhiyun static int tap_release(struct inode *inode, struct file *file)
564*4882a593Smuzhiyun {
565*4882a593Smuzhiyun 	struct tap_queue *q = file->private_data;
566*4882a593Smuzhiyun 	tap_put_queue(q);
567*4882a593Smuzhiyun 	return 0;
568*4882a593Smuzhiyun }
569*4882a593Smuzhiyun 
tap_poll(struct file * file,poll_table * wait)570*4882a593Smuzhiyun static __poll_t tap_poll(struct file *file, poll_table *wait)
571*4882a593Smuzhiyun {
572*4882a593Smuzhiyun 	struct tap_queue *q = file->private_data;
573*4882a593Smuzhiyun 	__poll_t mask = EPOLLERR;
574*4882a593Smuzhiyun 
575*4882a593Smuzhiyun 	if (!q)
576*4882a593Smuzhiyun 		goto out;
577*4882a593Smuzhiyun 
578*4882a593Smuzhiyun 	mask = 0;
579*4882a593Smuzhiyun 	poll_wait(file, &q->sock.wq.wait, wait);
580*4882a593Smuzhiyun 
581*4882a593Smuzhiyun 	if (!ptr_ring_empty(&q->ring))
582*4882a593Smuzhiyun 		mask |= EPOLLIN | EPOLLRDNORM;
583*4882a593Smuzhiyun 
584*4882a593Smuzhiyun 	if (sock_writeable(&q->sk) ||
585*4882a593Smuzhiyun 	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) &&
586*4882a593Smuzhiyun 	     sock_writeable(&q->sk)))
587*4882a593Smuzhiyun 		mask |= EPOLLOUT | EPOLLWRNORM;
588*4882a593Smuzhiyun 
589*4882a593Smuzhiyun out:
590*4882a593Smuzhiyun 	return mask;
591*4882a593Smuzhiyun }
592*4882a593Smuzhiyun 
tap_alloc_skb(struct sock * sk,size_t prepad,size_t len,size_t linear,int noblock,int * err)593*4882a593Smuzhiyun static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad,
594*4882a593Smuzhiyun 					    size_t len, size_t linear,
595*4882a593Smuzhiyun 						int noblock, int *err)
596*4882a593Smuzhiyun {
597*4882a593Smuzhiyun 	struct sk_buff *skb;
598*4882a593Smuzhiyun 
599*4882a593Smuzhiyun 	/* Under a page?  Don't bother with paged skb. */
600*4882a593Smuzhiyun 	if (prepad + len < PAGE_SIZE || !linear)
601*4882a593Smuzhiyun 		linear = len;
602*4882a593Smuzhiyun 
603*4882a593Smuzhiyun 	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
604*4882a593Smuzhiyun 				   err, 0);
605*4882a593Smuzhiyun 	if (!skb)
606*4882a593Smuzhiyun 		return NULL;
607*4882a593Smuzhiyun 
608*4882a593Smuzhiyun 	skb_reserve(skb, prepad);
609*4882a593Smuzhiyun 	skb_put(skb, linear);
610*4882a593Smuzhiyun 	skb->data_len = len - linear;
611*4882a593Smuzhiyun 	skb->len += len - linear;
612*4882a593Smuzhiyun 
613*4882a593Smuzhiyun 	return skb;
614*4882a593Smuzhiyun }
615*4882a593Smuzhiyun 
616*4882a593Smuzhiyun /* Neighbour code has some assumptions on HH_DATA_MOD alignment */
617*4882a593Smuzhiyun #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN)
618*4882a593Smuzhiyun 
619*4882a593Smuzhiyun /* Get packet from user space buffer */
tap_get_user(struct tap_queue * q,void * msg_control,struct iov_iter * from,int noblock)620*4882a593Smuzhiyun static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
621*4882a593Smuzhiyun 			    struct iov_iter *from, int noblock)
622*4882a593Smuzhiyun {
623*4882a593Smuzhiyun 	int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
624*4882a593Smuzhiyun 	struct sk_buff *skb;
625*4882a593Smuzhiyun 	struct tap_dev *tap;
626*4882a593Smuzhiyun 	unsigned long total_len = iov_iter_count(from);
627*4882a593Smuzhiyun 	unsigned long len = total_len;
628*4882a593Smuzhiyun 	int err;
629*4882a593Smuzhiyun 	struct virtio_net_hdr vnet_hdr = { 0 };
630*4882a593Smuzhiyun 	int vnet_hdr_len = 0;
631*4882a593Smuzhiyun 	int copylen = 0;
632*4882a593Smuzhiyun 	int depth;
633*4882a593Smuzhiyun 	bool zerocopy = false;
634*4882a593Smuzhiyun 	size_t linear;
635*4882a593Smuzhiyun 
636*4882a593Smuzhiyun 	if (q->flags & IFF_VNET_HDR) {
637*4882a593Smuzhiyun 		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
638*4882a593Smuzhiyun 
639*4882a593Smuzhiyun 		err = -EINVAL;
640*4882a593Smuzhiyun 		if (len < vnet_hdr_len)
641*4882a593Smuzhiyun 			goto err;
642*4882a593Smuzhiyun 		len -= vnet_hdr_len;
643*4882a593Smuzhiyun 
644*4882a593Smuzhiyun 		err = -EFAULT;
645*4882a593Smuzhiyun 		if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
646*4882a593Smuzhiyun 			goto err;
647*4882a593Smuzhiyun 		iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
648*4882a593Smuzhiyun 		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
649*4882a593Smuzhiyun 		     tap16_to_cpu(q, vnet_hdr.csum_start) +
650*4882a593Smuzhiyun 		     tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
651*4882a593Smuzhiyun 			     tap16_to_cpu(q, vnet_hdr.hdr_len))
652*4882a593Smuzhiyun 			vnet_hdr.hdr_len = cpu_to_tap16(q,
653*4882a593Smuzhiyun 				 tap16_to_cpu(q, vnet_hdr.csum_start) +
654*4882a593Smuzhiyun 				 tap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
655*4882a593Smuzhiyun 		err = -EINVAL;
656*4882a593Smuzhiyun 		if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len)
657*4882a593Smuzhiyun 			goto err;
658*4882a593Smuzhiyun 	}
659*4882a593Smuzhiyun 
660*4882a593Smuzhiyun 	err = -EINVAL;
661*4882a593Smuzhiyun 	if (unlikely(len < ETH_HLEN))
662*4882a593Smuzhiyun 		goto err;
663*4882a593Smuzhiyun 
664*4882a593Smuzhiyun 	if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
665*4882a593Smuzhiyun 		struct iov_iter i;
666*4882a593Smuzhiyun 
667*4882a593Smuzhiyun 		copylen = vnet_hdr.hdr_len ?
668*4882a593Smuzhiyun 			tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
669*4882a593Smuzhiyun 		if (copylen > good_linear)
670*4882a593Smuzhiyun 			copylen = good_linear;
671*4882a593Smuzhiyun 		else if (copylen < ETH_HLEN)
672*4882a593Smuzhiyun 			copylen = ETH_HLEN;
673*4882a593Smuzhiyun 		linear = copylen;
674*4882a593Smuzhiyun 		i = *from;
675*4882a593Smuzhiyun 		iov_iter_advance(&i, copylen);
676*4882a593Smuzhiyun 		if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
677*4882a593Smuzhiyun 			zerocopy = true;
678*4882a593Smuzhiyun 	}
679*4882a593Smuzhiyun 
680*4882a593Smuzhiyun 	if (!zerocopy) {
681*4882a593Smuzhiyun 		copylen = len;
682*4882a593Smuzhiyun 		linear = tap16_to_cpu(q, vnet_hdr.hdr_len);
683*4882a593Smuzhiyun 		if (linear > good_linear)
684*4882a593Smuzhiyun 			linear = good_linear;
685*4882a593Smuzhiyun 		else if (linear < ETH_HLEN)
686*4882a593Smuzhiyun 			linear = ETH_HLEN;
687*4882a593Smuzhiyun 	}
688*4882a593Smuzhiyun 
689*4882a593Smuzhiyun 	skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen,
690*4882a593Smuzhiyun 			    linear, noblock, &err);
691*4882a593Smuzhiyun 	if (!skb)
692*4882a593Smuzhiyun 		goto err;
693*4882a593Smuzhiyun 
694*4882a593Smuzhiyun 	if (zerocopy)
695*4882a593Smuzhiyun 		err = zerocopy_sg_from_iter(skb, from);
696*4882a593Smuzhiyun 	else
697*4882a593Smuzhiyun 		err = skb_copy_datagram_from_iter(skb, 0, from, len);
698*4882a593Smuzhiyun 
699*4882a593Smuzhiyun 	if (err)
700*4882a593Smuzhiyun 		goto err_kfree;
701*4882a593Smuzhiyun 
702*4882a593Smuzhiyun 	skb_set_network_header(skb, ETH_HLEN);
703*4882a593Smuzhiyun 	skb_reset_mac_header(skb);
704*4882a593Smuzhiyun 	skb->protocol = eth_hdr(skb)->h_proto;
705*4882a593Smuzhiyun 
706*4882a593Smuzhiyun 	if (vnet_hdr_len) {
707*4882a593Smuzhiyun 		err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
708*4882a593Smuzhiyun 					    tap_is_little_endian(q));
709*4882a593Smuzhiyun 		if (err)
710*4882a593Smuzhiyun 			goto err_kfree;
711*4882a593Smuzhiyun 	}
712*4882a593Smuzhiyun 
713*4882a593Smuzhiyun 	skb_probe_transport_header(skb);
714*4882a593Smuzhiyun 
715*4882a593Smuzhiyun 	/* Move network header to the right position for VLAN tagged packets */
716*4882a593Smuzhiyun 	if ((skb->protocol == htons(ETH_P_8021Q) ||
717*4882a593Smuzhiyun 	     skb->protocol == htons(ETH_P_8021AD)) &&
718*4882a593Smuzhiyun 	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
719*4882a593Smuzhiyun 		skb_set_network_header(skb, depth);
720*4882a593Smuzhiyun 
721*4882a593Smuzhiyun 	rcu_read_lock();
722*4882a593Smuzhiyun 	tap = rcu_dereference(q->tap);
723*4882a593Smuzhiyun 	/* copy skb_ubuf_info for callback when skb has no error */
724*4882a593Smuzhiyun 	if (zerocopy) {
725*4882a593Smuzhiyun 		skb_shinfo(skb)->destructor_arg = msg_control;
726*4882a593Smuzhiyun 		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
727*4882a593Smuzhiyun 		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
728*4882a593Smuzhiyun 	} else if (msg_control) {
729*4882a593Smuzhiyun 		struct ubuf_info *uarg = msg_control;
730*4882a593Smuzhiyun 		uarg->callback(uarg, false);
731*4882a593Smuzhiyun 	}
732*4882a593Smuzhiyun 
733*4882a593Smuzhiyun 	if (tap) {
734*4882a593Smuzhiyun 		skb->dev = tap->dev;
735*4882a593Smuzhiyun 		dev_queue_xmit(skb);
736*4882a593Smuzhiyun 	} else {
737*4882a593Smuzhiyun 		kfree_skb(skb);
738*4882a593Smuzhiyun 	}
739*4882a593Smuzhiyun 	rcu_read_unlock();
740*4882a593Smuzhiyun 
741*4882a593Smuzhiyun 	return total_len;
742*4882a593Smuzhiyun 
743*4882a593Smuzhiyun err_kfree:
744*4882a593Smuzhiyun 	kfree_skb(skb);
745*4882a593Smuzhiyun 
746*4882a593Smuzhiyun err:
747*4882a593Smuzhiyun 	rcu_read_lock();
748*4882a593Smuzhiyun 	tap = rcu_dereference(q->tap);
749*4882a593Smuzhiyun 	if (tap && tap->count_tx_dropped)
750*4882a593Smuzhiyun 		tap->count_tx_dropped(tap);
751*4882a593Smuzhiyun 	rcu_read_unlock();
752*4882a593Smuzhiyun 
753*4882a593Smuzhiyun 	return err;
754*4882a593Smuzhiyun }
755*4882a593Smuzhiyun 
tap_write_iter(struct kiocb * iocb,struct iov_iter * from)756*4882a593Smuzhiyun static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from)
757*4882a593Smuzhiyun {
758*4882a593Smuzhiyun 	struct file *file = iocb->ki_filp;
759*4882a593Smuzhiyun 	struct tap_queue *q = file->private_data;
760*4882a593Smuzhiyun 
761*4882a593Smuzhiyun 	return tap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK);
762*4882a593Smuzhiyun }
763*4882a593Smuzhiyun 
764*4882a593Smuzhiyun /* Put packet to the user space buffer */
tap_put_user(struct tap_queue * q,const struct sk_buff * skb,struct iov_iter * iter)765*4882a593Smuzhiyun static ssize_t tap_put_user(struct tap_queue *q,
766*4882a593Smuzhiyun 			    const struct sk_buff *skb,
767*4882a593Smuzhiyun 			    struct iov_iter *iter)
768*4882a593Smuzhiyun {
769*4882a593Smuzhiyun 	int ret;
770*4882a593Smuzhiyun 	int vnet_hdr_len = 0;
771*4882a593Smuzhiyun 	int vlan_offset = 0;
772*4882a593Smuzhiyun 	int total;
773*4882a593Smuzhiyun 
774*4882a593Smuzhiyun 	if (q->flags & IFF_VNET_HDR) {
775*4882a593Smuzhiyun 		int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
776*4882a593Smuzhiyun 		struct virtio_net_hdr vnet_hdr;
777*4882a593Smuzhiyun 
778*4882a593Smuzhiyun 		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
779*4882a593Smuzhiyun 		if (iov_iter_count(iter) < vnet_hdr_len)
780*4882a593Smuzhiyun 			return -EINVAL;
781*4882a593Smuzhiyun 
782*4882a593Smuzhiyun 		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
783*4882a593Smuzhiyun 					    tap_is_little_endian(q), true,
784*4882a593Smuzhiyun 					    vlan_hlen))
785*4882a593Smuzhiyun 			BUG();
786*4882a593Smuzhiyun 
787*4882a593Smuzhiyun 		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
788*4882a593Smuzhiyun 		    sizeof(vnet_hdr))
789*4882a593Smuzhiyun 			return -EFAULT;
790*4882a593Smuzhiyun 
791*4882a593Smuzhiyun 		iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr));
792*4882a593Smuzhiyun 	}
793*4882a593Smuzhiyun 	total = vnet_hdr_len;
794*4882a593Smuzhiyun 	total += skb->len;
795*4882a593Smuzhiyun 
796*4882a593Smuzhiyun 	if (skb_vlan_tag_present(skb)) {
797*4882a593Smuzhiyun 		struct {
798*4882a593Smuzhiyun 			__be16 h_vlan_proto;
799*4882a593Smuzhiyun 			__be16 h_vlan_TCI;
800*4882a593Smuzhiyun 		} veth;
801*4882a593Smuzhiyun 		veth.h_vlan_proto = skb->vlan_proto;
802*4882a593Smuzhiyun 		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
803*4882a593Smuzhiyun 
804*4882a593Smuzhiyun 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
805*4882a593Smuzhiyun 		total += VLAN_HLEN;
806*4882a593Smuzhiyun 
807*4882a593Smuzhiyun 		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
808*4882a593Smuzhiyun 		if (ret || !iov_iter_count(iter))
809*4882a593Smuzhiyun 			goto done;
810*4882a593Smuzhiyun 
811*4882a593Smuzhiyun 		ret = copy_to_iter(&veth, sizeof(veth), iter);
812*4882a593Smuzhiyun 		if (ret != sizeof(veth) || !iov_iter_count(iter))
813*4882a593Smuzhiyun 			goto done;
814*4882a593Smuzhiyun 	}
815*4882a593Smuzhiyun 
816*4882a593Smuzhiyun 	ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
817*4882a593Smuzhiyun 				     skb->len - vlan_offset);
818*4882a593Smuzhiyun 
819*4882a593Smuzhiyun done:
820*4882a593Smuzhiyun 	return ret ? ret : total;
821*4882a593Smuzhiyun }
822*4882a593Smuzhiyun 
tap_do_read(struct tap_queue * q,struct iov_iter * to,int noblock,struct sk_buff * skb)823*4882a593Smuzhiyun static ssize_t tap_do_read(struct tap_queue *q,
824*4882a593Smuzhiyun 			   struct iov_iter *to,
825*4882a593Smuzhiyun 			   int noblock, struct sk_buff *skb)
826*4882a593Smuzhiyun {
827*4882a593Smuzhiyun 	DEFINE_WAIT(wait);
828*4882a593Smuzhiyun 	ssize_t ret = 0;
829*4882a593Smuzhiyun 
830*4882a593Smuzhiyun 	if (!iov_iter_count(to)) {
831*4882a593Smuzhiyun 		kfree_skb(skb);
832*4882a593Smuzhiyun 		return 0;
833*4882a593Smuzhiyun 	}
834*4882a593Smuzhiyun 
835*4882a593Smuzhiyun 	if (skb)
836*4882a593Smuzhiyun 		goto put;
837*4882a593Smuzhiyun 
838*4882a593Smuzhiyun 	while (1) {
839*4882a593Smuzhiyun 		if (!noblock)
840*4882a593Smuzhiyun 			prepare_to_wait(sk_sleep(&q->sk), &wait,
841*4882a593Smuzhiyun 					TASK_INTERRUPTIBLE);
842*4882a593Smuzhiyun 
843*4882a593Smuzhiyun 		/* Read frames from the queue */
844*4882a593Smuzhiyun 		skb = ptr_ring_consume(&q->ring);
845*4882a593Smuzhiyun 		if (skb)
846*4882a593Smuzhiyun 			break;
847*4882a593Smuzhiyun 		if (noblock) {
848*4882a593Smuzhiyun 			ret = -EAGAIN;
849*4882a593Smuzhiyun 			break;
850*4882a593Smuzhiyun 		}
851*4882a593Smuzhiyun 		if (signal_pending(current)) {
852*4882a593Smuzhiyun 			ret = -ERESTARTSYS;
853*4882a593Smuzhiyun 			break;
854*4882a593Smuzhiyun 		}
855*4882a593Smuzhiyun 		/* Nothing to read, let's sleep */
856*4882a593Smuzhiyun 		schedule();
857*4882a593Smuzhiyun 	}
858*4882a593Smuzhiyun 	if (!noblock)
859*4882a593Smuzhiyun 		finish_wait(sk_sleep(&q->sk), &wait);
860*4882a593Smuzhiyun 
861*4882a593Smuzhiyun put:
862*4882a593Smuzhiyun 	if (skb) {
863*4882a593Smuzhiyun 		ret = tap_put_user(q, skb, to);
864*4882a593Smuzhiyun 		if (unlikely(ret < 0))
865*4882a593Smuzhiyun 			kfree_skb(skb);
866*4882a593Smuzhiyun 		else
867*4882a593Smuzhiyun 			consume_skb(skb);
868*4882a593Smuzhiyun 	}
869*4882a593Smuzhiyun 	return ret;
870*4882a593Smuzhiyun }
871*4882a593Smuzhiyun 
tap_read_iter(struct kiocb * iocb,struct iov_iter * to)872*4882a593Smuzhiyun static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to)
873*4882a593Smuzhiyun {
874*4882a593Smuzhiyun 	struct file *file = iocb->ki_filp;
875*4882a593Smuzhiyun 	struct tap_queue *q = file->private_data;
876*4882a593Smuzhiyun 	ssize_t len = iov_iter_count(to), ret;
877*4882a593Smuzhiyun 
878*4882a593Smuzhiyun 	ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK, NULL);
879*4882a593Smuzhiyun 	ret = min_t(ssize_t, ret, len);
880*4882a593Smuzhiyun 	if (ret > 0)
881*4882a593Smuzhiyun 		iocb->ki_pos = ret;
882*4882a593Smuzhiyun 	return ret;
883*4882a593Smuzhiyun }
884*4882a593Smuzhiyun 
tap_get_tap_dev(struct tap_queue * q)885*4882a593Smuzhiyun static struct tap_dev *tap_get_tap_dev(struct tap_queue *q)
886*4882a593Smuzhiyun {
887*4882a593Smuzhiyun 	struct tap_dev *tap;
888*4882a593Smuzhiyun 
889*4882a593Smuzhiyun 	ASSERT_RTNL();
890*4882a593Smuzhiyun 	tap = rtnl_dereference(q->tap);
891*4882a593Smuzhiyun 	if (tap)
892*4882a593Smuzhiyun 		dev_hold(tap->dev);
893*4882a593Smuzhiyun 
894*4882a593Smuzhiyun 	return tap;
895*4882a593Smuzhiyun }
896*4882a593Smuzhiyun 
tap_put_tap_dev(struct tap_dev * tap)897*4882a593Smuzhiyun static void tap_put_tap_dev(struct tap_dev *tap)
898*4882a593Smuzhiyun {
899*4882a593Smuzhiyun 	dev_put(tap->dev);
900*4882a593Smuzhiyun }
901*4882a593Smuzhiyun 
tap_ioctl_set_queue(struct file * file,unsigned int flags)902*4882a593Smuzhiyun static int tap_ioctl_set_queue(struct file *file, unsigned int flags)
903*4882a593Smuzhiyun {
904*4882a593Smuzhiyun 	struct tap_queue *q = file->private_data;
905*4882a593Smuzhiyun 	struct tap_dev *tap;
906*4882a593Smuzhiyun 	int ret;
907*4882a593Smuzhiyun 
908*4882a593Smuzhiyun 	tap = tap_get_tap_dev(q);
909*4882a593Smuzhiyun 	if (!tap)
910*4882a593Smuzhiyun 		return -EINVAL;
911*4882a593Smuzhiyun 
912*4882a593Smuzhiyun 	if (flags & IFF_ATTACH_QUEUE)
913*4882a593Smuzhiyun 		ret = tap_enable_queue(tap, file, q);
914*4882a593Smuzhiyun 	else if (flags & IFF_DETACH_QUEUE)
915*4882a593Smuzhiyun 		ret = tap_disable_queue(q);
916*4882a593Smuzhiyun 	else
917*4882a593Smuzhiyun 		ret = -EINVAL;
918*4882a593Smuzhiyun 
919*4882a593Smuzhiyun 	tap_put_tap_dev(tap);
920*4882a593Smuzhiyun 	return ret;
921*4882a593Smuzhiyun }
922*4882a593Smuzhiyun 
set_offload(struct tap_queue * q,unsigned long arg)923*4882a593Smuzhiyun static int set_offload(struct tap_queue *q, unsigned long arg)
924*4882a593Smuzhiyun {
925*4882a593Smuzhiyun 	struct tap_dev *tap;
926*4882a593Smuzhiyun 	netdev_features_t features;
927*4882a593Smuzhiyun 	netdev_features_t feature_mask = 0;
928*4882a593Smuzhiyun 
929*4882a593Smuzhiyun 	tap = rtnl_dereference(q->tap);
930*4882a593Smuzhiyun 	if (!tap)
931*4882a593Smuzhiyun 		return -ENOLINK;
932*4882a593Smuzhiyun 
933*4882a593Smuzhiyun 	features = tap->dev->features;
934*4882a593Smuzhiyun 
935*4882a593Smuzhiyun 	if (arg & TUN_F_CSUM) {
936*4882a593Smuzhiyun 		feature_mask = NETIF_F_HW_CSUM;
937*4882a593Smuzhiyun 
938*4882a593Smuzhiyun 		if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) {
939*4882a593Smuzhiyun 			if (arg & TUN_F_TSO_ECN)
940*4882a593Smuzhiyun 				feature_mask |= NETIF_F_TSO_ECN;
941*4882a593Smuzhiyun 			if (arg & TUN_F_TSO4)
942*4882a593Smuzhiyun 				feature_mask |= NETIF_F_TSO;
943*4882a593Smuzhiyun 			if (arg & TUN_F_TSO6)
944*4882a593Smuzhiyun 				feature_mask |= NETIF_F_TSO6;
945*4882a593Smuzhiyun 		}
946*4882a593Smuzhiyun 	}
947*4882a593Smuzhiyun 
948*4882a593Smuzhiyun 	/* tun/tap driver inverts the usage for TSO offloads, where
949*4882a593Smuzhiyun 	 * setting the TSO bit means that the userspace wants to
950*4882a593Smuzhiyun 	 * accept TSO frames and turning it off means that user space
951*4882a593Smuzhiyun 	 * does not support TSO.
952*4882a593Smuzhiyun 	 * For tap, we have to invert it to mean the same thing.
953*4882a593Smuzhiyun 	 * When user space turns off TSO, we turn off GSO/LRO so that
954*4882a593Smuzhiyun 	 * user-space will not receive TSO frames.
955*4882a593Smuzhiyun 	 */
956*4882a593Smuzhiyun 	if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6))
957*4882a593Smuzhiyun 		features |= RX_OFFLOADS;
958*4882a593Smuzhiyun 	else
959*4882a593Smuzhiyun 		features &= ~RX_OFFLOADS;
960*4882a593Smuzhiyun 
961*4882a593Smuzhiyun 	/* tap_features are the same as features on tun/tap and
962*4882a593Smuzhiyun 	 * reflect user expectations.
963*4882a593Smuzhiyun 	 */
964*4882a593Smuzhiyun 	tap->tap_features = feature_mask;
965*4882a593Smuzhiyun 	if (tap->update_features)
966*4882a593Smuzhiyun 		tap->update_features(tap, features);
967*4882a593Smuzhiyun 
968*4882a593Smuzhiyun 	return 0;
969*4882a593Smuzhiyun }
970*4882a593Smuzhiyun 
971*4882a593Smuzhiyun /*
972*4882a593Smuzhiyun  * provide compatibility with generic tun/tap interface
973*4882a593Smuzhiyun  */
tap_ioctl(struct file * file,unsigned int cmd,unsigned long arg)974*4882a593Smuzhiyun static long tap_ioctl(struct file *file, unsigned int cmd,
975*4882a593Smuzhiyun 		      unsigned long arg)
976*4882a593Smuzhiyun {
977*4882a593Smuzhiyun 	struct tap_queue *q = file->private_data;
978*4882a593Smuzhiyun 	struct tap_dev *tap;
979*4882a593Smuzhiyun 	void __user *argp = (void __user *)arg;
980*4882a593Smuzhiyun 	struct ifreq __user *ifr = argp;
981*4882a593Smuzhiyun 	unsigned int __user *up = argp;
982*4882a593Smuzhiyun 	unsigned short u;
983*4882a593Smuzhiyun 	int __user *sp = argp;
984*4882a593Smuzhiyun 	struct sockaddr sa;
985*4882a593Smuzhiyun 	int s;
986*4882a593Smuzhiyun 	int ret;
987*4882a593Smuzhiyun 
988*4882a593Smuzhiyun 	switch (cmd) {
989*4882a593Smuzhiyun 	case TUNSETIFF:
990*4882a593Smuzhiyun 		/* ignore the name, just look at flags */
991*4882a593Smuzhiyun 		if (get_user(u, &ifr->ifr_flags))
992*4882a593Smuzhiyun 			return -EFAULT;
993*4882a593Smuzhiyun 
994*4882a593Smuzhiyun 		ret = 0;
995*4882a593Smuzhiyun 		if ((u & ~TAP_IFFEATURES) != (IFF_NO_PI | IFF_TAP))
996*4882a593Smuzhiyun 			ret = -EINVAL;
997*4882a593Smuzhiyun 		else
998*4882a593Smuzhiyun 			q->flags = (q->flags & ~TAP_IFFEATURES) | u;
999*4882a593Smuzhiyun 
1000*4882a593Smuzhiyun 		return ret;
1001*4882a593Smuzhiyun 
1002*4882a593Smuzhiyun 	case TUNGETIFF:
1003*4882a593Smuzhiyun 		rtnl_lock();
1004*4882a593Smuzhiyun 		tap = tap_get_tap_dev(q);
1005*4882a593Smuzhiyun 		if (!tap) {
1006*4882a593Smuzhiyun 			rtnl_unlock();
1007*4882a593Smuzhiyun 			return -ENOLINK;
1008*4882a593Smuzhiyun 		}
1009*4882a593Smuzhiyun 
1010*4882a593Smuzhiyun 		ret = 0;
1011*4882a593Smuzhiyun 		u = q->flags;
1012*4882a593Smuzhiyun 		if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
1013*4882a593Smuzhiyun 		    put_user(u, &ifr->ifr_flags))
1014*4882a593Smuzhiyun 			ret = -EFAULT;
1015*4882a593Smuzhiyun 		tap_put_tap_dev(tap);
1016*4882a593Smuzhiyun 		rtnl_unlock();
1017*4882a593Smuzhiyun 		return ret;
1018*4882a593Smuzhiyun 
1019*4882a593Smuzhiyun 	case TUNSETQUEUE:
1020*4882a593Smuzhiyun 		if (get_user(u, &ifr->ifr_flags))
1021*4882a593Smuzhiyun 			return -EFAULT;
1022*4882a593Smuzhiyun 		rtnl_lock();
1023*4882a593Smuzhiyun 		ret = tap_ioctl_set_queue(file, u);
1024*4882a593Smuzhiyun 		rtnl_unlock();
1025*4882a593Smuzhiyun 		return ret;
1026*4882a593Smuzhiyun 
1027*4882a593Smuzhiyun 	case TUNGETFEATURES:
1028*4882a593Smuzhiyun 		if (put_user(IFF_TAP | IFF_NO_PI | TAP_IFFEATURES, up))
1029*4882a593Smuzhiyun 			return -EFAULT;
1030*4882a593Smuzhiyun 		return 0;
1031*4882a593Smuzhiyun 
1032*4882a593Smuzhiyun 	case TUNSETSNDBUF:
1033*4882a593Smuzhiyun 		if (get_user(s, sp))
1034*4882a593Smuzhiyun 			return -EFAULT;
1035*4882a593Smuzhiyun 		if (s <= 0)
1036*4882a593Smuzhiyun 			return -EINVAL;
1037*4882a593Smuzhiyun 
1038*4882a593Smuzhiyun 		q->sk.sk_sndbuf = s;
1039*4882a593Smuzhiyun 		return 0;
1040*4882a593Smuzhiyun 
1041*4882a593Smuzhiyun 	case TUNGETVNETHDRSZ:
1042*4882a593Smuzhiyun 		s = q->vnet_hdr_sz;
1043*4882a593Smuzhiyun 		if (put_user(s, sp))
1044*4882a593Smuzhiyun 			return -EFAULT;
1045*4882a593Smuzhiyun 		return 0;
1046*4882a593Smuzhiyun 
1047*4882a593Smuzhiyun 	case TUNSETVNETHDRSZ:
1048*4882a593Smuzhiyun 		if (get_user(s, sp))
1049*4882a593Smuzhiyun 			return -EFAULT;
1050*4882a593Smuzhiyun 		if (s < (int)sizeof(struct virtio_net_hdr))
1051*4882a593Smuzhiyun 			return -EINVAL;
1052*4882a593Smuzhiyun 
1053*4882a593Smuzhiyun 		q->vnet_hdr_sz = s;
1054*4882a593Smuzhiyun 		return 0;
1055*4882a593Smuzhiyun 
1056*4882a593Smuzhiyun 	case TUNGETVNETLE:
1057*4882a593Smuzhiyun 		s = !!(q->flags & TAP_VNET_LE);
1058*4882a593Smuzhiyun 		if (put_user(s, sp))
1059*4882a593Smuzhiyun 			return -EFAULT;
1060*4882a593Smuzhiyun 		return 0;
1061*4882a593Smuzhiyun 
1062*4882a593Smuzhiyun 	case TUNSETVNETLE:
1063*4882a593Smuzhiyun 		if (get_user(s, sp))
1064*4882a593Smuzhiyun 			return -EFAULT;
1065*4882a593Smuzhiyun 		if (s)
1066*4882a593Smuzhiyun 			q->flags |= TAP_VNET_LE;
1067*4882a593Smuzhiyun 		else
1068*4882a593Smuzhiyun 			q->flags &= ~TAP_VNET_LE;
1069*4882a593Smuzhiyun 		return 0;
1070*4882a593Smuzhiyun 
1071*4882a593Smuzhiyun 	case TUNGETVNETBE:
1072*4882a593Smuzhiyun 		return tap_get_vnet_be(q, sp);
1073*4882a593Smuzhiyun 
1074*4882a593Smuzhiyun 	case TUNSETVNETBE:
1075*4882a593Smuzhiyun 		return tap_set_vnet_be(q, sp);
1076*4882a593Smuzhiyun 
1077*4882a593Smuzhiyun 	case TUNSETOFFLOAD:
1078*4882a593Smuzhiyun 		/* let the user check for future flags */
1079*4882a593Smuzhiyun 		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
1080*4882a593Smuzhiyun 			    TUN_F_TSO_ECN | TUN_F_UFO))
1081*4882a593Smuzhiyun 			return -EINVAL;
1082*4882a593Smuzhiyun 
1083*4882a593Smuzhiyun 		rtnl_lock();
1084*4882a593Smuzhiyun 		ret = set_offload(q, arg);
1085*4882a593Smuzhiyun 		rtnl_unlock();
1086*4882a593Smuzhiyun 		return ret;
1087*4882a593Smuzhiyun 
1088*4882a593Smuzhiyun 	case SIOCGIFHWADDR:
1089*4882a593Smuzhiyun 		rtnl_lock();
1090*4882a593Smuzhiyun 		tap = tap_get_tap_dev(q);
1091*4882a593Smuzhiyun 		if (!tap) {
1092*4882a593Smuzhiyun 			rtnl_unlock();
1093*4882a593Smuzhiyun 			return -ENOLINK;
1094*4882a593Smuzhiyun 		}
1095*4882a593Smuzhiyun 		ret = 0;
1096*4882a593Smuzhiyun 		dev_get_mac_address(&sa, dev_net(tap->dev), tap->dev->name);
1097*4882a593Smuzhiyun 		if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
1098*4882a593Smuzhiyun 		    copy_to_user(&ifr->ifr_hwaddr, &sa, sizeof(sa)))
1099*4882a593Smuzhiyun 			ret = -EFAULT;
1100*4882a593Smuzhiyun 		tap_put_tap_dev(tap);
1101*4882a593Smuzhiyun 		rtnl_unlock();
1102*4882a593Smuzhiyun 		return ret;
1103*4882a593Smuzhiyun 
1104*4882a593Smuzhiyun 	case SIOCSIFHWADDR:
1105*4882a593Smuzhiyun 		if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa)))
1106*4882a593Smuzhiyun 			return -EFAULT;
1107*4882a593Smuzhiyun 		rtnl_lock();
1108*4882a593Smuzhiyun 		tap = tap_get_tap_dev(q);
1109*4882a593Smuzhiyun 		if (!tap) {
1110*4882a593Smuzhiyun 			rtnl_unlock();
1111*4882a593Smuzhiyun 			return -ENOLINK;
1112*4882a593Smuzhiyun 		}
1113*4882a593Smuzhiyun 		ret = dev_set_mac_address_user(tap->dev, &sa, NULL);
1114*4882a593Smuzhiyun 		tap_put_tap_dev(tap);
1115*4882a593Smuzhiyun 		rtnl_unlock();
1116*4882a593Smuzhiyun 		return ret;
1117*4882a593Smuzhiyun 
1118*4882a593Smuzhiyun 	default:
1119*4882a593Smuzhiyun 		return -EINVAL;
1120*4882a593Smuzhiyun 	}
1121*4882a593Smuzhiyun }
1122*4882a593Smuzhiyun 
1123*4882a593Smuzhiyun static const struct file_operations tap_fops = {
1124*4882a593Smuzhiyun 	.owner		= THIS_MODULE,
1125*4882a593Smuzhiyun 	.open		= tap_open,
1126*4882a593Smuzhiyun 	.release	= tap_release,
1127*4882a593Smuzhiyun 	.read_iter	= tap_read_iter,
1128*4882a593Smuzhiyun 	.write_iter	= tap_write_iter,
1129*4882a593Smuzhiyun 	.poll		= tap_poll,
1130*4882a593Smuzhiyun 	.llseek		= no_llseek,
1131*4882a593Smuzhiyun 	.unlocked_ioctl	= tap_ioctl,
1132*4882a593Smuzhiyun 	.compat_ioctl	= compat_ptr_ioctl,
1133*4882a593Smuzhiyun };
1134*4882a593Smuzhiyun 
tap_get_user_xdp(struct tap_queue * q,struct xdp_buff * xdp)1135*4882a593Smuzhiyun static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
1136*4882a593Smuzhiyun {
1137*4882a593Smuzhiyun 	struct tun_xdp_hdr *hdr = xdp->data_hard_start;
1138*4882a593Smuzhiyun 	struct virtio_net_hdr *gso = &hdr->gso;
1139*4882a593Smuzhiyun 	int buflen = hdr->buflen;
1140*4882a593Smuzhiyun 	int vnet_hdr_len = 0;
1141*4882a593Smuzhiyun 	struct tap_dev *tap;
1142*4882a593Smuzhiyun 	struct sk_buff *skb;
1143*4882a593Smuzhiyun 	int err, depth;
1144*4882a593Smuzhiyun 
1145*4882a593Smuzhiyun 	if (q->flags & IFF_VNET_HDR)
1146*4882a593Smuzhiyun 		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
1147*4882a593Smuzhiyun 
1148*4882a593Smuzhiyun 	skb = build_skb(xdp->data_hard_start, buflen);
1149*4882a593Smuzhiyun 	if (!skb) {
1150*4882a593Smuzhiyun 		err = -ENOMEM;
1151*4882a593Smuzhiyun 		goto err;
1152*4882a593Smuzhiyun 	}
1153*4882a593Smuzhiyun 
1154*4882a593Smuzhiyun 	skb_reserve(skb, xdp->data - xdp->data_hard_start);
1155*4882a593Smuzhiyun 	skb_put(skb, xdp->data_end - xdp->data);
1156*4882a593Smuzhiyun 
1157*4882a593Smuzhiyun 	skb_set_network_header(skb, ETH_HLEN);
1158*4882a593Smuzhiyun 	skb_reset_mac_header(skb);
1159*4882a593Smuzhiyun 	skb->protocol = eth_hdr(skb)->h_proto;
1160*4882a593Smuzhiyun 
1161*4882a593Smuzhiyun 	if (vnet_hdr_len) {
1162*4882a593Smuzhiyun 		err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q));
1163*4882a593Smuzhiyun 		if (err)
1164*4882a593Smuzhiyun 			goto err_kfree;
1165*4882a593Smuzhiyun 	}
1166*4882a593Smuzhiyun 
1167*4882a593Smuzhiyun 	/* Move network header to the right position for VLAN tagged packets */
1168*4882a593Smuzhiyun 	if ((skb->protocol == htons(ETH_P_8021Q) ||
1169*4882a593Smuzhiyun 	     skb->protocol == htons(ETH_P_8021AD)) &&
1170*4882a593Smuzhiyun 	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
1171*4882a593Smuzhiyun 		skb_set_network_header(skb, depth);
1172*4882a593Smuzhiyun 
1173*4882a593Smuzhiyun 	rcu_read_lock();
1174*4882a593Smuzhiyun 	tap = rcu_dereference(q->tap);
1175*4882a593Smuzhiyun 	if (tap) {
1176*4882a593Smuzhiyun 		skb->dev = tap->dev;
1177*4882a593Smuzhiyun 		skb_probe_transport_header(skb);
1178*4882a593Smuzhiyun 		dev_queue_xmit(skb);
1179*4882a593Smuzhiyun 	} else {
1180*4882a593Smuzhiyun 		kfree_skb(skb);
1181*4882a593Smuzhiyun 	}
1182*4882a593Smuzhiyun 	rcu_read_unlock();
1183*4882a593Smuzhiyun 
1184*4882a593Smuzhiyun 	return 0;
1185*4882a593Smuzhiyun 
1186*4882a593Smuzhiyun err_kfree:
1187*4882a593Smuzhiyun 	kfree_skb(skb);
1188*4882a593Smuzhiyun err:
1189*4882a593Smuzhiyun 	rcu_read_lock();
1190*4882a593Smuzhiyun 	tap = rcu_dereference(q->tap);
1191*4882a593Smuzhiyun 	if (tap && tap->count_tx_dropped)
1192*4882a593Smuzhiyun 		tap->count_tx_dropped(tap);
1193*4882a593Smuzhiyun 	rcu_read_unlock();
1194*4882a593Smuzhiyun 	return err;
1195*4882a593Smuzhiyun }
1196*4882a593Smuzhiyun 
tap_sendmsg(struct socket * sock,struct msghdr * m,size_t total_len)1197*4882a593Smuzhiyun static int tap_sendmsg(struct socket *sock, struct msghdr *m,
1198*4882a593Smuzhiyun 		       size_t total_len)
1199*4882a593Smuzhiyun {
1200*4882a593Smuzhiyun 	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
1201*4882a593Smuzhiyun 	struct tun_msg_ctl *ctl = m->msg_control;
1202*4882a593Smuzhiyun 	struct xdp_buff *xdp;
1203*4882a593Smuzhiyun 	int i;
1204*4882a593Smuzhiyun 
1205*4882a593Smuzhiyun 	if (m->msg_controllen == sizeof(struct tun_msg_ctl) &&
1206*4882a593Smuzhiyun 	    ctl && ctl->type == TUN_MSG_PTR) {
1207*4882a593Smuzhiyun 		for (i = 0; i < ctl->num; i++) {
1208*4882a593Smuzhiyun 			xdp = &((struct xdp_buff *)ctl->ptr)[i];
1209*4882a593Smuzhiyun 			tap_get_user_xdp(q, xdp);
1210*4882a593Smuzhiyun 		}
1211*4882a593Smuzhiyun 		return 0;
1212*4882a593Smuzhiyun 	}
1213*4882a593Smuzhiyun 
1214*4882a593Smuzhiyun 	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
1215*4882a593Smuzhiyun 			    m->msg_flags & MSG_DONTWAIT);
1216*4882a593Smuzhiyun }
1217*4882a593Smuzhiyun 
tap_recvmsg(struct socket * sock,struct msghdr * m,size_t total_len,int flags)1218*4882a593Smuzhiyun static int tap_recvmsg(struct socket *sock, struct msghdr *m,
1219*4882a593Smuzhiyun 		       size_t total_len, int flags)
1220*4882a593Smuzhiyun {
1221*4882a593Smuzhiyun 	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
1222*4882a593Smuzhiyun 	struct sk_buff *skb = m->msg_control;
1223*4882a593Smuzhiyun 	int ret;
1224*4882a593Smuzhiyun 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) {
1225*4882a593Smuzhiyun 		kfree_skb(skb);
1226*4882a593Smuzhiyun 		return -EINVAL;
1227*4882a593Smuzhiyun 	}
1228*4882a593Smuzhiyun 	ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb);
1229*4882a593Smuzhiyun 	if (ret > total_len) {
1230*4882a593Smuzhiyun 		m->msg_flags |= MSG_TRUNC;
1231*4882a593Smuzhiyun 		ret = flags & MSG_TRUNC ? ret : total_len;
1232*4882a593Smuzhiyun 	}
1233*4882a593Smuzhiyun 	return ret;
1234*4882a593Smuzhiyun }
1235*4882a593Smuzhiyun 
tap_peek_len(struct socket * sock)1236*4882a593Smuzhiyun static int tap_peek_len(struct socket *sock)
1237*4882a593Smuzhiyun {
1238*4882a593Smuzhiyun 	struct tap_queue *q = container_of(sock, struct tap_queue,
1239*4882a593Smuzhiyun 					       sock);
1240*4882a593Smuzhiyun 	return PTR_RING_PEEK_CALL(&q->ring, __skb_array_len_with_tag);
1241*4882a593Smuzhiyun }
1242*4882a593Smuzhiyun 
1243*4882a593Smuzhiyun /* Ops structure to mimic raw sockets with tun */
1244*4882a593Smuzhiyun static const struct proto_ops tap_socket_ops = {
1245*4882a593Smuzhiyun 	.sendmsg = tap_sendmsg,
1246*4882a593Smuzhiyun 	.recvmsg = tap_recvmsg,
1247*4882a593Smuzhiyun 	.peek_len = tap_peek_len,
1248*4882a593Smuzhiyun };
1249*4882a593Smuzhiyun 
1250*4882a593Smuzhiyun /* Get an underlying socket object from tun file.  Returns error unless file is
1251*4882a593Smuzhiyun  * attached to a device.  The returned object works like a packet socket, it
1252*4882a593Smuzhiyun  * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
1253*4882a593Smuzhiyun  * holding a reference to the file for as long as the socket is in use. */
tap_get_socket(struct file * file)1254*4882a593Smuzhiyun struct socket *tap_get_socket(struct file *file)
1255*4882a593Smuzhiyun {
1256*4882a593Smuzhiyun 	struct tap_queue *q;
1257*4882a593Smuzhiyun 	if (file->f_op != &tap_fops)
1258*4882a593Smuzhiyun 		return ERR_PTR(-EINVAL);
1259*4882a593Smuzhiyun 	q = file->private_data;
1260*4882a593Smuzhiyun 	if (!q)
1261*4882a593Smuzhiyun 		return ERR_PTR(-EBADFD);
1262*4882a593Smuzhiyun 	return &q->sock;
1263*4882a593Smuzhiyun }
1264*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tap_get_socket);
1265*4882a593Smuzhiyun 
tap_get_ptr_ring(struct file * file)1266*4882a593Smuzhiyun struct ptr_ring *tap_get_ptr_ring(struct file *file)
1267*4882a593Smuzhiyun {
1268*4882a593Smuzhiyun 	struct tap_queue *q;
1269*4882a593Smuzhiyun 
1270*4882a593Smuzhiyun 	if (file->f_op != &tap_fops)
1271*4882a593Smuzhiyun 		return ERR_PTR(-EINVAL);
1272*4882a593Smuzhiyun 	q = file->private_data;
1273*4882a593Smuzhiyun 	if (!q)
1274*4882a593Smuzhiyun 		return ERR_PTR(-EBADFD);
1275*4882a593Smuzhiyun 	return &q->ring;
1276*4882a593Smuzhiyun }
1277*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tap_get_ptr_ring);
1278*4882a593Smuzhiyun 
tap_queue_resize(struct tap_dev * tap)1279*4882a593Smuzhiyun int tap_queue_resize(struct tap_dev *tap)
1280*4882a593Smuzhiyun {
1281*4882a593Smuzhiyun 	struct net_device *dev = tap->dev;
1282*4882a593Smuzhiyun 	struct tap_queue *q;
1283*4882a593Smuzhiyun 	struct ptr_ring **rings;
1284*4882a593Smuzhiyun 	int n = tap->numqueues;
1285*4882a593Smuzhiyun 	int ret, i = 0;
1286*4882a593Smuzhiyun 
1287*4882a593Smuzhiyun 	rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
1288*4882a593Smuzhiyun 	if (!rings)
1289*4882a593Smuzhiyun 		return -ENOMEM;
1290*4882a593Smuzhiyun 
1291*4882a593Smuzhiyun 	list_for_each_entry(q, &tap->queue_list, next)
1292*4882a593Smuzhiyun 		rings[i++] = &q->ring;
1293*4882a593Smuzhiyun 
1294*4882a593Smuzhiyun 	ret = ptr_ring_resize_multiple(rings, n,
1295*4882a593Smuzhiyun 				       dev->tx_queue_len, GFP_KERNEL,
1296*4882a593Smuzhiyun 				       __skb_array_destroy_skb);
1297*4882a593Smuzhiyun 
1298*4882a593Smuzhiyun 	kfree(rings);
1299*4882a593Smuzhiyun 	return ret;
1300*4882a593Smuzhiyun }
1301*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tap_queue_resize);
1302*4882a593Smuzhiyun 
tap_list_add(dev_t major,const char * device_name)1303*4882a593Smuzhiyun static int tap_list_add(dev_t major, const char *device_name)
1304*4882a593Smuzhiyun {
1305*4882a593Smuzhiyun 	struct major_info *tap_major;
1306*4882a593Smuzhiyun 
1307*4882a593Smuzhiyun 	tap_major = kzalloc(sizeof(*tap_major), GFP_ATOMIC);
1308*4882a593Smuzhiyun 	if (!tap_major)
1309*4882a593Smuzhiyun 		return -ENOMEM;
1310*4882a593Smuzhiyun 
1311*4882a593Smuzhiyun 	tap_major->major = MAJOR(major);
1312*4882a593Smuzhiyun 
1313*4882a593Smuzhiyun 	idr_init(&tap_major->minor_idr);
1314*4882a593Smuzhiyun 	spin_lock_init(&tap_major->minor_lock);
1315*4882a593Smuzhiyun 
1316*4882a593Smuzhiyun 	tap_major->device_name = device_name;
1317*4882a593Smuzhiyun 
1318*4882a593Smuzhiyun 	list_add_tail_rcu(&tap_major->next, &major_list);
1319*4882a593Smuzhiyun 	return 0;
1320*4882a593Smuzhiyun }
1321*4882a593Smuzhiyun 
tap_create_cdev(struct cdev * tap_cdev,dev_t * tap_major,const char * device_name,struct module * module)1322*4882a593Smuzhiyun int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major,
1323*4882a593Smuzhiyun 		    const char *device_name, struct module *module)
1324*4882a593Smuzhiyun {
1325*4882a593Smuzhiyun 	int err;
1326*4882a593Smuzhiyun 
1327*4882a593Smuzhiyun 	err = alloc_chrdev_region(tap_major, 0, TAP_NUM_DEVS, device_name);
1328*4882a593Smuzhiyun 	if (err)
1329*4882a593Smuzhiyun 		goto out1;
1330*4882a593Smuzhiyun 
1331*4882a593Smuzhiyun 	cdev_init(tap_cdev, &tap_fops);
1332*4882a593Smuzhiyun 	tap_cdev->owner = module;
1333*4882a593Smuzhiyun 	err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS);
1334*4882a593Smuzhiyun 	if (err)
1335*4882a593Smuzhiyun 		goto out2;
1336*4882a593Smuzhiyun 
1337*4882a593Smuzhiyun 	err =  tap_list_add(*tap_major, device_name);
1338*4882a593Smuzhiyun 	if (err)
1339*4882a593Smuzhiyun 		goto out3;
1340*4882a593Smuzhiyun 
1341*4882a593Smuzhiyun 	return 0;
1342*4882a593Smuzhiyun 
1343*4882a593Smuzhiyun out3:
1344*4882a593Smuzhiyun 	cdev_del(tap_cdev);
1345*4882a593Smuzhiyun out2:
1346*4882a593Smuzhiyun 	unregister_chrdev_region(*tap_major, TAP_NUM_DEVS);
1347*4882a593Smuzhiyun out1:
1348*4882a593Smuzhiyun 	return err;
1349*4882a593Smuzhiyun }
1350*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tap_create_cdev);
1351*4882a593Smuzhiyun 
tap_destroy_cdev(dev_t major,struct cdev * tap_cdev)1352*4882a593Smuzhiyun void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev)
1353*4882a593Smuzhiyun {
1354*4882a593Smuzhiyun 	struct major_info *tap_major, *tmp;
1355*4882a593Smuzhiyun 
1356*4882a593Smuzhiyun 	cdev_del(tap_cdev);
1357*4882a593Smuzhiyun 	unregister_chrdev_region(major, TAP_NUM_DEVS);
1358*4882a593Smuzhiyun 	list_for_each_entry_safe(tap_major, tmp, &major_list, next) {
1359*4882a593Smuzhiyun 		if (tap_major->major == MAJOR(major)) {
1360*4882a593Smuzhiyun 			idr_destroy(&tap_major->minor_idr);
1361*4882a593Smuzhiyun 			list_del_rcu(&tap_major->next);
1362*4882a593Smuzhiyun 			kfree_rcu(tap_major, rcu);
1363*4882a593Smuzhiyun 		}
1364*4882a593Smuzhiyun 	}
1365*4882a593Smuzhiyun }
1366*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(tap_destroy_cdev);
1367*4882a593Smuzhiyun 
1368*4882a593Smuzhiyun MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
1369*4882a593Smuzhiyun MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>");
1370*4882a593Smuzhiyun MODULE_LICENSE("GPL");
1371