1*4882a593Smuzhiyun // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
2*4882a593Smuzhiyun
3*4882a593Smuzhiyun /*
4*4882a593Smuzhiyun * AF_XDP user-space access library.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * Copyright(c) 2018 - 2019 Intel Corporation.
7*4882a593Smuzhiyun *
8*4882a593Smuzhiyun * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
9*4882a593Smuzhiyun */
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun #include <errno.h>
12*4882a593Smuzhiyun #include <stdlib.h>
13*4882a593Smuzhiyun #include <string.h>
14*4882a593Smuzhiyun #include <unistd.h>
15*4882a593Smuzhiyun #include <arpa/inet.h>
16*4882a593Smuzhiyun #include <asm/barrier.h>
17*4882a593Smuzhiyun #include <linux/compiler.h>
18*4882a593Smuzhiyun #include <linux/ethtool.h>
19*4882a593Smuzhiyun #include <linux/filter.h>
20*4882a593Smuzhiyun #include <linux/if_ether.h>
21*4882a593Smuzhiyun #include <linux/if_packet.h>
22*4882a593Smuzhiyun #include <linux/if_xdp.h>
23*4882a593Smuzhiyun #include <linux/kernel.h>
24*4882a593Smuzhiyun #include <linux/list.h>
25*4882a593Smuzhiyun #include <linux/sockios.h>
26*4882a593Smuzhiyun #include <net/if.h>
27*4882a593Smuzhiyun #include <sys/ioctl.h>
28*4882a593Smuzhiyun #include <sys/mman.h>
29*4882a593Smuzhiyun #include <sys/socket.h>
30*4882a593Smuzhiyun #include <sys/types.h>
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun #include "bpf.h"
33*4882a593Smuzhiyun #include "libbpf.h"
34*4882a593Smuzhiyun #include "libbpf_internal.h"
35*4882a593Smuzhiyun #include "xsk.h"
36*4882a593Smuzhiyun
37*4882a593Smuzhiyun #ifndef SOL_XDP
38*4882a593Smuzhiyun #define SOL_XDP 283
39*4882a593Smuzhiyun #endif
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun #ifndef AF_XDP
42*4882a593Smuzhiyun #define AF_XDP 44
43*4882a593Smuzhiyun #endif
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun #ifndef PF_XDP
46*4882a593Smuzhiyun #define PF_XDP AF_XDP
47*4882a593Smuzhiyun #endif
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun struct xsk_umem {
50*4882a593Smuzhiyun struct xsk_ring_prod *fill_save;
51*4882a593Smuzhiyun struct xsk_ring_cons *comp_save;
52*4882a593Smuzhiyun char *umem_area;
53*4882a593Smuzhiyun struct xsk_umem_config config;
54*4882a593Smuzhiyun int fd;
55*4882a593Smuzhiyun int refcount;
56*4882a593Smuzhiyun struct list_head ctx_list;
57*4882a593Smuzhiyun bool rx_ring_setup_done;
58*4882a593Smuzhiyun bool tx_ring_setup_done;
59*4882a593Smuzhiyun };
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun struct xsk_ctx {
62*4882a593Smuzhiyun struct xsk_ring_prod *fill;
63*4882a593Smuzhiyun struct xsk_ring_cons *comp;
64*4882a593Smuzhiyun __u32 queue_id;
65*4882a593Smuzhiyun struct xsk_umem *umem;
66*4882a593Smuzhiyun int refcount;
67*4882a593Smuzhiyun int ifindex;
68*4882a593Smuzhiyun struct list_head list;
69*4882a593Smuzhiyun int prog_fd;
70*4882a593Smuzhiyun int xsks_map_fd;
71*4882a593Smuzhiyun char ifname[IFNAMSIZ];
72*4882a593Smuzhiyun };
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun struct xsk_socket {
75*4882a593Smuzhiyun struct xsk_ring_cons *rx;
76*4882a593Smuzhiyun struct xsk_ring_prod *tx;
77*4882a593Smuzhiyun __u64 outstanding_tx;
78*4882a593Smuzhiyun struct xsk_ctx *ctx;
79*4882a593Smuzhiyun struct xsk_socket_config config;
80*4882a593Smuzhiyun int fd;
81*4882a593Smuzhiyun };
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun struct xsk_nl_info {
84*4882a593Smuzhiyun bool xdp_prog_attached;
85*4882a593Smuzhiyun int ifindex;
86*4882a593Smuzhiyun int fd;
87*4882a593Smuzhiyun };
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun /* Up until and including Linux 5.3 */
90*4882a593Smuzhiyun struct xdp_ring_offset_v1 {
91*4882a593Smuzhiyun __u64 producer;
92*4882a593Smuzhiyun __u64 consumer;
93*4882a593Smuzhiyun __u64 desc;
94*4882a593Smuzhiyun };
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun /* Up until and including Linux 5.3 */
97*4882a593Smuzhiyun struct xdp_mmap_offsets_v1 {
98*4882a593Smuzhiyun struct xdp_ring_offset_v1 rx;
99*4882a593Smuzhiyun struct xdp_ring_offset_v1 tx;
100*4882a593Smuzhiyun struct xdp_ring_offset_v1 fr;
101*4882a593Smuzhiyun struct xdp_ring_offset_v1 cr;
102*4882a593Smuzhiyun };
103*4882a593Smuzhiyun
xsk_umem__fd(const struct xsk_umem * umem)104*4882a593Smuzhiyun int xsk_umem__fd(const struct xsk_umem *umem)
105*4882a593Smuzhiyun {
106*4882a593Smuzhiyun return umem ? umem->fd : -EINVAL;
107*4882a593Smuzhiyun }
108*4882a593Smuzhiyun
xsk_socket__fd(const struct xsk_socket * xsk)109*4882a593Smuzhiyun int xsk_socket__fd(const struct xsk_socket *xsk)
110*4882a593Smuzhiyun {
111*4882a593Smuzhiyun return xsk ? xsk->fd : -EINVAL;
112*4882a593Smuzhiyun }
113*4882a593Smuzhiyun
xsk_page_aligned(void * buffer)114*4882a593Smuzhiyun static bool xsk_page_aligned(void *buffer)
115*4882a593Smuzhiyun {
116*4882a593Smuzhiyun unsigned long addr = (unsigned long)buffer;
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun return !(addr & (getpagesize() - 1));
119*4882a593Smuzhiyun }
120*4882a593Smuzhiyun
xsk_set_umem_config(struct xsk_umem_config * cfg,const struct xsk_umem_config * usr_cfg)121*4882a593Smuzhiyun static void xsk_set_umem_config(struct xsk_umem_config *cfg,
122*4882a593Smuzhiyun const struct xsk_umem_config *usr_cfg)
123*4882a593Smuzhiyun {
124*4882a593Smuzhiyun if (!usr_cfg) {
125*4882a593Smuzhiyun cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
126*4882a593Smuzhiyun cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
127*4882a593Smuzhiyun cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
128*4882a593Smuzhiyun cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
129*4882a593Smuzhiyun cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
130*4882a593Smuzhiyun return;
131*4882a593Smuzhiyun }
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun cfg->fill_size = usr_cfg->fill_size;
134*4882a593Smuzhiyun cfg->comp_size = usr_cfg->comp_size;
135*4882a593Smuzhiyun cfg->frame_size = usr_cfg->frame_size;
136*4882a593Smuzhiyun cfg->frame_headroom = usr_cfg->frame_headroom;
137*4882a593Smuzhiyun cfg->flags = usr_cfg->flags;
138*4882a593Smuzhiyun }
139*4882a593Smuzhiyun
xsk_set_xdp_socket_config(struct xsk_socket_config * cfg,const struct xsk_socket_config * usr_cfg)140*4882a593Smuzhiyun static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
141*4882a593Smuzhiyun const struct xsk_socket_config *usr_cfg)
142*4882a593Smuzhiyun {
143*4882a593Smuzhiyun if (!usr_cfg) {
144*4882a593Smuzhiyun cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
145*4882a593Smuzhiyun cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
146*4882a593Smuzhiyun cfg->libbpf_flags = 0;
147*4882a593Smuzhiyun cfg->xdp_flags = 0;
148*4882a593Smuzhiyun cfg->bind_flags = 0;
149*4882a593Smuzhiyun return 0;
150*4882a593Smuzhiyun }
151*4882a593Smuzhiyun
152*4882a593Smuzhiyun if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)
153*4882a593Smuzhiyun return -EINVAL;
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun cfg->rx_size = usr_cfg->rx_size;
156*4882a593Smuzhiyun cfg->tx_size = usr_cfg->tx_size;
157*4882a593Smuzhiyun cfg->libbpf_flags = usr_cfg->libbpf_flags;
158*4882a593Smuzhiyun cfg->xdp_flags = usr_cfg->xdp_flags;
159*4882a593Smuzhiyun cfg->bind_flags = usr_cfg->bind_flags;
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun return 0;
162*4882a593Smuzhiyun }
163*4882a593Smuzhiyun
xsk_mmap_offsets_v1(struct xdp_mmap_offsets * off)164*4882a593Smuzhiyun static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off)
165*4882a593Smuzhiyun {
166*4882a593Smuzhiyun struct xdp_mmap_offsets_v1 off_v1;
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun /* getsockopt on a kernel <= 5.3 has no flags fields.
169*4882a593Smuzhiyun * Copy over the offsets to the correct places in the >=5.4 format
170*4882a593Smuzhiyun * and put the flags where they would have been on that kernel.
171*4882a593Smuzhiyun */
172*4882a593Smuzhiyun memcpy(&off_v1, off, sizeof(off_v1));
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun off->rx.producer = off_v1.rx.producer;
175*4882a593Smuzhiyun off->rx.consumer = off_v1.rx.consumer;
176*4882a593Smuzhiyun off->rx.desc = off_v1.rx.desc;
177*4882a593Smuzhiyun off->rx.flags = off_v1.rx.consumer + sizeof(__u32);
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun off->tx.producer = off_v1.tx.producer;
180*4882a593Smuzhiyun off->tx.consumer = off_v1.tx.consumer;
181*4882a593Smuzhiyun off->tx.desc = off_v1.tx.desc;
182*4882a593Smuzhiyun off->tx.flags = off_v1.tx.consumer + sizeof(__u32);
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun off->fr.producer = off_v1.fr.producer;
185*4882a593Smuzhiyun off->fr.consumer = off_v1.fr.consumer;
186*4882a593Smuzhiyun off->fr.desc = off_v1.fr.desc;
187*4882a593Smuzhiyun off->fr.flags = off_v1.fr.consumer + sizeof(__u32);
188*4882a593Smuzhiyun
189*4882a593Smuzhiyun off->cr.producer = off_v1.cr.producer;
190*4882a593Smuzhiyun off->cr.consumer = off_v1.cr.consumer;
191*4882a593Smuzhiyun off->cr.desc = off_v1.cr.desc;
192*4882a593Smuzhiyun off->cr.flags = off_v1.cr.consumer + sizeof(__u32);
193*4882a593Smuzhiyun }
194*4882a593Smuzhiyun
xsk_get_mmap_offsets(int fd,struct xdp_mmap_offsets * off)195*4882a593Smuzhiyun static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off)
196*4882a593Smuzhiyun {
197*4882a593Smuzhiyun socklen_t optlen;
198*4882a593Smuzhiyun int err;
199*4882a593Smuzhiyun
200*4882a593Smuzhiyun optlen = sizeof(*off);
201*4882a593Smuzhiyun err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen);
202*4882a593Smuzhiyun if (err)
203*4882a593Smuzhiyun return err;
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun if (optlen == sizeof(*off))
206*4882a593Smuzhiyun return 0;
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun if (optlen == sizeof(struct xdp_mmap_offsets_v1)) {
209*4882a593Smuzhiyun xsk_mmap_offsets_v1(off);
210*4882a593Smuzhiyun return 0;
211*4882a593Smuzhiyun }
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun return -EINVAL;
214*4882a593Smuzhiyun }
215*4882a593Smuzhiyun
xsk_create_umem_rings(struct xsk_umem * umem,int fd,struct xsk_ring_prod * fill,struct xsk_ring_cons * comp)216*4882a593Smuzhiyun static int xsk_create_umem_rings(struct xsk_umem *umem, int fd,
217*4882a593Smuzhiyun struct xsk_ring_prod *fill,
218*4882a593Smuzhiyun struct xsk_ring_cons *comp)
219*4882a593Smuzhiyun {
220*4882a593Smuzhiyun struct xdp_mmap_offsets off;
221*4882a593Smuzhiyun void *map;
222*4882a593Smuzhiyun int err;
223*4882a593Smuzhiyun
224*4882a593Smuzhiyun err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING,
225*4882a593Smuzhiyun &umem->config.fill_size,
226*4882a593Smuzhiyun sizeof(umem->config.fill_size));
227*4882a593Smuzhiyun if (err)
228*4882a593Smuzhiyun return -errno;
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
231*4882a593Smuzhiyun &umem->config.comp_size,
232*4882a593Smuzhiyun sizeof(umem->config.comp_size));
233*4882a593Smuzhiyun if (err)
234*4882a593Smuzhiyun return -errno;
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun err = xsk_get_mmap_offsets(fd, &off);
237*4882a593Smuzhiyun if (err)
238*4882a593Smuzhiyun return -errno;
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
241*4882a593Smuzhiyun PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
242*4882a593Smuzhiyun XDP_UMEM_PGOFF_FILL_RING);
243*4882a593Smuzhiyun if (map == MAP_FAILED)
244*4882a593Smuzhiyun return -errno;
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun fill->mask = umem->config.fill_size - 1;
247*4882a593Smuzhiyun fill->size = umem->config.fill_size;
248*4882a593Smuzhiyun fill->producer = map + off.fr.producer;
249*4882a593Smuzhiyun fill->consumer = map + off.fr.consumer;
250*4882a593Smuzhiyun fill->flags = map + off.fr.flags;
251*4882a593Smuzhiyun fill->ring = map + off.fr.desc;
252*4882a593Smuzhiyun fill->cached_cons = umem->config.fill_size;
253*4882a593Smuzhiyun
254*4882a593Smuzhiyun map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
255*4882a593Smuzhiyun PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
256*4882a593Smuzhiyun XDP_UMEM_PGOFF_COMPLETION_RING);
257*4882a593Smuzhiyun if (map == MAP_FAILED) {
258*4882a593Smuzhiyun err = -errno;
259*4882a593Smuzhiyun goto out_mmap;
260*4882a593Smuzhiyun }
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun comp->mask = umem->config.comp_size - 1;
263*4882a593Smuzhiyun comp->size = umem->config.comp_size;
264*4882a593Smuzhiyun comp->producer = map + off.cr.producer;
265*4882a593Smuzhiyun comp->consumer = map + off.cr.consumer;
266*4882a593Smuzhiyun comp->flags = map + off.cr.flags;
267*4882a593Smuzhiyun comp->ring = map + off.cr.desc;
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun return 0;
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun out_mmap:
272*4882a593Smuzhiyun munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64));
273*4882a593Smuzhiyun return err;
274*4882a593Smuzhiyun }
275*4882a593Smuzhiyun
xsk_umem__create_v0_0_4(struct xsk_umem ** umem_ptr,void * umem_area,__u64 size,struct xsk_ring_prod * fill,struct xsk_ring_cons * comp,const struct xsk_umem_config * usr_config)276*4882a593Smuzhiyun int xsk_umem__create_v0_0_4(struct xsk_umem **umem_ptr, void *umem_area,
277*4882a593Smuzhiyun __u64 size, struct xsk_ring_prod *fill,
278*4882a593Smuzhiyun struct xsk_ring_cons *comp,
279*4882a593Smuzhiyun const struct xsk_umem_config *usr_config)
280*4882a593Smuzhiyun {
281*4882a593Smuzhiyun struct xdp_umem_reg mr;
282*4882a593Smuzhiyun struct xsk_umem *umem;
283*4882a593Smuzhiyun int err;
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun if (!umem_area || !umem_ptr || !fill || !comp)
286*4882a593Smuzhiyun return -EFAULT;
287*4882a593Smuzhiyun if (!size && !xsk_page_aligned(umem_area))
288*4882a593Smuzhiyun return -EINVAL;
289*4882a593Smuzhiyun
290*4882a593Smuzhiyun umem = calloc(1, sizeof(*umem));
291*4882a593Smuzhiyun if (!umem)
292*4882a593Smuzhiyun return -ENOMEM;
293*4882a593Smuzhiyun
294*4882a593Smuzhiyun umem->fd = socket(AF_XDP, SOCK_RAW, 0);
295*4882a593Smuzhiyun if (umem->fd < 0) {
296*4882a593Smuzhiyun err = -errno;
297*4882a593Smuzhiyun goto out_umem_alloc;
298*4882a593Smuzhiyun }
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun umem->umem_area = umem_area;
301*4882a593Smuzhiyun INIT_LIST_HEAD(&umem->ctx_list);
302*4882a593Smuzhiyun xsk_set_umem_config(&umem->config, usr_config);
303*4882a593Smuzhiyun
304*4882a593Smuzhiyun memset(&mr, 0, sizeof(mr));
305*4882a593Smuzhiyun mr.addr = (uintptr_t)umem_area;
306*4882a593Smuzhiyun mr.len = size;
307*4882a593Smuzhiyun mr.chunk_size = umem->config.frame_size;
308*4882a593Smuzhiyun mr.headroom = umem->config.frame_headroom;
309*4882a593Smuzhiyun mr.flags = umem->config.flags;
310*4882a593Smuzhiyun
311*4882a593Smuzhiyun err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
312*4882a593Smuzhiyun if (err) {
313*4882a593Smuzhiyun err = -errno;
314*4882a593Smuzhiyun goto out_socket;
315*4882a593Smuzhiyun }
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun err = xsk_create_umem_rings(umem, umem->fd, fill, comp);
318*4882a593Smuzhiyun if (err)
319*4882a593Smuzhiyun goto out_socket;
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun umem->fill_save = fill;
322*4882a593Smuzhiyun umem->comp_save = comp;
323*4882a593Smuzhiyun *umem_ptr = umem;
324*4882a593Smuzhiyun return 0;
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun out_socket:
327*4882a593Smuzhiyun close(umem->fd);
328*4882a593Smuzhiyun out_umem_alloc:
329*4882a593Smuzhiyun free(umem);
330*4882a593Smuzhiyun return err;
331*4882a593Smuzhiyun }
332*4882a593Smuzhiyun
333*4882a593Smuzhiyun struct xsk_umem_config_v1 {
334*4882a593Smuzhiyun __u32 fill_size;
335*4882a593Smuzhiyun __u32 comp_size;
336*4882a593Smuzhiyun __u32 frame_size;
337*4882a593Smuzhiyun __u32 frame_headroom;
338*4882a593Smuzhiyun };
339*4882a593Smuzhiyun
xsk_umem__create_v0_0_2(struct xsk_umem ** umem_ptr,void * umem_area,__u64 size,struct xsk_ring_prod * fill,struct xsk_ring_cons * comp,const struct xsk_umem_config * usr_config)340*4882a593Smuzhiyun int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area,
341*4882a593Smuzhiyun __u64 size, struct xsk_ring_prod *fill,
342*4882a593Smuzhiyun struct xsk_ring_cons *comp,
343*4882a593Smuzhiyun const struct xsk_umem_config *usr_config)
344*4882a593Smuzhiyun {
345*4882a593Smuzhiyun struct xsk_umem_config config;
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun memcpy(&config, usr_config, sizeof(struct xsk_umem_config_v1));
348*4882a593Smuzhiyun config.flags = 0;
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun return xsk_umem__create_v0_0_4(umem_ptr, umem_area, size, fill, comp,
351*4882a593Smuzhiyun &config);
352*4882a593Smuzhiyun }
353*4882a593Smuzhiyun COMPAT_VERSION(xsk_umem__create_v0_0_2, xsk_umem__create, LIBBPF_0.0.2)
354*4882a593Smuzhiyun DEFAULT_VERSION(xsk_umem__create_v0_0_4, xsk_umem__create, LIBBPF_0.0.4)
355*4882a593Smuzhiyun
xsk_load_xdp_prog(struct xsk_socket * xsk)356*4882a593Smuzhiyun static int xsk_load_xdp_prog(struct xsk_socket *xsk)
357*4882a593Smuzhiyun {
358*4882a593Smuzhiyun static const int log_buf_size = 16 * 1024;
359*4882a593Smuzhiyun struct xsk_ctx *ctx = xsk->ctx;
360*4882a593Smuzhiyun char log_buf[log_buf_size];
361*4882a593Smuzhiyun int err, prog_fd;
362*4882a593Smuzhiyun
363*4882a593Smuzhiyun /* This is the C-program:
364*4882a593Smuzhiyun * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
365*4882a593Smuzhiyun * {
366*4882a593Smuzhiyun * int ret, index = ctx->rx_queue_index;
367*4882a593Smuzhiyun *
368*4882a593Smuzhiyun * // A set entry here means that the correspnding queue_id
369*4882a593Smuzhiyun * // has an active AF_XDP socket bound to it.
370*4882a593Smuzhiyun * ret = bpf_redirect_map(&xsks_map, index, XDP_PASS);
371*4882a593Smuzhiyun * if (ret > 0)
372*4882a593Smuzhiyun * return ret;
373*4882a593Smuzhiyun *
374*4882a593Smuzhiyun * // Fallback for pre-5.3 kernels, not supporting default
375*4882a593Smuzhiyun * // action in the flags parameter.
376*4882a593Smuzhiyun * if (bpf_map_lookup_elem(&xsks_map, &index))
377*4882a593Smuzhiyun * return bpf_redirect_map(&xsks_map, index, 0);
378*4882a593Smuzhiyun * return XDP_PASS;
379*4882a593Smuzhiyun * }
380*4882a593Smuzhiyun */
381*4882a593Smuzhiyun struct bpf_insn prog[] = {
382*4882a593Smuzhiyun /* r2 = *(u32 *)(r1 + 16) */
383*4882a593Smuzhiyun BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
384*4882a593Smuzhiyun /* *(u32 *)(r10 - 4) = r2 */
385*4882a593Smuzhiyun BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4),
386*4882a593Smuzhiyun /* r1 = xskmap[] */
387*4882a593Smuzhiyun BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
388*4882a593Smuzhiyun /* r3 = XDP_PASS */
389*4882a593Smuzhiyun BPF_MOV64_IMM(BPF_REG_3, 2),
390*4882a593Smuzhiyun /* call bpf_redirect_map */
391*4882a593Smuzhiyun BPF_EMIT_CALL(BPF_FUNC_redirect_map),
392*4882a593Smuzhiyun /* if w0 != 0 goto pc+13 */
393*4882a593Smuzhiyun BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13),
394*4882a593Smuzhiyun /* r2 = r10 */
395*4882a593Smuzhiyun BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
396*4882a593Smuzhiyun /* r2 += -4 */
397*4882a593Smuzhiyun BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
398*4882a593Smuzhiyun /* r1 = xskmap[] */
399*4882a593Smuzhiyun BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
400*4882a593Smuzhiyun /* call bpf_map_lookup_elem */
401*4882a593Smuzhiyun BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
402*4882a593Smuzhiyun /* r1 = r0 */
403*4882a593Smuzhiyun BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
404*4882a593Smuzhiyun /* r0 = XDP_PASS */
405*4882a593Smuzhiyun BPF_MOV64_IMM(BPF_REG_0, 2),
406*4882a593Smuzhiyun /* if r1 == 0 goto pc+5 */
407*4882a593Smuzhiyun BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
408*4882a593Smuzhiyun /* r2 = *(u32 *)(r10 - 4) */
409*4882a593Smuzhiyun BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
410*4882a593Smuzhiyun /* r1 = xskmap[] */
411*4882a593Smuzhiyun BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
412*4882a593Smuzhiyun /* r3 = 0 */
413*4882a593Smuzhiyun BPF_MOV64_IMM(BPF_REG_3, 0),
414*4882a593Smuzhiyun /* call bpf_redirect_map */
415*4882a593Smuzhiyun BPF_EMIT_CALL(BPF_FUNC_redirect_map),
416*4882a593Smuzhiyun /* The jumps are to this instruction */
417*4882a593Smuzhiyun BPF_EXIT_INSN(),
418*4882a593Smuzhiyun };
419*4882a593Smuzhiyun size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
420*4882a593Smuzhiyun
421*4882a593Smuzhiyun prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, prog, insns_cnt,
422*4882a593Smuzhiyun "LGPL-2.1 or BSD-2-Clause", 0, log_buf,
423*4882a593Smuzhiyun log_buf_size);
424*4882a593Smuzhiyun if (prog_fd < 0) {
425*4882a593Smuzhiyun pr_warn("BPF log buffer:\n%s", log_buf);
426*4882a593Smuzhiyun return prog_fd;
427*4882a593Smuzhiyun }
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, prog_fd,
430*4882a593Smuzhiyun xsk->config.xdp_flags);
431*4882a593Smuzhiyun if (err) {
432*4882a593Smuzhiyun close(prog_fd);
433*4882a593Smuzhiyun return err;
434*4882a593Smuzhiyun }
435*4882a593Smuzhiyun
436*4882a593Smuzhiyun ctx->prog_fd = prog_fd;
437*4882a593Smuzhiyun return 0;
438*4882a593Smuzhiyun }
439*4882a593Smuzhiyun
xsk_get_max_queues(struct xsk_socket * xsk)440*4882a593Smuzhiyun static int xsk_get_max_queues(struct xsk_socket *xsk)
441*4882a593Smuzhiyun {
442*4882a593Smuzhiyun struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
443*4882a593Smuzhiyun struct xsk_ctx *ctx = xsk->ctx;
444*4882a593Smuzhiyun struct ifreq ifr = {};
445*4882a593Smuzhiyun int fd, err, ret;
446*4882a593Smuzhiyun
447*4882a593Smuzhiyun fd = socket(AF_INET, SOCK_DGRAM, 0);
448*4882a593Smuzhiyun if (fd < 0)
449*4882a593Smuzhiyun return -errno;
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun ifr.ifr_data = (void *)&channels;
452*4882a593Smuzhiyun memcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ - 1);
453*4882a593Smuzhiyun ifr.ifr_name[IFNAMSIZ - 1] = '\0';
454*4882a593Smuzhiyun err = ioctl(fd, SIOCETHTOOL, &ifr);
455*4882a593Smuzhiyun if (err && errno != EOPNOTSUPP) {
456*4882a593Smuzhiyun ret = -errno;
457*4882a593Smuzhiyun goto out;
458*4882a593Smuzhiyun }
459*4882a593Smuzhiyun
460*4882a593Smuzhiyun if (err) {
461*4882a593Smuzhiyun /* If the device says it has no channels, then all traffic
462*4882a593Smuzhiyun * is sent to a single stream, so max queues = 1.
463*4882a593Smuzhiyun */
464*4882a593Smuzhiyun ret = 1;
465*4882a593Smuzhiyun } else {
466*4882a593Smuzhiyun /* Take the max of rx, tx, combined. Drivers return
467*4882a593Smuzhiyun * the number of channels in different ways.
468*4882a593Smuzhiyun */
469*4882a593Smuzhiyun ret = max(channels.max_rx, channels.max_tx);
470*4882a593Smuzhiyun ret = max(ret, (int)channels.max_combined);
471*4882a593Smuzhiyun }
472*4882a593Smuzhiyun
473*4882a593Smuzhiyun out:
474*4882a593Smuzhiyun close(fd);
475*4882a593Smuzhiyun return ret;
476*4882a593Smuzhiyun }
477*4882a593Smuzhiyun
xsk_create_bpf_maps(struct xsk_socket * xsk)478*4882a593Smuzhiyun static int xsk_create_bpf_maps(struct xsk_socket *xsk)
479*4882a593Smuzhiyun {
480*4882a593Smuzhiyun struct xsk_ctx *ctx = xsk->ctx;
481*4882a593Smuzhiyun int max_queues;
482*4882a593Smuzhiyun int fd;
483*4882a593Smuzhiyun
484*4882a593Smuzhiyun max_queues = xsk_get_max_queues(xsk);
485*4882a593Smuzhiyun if (max_queues < 0)
486*4882a593Smuzhiyun return max_queues;
487*4882a593Smuzhiyun
488*4882a593Smuzhiyun fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map",
489*4882a593Smuzhiyun sizeof(int), sizeof(int), max_queues, 0);
490*4882a593Smuzhiyun if (fd < 0)
491*4882a593Smuzhiyun return fd;
492*4882a593Smuzhiyun
493*4882a593Smuzhiyun ctx->xsks_map_fd = fd;
494*4882a593Smuzhiyun
495*4882a593Smuzhiyun return 0;
496*4882a593Smuzhiyun }
497*4882a593Smuzhiyun
xsk_delete_bpf_maps(struct xsk_socket * xsk)498*4882a593Smuzhiyun static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
499*4882a593Smuzhiyun {
500*4882a593Smuzhiyun struct xsk_ctx *ctx = xsk->ctx;
501*4882a593Smuzhiyun
502*4882a593Smuzhiyun bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id);
503*4882a593Smuzhiyun close(ctx->xsks_map_fd);
504*4882a593Smuzhiyun }
505*4882a593Smuzhiyun
xsk_lookup_bpf_maps(struct xsk_socket * xsk)506*4882a593Smuzhiyun static int xsk_lookup_bpf_maps(struct xsk_socket *xsk)
507*4882a593Smuzhiyun {
508*4882a593Smuzhiyun __u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info);
509*4882a593Smuzhiyun __u32 map_len = sizeof(struct bpf_map_info);
510*4882a593Smuzhiyun struct bpf_prog_info prog_info = {};
511*4882a593Smuzhiyun struct xsk_ctx *ctx = xsk->ctx;
512*4882a593Smuzhiyun struct bpf_map_info map_info;
513*4882a593Smuzhiyun int fd, err;
514*4882a593Smuzhiyun
515*4882a593Smuzhiyun err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
516*4882a593Smuzhiyun if (err)
517*4882a593Smuzhiyun return err;
518*4882a593Smuzhiyun
519*4882a593Smuzhiyun num_maps = prog_info.nr_map_ids;
520*4882a593Smuzhiyun
521*4882a593Smuzhiyun map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
522*4882a593Smuzhiyun if (!map_ids)
523*4882a593Smuzhiyun return -ENOMEM;
524*4882a593Smuzhiyun
525*4882a593Smuzhiyun memset(&prog_info, 0, prog_len);
526*4882a593Smuzhiyun prog_info.nr_map_ids = num_maps;
527*4882a593Smuzhiyun prog_info.map_ids = (__u64)(unsigned long)map_ids;
528*4882a593Smuzhiyun
529*4882a593Smuzhiyun err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
530*4882a593Smuzhiyun if (err)
531*4882a593Smuzhiyun goto out_map_ids;
532*4882a593Smuzhiyun
533*4882a593Smuzhiyun ctx->xsks_map_fd = -1;
534*4882a593Smuzhiyun
535*4882a593Smuzhiyun for (i = 0; i < prog_info.nr_map_ids; i++) {
536*4882a593Smuzhiyun fd = bpf_map_get_fd_by_id(map_ids[i]);
537*4882a593Smuzhiyun if (fd < 0)
538*4882a593Smuzhiyun continue;
539*4882a593Smuzhiyun
540*4882a593Smuzhiyun memset(&map_info, 0, map_len);
541*4882a593Smuzhiyun err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
542*4882a593Smuzhiyun if (err) {
543*4882a593Smuzhiyun close(fd);
544*4882a593Smuzhiyun continue;
545*4882a593Smuzhiyun }
546*4882a593Smuzhiyun
547*4882a593Smuzhiyun if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) {
548*4882a593Smuzhiyun ctx->xsks_map_fd = fd;
549*4882a593Smuzhiyun break;
550*4882a593Smuzhiyun }
551*4882a593Smuzhiyun
552*4882a593Smuzhiyun close(fd);
553*4882a593Smuzhiyun }
554*4882a593Smuzhiyun
555*4882a593Smuzhiyun err = 0;
556*4882a593Smuzhiyun if (ctx->xsks_map_fd == -1)
557*4882a593Smuzhiyun err = -ENOENT;
558*4882a593Smuzhiyun
559*4882a593Smuzhiyun out_map_ids:
560*4882a593Smuzhiyun free(map_ids);
561*4882a593Smuzhiyun return err;
562*4882a593Smuzhiyun }
563*4882a593Smuzhiyun
xsk_set_bpf_maps(struct xsk_socket * xsk)564*4882a593Smuzhiyun static int xsk_set_bpf_maps(struct xsk_socket *xsk)
565*4882a593Smuzhiyun {
566*4882a593Smuzhiyun struct xsk_ctx *ctx = xsk->ctx;
567*4882a593Smuzhiyun
568*4882a593Smuzhiyun return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id,
569*4882a593Smuzhiyun &xsk->fd, 0);
570*4882a593Smuzhiyun }
571*4882a593Smuzhiyun
xsk_setup_xdp_prog(struct xsk_socket * xsk)572*4882a593Smuzhiyun static int xsk_setup_xdp_prog(struct xsk_socket *xsk)
573*4882a593Smuzhiyun {
574*4882a593Smuzhiyun struct xsk_ctx *ctx = xsk->ctx;
575*4882a593Smuzhiyun __u32 prog_id = 0;
576*4882a593Smuzhiyun int err;
577*4882a593Smuzhiyun
578*4882a593Smuzhiyun err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id,
579*4882a593Smuzhiyun xsk->config.xdp_flags);
580*4882a593Smuzhiyun if (err)
581*4882a593Smuzhiyun return err;
582*4882a593Smuzhiyun
583*4882a593Smuzhiyun if (!prog_id) {
584*4882a593Smuzhiyun err = xsk_create_bpf_maps(xsk);
585*4882a593Smuzhiyun if (err)
586*4882a593Smuzhiyun return err;
587*4882a593Smuzhiyun
588*4882a593Smuzhiyun err = xsk_load_xdp_prog(xsk);
589*4882a593Smuzhiyun if (err) {
590*4882a593Smuzhiyun xsk_delete_bpf_maps(xsk);
591*4882a593Smuzhiyun return err;
592*4882a593Smuzhiyun }
593*4882a593Smuzhiyun } else {
594*4882a593Smuzhiyun ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
595*4882a593Smuzhiyun if (ctx->prog_fd < 0)
596*4882a593Smuzhiyun return -errno;
597*4882a593Smuzhiyun err = xsk_lookup_bpf_maps(xsk);
598*4882a593Smuzhiyun if (err) {
599*4882a593Smuzhiyun close(ctx->prog_fd);
600*4882a593Smuzhiyun return err;
601*4882a593Smuzhiyun }
602*4882a593Smuzhiyun }
603*4882a593Smuzhiyun
604*4882a593Smuzhiyun if (xsk->rx)
605*4882a593Smuzhiyun err = xsk_set_bpf_maps(xsk);
606*4882a593Smuzhiyun if (err) {
607*4882a593Smuzhiyun xsk_delete_bpf_maps(xsk);
608*4882a593Smuzhiyun close(ctx->prog_fd);
609*4882a593Smuzhiyun return err;
610*4882a593Smuzhiyun }
611*4882a593Smuzhiyun
612*4882a593Smuzhiyun return 0;
613*4882a593Smuzhiyun }
614*4882a593Smuzhiyun
xsk_get_ctx(struct xsk_umem * umem,int ifindex,__u32 queue_id)615*4882a593Smuzhiyun static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex,
616*4882a593Smuzhiyun __u32 queue_id)
617*4882a593Smuzhiyun {
618*4882a593Smuzhiyun struct xsk_ctx *ctx;
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun if (list_empty(&umem->ctx_list))
621*4882a593Smuzhiyun return NULL;
622*4882a593Smuzhiyun
623*4882a593Smuzhiyun list_for_each_entry(ctx, &umem->ctx_list, list) {
624*4882a593Smuzhiyun if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) {
625*4882a593Smuzhiyun ctx->refcount++;
626*4882a593Smuzhiyun return ctx;
627*4882a593Smuzhiyun }
628*4882a593Smuzhiyun }
629*4882a593Smuzhiyun
630*4882a593Smuzhiyun return NULL;
631*4882a593Smuzhiyun }
632*4882a593Smuzhiyun
xsk_put_ctx(struct xsk_ctx * ctx,bool unmap)633*4882a593Smuzhiyun static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap)
634*4882a593Smuzhiyun {
635*4882a593Smuzhiyun struct xsk_umem *umem = ctx->umem;
636*4882a593Smuzhiyun struct xdp_mmap_offsets off;
637*4882a593Smuzhiyun int err;
638*4882a593Smuzhiyun
639*4882a593Smuzhiyun if (--ctx->refcount)
640*4882a593Smuzhiyun return;
641*4882a593Smuzhiyun
642*4882a593Smuzhiyun if (!unmap)
643*4882a593Smuzhiyun goto out_free;
644*4882a593Smuzhiyun
645*4882a593Smuzhiyun err = xsk_get_mmap_offsets(umem->fd, &off);
646*4882a593Smuzhiyun if (err)
647*4882a593Smuzhiyun goto out_free;
648*4882a593Smuzhiyun
649*4882a593Smuzhiyun munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size *
650*4882a593Smuzhiyun sizeof(__u64));
651*4882a593Smuzhiyun munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size *
652*4882a593Smuzhiyun sizeof(__u64));
653*4882a593Smuzhiyun
654*4882a593Smuzhiyun out_free:
655*4882a593Smuzhiyun list_del(&ctx->list);
656*4882a593Smuzhiyun free(ctx);
657*4882a593Smuzhiyun }
658*4882a593Smuzhiyun
xsk_create_ctx(struct xsk_socket * xsk,struct xsk_umem * umem,int ifindex,const char * ifname,__u32 queue_id,struct xsk_ring_prod * fill,struct xsk_ring_cons * comp)659*4882a593Smuzhiyun static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
660*4882a593Smuzhiyun struct xsk_umem *umem, int ifindex,
661*4882a593Smuzhiyun const char *ifname, __u32 queue_id,
662*4882a593Smuzhiyun struct xsk_ring_prod *fill,
663*4882a593Smuzhiyun struct xsk_ring_cons *comp)
664*4882a593Smuzhiyun {
665*4882a593Smuzhiyun struct xsk_ctx *ctx;
666*4882a593Smuzhiyun int err;
667*4882a593Smuzhiyun
668*4882a593Smuzhiyun ctx = calloc(1, sizeof(*ctx));
669*4882a593Smuzhiyun if (!ctx)
670*4882a593Smuzhiyun return NULL;
671*4882a593Smuzhiyun
672*4882a593Smuzhiyun if (!umem->fill_save) {
673*4882a593Smuzhiyun err = xsk_create_umem_rings(umem, xsk->fd, fill, comp);
674*4882a593Smuzhiyun if (err) {
675*4882a593Smuzhiyun free(ctx);
676*4882a593Smuzhiyun return NULL;
677*4882a593Smuzhiyun }
678*4882a593Smuzhiyun } else if (umem->fill_save != fill || umem->comp_save != comp) {
679*4882a593Smuzhiyun /* Copy over rings to new structs. */
680*4882a593Smuzhiyun memcpy(fill, umem->fill_save, sizeof(*fill));
681*4882a593Smuzhiyun memcpy(comp, umem->comp_save, sizeof(*comp));
682*4882a593Smuzhiyun }
683*4882a593Smuzhiyun
684*4882a593Smuzhiyun ctx->ifindex = ifindex;
685*4882a593Smuzhiyun ctx->refcount = 1;
686*4882a593Smuzhiyun ctx->umem = umem;
687*4882a593Smuzhiyun ctx->queue_id = queue_id;
688*4882a593Smuzhiyun memcpy(ctx->ifname, ifname, IFNAMSIZ - 1);
689*4882a593Smuzhiyun ctx->ifname[IFNAMSIZ - 1] = '\0';
690*4882a593Smuzhiyun
691*4882a593Smuzhiyun ctx->fill = fill;
692*4882a593Smuzhiyun ctx->comp = comp;
693*4882a593Smuzhiyun list_add(&ctx->list, &umem->ctx_list);
694*4882a593Smuzhiyun return ctx;
695*4882a593Smuzhiyun }
696*4882a593Smuzhiyun
xsk_socket__create_shared(struct xsk_socket ** xsk_ptr,const char * ifname,__u32 queue_id,struct xsk_umem * umem,struct xsk_ring_cons * rx,struct xsk_ring_prod * tx,struct xsk_ring_prod * fill,struct xsk_ring_cons * comp,const struct xsk_socket_config * usr_config)697*4882a593Smuzhiyun int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
698*4882a593Smuzhiyun const char *ifname,
699*4882a593Smuzhiyun __u32 queue_id, struct xsk_umem *umem,
700*4882a593Smuzhiyun struct xsk_ring_cons *rx,
701*4882a593Smuzhiyun struct xsk_ring_prod *tx,
702*4882a593Smuzhiyun struct xsk_ring_prod *fill,
703*4882a593Smuzhiyun struct xsk_ring_cons *comp,
704*4882a593Smuzhiyun const struct xsk_socket_config *usr_config)
705*4882a593Smuzhiyun {
706*4882a593Smuzhiyun bool unmap, rx_setup_done = false, tx_setup_done = false;
707*4882a593Smuzhiyun void *rx_map = NULL, *tx_map = NULL;
708*4882a593Smuzhiyun struct sockaddr_xdp sxdp = {};
709*4882a593Smuzhiyun struct xdp_mmap_offsets off;
710*4882a593Smuzhiyun struct xsk_socket *xsk;
711*4882a593Smuzhiyun struct xsk_ctx *ctx;
712*4882a593Smuzhiyun int err, ifindex;
713*4882a593Smuzhiyun
714*4882a593Smuzhiyun if (!umem || !xsk_ptr || !(rx || tx))
715*4882a593Smuzhiyun return -EFAULT;
716*4882a593Smuzhiyun
717*4882a593Smuzhiyun unmap = umem->fill_save != fill;
718*4882a593Smuzhiyun
719*4882a593Smuzhiyun xsk = calloc(1, sizeof(*xsk));
720*4882a593Smuzhiyun if (!xsk)
721*4882a593Smuzhiyun return -ENOMEM;
722*4882a593Smuzhiyun
723*4882a593Smuzhiyun err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
724*4882a593Smuzhiyun if (err)
725*4882a593Smuzhiyun goto out_xsk_alloc;
726*4882a593Smuzhiyun
727*4882a593Smuzhiyun xsk->outstanding_tx = 0;
728*4882a593Smuzhiyun ifindex = if_nametoindex(ifname);
729*4882a593Smuzhiyun if (!ifindex) {
730*4882a593Smuzhiyun err = -errno;
731*4882a593Smuzhiyun goto out_xsk_alloc;
732*4882a593Smuzhiyun }
733*4882a593Smuzhiyun
734*4882a593Smuzhiyun if (umem->refcount++ > 0) {
735*4882a593Smuzhiyun xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
736*4882a593Smuzhiyun if (xsk->fd < 0) {
737*4882a593Smuzhiyun err = -errno;
738*4882a593Smuzhiyun goto out_xsk_alloc;
739*4882a593Smuzhiyun }
740*4882a593Smuzhiyun } else {
741*4882a593Smuzhiyun xsk->fd = umem->fd;
742*4882a593Smuzhiyun rx_setup_done = umem->rx_ring_setup_done;
743*4882a593Smuzhiyun tx_setup_done = umem->tx_ring_setup_done;
744*4882a593Smuzhiyun }
745*4882a593Smuzhiyun
746*4882a593Smuzhiyun ctx = xsk_get_ctx(umem, ifindex, queue_id);
747*4882a593Smuzhiyun if (!ctx) {
748*4882a593Smuzhiyun if (!fill || !comp) {
749*4882a593Smuzhiyun err = -EFAULT;
750*4882a593Smuzhiyun goto out_socket;
751*4882a593Smuzhiyun }
752*4882a593Smuzhiyun
753*4882a593Smuzhiyun ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
754*4882a593Smuzhiyun fill, comp);
755*4882a593Smuzhiyun if (!ctx) {
756*4882a593Smuzhiyun err = -ENOMEM;
757*4882a593Smuzhiyun goto out_socket;
758*4882a593Smuzhiyun }
759*4882a593Smuzhiyun }
760*4882a593Smuzhiyun xsk->ctx = ctx;
761*4882a593Smuzhiyun
762*4882a593Smuzhiyun if (rx && !rx_setup_done) {
763*4882a593Smuzhiyun err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
764*4882a593Smuzhiyun &xsk->config.rx_size,
765*4882a593Smuzhiyun sizeof(xsk->config.rx_size));
766*4882a593Smuzhiyun if (err) {
767*4882a593Smuzhiyun err = -errno;
768*4882a593Smuzhiyun goto out_put_ctx;
769*4882a593Smuzhiyun }
770*4882a593Smuzhiyun if (xsk->fd == umem->fd)
771*4882a593Smuzhiyun umem->rx_ring_setup_done = true;
772*4882a593Smuzhiyun }
773*4882a593Smuzhiyun if (tx && !tx_setup_done) {
774*4882a593Smuzhiyun err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
775*4882a593Smuzhiyun &xsk->config.tx_size,
776*4882a593Smuzhiyun sizeof(xsk->config.tx_size));
777*4882a593Smuzhiyun if (err) {
778*4882a593Smuzhiyun err = -errno;
779*4882a593Smuzhiyun goto out_put_ctx;
780*4882a593Smuzhiyun }
781*4882a593Smuzhiyun if (xsk->fd == umem->fd)
782*4882a593Smuzhiyun umem->tx_ring_setup_done = true;
783*4882a593Smuzhiyun }
784*4882a593Smuzhiyun
785*4882a593Smuzhiyun err = xsk_get_mmap_offsets(xsk->fd, &off);
786*4882a593Smuzhiyun if (err) {
787*4882a593Smuzhiyun err = -errno;
788*4882a593Smuzhiyun goto out_put_ctx;
789*4882a593Smuzhiyun }
790*4882a593Smuzhiyun
791*4882a593Smuzhiyun if (rx) {
792*4882a593Smuzhiyun rx_map = mmap(NULL, off.rx.desc +
793*4882a593Smuzhiyun xsk->config.rx_size * sizeof(struct xdp_desc),
794*4882a593Smuzhiyun PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
795*4882a593Smuzhiyun xsk->fd, XDP_PGOFF_RX_RING);
796*4882a593Smuzhiyun if (rx_map == MAP_FAILED) {
797*4882a593Smuzhiyun err = -errno;
798*4882a593Smuzhiyun goto out_put_ctx;
799*4882a593Smuzhiyun }
800*4882a593Smuzhiyun
801*4882a593Smuzhiyun rx->mask = xsk->config.rx_size - 1;
802*4882a593Smuzhiyun rx->size = xsk->config.rx_size;
803*4882a593Smuzhiyun rx->producer = rx_map + off.rx.producer;
804*4882a593Smuzhiyun rx->consumer = rx_map + off.rx.consumer;
805*4882a593Smuzhiyun rx->flags = rx_map + off.rx.flags;
806*4882a593Smuzhiyun rx->ring = rx_map + off.rx.desc;
807*4882a593Smuzhiyun rx->cached_prod = *rx->producer;
808*4882a593Smuzhiyun rx->cached_cons = *rx->consumer;
809*4882a593Smuzhiyun }
810*4882a593Smuzhiyun xsk->rx = rx;
811*4882a593Smuzhiyun
812*4882a593Smuzhiyun if (tx) {
813*4882a593Smuzhiyun tx_map = mmap(NULL, off.tx.desc +
814*4882a593Smuzhiyun xsk->config.tx_size * sizeof(struct xdp_desc),
815*4882a593Smuzhiyun PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
816*4882a593Smuzhiyun xsk->fd, XDP_PGOFF_TX_RING);
817*4882a593Smuzhiyun if (tx_map == MAP_FAILED) {
818*4882a593Smuzhiyun err = -errno;
819*4882a593Smuzhiyun goto out_mmap_rx;
820*4882a593Smuzhiyun }
821*4882a593Smuzhiyun
822*4882a593Smuzhiyun tx->mask = xsk->config.tx_size - 1;
823*4882a593Smuzhiyun tx->size = xsk->config.tx_size;
824*4882a593Smuzhiyun tx->producer = tx_map + off.tx.producer;
825*4882a593Smuzhiyun tx->consumer = tx_map + off.tx.consumer;
826*4882a593Smuzhiyun tx->flags = tx_map + off.tx.flags;
827*4882a593Smuzhiyun tx->ring = tx_map + off.tx.desc;
828*4882a593Smuzhiyun tx->cached_prod = *tx->producer;
829*4882a593Smuzhiyun /* cached_cons is r->size bigger than the real consumer pointer
830*4882a593Smuzhiyun * See xsk_prod_nb_free
831*4882a593Smuzhiyun */
832*4882a593Smuzhiyun tx->cached_cons = *tx->consumer + xsk->config.tx_size;
833*4882a593Smuzhiyun }
834*4882a593Smuzhiyun xsk->tx = tx;
835*4882a593Smuzhiyun
836*4882a593Smuzhiyun sxdp.sxdp_family = PF_XDP;
837*4882a593Smuzhiyun sxdp.sxdp_ifindex = ctx->ifindex;
838*4882a593Smuzhiyun sxdp.sxdp_queue_id = ctx->queue_id;
839*4882a593Smuzhiyun if (umem->refcount > 1) {
840*4882a593Smuzhiyun sxdp.sxdp_flags |= XDP_SHARED_UMEM;
841*4882a593Smuzhiyun sxdp.sxdp_shared_umem_fd = umem->fd;
842*4882a593Smuzhiyun } else {
843*4882a593Smuzhiyun sxdp.sxdp_flags = xsk->config.bind_flags;
844*4882a593Smuzhiyun }
845*4882a593Smuzhiyun
846*4882a593Smuzhiyun err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
847*4882a593Smuzhiyun if (err) {
848*4882a593Smuzhiyun err = -errno;
849*4882a593Smuzhiyun goto out_mmap_tx;
850*4882a593Smuzhiyun }
851*4882a593Smuzhiyun
852*4882a593Smuzhiyun if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
853*4882a593Smuzhiyun err = xsk_setup_xdp_prog(xsk);
854*4882a593Smuzhiyun if (err)
855*4882a593Smuzhiyun goto out_mmap_tx;
856*4882a593Smuzhiyun }
857*4882a593Smuzhiyun
858*4882a593Smuzhiyun *xsk_ptr = xsk;
859*4882a593Smuzhiyun umem->fill_save = NULL;
860*4882a593Smuzhiyun umem->comp_save = NULL;
861*4882a593Smuzhiyun return 0;
862*4882a593Smuzhiyun
863*4882a593Smuzhiyun out_mmap_tx:
864*4882a593Smuzhiyun if (tx)
865*4882a593Smuzhiyun munmap(tx_map, off.tx.desc +
866*4882a593Smuzhiyun xsk->config.tx_size * sizeof(struct xdp_desc));
867*4882a593Smuzhiyun out_mmap_rx:
868*4882a593Smuzhiyun if (rx)
869*4882a593Smuzhiyun munmap(rx_map, off.rx.desc +
870*4882a593Smuzhiyun xsk->config.rx_size * sizeof(struct xdp_desc));
871*4882a593Smuzhiyun out_put_ctx:
872*4882a593Smuzhiyun xsk_put_ctx(ctx, unmap);
873*4882a593Smuzhiyun out_socket:
874*4882a593Smuzhiyun if (--umem->refcount)
875*4882a593Smuzhiyun close(xsk->fd);
876*4882a593Smuzhiyun out_xsk_alloc:
877*4882a593Smuzhiyun free(xsk);
878*4882a593Smuzhiyun return err;
879*4882a593Smuzhiyun }
880*4882a593Smuzhiyun
xsk_socket__create(struct xsk_socket ** xsk_ptr,const char * ifname,__u32 queue_id,struct xsk_umem * umem,struct xsk_ring_cons * rx,struct xsk_ring_prod * tx,const struct xsk_socket_config * usr_config)881*4882a593Smuzhiyun int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
882*4882a593Smuzhiyun __u32 queue_id, struct xsk_umem *umem,
883*4882a593Smuzhiyun struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
884*4882a593Smuzhiyun const struct xsk_socket_config *usr_config)
885*4882a593Smuzhiyun {
886*4882a593Smuzhiyun if (!umem)
887*4882a593Smuzhiyun return -EFAULT;
888*4882a593Smuzhiyun
889*4882a593Smuzhiyun return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem,
890*4882a593Smuzhiyun rx, tx, umem->fill_save,
891*4882a593Smuzhiyun umem->comp_save, usr_config);
892*4882a593Smuzhiyun }
893*4882a593Smuzhiyun
xsk_umem__delete(struct xsk_umem * umem)894*4882a593Smuzhiyun int xsk_umem__delete(struct xsk_umem *umem)
895*4882a593Smuzhiyun {
896*4882a593Smuzhiyun struct xdp_mmap_offsets off;
897*4882a593Smuzhiyun int err;
898*4882a593Smuzhiyun
899*4882a593Smuzhiyun if (!umem)
900*4882a593Smuzhiyun return 0;
901*4882a593Smuzhiyun
902*4882a593Smuzhiyun if (umem->refcount)
903*4882a593Smuzhiyun return -EBUSY;
904*4882a593Smuzhiyun
905*4882a593Smuzhiyun err = xsk_get_mmap_offsets(umem->fd, &off);
906*4882a593Smuzhiyun if (!err && umem->fill_save && umem->comp_save) {
907*4882a593Smuzhiyun munmap(umem->fill_save->ring - off.fr.desc,
908*4882a593Smuzhiyun off.fr.desc + umem->config.fill_size * sizeof(__u64));
909*4882a593Smuzhiyun munmap(umem->comp_save->ring - off.cr.desc,
910*4882a593Smuzhiyun off.cr.desc + umem->config.comp_size * sizeof(__u64));
911*4882a593Smuzhiyun }
912*4882a593Smuzhiyun
913*4882a593Smuzhiyun close(umem->fd);
914*4882a593Smuzhiyun free(umem);
915*4882a593Smuzhiyun
916*4882a593Smuzhiyun return 0;
917*4882a593Smuzhiyun }
918*4882a593Smuzhiyun
xsk_socket__delete(struct xsk_socket * xsk)919*4882a593Smuzhiyun void xsk_socket__delete(struct xsk_socket *xsk)
920*4882a593Smuzhiyun {
921*4882a593Smuzhiyun size_t desc_sz = sizeof(struct xdp_desc);
922*4882a593Smuzhiyun struct xdp_mmap_offsets off;
923*4882a593Smuzhiyun struct xsk_umem *umem;
924*4882a593Smuzhiyun struct xsk_ctx *ctx;
925*4882a593Smuzhiyun int err;
926*4882a593Smuzhiyun
927*4882a593Smuzhiyun if (!xsk)
928*4882a593Smuzhiyun return;
929*4882a593Smuzhiyun
930*4882a593Smuzhiyun ctx = xsk->ctx;
931*4882a593Smuzhiyun umem = ctx->umem;
932*4882a593Smuzhiyun
933*4882a593Smuzhiyun if (ctx->refcount == 1) {
934*4882a593Smuzhiyun xsk_delete_bpf_maps(xsk);
935*4882a593Smuzhiyun close(ctx->prog_fd);
936*4882a593Smuzhiyun }
937*4882a593Smuzhiyun
938*4882a593Smuzhiyun xsk_put_ctx(ctx, true);
939*4882a593Smuzhiyun
940*4882a593Smuzhiyun err = xsk_get_mmap_offsets(xsk->fd, &off);
941*4882a593Smuzhiyun if (!err) {
942*4882a593Smuzhiyun if (xsk->rx) {
943*4882a593Smuzhiyun munmap(xsk->rx->ring - off.rx.desc,
944*4882a593Smuzhiyun off.rx.desc + xsk->config.rx_size * desc_sz);
945*4882a593Smuzhiyun }
946*4882a593Smuzhiyun if (xsk->tx) {
947*4882a593Smuzhiyun munmap(xsk->tx->ring - off.tx.desc,
948*4882a593Smuzhiyun off.tx.desc + xsk->config.tx_size * desc_sz);
949*4882a593Smuzhiyun }
950*4882a593Smuzhiyun }
951*4882a593Smuzhiyun
952*4882a593Smuzhiyun umem->refcount--;
953*4882a593Smuzhiyun /* Do not close an fd that also has an associated umem connected
954*4882a593Smuzhiyun * to it.
955*4882a593Smuzhiyun */
956*4882a593Smuzhiyun if (xsk->fd != umem->fd)
957*4882a593Smuzhiyun close(xsk->fd);
958*4882a593Smuzhiyun free(xsk);
959*4882a593Smuzhiyun }
960