xref: /OK3568_Linux_fs/kernel/kernel/bpf/cgroup.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * Functions to manage eBPF programs attached to cgroups
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  * Copyright (c) 2016 Daniel Mack
6*4882a593Smuzhiyun  */
7*4882a593Smuzhiyun 
8*4882a593Smuzhiyun #include <linux/kernel.h>
9*4882a593Smuzhiyun #include <linux/atomic.h>
10*4882a593Smuzhiyun #include <linux/cgroup.h>
11*4882a593Smuzhiyun #include <linux/filter.h>
12*4882a593Smuzhiyun #include <linux/slab.h>
13*4882a593Smuzhiyun #include <linux/sysctl.h>
14*4882a593Smuzhiyun #include <linux/string.h>
15*4882a593Smuzhiyun #include <linux/bpf.h>
16*4882a593Smuzhiyun #include <linux/bpf-cgroup.h>
17*4882a593Smuzhiyun #include <net/sock.h>
18*4882a593Smuzhiyun #include <net/bpf_sk_storage.h>
19*4882a593Smuzhiyun 
20*4882a593Smuzhiyun #include "../cgroup/cgroup-internal.h"
21*4882a593Smuzhiyun 
22*4882a593Smuzhiyun DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
23*4882a593Smuzhiyun EXPORT_SYMBOL(cgroup_bpf_enabled_key);
24*4882a593Smuzhiyun 
cgroup_bpf_offline(struct cgroup * cgrp)25*4882a593Smuzhiyun void cgroup_bpf_offline(struct cgroup *cgrp)
26*4882a593Smuzhiyun {
27*4882a593Smuzhiyun 	cgroup_get(cgrp);
28*4882a593Smuzhiyun 	percpu_ref_kill(&cgrp->bpf.refcnt);
29*4882a593Smuzhiyun }
30*4882a593Smuzhiyun 
bpf_cgroup_storages_free(struct bpf_cgroup_storage * storages[])31*4882a593Smuzhiyun static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
32*4882a593Smuzhiyun {
33*4882a593Smuzhiyun 	enum bpf_cgroup_storage_type stype;
34*4882a593Smuzhiyun 
35*4882a593Smuzhiyun 	for_each_cgroup_storage_type(stype)
36*4882a593Smuzhiyun 		bpf_cgroup_storage_free(storages[stype]);
37*4882a593Smuzhiyun }
38*4882a593Smuzhiyun 
bpf_cgroup_storages_alloc(struct bpf_cgroup_storage * storages[],struct bpf_cgroup_storage * new_storages[],enum bpf_attach_type type,struct bpf_prog * prog,struct cgroup * cgrp)39*4882a593Smuzhiyun static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
40*4882a593Smuzhiyun 				     struct bpf_cgroup_storage *new_storages[],
41*4882a593Smuzhiyun 				     enum bpf_attach_type type,
42*4882a593Smuzhiyun 				     struct bpf_prog *prog,
43*4882a593Smuzhiyun 				     struct cgroup *cgrp)
44*4882a593Smuzhiyun {
45*4882a593Smuzhiyun 	enum bpf_cgroup_storage_type stype;
46*4882a593Smuzhiyun 	struct bpf_cgroup_storage_key key;
47*4882a593Smuzhiyun 	struct bpf_map *map;
48*4882a593Smuzhiyun 
49*4882a593Smuzhiyun 	key.cgroup_inode_id = cgroup_id(cgrp);
50*4882a593Smuzhiyun 	key.attach_type = type;
51*4882a593Smuzhiyun 
52*4882a593Smuzhiyun 	for_each_cgroup_storage_type(stype) {
53*4882a593Smuzhiyun 		map = prog->aux->cgroup_storage[stype];
54*4882a593Smuzhiyun 		if (!map)
55*4882a593Smuzhiyun 			continue;
56*4882a593Smuzhiyun 
57*4882a593Smuzhiyun 		storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
58*4882a593Smuzhiyun 		if (storages[stype])
59*4882a593Smuzhiyun 			continue;
60*4882a593Smuzhiyun 
61*4882a593Smuzhiyun 		storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
62*4882a593Smuzhiyun 		if (IS_ERR(storages[stype])) {
63*4882a593Smuzhiyun 			bpf_cgroup_storages_free(new_storages);
64*4882a593Smuzhiyun 			return -ENOMEM;
65*4882a593Smuzhiyun 		}
66*4882a593Smuzhiyun 
67*4882a593Smuzhiyun 		new_storages[stype] = storages[stype];
68*4882a593Smuzhiyun 	}
69*4882a593Smuzhiyun 
70*4882a593Smuzhiyun 	return 0;
71*4882a593Smuzhiyun }
72*4882a593Smuzhiyun 
bpf_cgroup_storages_assign(struct bpf_cgroup_storage * dst[],struct bpf_cgroup_storage * src[])73*4882a593Smuzhiyun static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
74*4882a593Smuzhiyun 				       struct bpf_cgroup_storage *src[])
75*4882a593Smuzhiyun {
76*4882a593Smuzhiyun 	enum bpf_cgroup_storage_type stype;
77*4882a593Smuzhiyun 
78*4882a593Smuzhiyun 	for_each_cgroup_storage_type(stype)
79*4882a593Smuzhiyun 		dst[stype] = src[stype];
80*4882a593Smuzhiyun }
81*4882a593Smuzhiyun 
bpf_cgroup_storages_link(struct bpf_cgroup_storage * storages[],struct cgroup * cgrp,enum bpf_attach_type attach_type)82*4882a593Smuzhiyun static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
83*4882a593Smuzhiyun 				     struct cgroup *cgrp,
84*4882a593Smuzhiyun 				     enum bpf_attach_type attach_type)
85*4882a593Smuzhiyun {
86*4882a593Smuzhiyun 	enum bpf_cgroup_storage_type stype;
87*4882a593Smuzhiyun 
88*4882a593Smuzhiyun 	for_each_cgroup_storage_type(stype)
89*4882a593Smuzhiyun 		bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
90*4882a593Smuzhiyun }
91*4882a593Smuzhiyun 
92*4882a593Smuzhiyun /* Called when bpf_cgroup_link is auto-detached from dying cgroup.
93*4882a593Smuzhiyun  * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
94*4882a593Smuzhiyun  * doesn't free link memory, which will eventually be done by bpf_link's
95*4882a593Smuzhiyun  * release() callback, when its last FD is closed.
96*4882a593Smuzhiyun  */
bpf_cgroup_link_auto_detach(struct bpf_cgroup_link * link)97*4882a593Smuzhiyun static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
98*4882a593Smuzhiyun {
99*4882a593Smuzhiyun 	cgroup_put(link->cgroup);
100*4882a593Smuzhiyun 	link->cgroup = NULL;
101*4882a593Smuzhiyun }
102*4882a593Smuzhiyun 
103*4882a593Smuzhiyun /**
104*4882a593Smuzhiyun  * cgroup_bpf_release() - put references of all bpf programs and
105*4882a593Smuzhiyun  *                        release all cgroup bpf data
106*4882a593Smuzhiyun  * @work: work structure embedded into the cgroup to modify
107*4882a593Smuzhiyun  */
cgroup_bpf_release(struct work_struct * work)108*4882a593Smuzhiyun static void cgroup_bpf_release(struct work_struct *work)
109*4882a593Smuzhiyun {
110*4882a593Smuzhiyun 	struct cgroup *p, *cgrp = container_of(work, struct cgroup,
111*4882a593Smuzhiyun 					       bpf.release_work);
112*4882a593Smuzhiyun 	struct bpf_prog_array *old_array;
113*4882a593Smuzhiyun 	struct list_head *storages = &cgrp->bpf.storages;
114*4882a593Smuzhiyun 	struct bpf_cgroup_storage *storage, *stmp;
115*4882a593Smuzhiyun 
116*4882a593Smuzhiyun 	unsigned int type;
117*4882a593Smuzhiyun 
118*4882a593Smuzhiyun 	mutex_lock(&cgroup_mutex);
119*4882a593Smuzhiyun 
120*4882a593Smuzhiyun 	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
121*4882a593Smuzhiyun 		struct list_head *progs = &cgrp->bpf.progs[type];
122*4882a593Smuzhiyun 		struct bpf_prog_list *pl, *pltmp;
123*4882a593Smuzhiyun 
124*4882a593Smuzhiyun 		list_for_each_entry_safe(pl, pltmp, progs, node) {
125*4882a593Smuzhiyun 			list_del(&pl->node);
126*4882a593Smuzhiyun 			if (pl->prog)
127*4882a593Smuzhiyun 				bpf_prog_put(pl->prog);
128*4882a593Smuzhiyun 			if (pl->link)
129*4882a593Smuzhiyun 				bpf_cgroup_link_auto_detach(pl->link);
130*4882a593Smuzhiyun 			kfree(pl);
131*4882a593Smuzhiyun 			static_branch_dec(&cgroup_bpf_enabled_key);
132*4882a593Smuzhiyun 		}
133*4882a593Smuzhiyun 		old_array = rcu_dereference_protected(
134*4882a593Smuzhiyun 				cgrp->bpf.effective[type],
135*4882a593Smuzhiyun 				lockdep_is_held(&cgroup_mutex));
136*4882a593Smuzhiyun 		bpf_prog_array_free(old_array);
137*4882a593Smuzhiyun 	}
138*4882a593Smuzhiyun 
139*4882a593Smuzhiyun 	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
140*4882a593Smuzhiyun 		bpf_cgroup_storage_unlink(storage);
141*4882a593Smuzhiyun 		bpf_cgroup_storage_free(storage);
142*4882a593Smuzhiyun 	}
143*4882a593Smuzhiyun 
144*4882a593Smuzhiyun 	mutex_unlock(&cgroup_mutex);
145*4882a593Smuzhiyun 
146*4882a593Smuzhiyun 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
147*4882a593Smuzhiyun 		cgroup_bpf_put(p);
148*4882a593Smuzhiyun 
149*4882a593Smuzhiyun 	percpu_ref_exit(&cgrp->bpf.refcnt);
150*4882a593Smuzhiyun 	cgroup_put(cgrp);
151*4882a593Smuzhiyun }
152*4882a593Smuzhiyun 
153*4882a593Smuzhiyun /**
154*4882a593Smuzhiyun  * cgroup_bpf_release_fn() - callback used to schedule releasing
155*4882a593Smuzhiyun  *                           of bpf cgroup data
156*4882a593Smuzhiyun  * @ref: percpu ref counter structure
157*4882a593Smuzhiyun  */
cgroup_bpf_release_fn(struct percpu_ref * ref)158*4882a593Smuzhiyun static void cgroup_bpf_release_fn(struct percpu_ref *ref)
159*4882a593Smuzhiyun {
160*4882a593Smuzhiyun 	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
161*4882a593Smuzhiyun 
162*4882a593Smuzhiyun 	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
163*4882a593Smuzhiyun 	queue_work(system_wq, &cgrp->bpf.release_work);
164*4882a593Smuzhiyun }
165*4882a593Smuzhiyun 
166*4882a593Smuzhiyun /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
167*4882a593Smuzhiyun  * link or direct prog.
168*4882a593Smuzhiyun  */
prog_list_prog(struct bpf_prog_list * pl)169*4882a593Smuzhiyun static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
170*4882a593Smuzhiyun {
171*4882a593Smuzhiyun 	if (pl->prog)
172*4882a593Smuzhiyun 		return pl->prog;
173*4882a593Smuzhiyun 	if (pl->link)
174*4882a593Smuzhiyun 		return pl->link->link.prog;
175*4882a593Smuzhiyun 	return NULL;
176*4882a593Smuzhiyun }
177*4882a593Smuzhiyun 
178*4882a593Smuzhiyun /* count number of elements in the list.
179*4882a593Smuzhiyun  * it's slow but the list cannot be long
180*4882a593Smuzhiyun  */
prog_list_length(struct list_head * head)181*4882a593Smuzhiyun static u32 prog_list_length(struct list_head *head)
182*4882a593Smuzhiyun {
183*4882a593Smuzhiyun 	struct bpf_prog_list *pl;
184*4882a593Smuzhiyun 	u32 cnt = 0;
185*4882a593Smuzhiyun 
186*4882a593Smuzhiyun 	list_for_each_entry(pl, head, node) {
187*4882a593Smuzhiyun 		if (!prog_list_prog(pl))
188*4882a593Smuzhiyun 			continue;
189*4882a593Smuzhiyun 		cnt++;
190*4882a593Smuzhiyun 	}
191*4882a593Smuzhiyun 	return cnt;
192*4882a593Smuzhiyun }
193*4882a593Smuzhiyun 
194*4882a593Smuzhiyun /* if parent has non-overridable prog attached,
195*4882a593Smuzhiyun  * disallow attaching new programs to the descendent cgroup.
196*4882a593Smuzhiyun  * if parent has overridable or multi-prog, allow attaching
197*4882a593Smuzhiyun  */
hierarchy_allows_attach(struct cgroup * cgrp,enum bpf_attach_type type)198*4882a593Smuzhiyun static bool hierarchy_allows_attach(struct cgroup *cgrp,
199*4882a593Smuzhiyun 				    enum bpf_attach_type type)
200*4882a593Smuzhiyun {
201*4882a593Smuzhiyun 	struct cgroup *p;
202*4882a593Smuzhiyun 
203*4882a593Smuzhiyun 	p = cgroup_parent(cgrp);
204*4882a593Smuzhiyun 	if (!p)
205*4882a593Smuzhiyun 		return true;
206*4882a593Smuzhiyun 	do {
207*4882a593Smuzhiyun 		u32 flags = p->bpf.flags[type];
208*4882a593Smuzhiyun 		u32 cnt;
209*4882a593Smuzhiyun 
210*4882a593Smuzhiyun 		if (flags & BPF_F_ALLOW_MULTI)
211*4882a593Smuzhiyun 			return true;
212*4882a593Smuzhiyun 		cnt = prog_list_length(&p->bpf.progs[type]);
213*4882a593Smuzhiyun 		WARN_ON_ONCE(cnt > 1);
214*4882a593Smuzhiyun 		if (cnt == 1)
215*4882a593Smuzhiyun 			return !!(flags & BPF_F_ALLOW_OVERRIDE);
216*4882a593Smuzhiyun 		p = cgroup_parent(p);
217*4882a593Smuzhiyun 	} while (p);
218*4882a593Smuzhiyun 	return true;
219*4882a593Smuzhiyun }
220*4882a593Smuzhiyun 
221*4882a593Smuzhiyun /* compute a chain of effective programs for a given cgroup:
222*4882a593Smuzhiyun  * start from the list of programs in this cgroup and add
223*4882a593Smuzhiyun  * all parent programs.
224*4882a593Smuzhiyun  * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
225*4882a593Smuzhiyun  * to programs in this cgroup
226*4882a593Smuzhiyun  */
compute_effective_progs(struct cgroup * cgrp,enum bpf_attach_type type,struct bpf_prog_array ** array)227*4882a593Smuzhiyun static int compute_effective_progs(struct cgroup *cgrp,
228*4882a593Smuzhiyun 				   enum bpf_attach_type type,
229*4882a593Smuzhiyun 				   struct bpf_prog_array **array)
230*4882a593Smuzhiyun {
231*4882a593Smuzhiyun 	struct bpf_prog_array_item *item;
232*4882a593Smuzhiyun 	struct bpf_prog_array *progs;
233*4882a593Smuzhiyun 	struct bpf_prog_list *pl;
234*4882a593Smuzhiyun 	struct cgroup *p = cgrp;
235*4882a593Smuzhiyun 	int cnt = 0;
236*4882a593Smuzhiyun 
237*4882a593Smuzhiyun 	/* count number of effective programs by walking parents */
238*4882a593Smuzhiyun 	do {
239*4882a593Smuzhiyun 		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
240*4882a593Smuzhiyun 			cnt += prog_list_length(&p->bpf.progs[type]);
241*4882a593Smuzhiyun 		p = cgroup_parent(p);
242*4882a593Smuzhiyun 	} while (p);
243*4882a593Smuzhiyun 
244*4882a593Smuzhiyun 	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
245*4882a593Smuzhiyun 	if (!progs)
246*4882a593Smuzhiyun 		return -ENOMEM;
247*4882a593Smuzhiyun 
248*4882a593Smuzhiyun 	/* populate the array with effective progs */
249*4882a593Smuzhiyun 	cnt = 0;
250*4882a593Smuzhiyun 	p = cgrp;
251*4882a593Smuzhiyun 	do {
252*4882a593Smuzhiyun 		if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
253*4882a593Smuzhiyun 			continue;
254*4882a593Smuzhiyun 
255*4882a593Smuzhiyun 		list_for_each_entry(pl, &p->bpf.progs[type], node) {
256*4882a593Smuzhiyun 			if (!prog_list_prog(pl))
257*4882a593Smuzhiyun 				continue;
258*4882a593Smuzhiyun 
259*4882a593Smuzhiyun 			item = &progs->items[cnt];
260*4882a593Smuzhiyun 			item->prog = prog_list_prog(pl);
261*4882a593Smuzhiyun 			bpf_cgroup_storages_assign(item->cgroup_storage,
262*4882a593Smuzhiyun 						   pl->storage);
263*4882a593Smuzhiyun 			cnt++;
264*4882a593Smuzhiyun 		}
265*4882a593Smuzhiyun 	} while ((p = cgroup_parent(p)));
266*4882a593Smuzhiyun 
267*4882a593Smuzhiyun 	*array = progs;
268*4882a593Smuzhiyun 	return 0;
269*4882a593Smuzhiyun }
270*4882a593Smuzhiyun 
activate_effective_progs(struct cgroup * cgrp,enum bpf_attach_type type,struct bpf_prog_array * old_array)271*4882a593Smuzhiyun static void activate_effective_progs(struct cgroup *cgrp,
272*4882a593Smuzhiyun 				     enum bpf_attach_type type,
273*4882a593Smuzhiyun 				     struct bpf_prog_array *old_array)
274*4882a593Smuzhiyun {
275*4882a593Smuzhiyun 	old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
276*4882a593Smuzhiyun 					lockdep_is_held(&cgroup_mutex));
277*4882a593Smuzhiyun 	/* free prog array after grace period, since __cgroup_bpf_run_*()
278*4882a593Smuzhiyun 	 * might be still walking the array
279*4882a593Smuzhiyun 	 */
280*4882a593Smuzhiyun 	bpf_prog_array_free(old_array);
281*4882a593Smuzhiyun }
282*4882a593Smuzhiyun 
283*4882a593Smuzhiyun /**
284*4882a593Smuzhiyun  * cgroup_bpf_inherit() - inherit effective programs from parent
285*4882a593Smuzhiyun  * @cgrp: the cgroup to modify
286*4882a593Smuzhiyun  */
cgroup_bpf_inherit(struct cgroup * cgrp)287*4882a593Smuzhiyun int cgroup_bpf_inherit(struct cgroup *cgrp)
288*4882a593Smuzhiyun {
289*4882a593Smuzhiyun /* has to use marco instead of const int, since compiler thinks
290*4882a593Smuzhiyun  * that array below is variable length
291*4882a593Smuzhiyun  */
292*4882a593Smuzhiyun #define	NR ARRAY_SIZE(cgrp->bpf.effective)
293*4882a593Smuzhiyun 	struct bpf_prog_array *arrays[NR] = {};
294*4882a593Smuzhiyun 	struct cgroup *p;
295*4882a593Smuzhiyun 	int ret, i;
296*4882a593Smuzhiyun 
297*4882a593Smuzhiyun 	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
298*4882a593Smuzhiyun 			      GFP_KERNEL);
299*4882a593Smuzhiyun 	if (ret)
300*4882a593Smuzhiyun 		return ret;
301*4882a593Smuzhiyun 
302*4882a593Smuzhiyun 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
303*4882a593Smuzhiyun 		cgroup_bpf_get(p);
304*4882a593Smuzhiyun 
305*4882a593Smuzhiyun 	for (i = 0; i < NR; i++)
306*4882a593Smuzhiyun 		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
307*4882a593Smuzhiyun 
308*4882a593Smuzhiyun 	INIT_LIST_HEAD(&cgrp->bpf.storages);
309*4882a593Smuzhiyun 
310*4882a593Smuzhiyun 	for (i = 0; i < NR; i++)
311*4882a593Smuzhiyun 		if (compute_effective_progs(cgrp, i, &arrays[i]))
312*4882a593Smuzhiyun 			goto cleanup;
313*4882a593Smuzhiyun 
314*4882a593Smuzhiyun 	for (i = 0; i < NR; i++)
315*4882a593Smuzhiyun 		activate_effective_progs(cgrp, i, arrays[i]);
316*4882a593Smuzhiyun 
317*4882a593Smuzhiyun 	return 0;
318*4882a593Smuzhiyun cleanup:
319*4882a593Smuzhiyun 	for (i = 0; i < NR; i++)
320*4882a593Smuzhiyun 		bpf_prog_array_free(arrays[i]);
321*4882a593Smuzhiyun 
322*4882a593Smuzhiyun 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
323*4882a593Smuzhiyun 		cgroup_bpf_put(p);
324*4882a593Smuzhiyun 
325*4882a593Smuzhiyun 	percpu_ref_exit(&cgrp->bpf.refcnt);
326*4882a593Smuzhiyun 
327*4882a593Smuzhiyun 	return -ENOMEM;
328*4882a593Smuzhiyun }
329*4882a593Smuzhiyun 
update_effective_progs(struct cgroup * cgrp,enum bpf_attach_type type)330*4882a593Smuzhiyun static int update_effective_progs(struct cgroup *cgrp,
331*4882a593Smuzhiyun 				  enum bpf_attach_type type)
332*4882a593Smuzhiyun {
333*4882a593Smuzhiyun 	struct cgroup_subsys_state *css;
334*4882a593Smuzhiyun 	int err;
335*4882a593Smuzhiyun 
336*4882a593Smuzhiyun 	/* allocate and recompute effective prog arrays */
337*4882a593Smuzhiyun 	css_for_each_descendant_pre(css, &cgrp->self) {
338*4882a593Smuzhiyun 		struct cgroup *desc = container_of(css, struct cgroup, self);
339*4882a593Smuzhiyun 
340*4882a593Smuzhiyun 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
341*4882a593Smuzhiyun 			continue;
342*4882a593Smuzhiyun 
343*4882a593Smuzhiyun 		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
344*4882a593Smuzhiyun 		if (err)
345*4882a593Smuzhiyun 			goto cleanup;
346*4882a593Smuzhiyun 	}
347*4882a593Smuzhiyun 
348*4882a593Smuzhiyun 	/* all allocations were successful. Activate all prog arrays */
349*4882a593Smuzhiyun 	css_for_each_descendant_pre(css, &cgrp->self) {
350*4882a593Smuzhiyun 		struct cgroup *desc = container_of(css, struct cgroup, self);
351*4882a593Smuzhiyun 
352*4882a593Smuzhiyun 		if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
353*4882a593Smuzhiyun 			if (unlikely(desc->bpf.inactive)) {
354*4882a593Smuzhiyun 				bpf_prog_array_free(desc->bpf.inactive);
355*4882a593Smuzhiyun 				desc->bpf.inactive = NULL;
356*4882a593Smuzhiyun 			}
357*4882a593Smuzhiyun 			continue;
358*4882a593Smuzhiyun 		}
359*4882a593Smuzhiyun 
360*4882a593Smuzhiyun 		activate_effective_progs(desc, type, desc->bpf.inactive);
361*4882a593Smuzhiyun 		desc->bpf.inactive = NULL;
362*4882a593Smuzhiyun 	}
363*4882a593Smuzhiyun 
364*4882a593Smuzhiyun 	return 0;
365*4882a593Smuzhiyun 
366*4882a593Smuzhiyun cleanup:
367*4882a593Smuzhiyun 	/* oom while computing effective. Free all computed effective arrays
368*4882a593Smuzhiyun 	 * since they were not activated
369*4882a593Smuzhiyun 	 */
370*4882a593Smuzhiyun 	css_for_each_descendant_pre(css, &cgrp->self) {
371*4882a593Smuzhiyun 		struct cgroup *desc = container_of(css, struct cgroup, self);
372*4882a593Smuzhiyun 
373*4882a593Smuzhiyun 		bpf_prog_array_free(desc->bpf.inactive);
374*4882a593Smuzhiyun 		desc->bpf.inactive = NULL;
375*4882a593Smuzhiyun 	}
376*4882a593Smuzhiyun 
377*4882a593Smuzhiyun 	return err;
378*4882a593Smuzhiyun }
379*4882a593Smuzhiyun 
380*4882a593Smuzhiyun #define BPF_CGROUP_MAX_PROGS 64
381*4882a593Smuzhiyun 
find_attach_entry(struct list_head * progs,struct bpf_prog * prog,struct bpf_cgroup_link * link,struct bpf_prog * replace_prog,bool allow_multi)382*4882a593Smuzhiyun static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
383*4882a593Smuzhiyun 					       struct bpf_prog *prog,
384*4882a593Smuzhiyun 					       struct bpf_cgroup_link *link,
385*4882a593Smuzhiyun 					       struct bpf_prog *replace_prog,
386*4882a593Smuzhiyun 					       bool allow_multi)
387*4882a593Smuzhiyun {
388*4882a593Smuzhiyun 	struct bpf_prog_list *pl;
389*4882a593Smuzhiyun 
390*4882a593Smuzhiyun 	/* single-attach case */
391*4882a593Smuzhiyun 	if (!allow_multi) {
392*4882a593Smuzhiyun 		if (list_empty(progs))
393*4882a593Smuzhiyun 			return NULL;
394*4882a593Smuzhiyun 		return list_first_entry(progs, typeof(*pl), node);
395*4882a593Smuzhiyun 	}
396*4882a593Smuzhiyun 
397*4882a593Smuzhiyun 	list_for_each_entry(pl, progs, node) {
398*4882a593Smuzhiyun 		if (prog && pl->prog == prog && prog != replace_prog)
399*4882a593Smuzhiyun 			/* disallow attaching the same prog twice */
400*4882a593Smuzhiyun 			return ERR_PTR(-EINVAL);
401*4882a593Smuzhiyun 		if (link && pl->link == link)
402*4882a593Smuzhiyun 			/* disallow attaching the same link twice */
403*4882a593Smuzhiyun 			return ERR_PTR(-EINVAL);
404*4882a593Smuzhiyun 	}
405*4882a593Smuzhiyun 
406*4882a593Smuzhiyun 	/* direct prog multi-attach w/ replacement case */
407*4882a593Smuzhiyun 	if (replace_prog) {
408*4882a593Smuzhiyun 		list_for_each_entry(pl, progs, node) {
409*4882a593Smuzhiyun 			if (pl->prog == replace_prog)
410*4882a593Smuzhiyun 				/* a match found */
411*4882a593Smuzhiyun 				return pl;
412*4882a593Smuzhiyun 		}
413*4882a593Smuzhiyun 		/* prog to replace not found for cgroup */
414*4882a593Smuzhiyun 		return ERR_PTR(-ENOENT);
415*4882a593Smuzhiyun 	}
416*4882a593Smuzhiyun 
417*4882a593Smuzhiyun 	return NULL;
418*4882a593Smuzhiyun }
419*4882a593Smuzhiyun 
420*4882a593Smuzhiyun /**
421*4882a593Smuzhiyun  * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
422*4882a593Smuzhiyun  *                         propagate the change to descendants
423*4882a593Smuzhiyun  * @cgrp: The cgroup which descendants to traverse
424*4882a593Smuzhiyun  * @prog: A program to attach
425*4882a593Smuzhiyun  * @link: A link to attach
426*4882a593Smuzhiyun  * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
427*4882a593Smuzhiyun  * @type: Type of attach operation
428*4882a593Smuzhiyun  * @flags: Option flags
429*4882a593Smuzhiyun  *
430*4882a593Smuzhiyun  * Exactly one of @prog or @link can be non-null.
431*4882a593Smuzhiyun  * Must be called with cgroup_mutex held.
432*4882a593Smuzhiyun  */
__cgroup_bpf_attach(struct cgroup * cgrp,struct bpf_prog * prog,struct bpf_prog * replace_prog,struct bpf_cgroup_link * link,enum bpf_attach_type type,u32 flags)433*4882a593Smuzhiyun int __cgroup_bpf_attach(struct cgroup *cgrp,
434*4882a593Smuzhiyun 			struct bpf_prog *prog, struct bpf_prog *replace_prog,
435*4882a593Smuzhiyun 			struct bpf_cgroup_link *link,
436*4882a593Smuzhiyun 			enum bpf_attach_type type, u32 flags)
437*4882a593Smuzhiyun {
438*4882a593Smuzhiyun 	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
439*4882a593Smuzhiyun 	struct list_head *progs = &cgrp->bpf.progs[type];
440*4882a593Smuzhiyun 	struct bpf_prog *old_prog = NULL;
441*4882a593Smuzhiyun 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
442*4882a593Smuzhiyun 	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
443*4882a593Smuzhiyun 	struct bpf_prog_list *pl;
444*4882a593Smuzhiyun 	int err;
445*4882a593Smuzhiyun 
446*4882a593Smuzhiyun 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
447*4882a593Smuzhiyun 	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
448*4882a593Smuzhiyun 		/* invalid combination */
449*4882a593Smuzhiyun 		return -EINVAL;
450*4882a593Smuzhiyun 	if (link && (prog || replace_prog))
451*4882a593Smuzhiyun 		/* only either link or prog/replace_prog can be specified */
452*4882a593Smuzhiyun 		return -EINVAL;
453*4882a593Smuzhiyun 	if (!!replace_prog != !!(flags & BPF_F_REPLACE))
454*4882a593Smuzhiyun 		/* replace_prog implies BPF_F_REPLACE, and vice versa */
455*4882a593Smuzhiyun 		return -EINVAL;
456*4882a593Smuzhiyun 
457*4882a593Smuzhiyun 	if (!hierarchy_allows_attach(cgrp, type))
458*4882a593Smuzhiyun 		return -EPERM;
459*4882a593Smuzhiyun 
460*4882a593Smuzhiyun 	if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
461*4882a593Smuzhiyun 		/* Disallow attaching non-overridable on top
462*4882a593Smuzhiyun 		 * of existing overridable in this cgroup.
463*4882a593Smuzhiyun 		 * Disallow attaching multi-prog if overridable or none
464*4882a593Smuzhiyun 		 */
465*4882a593Smuzhiyun 		return -EPERM;
466*4882a593Smuzhiyun 
467*4882a593Smuzhiyun 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
468*4882a593Smuzhiyun 		return -E2BIG;
469*4882a593Smuzhiyun 
470*4882a593Smuzhiyun 	pl = find_attach_entry(progs, prog, link, replace_prog,
471*4882a593Smuzhiyun 			       flags & BPF_F_ALLOW_MULTI);
472*4882a593Smuzhiyun 	if (IS_ERR(pl))
473*4882a593Smuzhiyun 		return PTR_ERR(pl);
474*4882a593Smuzhiyun 
475*4882a593Smuzhiyun 	if (bpf_cgroup_storages_alloc(storage, new_storage, type,
476*4882a593Smuzhiyun 				      prog ? : link->link.prog, cgrp))
477*4882a593Smuzhiyun 		return -ENOMEM;
478*4882a593Smuzhiyun 
479*4882a593Smuzhiyun 	if (pl) {
480*4882a593Smuzhiyun 		old_prog = pl->prog;
481*4882a593Smuzhiyun 	} else {
482*4882a593Smuzhiyun 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
483*4882a593Smuzhiyun 		if (!pl) {
484*4882a593Smuzhiyun 			bpf_cgroup_storages_free(new_storage);
485*4882a593Smuzhiyun 			return -ENOMEM;
486*4882a593Smuzhiyun 		}
487*4882a593Smuzhiyun 		list_add_tail(&pl->node, progs);
488*4882a593Smuzhiyun 	}
489*4882a593Smuzhiyun 
490*4882a593Smuzhiyun 	pl->prog = prog;
491*4882a593Smuzhiyun 	pl->link = link;
492*4882a593Smuzhiyun 	bpf_cgroup_storages_assign(pl->storage, storage);
493*4882a593Smuzhiyun 	cgrp->bpf.flags[type] = saved_flags;
494*4882a593Smuzhiyun 
495*4882a593Smuzhiyun 	err = update_effective_progs(cgrp, type);
496*4882a593Smuzhiyun 	if (err)
497*4882a593Smuzhiyun 		goto cleanup;
498*4882a593Smuzhiyun 
499*4882a593Smuzhiyun 	if (old_prog)
500*4882a593Smuzhiyun 		bpf_prog_put(old_prog);
501*4882a593Smuzhiyun 	else
502*4882a593Smuzhiyun 		static_branch_inc(&cgroup_bpf_enabled_key);
503*4882a593Smuzhiyun 	bpf_cgroup_storages_link(new_storage, cgrp, type);
504*4882a593Smuzhiyun 	return 0;
505*4882a593Smuzhiyun 
506*4882a593Smuzhiyun cleanup:
507*4882a593Smuzhiyun 	if (old_prog) {
508*4882a593Smuzhiyun 		pl->prog = old_prog;
509*4882a593Smuzhiyun 		pl->link = NULL;
510*4882a593Smuzhiyun 	}
511*4882a593Smuzhiyun 	bpf_cgroup_storages_free(new_storage);
512*4882a593Smuzhiyun 	if (!old_prog) {
513*4882a593Smuzhiyun 		list_del(&pl->node);
514*4882a593Smuzhiyun 		kfree(pl);
515*4882a593Smuzhiyun 	}
516*4882a593Smuzhiyun 	return err;
517*4882a593Smuzhiyun }
518*4882a593Smuzhiyun 
519*4882a593Smuzhiyun /* Swap updated BPF program for given link in effective program arrays across
520*4882a593Smuzhiyun  * all descendant cgroups. This function is guaranteed to succeed.
521*4882a593Smuzhiyun  */
replace_effective_prog(struct cgroup * cgrp,enum bpf_attach_type type,struct bpf_cgroup_link * link)522*4882a593Smuzhiyun static void replace_effective_prog(struct cgroup *cgrp,
523*4882a593Smuzhiyun 				   enum bpf_attach_type type,
524*4882a593Smuzhiyun 				   struct bpf_cgroup_link *link)
525*4882a593Smuzhiyun {
526*4882a593Smuzhiyun 	struct bpf_prog_array_item *item;
527*4882a593Smuzhiyun 	struct cgroup_subsys_state *css;
528*4882a593Smuzhiyun 	struct bpf_prog_array *progs;
529*4882a593Smuzhiyun 	struct bpf_prog_list *pl;
530*4882a593Smuzhiyun 	struct list_head *head;
531*4882a593Smuzhiyun 	struct cgroup *cg;
532*4882a593Smuzhiyun 	int pos;
533*4882a593Smuzhiyun 
534*4882a593Smuzhiyun 	css_for_each_descendant_pre(css, &cgrp->self) {
535*4882a593Smuzhiyun 		struct cgroup *desc = container_of(css, struct cgroup, self);
536*4882a593Smuzhiyun 
537*4882a593Smuzhiyun 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
538*4882a593Smuzhiyun 			continue;
539*4882a593Smuzhiyun 
540*4882a593Smuzhiyun 		/* find position of link in effective progs array */
541*4882a593Smuzhiyun 		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
542*4882a593Smuzhiyun 			if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
543*4882a593Smuzhiyun 				continue;
544*4882a593Smuzhiyun 
545*4882a593Smuzhiyun 			head = &cg->bpf.progs[type];
546*4882a593Smuzhiyun 			list_for_each_entry(pl, head, node) {
547*4882a593Smuzhiyun 				if (!prog_list_prog(pl))
548*4882a593Smuzhiyun 					continue;
549*4882a593Smuzhiyun 				if (pl->link == link)
550*4882a593Smuzhiyun 					goto found;
551*4882a593Smuzhiyun 				pos++;
552*4882a593Smuzhiyun 			}
553*4882a593Smuzhiyun 		}
554*4882a593Smuzhiyun found:
555*4882a593Smuzhiyun 		BUG_ON(!cg);
556*4882a593Smuzhiyun 		progs = rcu_dereference_protected(
557*4882a593Smuzhiyun 				desc->bpf.effective[type],
558*4882a593Smuzhiyun 				lockdep_is_held(&cgroup_mutex));
559*4882a593Smuzhiyun 		item = &progs->items[pos];
560*4882a593Smuzhiyun 		WRITE_ONCE(item->prog, link->link.prog);
561*4882a593Smuzhiyun 	}
562*4882a593Smuzhiyun }
563*4882a593Smuzhiyun 
564*4882a593Smuzhiyun /**
565*4882a593Smuzhiyun  * __cgroup_bpf_replace() - Replace link's program and propagate the change
566*4882a593Smuzhiyun  *                          to descendants
567*4882a593Smuzhiyun  * @cgrp: The cgroup which descendants to traverse
568*4882a593Smuzhiyun  * @link: A link for which to replace BPF program
569*4882a593Smuzhiyun  * @type: Type of attach operation
570*4882a593Smuzhiyun  *
571*4882a593Smuzhiyun  * Must be called with cgroup_mutex held.
572*4882a593Smuzhiyun  */
__cgroup_bpf_replace(struct cgroup * cgrp,struct bpf_cgroup_link * link,struct bpf_prog * new_prog)573*4882a593Smuzhiyun static int __cgroup_bpf_replace(struct cgroup *cgrp,
574*4882a593Smuzhiyun 				struct bpf_cgroup_link *link,
575*4882a593Smuzhiyun 				struct bpf_prog *new_prog)
576*4882a593Smuzhiyun {
577*4882a593Smuzhiyun 	struct list_head *progs = &cgrp->bpf.progs[link->type];
578*4882a593Smuzhiyun 	struct bpf_prog *old_prog;
579*4882a593Smuzhiyun 	struct bpf_prog_list *pl;
580*4882a593Smuzhiyun 	bool found = false;
581*4882a593Smuzhiyun 
582*4882a593Smuzhiyun 	if (link->link.prog->type != new_prog->type)
583*4882a593Smuzhiyun 		return -EINVAL;
584*4882a593Smuzhiyun 
585*4882a593Smuzhiyun 	list_for_each_entry(pl, progs, node) {
586*4882a593Smuzhiyun 		if (pl->link == link) {
587*4882a593Smuzhiyun 			found = true;
588*4882a593Smuzhiyun 			break;
589*4882a593Smuzhiyun 		}
590*4882a593Smuzhiyun 	}
591*4882a593Smuzhiyun 	if (!found)
592*4882a593Smuzhiyun 		return -ENOENT;
593*4882a593Smuzhiyun 
594*4882a593Smuzhiyun 	old_prog = xchg(&link->link.prog, new_prog);
595*4882a593Smuzhiyun 	replace_effective_prog(cgrp, link->type, link);
596*4882a593Smuzhiyun 	bpf_prog_put(old_prog);
597*4882a593Smuzhiyun 	return 0;
598*4882a593Smuzhiyun }
599*4882a593Smuzhiyun 
cgroup_bpf_replace(struct bpf_link * link,struct bpf_prog * new_prog,struct bpf_prog * old_prog)600*4882a593Smuzhiyun static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
601*4882a593Smuzhiyun 			      struct bpf_prog *old_prog)
602*4882a593Smuzhiyun {
603*4882a593Smuzhiyun 	struct bpf_cgroup_link *cg_link;
604*4882a593Smuzhiyun 	int ret;
605*4882a593Smuzhiyun 
606*4882a593Smuzhiyun 	cg_link = container_of(link, struct bpf_cgroup_link, link);
607*4882a593Smuzhiyun 
608*4882a593Smuzhiyun 	mutex_lock(&cgroup_mutex);
609*4882a593Smuzhiyun 	/* link might have been auto-released by dying cgroup, so fail */
610*4882a593Smuzhiyun 	if (!cg_link->cgroup) {
611*4882a593Smuzhiyun 		ret = -ENOLINK;
612*4882a593Smuzhiyun 		goto out_unlock;
613*4882a593Smuzhiyun 	}
614*4882a593Smuzhiyun 	if (old_prog && link->prog != old_prog) {
615*4882a593Smuzhiyun 		ret = -EPERM;
616*4882a593Smuzhiyun 		goto out_unlock;
617*4882a593Smuzhiyun 	}
618*4882a593Smuzhiyun 	ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
619*4882a593Smuzhiyun out_unlock:
620*4882a593Smuzhiyun 	mutex_unlock(&cgroup_mutex);
621*4882a593Smuzhiyun 	return ret;
622*4882a593Smuzhiyun }
623*4882a593Smuzhiyun 
find_detach_entry(struct list_head * progs,struct bpf_prog * prog,struct bpf_cgroup_link * link,bool allow_multi)624*4882a593Smuzhiyun static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
625*4882a593Smuzhiyun 					       struct bpf_prog *prog,
626*4882a593Smuzhiyun 					       struct bpf_cgroup_link *link,
627*4882a593Smuzhiyun 					       bool allow_multi)
628*4882a593Smuzhiyun {
629*4882a593Smuzhiyun 	struct bpf_prog_list *pl;
630*4882a593Smuzhiyun 
631*4882a593Smuzhiyun 	if (!allow_multi) {
632*4882a593Smuzhiyun 		if (list_empty(progs))
633*4882a593Smuzhiyun 			/* report error when trying to detach and nothing is attached */
634*4882a593Smuzhiyun 			return ERR_PTR(-ENOENT);
635*4882a593Smuzhiyun 
636*4882a593Smuzhiyun 		/* to maintain backward compatibility NONE and OVERRIDE cgroups
637*4882a593Smuzhiyun 		 * allow detaching with invalid FD (prog==NULL) in legacy mode
638*4882a593Smuzhiyun 		 */
639*4882a593Smuzhiyun 		return list_first_entry(progs, typeof(*pl), node);
640*4882a593Smuzhiyun 	}
641*4882a593Smuzhiyun 
642*4882a593Smuzhiyun 	if (!prog && !link)
643*4882a593Smuzhiyun 		/* to detach MULTI prog the user has to specify valid FD
644*4882a593Smuzhiyun 		 * of the program or link to be detached
645*4882a593Smuzhiyun 		 */
646*4882a593Smuzhiyun 		return ERR_PTR(-EINVAL);
647*4882a593Smuzhiyun 
648*4882a593Smuzhiyun 	/* find the prog or link and detach it */
649*4882a593Smuzhiyun 	list_for_each_entry(pl, progs, node) {
650*4882a593Smuzhiyun 		if (pl->prog == prog && pl->link == link)
651*4882a593Smuzhiyun 			return pl;
652*4882a593Smuzhiyun 	}
653*4882a593Smuzhiyun 	return ERR_PTR(-ENOENT);
654*4882a593Smuzhiyun }
655*4882a593Smuzhiyun 
656*4882a593Smuzhiyun /**
657*4882a593Smuzhiyun  * purge_effective_progs() - After compute_effective_progs fails to alloc new
658*4882a593Smuzhiyun  *			     cgrp->bpf.inactive table we can recover by
659*4882a593Smuzhiyun  *			     recomputing the array in place.
660*4882a593Smuzhiyun  *
661*4882a593Smuzhiyun  * @cgrp: The cgroup which descendants to travers
662*4882a593Smuzhiyun  * @prog: A program to detach or NULL
663*4882a593Smuzhiyun  * @link: A link to detach or NULL
664*4882a593Smuzhiyun  * @type: Type of detach operation
665*4882a593Smuzhiyun  */
purge_effective_progs(struct cgroup * cgrp,struct bpf_prog * prog,struct bpf_cgroup_link * link,enum bpf_attach_type type)666*4882a593Smuzhiyun static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
667*4882a593Smuzhiyun 				  struct bpf_cgroup_link *link,
668*4882a593Smuzhiyun 				  enum bpf_attach_type type)
669*4882a593Smuzhiyun {
670*4882a593Smuzhiyun 	struct cgroup_subsys_state *css;
671*4882a593Smuzhiyun 	struct bpf_prog_array *progs;
672*4882a593Smuzhiyun 	struct bpf_prog_list *pl;
673*4882a593Smuzhiyun 	struct list_head *head;
674*4882a593Smuzhiyun 	struct cgroup *cg;
675*4882a593Smuzhiyun 	int pos;
676*4882a593Smuzhiyun 
677*4882a593Smuzhiyun 	/* recompute effective prog array in place */
678*4882a593Smuzhiyun 	css_for_each_descendant_pre(css, &cgrp->self) {
679*4882a593Smuzhiyun 		struct cgroup *desc = container_of(css, struct cgroup, self);
680*4882a593Smuzhiyun 
681*4882a593Smuzhiyun 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
682*4882a593Smuzhiyun 			continue;
683*4882a593Smuzhiyun 
684*4882a593Smuzhiyun 		/* find position of link or prog in effective progs array */
685*4882a593Smuzhiyun 		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
686*4882a593Smuzhiyun 			if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
687*4882a593Smuzhiyun 				continue;
688*4882a593Smuzhiyun 
689*4882a593Smuzhiyun 			head = &cg->bpf.progs[type];
690*4882a593Smuzhiyun 			list_for_each_entry(pl, head, node) {
691*4882a593Smuzhiyun 				if (!prog_list_prog(pl))
692*4882a593Smuzhiyun 					continue;
693*4882a593Smuzhiyun 				if (pl->prog == prog && pl->link == link)
694*4882a593Smuzhiyun 					goto found;
695*4882a593Smuzhiyun 				pos++;
696*4882a593Smuzhiyun 			}
697*4882a593Smuzhiyun 		}
698*4882a593Smuzhiyun 
699*4882a593Smuzhiyun 		/* no link or prog match, skip the cgroup of this layer */
700*4882a593Smuzhiyun 		continue;
701*4882a593Smuzhiyun found:
702*4882a593Smuzhiyun 		progs = rcu_dereference_protected(
703*4882a593Smuzhiyun 				desc->bpf.effective[type],
704*4882a593Smuzhiyun 				lockdep_is_held(&cgroup_mutex));
705*4882a593Smuzhiyun 
706*4882a593Smuzhiyun 		/* Remove the program from the array */
707*4882a593Smuzhiyun 		WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
708*4882a593Smuzhiyun 			  "Failed to purge a prog from array at index %d", pos);
709*4882a593Smuzhiyun 	}
710*4882a593Smuzhiyun }
711*4882a593Smuzhiyun 
712*4882a593Smuzhiyun /**
713*4882a593Smuzhiyun  * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
714*4882a593Smuzhiyun  *                         propagate the change to descendants
715*4882a593Smuzhiyun  * @cgrp: The cgroup which descendants to traverse
716*4882a593Smuzhiyun  * @prog: A program to detach or NULL
717*4882a593Smuzhiyun  * @prog: A link to detach or NULL
718*4882a593Smuzhiyun  * @type: Type of detach operation
719*4882a593Smuzhiyun  *
720*4882a593Smuzhiyun  * At most one of @prog or @link can be non-NULL.
721*4882a593Smuzhiyun  * Must be called with cgroup_mutex held.
722*4882a593Smuzhiyun  */
__cgroup_bpf_detach(struct cgroup * cgrp,struct bpf_prog * prog,struct bpf_cgroup_link * link,enum bpf_attach_type type)723*4882a593Smuzhiyun int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
724*4882a593Smuzhiyun 			struct bpf_cgroup_link *link, enum bpf_attach_type type)
725*4882a593Smuzhiyun {
726*4882a593Smuzhiyun 	struct list_head *progs = &cgrp->bpf.progs[type];
727*4882a593Smuzhiyun 	u32 flags = cgrp->bpf.flags[type];
728*4882a593Smuzhiyun 	struct bpf_prog_list *pl;
729*4882a593Smuzhiyun 	struct bpf_prog *old_prog;
730*4882a593Smuzhiyun 
731*4882a593Smuzhiyun 	if (prog && link)
732*4882a593Smuzhiyun 		/* only one of prog or link can be specified */
733*4882a593Smuzhiyun 		return -EINVAL;
734*4882a593Smuzhiyun 
735*4882a593Smuzhiyun 	pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
736*4882a593Smuzhiyun 	if (IS_ERR(pl))
737*4882a593Smuzhiyun 		return PTR_ERR(pl);
738*4882a593Smuzhiyun 
739*4882a593Smuzhiyun 	/* mark it deleted, so it's ignored while recomputing effective */
740*4882a593Smuzhiyun 	old_prog = pl->prog;
741*4882a593Smuzhiyun 	pl->prog = NULL;
742*4882a593Smuzhiyun 	pl->link = NULL;
743*4882a593Smuzhiyun 
744*4882a593Smuzhiyun 	if (update_effective_progs(cgrp, type)) {
745*4882a593Smuzhiyun 		/* if update effective array failed replace the prog with a dummy prog*/
746*4882a593Smuzhiyun 		pl->prog = old_prog;
747*4882a593Smuzhiyun 		pl->link = link;
748*4882a593Smuzhiyun 		purge_effective_progs(cgrp, old_prog, link, type);
749*4882a593Smuzhiyun 	}
750*4882a593Smuzhiyun 
751*4882a593Smuzhiyun 	/* now can actually delete it from this cgroup list */
752*4882a593Smuzhiyun 	list_del(&pl->node);
753*4882a593Smuzhiyun 	kfree(pl);
754*4882a593Smuzhiyun 	if (list_empty(progs))
755*4882a593Smuzhiyun 		/* last program was detached, reset flags to zero */
756*4882a593Smuzhiyun 		cgrp->bpf.flags[type] = 0;
757*4882a593Smuzhiyun 	if (old_prog)
758*4882a593Smuzhiyun 		bpf_prog_put(old_prog);
759*4882a593Smuzhiyun 	static_branch_dec(&cgroup_bpf_enabled_key);
760*4882a593Smuzhiyun 	return 0;
761*4882a593Smuzhiyun }
762*4882a593Smuzhiyun 
763*4882a593Smuzhiyun /* Must be called with cgroup_mutex held to avoid races. */
__cgroup_bpf_query(struct cgroup * cgrp,const union bpf_attr * attr,union bpf_attr __user * uattr)764*4882a593Smuzhiyun int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
765*4882a593Smuzhiyun 		       union bpf_attr __user *uattr)
766*4882a593Smuzhiyun {
767*4882a593Smuzhiyun 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
768*4882a593Smuzhiyun 	enum bpf_attach_type type = attr->query.attach_type;
769*4882a593Smuzhiyun 	struct list_head *progs = &cgrp->bpf.progs[type];
770*4882a593Smuzhiyun 	u32 flags = cgrp->bpf.flags[type];
771*4882a593Smuzhiyun 	struct bpf_prog_array *effective;
772*4882a593Smuzhiyun 	struct bpf_prog *prog;
773*4882a593Smuzhiyun 	int cnt, ret = 0, i;
774*4882a593Smuzhiyun 
775*4882a593Smuzhiyun 	effective = rcu_dereference_protected(cgrp->bpf.effective[type],
776*4882a593Smuzhiyun 					      lockdep_is_held(&cgroup_mutex));
777*4882a593Smuzhiyun 
778*4882a593Smuzhiyun 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
779*4882a593Smuzhiyun 		cnt = bpf_prog_array_length(effective);
780*4882a593Smuzhiyun 	else
781*4882a593Smuzhiyun 		cnt = prog_list_length(progs);
782*4882a593Smuzhiyun 
783*4882a593Smuzhiyun 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
784*4882a593Smuzhiyun 		return -EFAULT;
785*4882a593Smuzhiyun 	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
786*4882a593Smuzhiyun 		return -EFAULT;
787*4882a593Smuzhiyun 	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
788*4882a593Smuzhiyun 		/* return early if user requested only program count + flags */
789*4882a593Smuzhiyun 		return 0;
790*4882a593Smuzhiyun 	if (attr->query.prog_cnt < cnt) {
791*4882a593Smuzhiyun 		cnt = attr->query.prog_cnt;
792*4882a593Smuzhiyun 		ret = -ENOSPC;
793*4882a593Smuzhiyun 	}
794*4882a593Smuzhiyun 
795*4882a593Smuzhiyun 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
796*4882a593Smuzhiyun 		return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
797*4882a593Smuzhiyun 	} else {
798*4882a593Smuzhiyun 		struct bpf_prog_list *pl;
799*4882a593Smuzhiyun 		u32 id;
800*4882a593Smuzhiyun 
801*4882a593Smuzhiyun 		i = 0;
802*4882a593Smuzhiyun 		list_for_each_entry(pl, progs, node) {
803*4882a593Smuzhiyun 			prog = prog_list_prog(pl);
804*4882a593Smuzhiyun 			id = prog->aux->id;
805*4882a593Smuzhiyun 			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
806*4882a593Smuzhiyun 				return -EFAULT;
807*4882a593Smuzhiyun 			if (++i == cnt)
808*4882a593Smuzhiyun 				break;
809*4882a593Smuzhiyun 		}
810*4882a593Smuzhiyun 	}
811*4882a593Smuzhiyun 	return ret;
812*4882a593Smuzhiyun }
813*4882a593Smuzhiyun 
cgroup_bpf_prog_attach(const union bpf_attr * attr,enum bpf_prog_type ptype,struct bpf_prog * prog)814*4882a593Smuzhiyun int cgroup_bpf_prog_attach(const union bpf_attr *attr,
815*4882a593Smuzhiyun 			   enum bpf_prog_type ptype, struct bpf_prog *prog)
816*4882a593Smuzhiyun {
817*4882a593Smuzhiyun 	struct bpf_prog *replace_prog = NULL;
818*4882a593Smuzhiyun 	struct cgroup *cgrp;
819*4882a593Smuzhiyun 	int ret;
820*4882a593Smuzhiyun 
821*4882a593Smuzhiyun 	cgrp = cgroup_get_from_fd(attr->target_fd);
822*4882a593Smuzhiyun 	if (IS_ERR(cgrp))
823*4882a593Smuzhiyun 		return PTR_ERR(cgrp);
824*4882a593Smuzhiyun 
825*4882a593Smuzhiyun 	if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
826*4882a593Smuzhiyun 	    (attr->attach_flags & BPF_F_REPLACE)) {
827*4882a593Smuzhiyun 		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
828*4882a593Smuzhiyun 		if (IS_ERR(replace_prog)) {
829*4882a593Smuzhiyun 			cgroup_put(cgrp);
830*4882a593Smuzhiyun 			return PTR_ERR(replace_prog);
831*4882a593Smuzhiyun 		}
832*4882a593Smuzhiyun 	}
833*4882a593Smuzhiyun 
834*4882a593Smuzhiyun 	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
835*4882a593Smuzhiyun 				attr->attach_type, attr->attach_flags);
836*4882a593Smuzhiyun 
837*4882a593Smuzhiyun 	if (replace_prog)
838*4882a593Smuzhiyun 		bpf_prog_put(replace_prog);
839*4882a593Smuzhiyun 	cgroup_put(cgrp);
840*4882a593Smuzhiyun 	return ret;
841*4882a593Smuzhiyun }
842*4882a593Smuzhiyun 
cgroup_bpf_prog_detach(const union bpf_attr * attr,enum bpf_prog_type ptype)843*4882a593Smuzhiyun int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
844*4882a593Smuzhiyun {
845*4882a593Smuzhiyun 	struct bpf_prog *prog;
846*4882a593Smuzhiyun 	struct cgroup *cgrp;
847*4882a593Smuzhiyun 	int ret;
848*4882a593Smuzhiyun 
849*4882a593Smuzhiyun 	cgrp = cgroup_get_from_fd(attr->target_fd);
850*4882a593Smuzhiyun 	if (IS_ERR(cgrp))
851*4882a593Smuzhiyun 		return PTR_ERR(cgrp);
852*4882a593Smuzhiyun 
853*4882a593Smuzhiyun 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
854*4882a593Smuzhiyun 	if (IS_ERR(prog))
855*4882a593Smuzhiyun 		prog = NULL;
856*4882a593Smuzhiyun 
857*4882a593Smuzhiyun 	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
858*4882a593Smuzhiyun 	if (prog)
859*4882a593Smuzhiyun 		bpf_prog_put(prog);
860*4882a593Smuzhiyun 
861*4882a593Smuzhiyun 	cgroup_put(cgrp);
862*4882a593Smuzhiyun 	return ret;
863*4882a593Smuzhiyun }
864*4882a593Smuzhiyun 
bpf_cgroup_link_release(struct bpf_link * link)865*4882a593Smuzhiyun static void bpf_cgroup_link_release(struct bpf_link *link)
866*4882a593Smuzhiyun {
867*4882a593Smuzhiyun 	struct bpf_cgroup_link *cg_link =
868*4882a593Smuzhiyun 		container_of(link, struct bpf_cgroup_link, link);
869*4882a593Smuzhiyun 	struct cgroup *cg;
870*4882a593Smuzhiyun 
871*4882a593Smuzhiyun 	/* link might have been auto-detached by dying cgroup already,
872*4882a593Smuzhiyun 	 * in that case our work is done here
873*4882a593Smuzhiyun 	 */
874*4882a593Smuzhiyun 	if (!cg_link->cgroup)
875*4882a593Smuzhiyun 		return;
876*4882a593Smuzhiyun 
877*4882a593Smuzhiyun 	mutex_lock(&cgroup_mutex);
878*4882a593Smuzhiyun 
879*4882a593Smuzhiyun 	/* re-check cgroup under lock again */
880*4882a593Smuzhiyun 	if (!cg_link->cgroup) {
881*4882a593Smuzhiyun 		mutex_unlock(&cgroup_mutex);
882*4882a593Smuzhiyun 		return;
883*4882a593Smuzhiyun 	}
884*4882a593Smuzhiyun 
885*4882a593Smuzhiyun 	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
886*4882a593Smuzhiyun 				    cg_link->type));
887*4882a593Smuzhiyun 
888*4882a593Smuzhiyun 	cg = cg_link->cgroup;
889*4882a593Smuzhiyun 	cg_link->cgroup = NULL;
890*4882a593Smuzhiyun 
891*4882a593Smuzhiyun 	mutex_unlock(&cgroup_mutex);
892*4882a593Smuzhiyun 
893*4882a593Smuzhiyun 	cgroup_put(cg);
894*4882a593Smuzhiyun }
895*4882a593Smuzhiyun 
bpf_cgroup_link_dealloc(struct bpf_link * link)896*4882a593Smuzhiyun static void bpf_cgroup_link_dealloc(struct bpf_link *link)
897*4882a593Smuzhiyun {
898*4882a593Smuzhiyun 	struct bpf_cgroup_link *cg_link =
899*4882a593Smuzhiyun 		container_of(link, struct bpf_cgroup_link, link);
900*4882a593Smuzhiyun 
901*4882a593Smuzhiyun 	kfree(cg_link);
902*4882a593Smuzhiyun }
903*4882a593Smuzhiyun 
bpf_cgroup_link_detach(struct bpf_link * link)904*4882a593Smuzhiyun static int bpf_cgroup_link_detach(struct bpf_link *link)
905*4882a593Smuzhiyun {
906*4882a593Smuzhiyun 	bpf_cgroup_link_release(link);
907*4882a593Smuzhiyun 
908*4882a593Smuzhiyun 	return 0;
909*4882a593Smuzhiyun }
910*4882a593Smuzhiyun 
bpf_cgroup_link_show_fdinfo(const struct bpf_link * link,struct seq_file * seq)911*4882a593Smuzhiyun static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
912*4882a593Smuzhiyun 					struct seq_file *seq)
913*4882a593Smuzhiyun {
914*4882a593Smuzhiyun 	struct bpf_cgroup_link *cg_link =
915*4882a593Smuzhiyun 		container_of(link, struct bpf_cgroup_link, link);
916*4882a593Smuzhiyun 	u64 cg_id = 0;
917*4882a593Smuzhiyun 
918*4882a593Smuzhiyun 	mutex_lock(&cgroup_mutex);
919*4882a593Smuzhiyun 	if (cg_link->cgroup)
920*4882a593Smuzhiyun 		cg_id = cgroup_id(cg_link->cgroup);
921*4882a593Smuzhiyun 	mutex_unlock(&cgroup_mutex);
922*4882a593Smuzhiyun 
923*4882a593Smuzhiyun 	seq_printf(seq,
924*4882a593Smuzhiyun 		   "cgroup_id:\t%llu\n"
925*4882a593Smuzhiyun 		   "attach_type:\t%d\n",
926*4882a593Smuzhiyun 		   cg_id,
927*4882a593Smuzhiyun 		   cg_link->type);
928*4882a593Smuzhiyun }
929*4882a593Smuzhiyun 
bpf_cgroup_link_fill_link_info(const struct bpf_link * link,struct bpf_link_info * info)930*4882a593Smuzhiyun static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
931*4882a593Smuzhiyun 					  struct bpf_link_info *info)
932*4882a593Smuzhiyun {
933*4882a593Smuzhiyun 	struct bpf_cgroup_link *cg_link =
934*4882a593Smuzhiyun 		container_of(link, struct bpf_cgroup_link, link);
935*4882a593Smuzhiyun 	u64 cg_id = 0;
936*4882a593Smuzhiyun 
937*4882a593Smuzhiyun 	mutex_lock(&cgroup_mutex);
938*4882a593Smuzhiyun 	if (cg_link->cgroup)
939*4882a593Smuzhiyun 		cg_id = cgroup_id(cg_link->cgroup);
940*4882a593Smuzhiyun 	mutex_unlock(&cgroup_mutex);
941*4882a593Smuzhiyun 
942*4882a593Smuzhiyun 	info->cgroup.cgroup_id = cg_id;
943*4882a593Smuzhiyun 	info->cgroup.attach_type = cg_link->type;
944*4882a593Smuzhiyun 	return 0;
945*4882a593Smuzhiyun }
946*4882a593Smuzhiyun 
947*4882a593Smuzhiyun static const struct bpf_link_ops bpf_cgroup_link_lops = {
948*4882a593Smuzhiyun 	.release = bpf_cgroup_link_release,
949*4882a593Smuzhiyun 	.dealloc = bpf_cgroup_link_dealloc,
950*4882a593Smuzhiyun 	.detach = bpf_cgroup_link_detach,
951*4882a593Smuzhiyun 	.update_prog = cgroup_bpf_replace,
952*4882a593Smuzhiyun 	.show_fdinfo = bpf_cgroup_link_show_fdinfo,
953*4882a593Smuzhiyun 	.fill_link_info = bpf_cgroup_link_fill_link_info,
954*4882a593Smuzhiyun };
955*4882a593Smuzhiyun 
cgroup_bpf_link_attach(const union bpf_attr * attr,struct bpf_prog * prog)956*4882a593Smuzhiyun int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
957*4882a593Smuzhiyun {
958*4882a593Smuzhiyun 	struct bpf_link_primer link_primer;
959*4882a593Smuzhiyun 	struct bpf_cgroup_link *link;
960*4882a593Smuzhiyun 	struct cgroup *cgrp;
961*4882a593Smuzhiyun 	int err;
962*4882a593Smuzhiyun 
963*4882a593Smuzhiyun 	if (attr->link_create.flags)
964*4882a593Smuzhiyun 		return -EINVAL;
965*4882a593Smuzhiyun 
966*4882a593Smuzhiyun 	cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
967*4882a593Smuzhiyun 	if (IS_ERR(cgrp))
968*4882a593Smuzhiyun 		return PTR_ERR(cgrp);
969*4882a593Smuzhiyun 
970*4882a593Smuzhiyun 	link = kzalloc(sizeof(*link), GFP_USER);
971*4882a593Smuzhiyun 	if (!link) {
972*4882a593Smuzhiyun 		err = -ENOMEM;
973*4882a593Smuzhiyun 		goto out_put_cgroup;
974*4882a593Smuzhiyun 	}
975*4882a593Smuzhiyun 	bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
976*4882a593Smuzhiyun 		      prog);
977*4882a593Smuzhiyun 	link->cgroup = cgrp;
978*4882a593Smuzhiyun 	link->type = attr->link_create.attach_type;
979*4882a593Smuzhiyun 
980*4882a593Smuzhiyun 	err  = bpf_link_prime(&link->link, &link_primer);
981*4882a593Smuzhiyun 	if (err) {
982*4882a593Smuzhiyun 		kfree(link);
983*4882a593Smuzhiyun 		goto out_put_cgroup;
984*4882a593Smuzhiyun 	}
985*4882a593Smuzhiyun 
986*4882a593Smuzhiyun 	err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
987*4882a593Smuzhiyun 				BPF_F_ALLOW_MULTI);
988*4882a593Smuzhiyun 	if (err) {
989*4882a593Smuzhiyun 		bpf_link_cleanup(&link_primer);
990*4882a593Smuzhiyun 		goto out_put_cgroup;
991*4882a593Smuzhiyun 	}
992*4882a593Smuzhiyun 
993*4882a593Smuzhiyun 	return bpf_link_settle(&link_primer);
994*4882a593Smuzhiyun 
995*4882a593Smuzhiyun out_put_cgroup:
996*4882a593Smuzhiyun 	cgroup_put(cgrp);
997*4882a593Smuzhiyun 	return err;
998*4882a593Smuzhiyun }
999*4882a593Smuzhiyun 
cgroup_bpf_prog_query(const union bpf_attr * attr,union bpf_attr __user * uattr)1000*4882a593Smuzhiyun int cgroup_bpf_prog_query(const union bpf_attr *attr,
1001*4882a593Smuzhiyun 			  union bpf_attr __user *uattr)
1002*4882a593Smuzhiyun {
1003*4882a593Smuzhiyun 	struct cgroup *cgrp;
1004*4882a593Smuzhiyun 	int ret;
1005*4882a593Smuzhiyun 
1006*4882a593Smuzhiyun 	cgrp = cgroup_get_from_fd(attr->query.target_fd);
1007*4882a593Smuzhiyun 	if (IS_ERR(cgrp))
1008*4882a593Smuzhiyun 		return PTR_ERR(cgrp);
1009*4882a593Smuzhiyun 
1010*4882a593Smuzhiyun 	ret = cgroup_bpf_query(cgrp, attr, uattr);
1011*4882a593Smuzhiyun 
1012*4882a593Smuzhiyun 	cgroup_put(cgrp);
1013*4882a593Smuzhiyun 	return ret;
1014*4882a593Smuzhiyun }
1015*4882a593Smuzhiyun 
1016*4882a593Smuzhiyun /**
1017*4882a593Smuzhiyun  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
1018*4882a593Smuzhiyun  * @sk: The socket sending or receiving traffic
1019*4882a593Smuzhiyun  * @skb: The skb that is being sent or received
1020*4882a593Smuzhiyun  * @type: The type of program to be exectuted
1021*4882a593Smuzhiyun  *
1022*4882a593Smuzhiyun  * If no socket is passed, or the socket is not of type INET or INET6,
1023*4882a593Smuzhiyun  * this function does nothing and returns 0.
1024*4882a593Smuzhiyun  *
1025*4882a593Smuzhiyun  * The program type passed in via @type must be suitable for network
1026*4882a593Smuzhiyun  * filtering. No further check is performed to assert that.
1027*4882a593Smuzhiyun  *
1028*4882a593Smuzhiyun  * For egress packets, this function can return:
1029*4882a593Smuzhiyun  *   NET_XMIT_SUCCESS    (0)	- continue with packet output
1030*4882a593Smuzhiyun  *   NET_XMIT_DROP       (1)	- drop packet and notify TCP to call cwr
1031*4882a593Smuzhiyun  *   NET_XMIT_CN         (2)	- continue with packet output and notify TCP
1032*4882a593Smuzhiyun  *				  to call cwr
1033*4882a593Smuzhiyun  *   -EPERM			- drop packet
1034*4882a593Smuzhiyun  *
1035*4882a593Smuzhiyun  * For ingress packets, this function will return -EPERM if any
1036*4882a593Smuzhiyun  * attached program was found and if it returned != 1 during execution.
1037*4882a593Smuzhiyun  * Otherwise 0 is returned.
1038*4882a593Smuzhiyun  */
__cgroup_bpf_run_filter_skb(struct sock * sk,struct sk_buff * skb,enum bpf_attach_type type)1039*4882a593Smuzhiyun int __cgroup_bpf_run_filter_skb(struct sock *sk,
1040*4882a593Smuzhiyun 				struct sk_buff *skb,
1041*4882a593Smuzhiyun 				enum bpf_attach_type type)
1042*4882a593Smuzhiyun {
1043*4882a593Smuzhiyun 	unsigned int offset = skb->data - skb_network_header(skb);
1044*4882a593Smuzhiyun 	struct sock *save_sk;
1045*4882a593Smuzhiyun 	void *saved_data_end;
1046*4882a593Smuzhiyun 	struct cgroup *cgrp;
1047*4882a593Smuzhiyun 	int ret;
1048*4882a593Smuzhiyun 
1049*4882a593Smuzhiyun 	if (!sk || !sk_fullsock(sk))
1050*4882a593Smuzhiyun 		return 0;
1051*4882a593Smuzhiyun 
1052*4882a593Smuzhiyun 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1053*4882a593Smuzhiyun 		return 0;
1054*4882a593Smuzhiyun 
1055*4882a593Smuzhiyun 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1056*4882a593Smuzhiyun 	save_sk = skb->sk;
1057*4882a593Smuzhiyun 	skb->sk = sk;
1058*4882a593Smuzhiyun 	__skb_push(skb, offset);
1059*4882a593Smuzhiyun 
1060*4882a593Smuzhiyun 	/* compute pointers for the bpf prog */
1061*4882a593Smuzhiyun 	bpf_compute_and_save_data_end(skb, &saved_data_end);
1062*4882a593Smuzhiyun 
1063*4882a593Smuzhiyun 	if (type == BPF_CGROUP_INET_EGRESS) {
1064*4882a593Smuzhiyun 		ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
1065*4882a593Smuzhiyun 			cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
1066*4882a593Smuzhiyun 	} else {
1067*4882a593Smuzhiyun 		ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
1068*4882a593Smuzhiyun 					  __bpf_prog_run_save_cb);
1069*4882a593Smuzhiyun 		ret = (ret == 1 ? 0 : -EPERM);
1070*4882a593Smuzhiyun 	}
1071*4882a593Smuzhiyun 	bpf_restore_data_end(skb, saved_data_end);
1072*4882a593Smuzhiyun 	__skb_pull(skb, offset);
1073*4882a593Smuzhiyun 	skb->sk = save_sk;
1074*4882a593Smuzhiyun 
1075*4882a593Smuzhiyun 	return ret;
1076*4882a593Smuzhiyun }
1077*4882a593Smuzhiyun EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
1078*4882a593Smuzhiyun 
1079*4882a593Smuzhiyun /**
1080*4882a593Smuzhiyun  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
1081*4882a593Smuzhiyun  * @sk: sock structure to manipulate
1082*4882a593Smuzhiyun  * @type: The type of program to be exectuted
1083*4882a593Smuzhiyun  *
1084*4882a593Smuzhiyun  * socket is passed is expected to be of type INET or INET6.
1085*4882a593Smuzhiyun  *
1086*4882a593Smuzhiyun  * The program type passed in via @type must be suitable for sock
1087*4882a593Smuzhiyun  * filtering. No further check is performed to assert that.
1088*4882a593Smuzhiyun  *
1089*4882a593Smuzhiyun  * This function will return %-EPERM if any if an attached program was found
1090*4882a593Smuzhiyun  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1091*4882a593Smuzhiyun  */
__cgroup_bpf_run_filter_sk(struct sock * sk,enum bpf_attach_type type)1092*4882a593Smuzhiyun int __cgroup_bpf_run_filter_sk(struct sock *sk,
1093*4882a593Smuzhiyun 			       enum bpf_attach_type type)
1094*4882a593Smuzhiyun {
1095*4882a593Smuzhiyun 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1096*4882a593Smuzhiyun 	int ret;
1097*4882a593Smuzhiyun 
1098*4882a593Smuzhiyun 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
1099*4882a593Smuzhiyun 	return ret == 1 ? 0 : -EPERM;
1100*4882a593Smuzhiyun }
1101*4882a593Smuzhiyun EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
1102*4882a593Smuzhiyun 
1103*4882a593Smuzhiyun /**
1104*4882a593Smuzhiyun  * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
1105*4882a593Smuzhiyun  *                                       provided by user sockaddr
1106*4882a593Smuzhiyun  * @sk: sock struct that will use sockaddr
1107*4882a593Smuzhiyun  * @uaddr: sockaddr struct provided by user
1108*4882a593Smuzhiyun  * @type: The type of program to be exectuted
1109*4882a593Smuzhiyun  * @t_ctx: Pointer to attach type specific context
1110*4882a593Smuzhiyun  *
1111*4882a593Smuzhiyun  * socket is expected to be of type INET or INET6.
1112*4882a593Smuzhiyun  *
1113*4882a593Smuzhiyun  * This function will return %-EPERM if an attached program is found and
1114*4882a593Smuzhiyun  * returned value != 1 during execution. In all other cases, 0 is returned.
1115*4882a593Smuzhiyun  */
__cgroup_bpf_run_filter_sock_addr(struct sock * sk,struct sockaddr * uaddr,enum bpf_attach_type type,void * t_ctx)1116*4882a593Smuzhiyun int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
1117*4882a593Smuzhiyun 				      struct sockaddr *uaddr,
1118*4882a593Smuzhiyun 				      enum bpf_attach_type type,
1119*4882a593Smuzhiyun 				      void *t_ctx)
1120*4882a593Smuzhiyun {
1121*4882a593Smuzhiyun 	struct bpf_sock_addr_kern ctx = {
1122*4882a593Smuzhiyun 		.sk = sk,
1123*4882a593Smuzhiyun 		.uaddr = uaddr,
1124*4882a593Smuzhiyun 		.t_ctx = t_ctx,
1125*4882a593Smuzhiyun 	};
1126*4882a593Smuzhiyun 	struct sockaddr_storage unspec;
1127*4882a593Smuzhiyun 	struct cgroup *cgrp;
1128*4882a593Smuzhiyun 	int ret;
1129*4882a593Smuzhiyun 
1130*4882a593Smuzhiyun 	/* Check socket family since not all sockets represent network
1131*4882a593Smuzhiyun 	 * endpoint (e.g. AF_UNIX).
1132*4882a593Smuzhiyun 	 */
1133*4882a593Smuzhiyun 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1134*4882a593Smuzhiyun 		return 0;
1135*4882a593Smuzhiyun 
1136*4882a593Smuzhiyun 	if (!ctx.uaddr) {
1137*4882a593Smuzhiyun 		memset(&unspec, 0, sizeof(unspec));
1138*4882a593Smuzhiyun 		ctx.uaddr = (struct sockaddr *)&unspec;
1139*4882a593Smuzhiyun 	}
1140*4882a593Smuzhiyun 
1141*4882a593Smuzhiyun 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1142*4882a593Smuzhiyun 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
1143*4882a593Smuzhiyun 
1144*4882a593Smuzhiyun 	return ret == 1 ? 0 : -EPERM;
1145*4882a593Smuzhiyun }
1146*4882a593Smuzhiyun EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
1147*4882a593Smuzhiyun 
1148*4882a593Smuzhiyun /**
1149*4882a593Smuzhiyun  * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
1150*4882a593Smuzhiyun  * @sk: socket to get cgroup from
1151*4882a593Smuzhiyun  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
1152*4882a593Smuzhiyun  * sk with connection information (IP addresses, etc.) May not contain
1153*4882a593Smuzhiyun  * cgroup info if it is a req sock.
1154*4882a593Smuzhiyun  * @type: The type of program to be exectuted
1155*4882a593Smuzhiyun  *
1156*4882a593Smuzhiyun  * socket passed is expected to be of type INET or INET6.
1157*4882a593Smuzhiyun  *
1158*4882a593Smuzhiyun  * The program type passed in via @type must be suitable for sock_ops
1159*4882a593Smuzhiyun  * filtering. No further check is performed to assert that.
1160*4882a593Smuzhiyun  *
1161*4882a593Smuzhiyun  * This function will return %-EPERM if any if an attached program was found
1162*4882a593Smuzhiyun  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1163*4882a593Smuzhiyun  */
__cgroup_bpf_run_filter_sock_ops(struct sock * sk,struct bpf_sock_ops_kern * sock_ops,enum bpf_attach_type type)1164*4882a593Smuzhiyun int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
1165*4882a593Smuzhiyun 				     struct bpf_sock_ops_kern *sock_ops,
1166*4882a593Smuzhiyun 				     enum bpf_attach_type type)
1167*4882a593Smuzhiyun {
1168*4882a593Smuzhiyun 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1169*4882a593Smuzhiyun 	int ret;
1170*4882a593Smuzhiyun 
1171*4882a593Smuzhiyun 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
1172*4882a593Smuzhiyun 				 BPF_PROG_RUN);
1173*4882a593Smuzhiyun 	return ret == 1 ? 0 : -EPERM;
1174*4882a593Smuzhiyun }
1175*4882a593Smuzhiyun EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
1176*4882a593Smuzhiyun 
__cgroup_bpf_check_dev_permission(short dev_type,u32 major,u32 minor,short access,enum bpf_attach_type type)1177*4882a593Smuzhiyun int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
1178*4882a593Smuzhiyun 				      short access, enum bpf_attach_type type)
1179*4882a593Smuzhiyun {
1180*4882a593Smuzhiyun 	struct cgroup *cgrp;
1181*4882a593Smuzhiyun 	struct bpf_cgroup_dev_ctx ctx = {
1182*4882a593Smuzhiyun 		.access_type = (access << 16) | dev_type,
1183*4882a593Smuzhiyun 		.major = major,
1184*4882a593Smuzhiyun 		.minor = minor,
1185*4882a593Smuzhiyun 	};
1186*4882a593Smuzhiyun 	int allow = 1;
1187*4882a593Smuzhiyun 
1188*4882a593Smuzhiyun 	rcu_read_lock();
1189*4882a593Smuzhiyun 	cgrp = task_dfl_cgroup(current);
1190*4882a593Smuzhiyun 	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
1191*4882a593Smuzhiyun 				   BPF_PROG_RUN);
1192*4882a593Smuzhiyun 	rcu_read_unlock();
1193*4882a593Smuzhiyun 
1194*4882a593Smuzhiyun 	return !allow;
1195*4882a593Smuzhiyun }
1196*4882a593Smuzhiyun 
1197*4882a593Smuzhiyun static const struct bpf_func_proto *
cgroup_base_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)1198*4882a593Smuzhiyun cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1199*4882a593Smuzhiyun {
1200*4882a593Smuzhiyun 	switch (func_id) {
1201*4882a593Smuzhiyun 	case BPF_FUNC_get_current_uid_gid:
1202*4882a593Smuzhiyun 		return &bpf_get_current_uid_gid_proto;
1203*4882a593Smuzhiyun 	case BPF_FUNC_get_local_storage:
1204*4882a593Smuzhiyun 		return &bpf_get_local_storage_proto;
1205*4882a593Smuzhiyun 	case BPF_FUNC_get_current_cgroup_id:
1206*4882a593Smuzhiyun 		return &bpf_get_current_cgroup_id_proto;
1207*4882a593Smuzhiyun 	case BPF_FUNC_perf_event_output:
1208*4882a593Smuzhiyun 		return &bpf_event_output_data_proto;
1209*4882a593Smuzhiyun 	default:
1210*4882a593Smuzhiyun 		return bpf_base_func_proto(func_id);
1211*4882a593Smuzhiyun 	}
1212*4882a593Smuzhiyun }
1213*4882a593Smuzhiyun 
1214*4882a593Smuzhiyun static const struct bpf_func_proto *
cgroup_dev_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)1215*4882a593Smuzhiyun cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1216*4882a593Smuzhiyun {
1217*4882a593Smuzhiyun 	return cgroup_base_func_proto(func_id, prog);
1218*4882a593Smuzhiyun }
1219*4882a593Smuzhiyun 
cgroup_dev_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)1220*4882a593Smuzhiyun static bool cgroup_dev_is_valid_access(int off, int size,
1221*4882a593Smuzhiyun 				       enum bpf_access_type type,
1222*4882a593Smuzhiyun 				       const struct bpf_prog *prog,
1223*4882a593Smuzhiyun 				       struct bpf_insn_access_aux *info)
1224*4882a593Smuzhiyun {
1225*4882a593Smuzhiyun 	const int size_default = sizeof(__u32);
1226*4882a593Smuzhiyun 
1227*4882a593Smuzhiyun 	if (type == BPF_WRITE)
1228*4882a593Smuzhiyun 		return false;
1229*4882a593Smuzhiyun 
1230*4882a593Smuzhiyun 	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
1231*4882a593Smuzhiyun 		return false;
1232*4882a593Smuzhiyun 	/* The verifier guarantees that size > 0. */
1233*4882a593Smuzhiyun 	if (off % size != 0)
1234*4882a593Smuzhiyun 		return false;
1235*4882a593Smuzhiyun 
1236*4882a593Smuzhiyun 	switch (off) {
1237*4882a593Smuzhiyun 	case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
1238*4882a593Smuzhiyun 		bpf_ctx_record_field_size(info, size_default);
1239*4882a593Smuzhiyun 		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
1240*4882a593Smuzhiyun 			return false;
1241*4882a593Smuzhiyun 		break;
1242*4882a593Smuzhiyun 	default:
1243*4882a593Smuzhiyun 		if (size != size_default)
1244*4882a593Smuzhiyun 			return false;
1245*4882a593Smuzhiyun 	}
1246*4882a593Smuzhiyun 
1247*4882a593Smuzhiyun 	return true;
1248*4882a593Smuzhiyun }
1249*4882a593Smuzhiyun 
1250*4882a593Smuzhiyun const struct bpf_prog_ops cg_dev_prog_ops = {
1251*4882a593Smuzhiyun };
1252*4882a593Smuzhiyun 
1253*4882a593Smuzhiyun const struct bpf_verifier_ops cg_dev_verifier_ops = {
1254*4882a593Smuzhiyun 	.get_func_proto		= cgroup_dev_func_proto,
1255*4882a593Smuzhiyun 	.is_valid_access	= cgroup_dev_is_valid_access,
1256*4882a593Smuzhiyun };
1257*4882a593Smuzhiyun 
1258*4882a593Smuzhiyun /**
1259*4882a593Smuzhiyun  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
1260*4882a593Smuzhiyun  *
1261*4882a593Smuzhiyun  * @head: sysctl table header
1262*4882a593Smuzhiyun  * @table: sysctl table
1263*4882a593Smuzhiyun  * @write: sysctl is being read (= 0) or written (= 1)
1264*4882a593Smuzhiyun  * @buf: pointer to buffer (in and out)
1265*4882a593Smuzhiyun  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
1266*4882a593Smuzhiyun  *	result is size of @new_buf if program set new value, initial value
1267*4882a593Smuzhiyun  *	otherwise
1268*4882a593Smuzhiyun  * @ppos: value-result argument: value is position at which read from or write
1269*4882a593Smuzhiyun  *	to sysctl is happening, result is new position if program overrode it,
1270*4882a593Smuzhiyun  *	initial value otherwise
1271*4882a593Smuzhiyun  * @type: type of program to be executed
1272*4882a593Smuzhiyun  *
1273*4882a593Smuzhiyun  * Program is run when sysctl is being accessed, either read or written, and
1274*4882a593Smuzhiyun  * can allow or deny such access.
1275*4882a593Smuzhiyun  *
1276*4882a593Smuzhiyun  * This function will return %-EPERM if an attached program is found and
1277*4882a593Smuzhiyun  * returned value != 1 during execution. In all other cases 0 is returned.
1278*4882a593Smuzhiyun  */
__cgroup_bpf_run_filter_sysctl(struct ctl_table_header * head,struct ctl_table * table,int write,char ** buf,size_t * pcount,loff_t * ppos,enum bpf_attach_type type)1279*4882a593Smuzhiyun int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
1280*4882a593Smuzhiyun 				   struct ctl_table *table, int write,
1281*4882a593Smuzhiyun 				   char **buf, size_t *pcount, loff_t *ppos,
1282*4882a593Smuzhiyun 				   enum bpf_attach_type type)
1283*4882a593Smuzhiyun {
1284*4882a593Smuzhiyun 	struct bpf_sysctl_kern ctx = {
1285*4882a593Smuzhiyun 		.head = head,
1286*4882a593Smuzhiyun 		.table = table,
1287*4882a593Smuzhiyun 		.write = write,
1288*4882a593Smuzhiyun 		.ppos = ppos,
1289*4882a593Smuzhiyun 		.cur_val = NULL,
1290*4882a593Smuzhiyun 		.cur_len = PAGE_SIZE,
1291*4882a593Smuzhiyun 		.new_val = NULL,
1292*4882a593Smuzhiyun 		.new_len = 0,
1293*4882a593Smuzhiyun 		.new_updated = 0,
1294*4882a593Smuzhiyun 	};
1295*4882a593Smuzhiyun 	struct cgroup *cgrp;
1296*4882a593Smuzhiyun 	loff_t pos = 0;
1297*4882a593Smuzhiyun 	int ret;
1298*4882a593Smuzhiyun 
1299*4882a593Smuzhiyun 	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
1300*4882a593Smuzhiyun 	if (!ctx.cur_val ||
1301*4882a593Smuzhiyun 	    table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
1302*4882a593Smuzhiyun 		/* Let BPF program decide how to proceed. */
1303*4882a593Smuzhiyun 		ctx.cur_len = 0;
1304*4882a593Smuzhiyun 	}
1305*4882a593Smuzhiyun 
1306*4882a593Smuzhiyun 	if (write && *buf && *pcount) {
1307*4882a593Smuzhiyun 		/* BPF program should be able to override new value with a
1308*4882a593Smuzhiyun 		 * buffer bigger than provided by user.
1309*4882a593Smuzhiyun 		 */
1310*4882a593Smuzhiyun 		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
1311*4882a593Smuzhiyun 		ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
1312*4882a593Smuzhiyun 		if (ctx.new_val) {
1313*4882a593Smuzhiyun 			memcpy(ctx.new_val, *buf, ctx.new_len);
1314*4882a593Smuzhiyun 		} else {
1315*4882a593Smuzhiyun 			/* Let BPF program decide how to proceed. */
1316*4882a593Smuzhiyun 			ctx.new_len = 0;
1317*4882a593Smuzhiyun 		}
1318*4882a593Smuzhiyun 	}
1319*4882a593Smuzhiyun 
1320*4882a593Smuzhiyun 	rcu_read_lock();
1321*4882a593Smuzhiyun 	cgrp = task_dfl_cgroup(current);
1322*4882a593Smuzhiyun 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
1323*4882a593Smuzhiyun 	rcu_read_unlock();
1324*4882a593Smuzhiyun 
1325*4882a593Smuzhiyun 	kfree(ctx.cur_val);
1326*4882a593Smuzhiyun 
1327*4882a593Smuzhiyun 	if (ret == 1 && ctx.new_updated) {
1328*4882a593Smuzhiyun 		kfree(*buf);
1329*4882a593Smuzhiyun 		*buf = ctx.new_val;
1330*4882a593Smuzhiyun 		*pcount = ctx.new_len;
1331*4882a593Smuzhiyun 	} else {
1332*4882a593Smuzhiyun 		kfree(ctx.new_val);
1333*4882a593Smuzhiyun 	}
1334*4882a593Smuzhiyun 
1335*4882a593Smuzhiyun 	return ret == 1 ? 0 : -EPERM;
1336*4882a593Smuzhiyun }
1337*4882a593Smuzhiyun 
1338*4882a593Smuzhiyun #ifdef CONFIG_NET
__cgroup_bpf_prog_array_is_empty(struct cgroup * cgrp,enum bpf_attach_type attach_type)1339*4882a593Smuzhiyun static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
1340*4882a593Smuzhiyun 					     enum bpf_attach_type attach_type)
1341*4882a593Smuzhiyun {
1342*4882a593Smuzhiyun 	struct bpf_prog_array *prog_array;
1343*4882a593Smuzhiyun 	bool empty;
1344*4882a593Smuzhiyun 
1345*4882a593Smuzhiyun 	rcu_read_lock();
1346*4882a593Smuzhiyun 	prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
1347*4882a593Smuzhiyun 	empty = bpf_prog_array_is_empty(prog_array);
1348*4882a593Smuzhiyun 	rcu_read_unlock();
1349*4882a593Smuzhiyun 
1350*4882a593Smuzhiyun 	return empty;
1351*4882a593Smuzhiyun }
1352*4882a593Smuzhiyun 
sockopt_alloc_buf(struct bpf_sockopt_kern * ctx,int max_optlen)1353*4882a593Smuzhiyun static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
1354*4882a593Smuzhiyun {
1355*4882a593Smuzhiyun 	if (unlikely(max_optlen < 0))
1356*4882a593Smuzhiyun 		return -EINVAL;
1357*4882a593Smuzhiyun 
1358*4882a593Smuzhiyun 	if (unlikely(max_optlen > PAGE_SIZE)) {
1359*4882a593Smuzhiyun 		/* We don't expose optvals that are greater than PAGE_SIZE
1360*4882a593Smuzhiyun 		 * to the BPF program.
1361*4882a593Smuzhiyun 		 */
1362*4882a593Smuzhiyun 		max_optlen = PAGE_SIZE;
1363*4882a593Smuzhiyun 	}
1364*4882a593Smuzhiyun 
1365*4882a593Smuzhiyun 	ctx->optval = kzalloc(max_optlen, GFP_USER);
1366*4882a593Smuzhiyun 	if (!ctx->optval)
1367*4882a593Smuzhiyun 		return -ENOMEM;
1368*4882a593Smuzhiyun 
1369*4882a593Smuzhiyun 	ctx->optval_end = ctx->optval + max_optlen;
1370*4882a593Smuzhiyun 
1371*4882a593Smuzhiyun 	return max_optlen;
1372*4882a593Smuzhiyun }
1373*4882a593Smuzhiyun 
sockopt_free_buf(struct bpf_sockopt_kern * ctx)1374*4882a593Smuzhiyun static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
1375*4882a593Smuzhiyun {
1376*4882a593Smuzhiyun 	kfree(ctx->optval);
1377*4882a593Smuzhiyun }
1378*4882a593Smuzhiyun 
__cgroup_bpf_run_filter_setsockopt(struct sock * sk,int * level,int * optname,char __user * optval,int * optlen,char ** kernel_optval)1379*4882a593Smuzhiyun int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
1380*4882a593Smuzhiyun 				       int *optname, char __user *optval,
1381*4882a593Smuzhiyun 				       int *optlen, char **kernel_optval)
1382*4882a593Smuzhiyun {
1383*4882a593Smuzhiyun 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1384*4882a593Smuzhiyun 	struct bpf_sockopt_kern ctx = {
1385*4882a593Smuzhiyun 		.sk = sk,
1386*4882a593Smuzhiyun 		.level = *level,
1387*4882a593Smuzhiyun 		.optname = *optname,
1388*4882a593Smuzhiyun 	};
1389*4882a593Smuzhiyun 	int ret, max_optlen;
1390*4882a593Smuzhiyun 
1391*4882a593Smuzhiyun 	/* Opportunistic check to see whether we have any BPF program
1392*4882a593Smuzhiyun 	 * attached to the hook so we don't waste time allocating
1393*4882a593Smuzhiyun 	 * memory and locking the socket.
1394*4882a593Smuzhiyun 	 */
1395*4882a593Smuzhiyun 	if (!cgroup_bpf_enabled ||
1396*4882a593Smuzhiyun 	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
1397*4882a593Smuzhiyun 		return 0;
1398*4882a593Smuzhiyun 
1399*4882a593Smuzhiyun 	/* Allocate a bit more than the initial user buffer for
1400*4882a593Smuzhiyun 	 * BPF program. The canonical use case is overriding
1401*4882a593Smuzhiyun 	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
1402*4882a593Smuzhiyun 	 */
1403*4882a593Smuzhiyun 	max_optlen = max_t(int, 16, *optlen);
1404*4882a593Smuzhiyun 
1405*4882a593Smuzhiyun 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
1406*4882a593Smuzhiyun 	if (max_optlen < 0)
1407*4882a593Smuzhiyun 		return max_optlen;
1408*4882a593Smuzhiyun 
1409*4882a593Smuzhiyun 	ctx.optlen = *optlen;
1410*4882a593Smuzhiyun 
1411*4882a593Smuzhiyun 	if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
1412*4882a593Smuzhiyun 		ret = -EFAULT;
1413*4882a593Smuzhiyun 		goto out;
1414*4882a593Smuzhiyun 	}
1415*4882a593Smuzhiyun 
1416*4882a593Smuzhiyun 	lock_sock(sk);
1417*4882a593Smuzhiyun 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
1418*4882a593Smuzhiyun 				 &ctx, BPF_PROG_RUN);
1419*4882a593Smuzhiyun 	release_sock(sk);
1420*4882a593Smuzhiyun 
1421*4882a593Smuzhiyun 	if (!ret) {
1422*4882a593Smuzhiyun 		ret = -EPERM;
1423*4882a593Smuzhiyun 		goto out;
1424*4882a593Smuzhiyun 	}
1425*4882a593Smuzhiyun 
1426*4882a593Smuzhiyun 	if (ctx.optlen == -1) {
1427*4882a593Smuzhiyun 		/* optlen set to -1, bypass kernel */
1428*4882a593Smuzhiyun 		ret = 1;
1429*4882a593Smuzhiyun 	} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1430*4882a593Smuzhiyun 		/* optlen is out of bounds */
1431*4882a593Smuzhiyun 		ret = -EFAULT;
1432*4882a593Smuzhiyun 	} else {
1433*4882a593Smuzhiyun 		/* optlen within bounds, run kernel handler */
1434*4882a593Smuzhiyun 		ret = 0;
1435*4882a593Smuzhiyun 
1436*4882a593Smuzhiyun 		/* export any potential modifications */
1437*4882a593Smuzhiyun 		*level = ctx.level;
1438*4882a593Smuzhiyun 		*optname = ctx.optname;
1439*4882a593Smuzhiyun 
1440*4882a593Smuzhiyun 		/* optlen == 0 from BPF indicates that we should
1441*4882a593Smuzhiyun 		 * use original userspace data.
1442*4882a593Smuzhiyun 		 */
1443*4882a593Smuzhiyun 		if (ctx.optlen != 0) {
1444*4882a593Smuzhiyun 			*optlen = ctx.optlen;
1445*4882a593Smuzhiyun 			*kernel_optval = ctx.optval;
1446*4882a593Smuzhiyun 			/* export and don't free sockopt buf */
1447*4882a593Smuzhiyun 			return 0;
1448*4882a593Smuzhiyun 		}
1449*4882a593Smuzhiyun 	}
1450*4882a593Smuzhiyun 
1451*4882a593Smuzhiyun out:
1452*4882a593Smuzhiyun 	sockopt_free_buf(&ctx);
1453*4882a593Smuzhiyun 	return ret;
1454*4882a593Smuzhiyun }
1455*4882a593Smuzhiyun 
__cgroup_bpf_run_filter_getsockopt(struct sock * sk,int level,int optname,char __user * optval,int __user * optlen,int max_optlen,int retval)1456*4882a593Smuzhiyun int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1457*4882a593Smuzhiyun 				       int optname, char __user *optval,
1458*4882a593Smuzhiyun 				       int __user *optlen, int max_optlen,
1459*4882a593Smuzhiyun 				       int retval)
1460*4882a593Smuzhiyun {
1461*4882a593Smuzhiyun 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1462*4882a593Smuzhiyun 	struct bpf_sockopt_kern ctx = {
1463*4882a593Smuzhiyun 		.sk = sk,
1464*4882a593Smuzhiyun 		.level = level,
1465*4882a593Smuzhiyun 		.optname = optname,
1466*4882a593Smuzhiyun 		.retval = retval,
1467*4882a593Smuzhiyun 	};
1468*4882a593Smuzhiyun 	int ret;
1469*4882a593Smuzhiyun 
1470*4882a593Smuzhiyun 	/* Opportunistic check to see whether we have any BPF program
1471*4882a593Smuzhiyun 	 * attached to the hook so we don't waste time allocating
1472*4882a593Smuzhiyun 	 * memory and locking the socket.
1473*4882a593Smuzhiyun 	 */
1474*4882a593Smuzhiyun 	if (!cgroup_bpf_enabled ||
1475*4882a593Smuzhiyun 	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
1476*4882a593Smuzhiyun 		return retval;
1477*4882a593Smuzhiyun 
1478*4882a593Smuzhiyun 	ctx.optlen = max_optlen;
1479*4882a593Smuzhiyun 
1480*4882a593Smuzhiyun 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
1481*4882a593Smuzhiyun 	if (max_optlen < 0)
1482*4882a593Smuzhiyun 		return max_optlen;
1483*4882a593Smuzhiyun 
1484*4882a593Smuzhiyun 	if (!retval) {
1485*4882a593Smuzhiyun 		/* If kernel getsockopt finished successfully,
1486*4882a593Smuzhiyun 		 * copy whatever was returned to the user back
1487*4882a593Smuzhiyun 		 * into our temporary buffer. Set optlen to the
1488*4882a593Smuzhiyun 		 * one that kernel returned as well to let
1489*4882a593Smuzhiyun 		 * BPF programs inspect the value.
1490*4882a593Smuzhiyun 		 */
1491*4882a593Smuzhiyun 
1492*4882a593Smuzhiyun 		if (get_user(ctx.optlen, optlen)) {
1493*4882a593Smuzhiyun 			ret = -EFAULT;
1494*4882a593Smuzhiyun 			goto out;
1495*4882a593Smuzhiyun 		}
1496*4882a593Smuzhiyun 
1497*4882a593Smuzhiyun 		if (ctx.optlen < 0) {
1498*4882a593Smuzhiyun 			ret = -EFAULT;
1499*4882a593Smuzhiyun 			goto out;
1500*4882a593Smuzhiyun 		}
1501*4882a593Smuzhiyun 
1502*4882a593Smuzhiyun 		if (copy_from_user(ctx.optval, optval,
1503*4882a593Smuzhiyun 				   min(ctx.optlen, max_optlen)) != 0) {
1504*4882a593Smuzhiyun 			ret = -EFAULT;
1505*4882a593Smuzhiyun 			goto out;
1506*4882a593Smuzhiyun 		}
1507*4882a593Smuzhiyun 	}
1508*4882a593Smuzhiyun 
1509*4882a593Smuzhiyun 	lock_sock(sk);
1510*4882a593Smuzhiyun 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
1511*4882a593Smuzhiyun 				 &ctx, BPF_PROG_RUN);
1512*4882a593Smuzhiyun 	release_sock(sk);
1513*4882a593Smuzhiyun 
1514*4882a593Smuzhiyun 	if (!ret) {
1515*4882a593Smuzhiyun 		ret = -EPERM;
1516*4882a593Smuzhiyun 		goto out;
1517*4882a593Smuzhiyun 	}
1518*4882a593Smuzhiyun 
1519*4882a593Smuzhiyun 	if (ctx.optlen > max_optlen || ctx.optlen < 0) {
1520*4882a593Smuzhiyun 		ret = -EFAULT;
1521*4882a593Smuzhiyun 		goto out;
1522*4882a593Smuzhiyun 	}
1523*4882a593Smuzhiyun 
1524*4882a593Smuzhiyun 	/* BPF programs only allowed to set retval to 0, not some
1525*4882a593Smuzhiyun 	 * arbitrary value.
1526*4882a593Smuzhiyun 	 */
1527*4882a593Smuzhiyun 	if (ctx.retval != 0 && ctx.retval != retval) {
1528*4882a593Smuzhiyun 		ret = -EFAULT;
1529*4882a593Smuzhiyun 		goto out;
1530*4882a593Smuzhiyun 	}
1531*4882a593Smuzhiyun 
1532*4882a593Smuzhiyun 	if (ctx.optlen != 0) {
1533*4882a593Smuzhiyun 		if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1534*4882a593Smuzhiyun 		    put_user(ctx.optlen, optlen)) {
1535*4882a593Smuzhiyun 			ret = -EFAULT;
1536*4882a593Smuzhiyun 			goto out;
1537*4882a593Smuzhiyun 		}
1538*4882a593Smuzhiyun 	}
1539*4882a593Smuzhiyun 
1540*4882a593Smuzhiyun 	ret = ctx.retval;
1541*4882a593Smuzhiyun 
1542*4882a593Smuzhiyun out:
1543*4882a593Smuzhiyun 	sockopt_free_buf(&ctx);
1544*4882a593Smuzhiyun 	return ret;
1545*4882a593Smuzhiyun }
1546*4882a593Smuzhiyun #endif
1547*4882a593Smuzhiyun 
sysctl_cpy_dir(const struct ctl_dir * dir,char ** bufp,size_t * lenp)1548*4882a593Smuzhiyun static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
1549*4882a593Smuzhiyun 			      size_t *lenp)
1550*4882a593Smuzhiyun {
1551*4882a593Smuzhiyun 	ssize_t tmp_ret = 0, ret;
1552*4882a593Smuzhiyun 
1553*4882a593Smuzhiyun 	if (dir->header.parent) {
1554*4882a593Smuzhiyun 		tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
1555*4882a593Smuzhiyun 		if (tmp_ret < 0)
1556*4882a593Smuzhiyun 			return tmp_ret;
1557*4882a593Smuzhiyun 	}
1558*4882a593Smuzhiyun 
1559*4882a593Smuzhiyun 	ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
1560*4882a593Smuzhiyun 	if (ret < 0)
1561*4882a593Smuzhiyun 		return ret;
1562*4882a593Smuzhiyun 	*bufp += ret;
1563*4882a593Smuzhiyun 	*lenp -= ret;
1564*4882a593Smuzhiyun 	ret += tmp_ret;
1565*4882a593Smuzhiyun 
1566*4882a593Smuzhiyun 	/* Avoid leading slash. */
1567*4882a593Smuzhiyun 	if (!ret)
1568*4882a593Smuzhiyun 		return ret;
1569*4882a593Smuzhiyun 
1570*4882a593Smuzhiyun 	tmp_ret = strscpy(*bufp, "/", *lenp);
1571*4882a593Smuzhiyun 	if (tmp_ret < 0)
1572*4882a593Smuzhiyun 		return tmp_ret;
1573*4882a593Smuzhiyun 	*bufp += tmp_ret;
1574*4882a593Smuzhiyun 	*lenp -= tmp_ret;
1575*4882a593Smuzhiyun 
1576*4882a593Smuzhiyun 	return ret + tmp_ret;
1577*4882a593Smuzhiyun }
1578*4882a593Smuzhiyun 
BPF_CALL_4(bpf_sysctl_get_name,struct bpf_sysctl_kern *,ctx,char *,buf,size_t,buf_len,u64,flags)1579*4882a593Smuzhiyun BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
1580*4882a593Smuzhiyun 	   size_t, buf_len, u64, flags)
1581*4882a593Smuzhiyun {
1582*4882a593Smuzhiyun 	ssize_t tmp_ret = 0, ret;
1583*4882a593Smuzhiyun 
1584*4882a593Smuzhiyun 	if (!buf)
1585*4882a593Smuzhiyun 		return -EINVAL;
1586*4882a593Smuzhiyun 
1587*4882a593Smuzhiyun 	if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
1588*4882a593Smuzhiyun 		if (!ctx->head)
1589*4882a593Smuzhiyun 			return -EINVAL;
1590*4882a593Smuzhiyun 		tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
1591*4882a593Smuzhiyun 		if (tmp_ret < 0)
1592*4882a593Smuzhiyun 			return tmp_ret;
1593*4882a593Smuzhiyun 	}
1594*4882a593Smuzhiyun 
1595*4882a593Smuzhiyun 	ret = strscpy(buf, ctx->table->procname, buf_len);
1596*4882a593Smuzhiyun 
1597*4882a593Smuzhiyun 	return ret < 0 ? ret : tmp_ret + ret;
1598*4882a593Smuzhiyun }
1599*4882a593Smuzhiyun 
1600*4882a593Smuzhiyun static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
1601*4882a593Smuzhiyun 	.func		= bpf_sysctl_get_name,
1602*4882a593Smuzhiyun 	.gpl_only	= false,
1603*4882a593Smuzhiyun 	.ret_type	= RET_INTEGER,
1604*4882a593Smuzhiyun 	.arg1_type	= ARG_PTR_TO_CTX,
1605*4882a593Smuzhiyun 	.arg2_type	= ARG_PTR_TO_MEM,
1606*4882a593Smuzhiyun 	.arg3_type	= ARG_CONST_SIZE,
1607*4882a593Smuzhiyun 	.arg4_type	= ARG_ANYTHING,
1608*4882a593Smuzhiyun };
1609*4882a593Smuzhiyun 
copy_sysctl_value(char * dst,size_t dst_len,char * src,size_t src_len)1610*4882a593Smuzhiyun static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
1611*4882a593Smuzhiyun 			     size_t src_len)
1612*4882a593Smuzhiyun {
1613*4882a593Smuzhiyun 	if (!dst)
1614*4882a593Smuzhiyun 		return -EINVAL;
1615*4882a593Smuzhiyun 
1616*4882a593Smuzhiyun 	if (!dst_len)
1617*4882a593Smuzhiyun 		return -E2BIG;
1618*4882a593Smuzhiyun 
1619*4882a593Smuzhiyun 	if (!src || !src_len) {
1620*4882a593Smuzhiyun 		memset(dst, 0, dst_len);
1621*4882a593Smuzhiyun 		return -EINVAL;
1622*4882a593Smuzhiyun 	}
1623*4882a593Smuzhiyun 
1624*4882a593Smuzhiyun 	memcpy(dst, src, min(dst_len, src_len));
1625*4882a593Smuzhiyun 
1626*4882a593Smuzhiyun 	if (dst_len > src_len) {
1627*4882a593Smuzhiyun 		memset(dst + src_len, '\0', dst_len - src_len);
1628*4882a593Smuzhiyun 		return src_len;
1629*4882a593Smuzhiyun 	}
1630*4882a593Smuzhiyun 
1631*4882a593Smuzhiyun 	dst[dst_len - 1] = '\0';
1632*4882a593Smuzhiyun 
1633*4882a593Smuzhiyun 	return -E2BIG;
1634*4882a593Smuzhiyun }
1635*4882a593Smuzhiyun 
BPF_CALL_3(bpf_sysctl_get_current_value,struct bpf_sysctl_kern *,ctx,char *,buf,size_t,buf_len)1636*4882a593Smuzhiyun BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
1637*4882a593Smuzhiyun 	   char *, buf, size_t, buf_len)
1638*4882a593Smuzhiyun {
1639*4882a593Smuzhiyun 	return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
1640*4882a593Smuzhiyun }
1641*4882a593Smuzhiyun 
1642*4882a593Smuzhiyun static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
1643*4882a593Smuzhiyun 	.func		= bpf_sysctl_get_current_value,
1644*4882a593Smuzhiyun 	.gpl_only	= false,
1645*4882a593Smuzhiyun 	.ret_type	= RET_INTEGER,
1646*4882a593Smuzhiyun 	.arg1_type	= ARG_PTR_TO_CTX,
1647*4882a593Smuzhiyun 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1648*4882a593Smuzhiyun 	.arg3_type	= ARG_CONST_SIZE,
1649*4882a593Smuzhiyun };
1650*4882a593Smuzhiyun 
BPF_CALL_3(bpf_sysctl_get_new_value,struct bpf_sysctl_kern *,ctx,char *,buf,size_t,buf_len)1651*4882a593Smuzhiyun BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
1652*4882a593Smuzhiyun 	   size_t, buf_len)
1653*4882a593Smuzhiyun {
1654*4882a593Smuzhiyun 	if (!ctx->write) {
1655*4882a593Smuzhiyun 		if (buf && buf_len)
1656*4882a593Smuzhiyun 			memset(buf, '\0', buf_len);
1657*4882a593Smuzhiyun 		return -EINVAL;
1658*4882a593Smuzhiyun 	}
1659*4882a593Smuzhiyun 	return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
1660*4882a593Smuzhiyun }
1661*4882a593Smuzhiyun 
1662*4882a593Smuzhiyun static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
1663*4882a593Smuzhiyun 	.func		= bpf_sysctl_get_new_value,
1664*4882a593Smuzhiyun 	.gpl_only	= false,
1665*4882a593Smuzhiyun 	.ret_type	= RET_INTEGER,
1666*4882a593Smuzhiyun 	.arg1_type	= ARG_PTR_TO_CTX,
1667*4882a593Smuzhiyun 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1668*4882a593Smuzhiyun 	.arg3_type	= ARG_CONST_SIZE,
1669*4882a593Smuzhiyun };
1670*4882a593Smuzhiyun 
BPF_CALL_3(bpf_sysctl_set_new_value,struct bpf_sysctl_kern *,ctx,const char *,buf,size_t,buf_len)1671*4882a593Smuzhiyun BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
1672*4882a593Smuzhiyun 	   const char *, buf, size_t, buf_len)
1673*4882a593Smuzhiyun {
1674*4882a593Smuzhiyun 	if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
1675*4882a593Smuzhiyun 		return -EINVAL;
1676*4882a593Smuzhiyun 
1677*4882a593Smuzhiyun 	if (buf_len > PAGE_SIZE - 1)
1678*4882a593Smuzhiyun 		return -E2BIG;
1679*4882a593Smuzhiyun 
1680*4882a593Smuzhiyun 	memcpy(ctx->new_val, buf, buf_len);
1681*4882a593Smuzhiyun 	ctx->new_len = buf_len;
1682*4882a593Smuzhiyun 	ctx->new_updated = 1;
1683*4882a593Smuzhiyun 
1684*4882a593Smuzhiyun 	return 0;
1685*4882a593Smuzhiyun }
1686*4882a593Smuzhiyun 
1687*4882a593Smuzhiyun static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
1688*4882a593Smuzhiyun 	.func		= bpf_sysctl_set_new_value,
1689*4882a593Smuzhiyun 	.gpl_only	= false,
1690*4882a593Smuzhiyun 	.ret_type	= RET_INTEGER,
1691*4882a593Smuzhiyun 	.arg1_type	= ARG_PTR_TO_CTX,
1692*4882a593Smuzhiyun 	.arg2_type	= ARG_PTR_TO_MEM,
1693*4882a593Smuzhiyun 	.arg3_type	= ARG_CONST_SIZE,
1694*4882a593Smuzhiyun };
1695*4882a593Smuzhiyun 
1696*4882a593Smuzhiyun static const struct bpf_func_proto *
sysctl_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)1697*4882a593Smuzhiyun sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1698*4882a593Smuzhiyun {
1699*4882a593Smuzhiyun 	switch (func_id) {
1700*4882a593Smuzhiyun 	case BPF_FUNC_strtol:
1701*4882a593Smuzhiyun 		return &bpf_strtol_proto;
1702*4882a593Smuzhiyun 	case BPF_FUNC_strtoul:
1703*4882a593Smuzhiyun 		return &bpf_strtoul_proto;
1704*4882a593Smuzhiyun 	case BPF_FUNC_sysctl_get_name:
1705*4882a593Smuzhiyun 		return &bpf_sysctl_get_name_proto;
1706*4882a593Smuzhiyun 	case BPF_FUNC_sysctl_get_current_value:
1707*4882a593Smuzhiyun 		return &bpf_sysctl_get_current_value_proto;
1708*4882a593Smuzhiyun 	case BPF_FUNC_sysctl_get_new_value:
1709*4882a593Smuzhiyun 		return &bpf_sysctl_get_new_value_proto;
1710*4882a593Smuzhiyun 	case BPF_FUNC_sysctl_set_new_value:
1711*4882a593Smuzhiyun 		return &bpf_sysctl_set_new_value_proto;
1712*4882a593Smuzhiyun 	default:
1713*4882a593Smuzhiyun 		return cgroup_base_func_proto(func_id, prog);
1714*4882a593Smuzhiyun 	}
1715*4882a593Smuzhiyun }
1716*4882a593Smuzhiyun 
sysctl_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)1717*4882a593Smuzhiyun static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
1718*4882a593Smuzhiyun 				   const struct bpf_prog *prog,
1719*4882a593Smuzhiyun 				   struct bpf_insn_access_aux *info)
1720*4882a593Smuzhiyun {
1721*4882a593Smuzhiyun 	const int size_default = sizeof(__u32);
1722*4882a593Smuzhiyun 
1723*4882a593Smuzhiyun 	if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
1724*4882a593Smuzhiyun 		return false;
1725*4882a593Smuzhiyun 
1726*4882a593Smuzhiyun 	switch (off) {
1727*4882a593Smuzhiyun 	case bpf_ctx_range(struct bpf_sysctl, write):
1728*4882a593Smuzhiyun 		if (type != BPF_READ)
1729*4882a593Smuzhiyun 			return false;
1730*4882a593Smuzhiyun 		bpf_ctx_record_field_size(info, size_default);
1731*4882a593Smuzhiyun 		return bpf_ctx_narrow_access_ok(off, size, size_default);
1732*4882a593Smuzhiyun 	case bpf_ctx_range(struct bpf_sysctl, file_pos):
1733*4882a593Smuzhiyun 		if (type == BPF_READ) {
1734*4882a593Smuzhiyun 			bpf_ctx_record_field_size(info, size_default);
1735*4882a593Smuzhiyun 			return bpf_ctx_narrow_access_ok(off, size, size_default);
1736*4882a593Smuzhiyun 		} else {
1737*4882a593Smuzhiyun 			return size == size_default;
1738*4882a593Smuzhiyun 		}
1739*4882a593Smuzhiyun 	default:
1740*4882a593Smuzhiyun 		return false;
1741*4882a593Smuzhiyun 	}
1742*4882a593Smuzhiyun }
1743*4882a593Smuzhiyun 
sysctl_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)1744*4882a593Smuzhiyun static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
1745*4882a593Smuzhiyun 				     const struct bpf_insn *si,
1746*4882a593Smuzhiyun 				     struct bpf_insn *insn_buf,
1747*4882a593Smuzhiyun 				     struct bpf_prog *prog, u32 *target_size)
1748*4882a593Smuzhiyun {
1749*4882a593Smuzhiyun 	struct bpf_insn *insn = insn_buf;
1750*4882a593Smuzhiyun 	u32 read_size;
1751*4882a593Smuzhiyun 
1752*4882a593Smuzhiyun 	switch (si->off) {
1753*4882a593Smuzhiyun 	case offsetof(struct bpf_sysctl, write):
1754*4882a593Smuzhiyun 		*insn++ = BPF_LDX_MEM(
1755*4882a593Smuzhiyun 			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
1756*4882a593Smuzhiyun 			bpf_target_off(struct bpf_sysctl_kern, write,
1757*4882a593Smuzhiyun 				       sizeof_field(struct bpf_sysctl_kern,
1758*4882a593Smuzhiyun 						    write),
1759*4882a593Smuzhiyun 				       target_size));
1760*4882a593Smuzhiyun 		break;
1761*4882a593Smuzhiyun 	case offsetof(struct bpf_sysctl, file_pos):
1762*4882a593Smuzhiyun 		/* ppos is a pointer so it should be accessed via indirect
1763*4882a593Smuzhiyun 		 * loads and stores. Also for stores additional temporary
1764*4882a593Smuzhiyun 		 * register is used since neither src_reg nor dst_reg can be
1765*4882a593Smuzhiyun 		 * overridden.
1766*4882a593Smuzhiyun 		 */
1767*4882a593Smuzhiyun 		if (type == BPF_WRITE) {
1768*4882a593Smuzhiyun 			int treg = BPF_REG_9;
1769*4882a593Smuzhiyun 
1770*4882a593Smuzhiyun 			if (si->src_reg == treg || si->dst_reg == treg)
1771*4882a593Smuzhiyun 				--treg;
1772*4882a593Smuzhiyun 			if (si->src_reg == treg || si->dst_reg == treg)
1773*4882a593Smuzhiyun 				--treg;
1774*4882a593Smuzhiyun 			*insn++ = BPF_STX_MEM(
1775*4882a593Smuzhiyun 				BPF_DW, si->dst_reg, treg,
1776*4882a593Smuzhiyun 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1777*4882a593Smuzhiyun 			*insn++ = BPF_LDX_MEM(
1778*4882a593Smuzhiyun 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1779*4882a593Smuzhiyun 				treg, si->dst_reg,
1780*4882a593Smuzhiyun 				offsetof(struct bpf_sysctl_kern, ppos));
1781*4882a593Smuzhiyun 			*insn++ = BPF_STX_MEM(
1782*4882a593Smuzhiyun 				BPF_SIZEOF(u32), treg, si->src_reg,
1783*4882a593Smuzhiyun 				bpf_ctx_narrow_access_offset(
1784*4882a593Smuzhiyun 					0, sizeof(u32), sizeof(loff_t)));
1785*4882a593Smuzhiyun 			*insn++ = BPF_LDX_MEM(
1786*4882a593Smuzhiyun 				BPF_DW, treg, si->dst_reg,
1787*4882a593Smuzhiyun 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1788*4882a593Smuzhiyun 		} else {
1789*4882a593Smuzhiyun 			*insn++ = BPF_LDX_MEM(
1790*4882a593Smuzhiyun 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1791*4882a593Smuzhiyun 				si->dst_reg, si->src_reg,
1792*4882a593Smuzhiyun 				offsetof(struct bpf_sysctl_kern, ppos));
1793*4882a593Smuzhiyun 			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
1794*4882a593Smuzhiyun 			*insn++ = BPF_LDX_MEM(
1795*4882a593Smuzhiyun 				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
1796*4882a593Smuzhiyun 				bpf_ctx_narrow_access_offset(
1797*4882a593Smuzhiyun 					0, read_size, sizeof(loff_t)));
1798*4882a593Smuzhiyun 		}
1799*4882a593Smuzhiyun 		*target_size = sizeof(u32);
1800*4882a593Smuzhiyun 		break;
1801*4882a593Smuzhiyun 	}
1802*4882a593Smuzhiyun 
1803*4882a593Smuzhiyun 	return insn - insn_buf;
1804*4882a593Smuzhiyun }
1805*4882a593Smuzhiyun 
1806*4882a593Smuzhiyun const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
1807*4882a593Smuzhiyun 	.get_func_proto		= sysctl_func_proto,
1808*4882a593Smuzhiyun 	.is_valid_access	= sysctl_is_valid_access,
1809*4882a593Smuzhiyun 	.convert_ctx_access	= sysctl_convert_ctx_access,
1810*4882a593Smuzhiyun };
1811*4882a593Smuzhiyun 
1812*4882a593Smuzhiyun const struct bpf_prog_ops cg_sysctl_prog_ops = {
1813*4882a593Smuzhiyun };
1814*4882a593Smuzhiyun 
1815*4882a593Smuzhiyun static const struct bpf_func_proto *
cg_sockopt_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)1816*4882a593Smuzhiyun cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1817*4882a593Smuzhiyun {
1818*4882a593Smuzhiyun 	switch (func_id) {
1819*4882a593Smuzhiyun #ifdef CONFIG_NET
1820*4882a593Smuzhiyun 	case BPF_FUNC_sk_storage_get:
1821*4882a593Smuzhiyun 		return &bpf_sk_storage_get_proto;
1822*4882a593Smuzhiyun 	case BPF_FUNC_sk_storage_delete:
1823*4882a593Smuzhiyun 		return &bpf_sk_storage_delete_proto;
1824*4882a593Smuzhiyun #endif
1825*4882a593Smuzhiyun #ifdef CONFIG_INET
1826*4882a593Smuzhiyun 	case BPF_FUNC_tcp_sock:
1827*4882a593Smuzhiyun 		return &bpf_tcp_sock_proto;
1828*4882a593Smuzhiyun #endif
1829*4882a593Smuzhiyun 	default:
1830*4882a593Smuzhiyun 		return cgroup_base_func_proto(func_id, prog);
1831*4882a593Smuzhiyun 	}
1832*4882a593Smuzhiyun }
1833*4882a593Smuzhiyun 
cg_sockopt_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)1834*4882a593Smuzhiyun static bool cg_sockopt_is_valid_access(int off, int size,
1835*4882a593Smuzhiyun 				       enum bpf_access_type type,
1836*4882a593Smuzhiyun 				       const struct bpf_prog *prog,
1837*4882a593Smuzhiyun 				       struct bpf_insn_access_aux *info)
1838*4882a593Smuzhiyun {
1839*4882a593Smuzhiyun 	const int size_default = sizeof(__u32);
1840*4882a593Smuzhiyun 
1841*4882a593Smuzhiyun 	if (off < 0 || off >= sizeof(struct bpf_sockopt))
1842*4882a593Smuzhiyun 		return false;
1843*4882a593Smuzhiyun 
1844*4882a593Smuzhiyun 	if (off % size != 0)
1845*4882a593Smuzhiyun 		return false;
1846*4882a593Smuzhiyun 
1847*4882a593Smuzhiyun 	if (type == BPF_WRITE) {
1848*4882a593Smuzhiyun 		switch (off) {
1849*4882a593Smuzhiyun 		case offsetof(struct bpf_sockopt, retval):
1850*4882a593Smuzhiyun 			if (size != size_default)
1851*4882a593Smuzhiyun 				return false;
1852*4882a593Smuzhiyun 			return prog->expected_attach_type ==
1853*4882a593Smuzhiyun 				BPF_CGROUP_GETSOCKOPT;
1854*4882a593Smuzhiyun 		case offsetof(struct bpf_sockopt, optname):
1855*4882a593Smuzhiyun 			fallthrough;
1856*4882a593Smuzhiyun 		case offsetof(struct bpf_sockopt, level):
1857*4882a593Smuzhiyun 			if (size != size_default)
1858*4882a593Smuzhiyun 				return false;
1859*4882a593Smuzhiyun 			return prog->expected_attach_type ==
1860*4882a593Smuzhiyun 				BPF_CGROUP_SETSOCKOPT;
1861*4882a593Smuzhiyun 		case offsetof(struct bpf_sockopt, optlen):
1862*4882a593Smuzhiyun 			return size == size_default;
1863*4882a593Smuzhiyun 		default:
1864*4882a593Smuzhiyun 			return false;
1865*4882a593Smuzhiyun 		}
1866*4882a593Smuzhiyun 	}
1867*4882a593Smuzhiyun 
1868*4882a593Smuzhiyun 	switch (off) {
1869*4882a593Smuzhiyun 	case offsetof(struct bpf_sockopt, sk):
1870*4882a593Smuzhiyun 		if (size != sizeof(__u64))
1871*4882a593Smuzhiyun 			return false;
1872*4882a593Smuzhiyun 		info->reg_type = PTR_TO_SOCKET;
1873*4882a593Smuzhiyun 		break;
1874*4882a593Smuzhiyun 	case offsetof(struct bpf_sockopt, optval):
1875*4882a593Smuzhiyun 		if (size != sizeof(__u64))
1876*4882a593Smuzhiyun 			return false;
1877*4882a593Smuzhiyun 		info->reg_type = PTR_TO_PACKET;
1878*4882a593Smuzhiyun 		break;
1879*4882a593Smuzhiyun 	case offsetof(struct bpf_sockopt, optval_end):
1880*4882a593Smuzhiyun 		if (size != sizeof(__u64))
1881*4882a593Smuzhiyun 			return false;
1882*4882a593Smuzhiyun 		info->reg_type = PTR_TO_PACKET_END;
1883*4882a593Smuzhiyun 		break;
1884*4882a593Smuzhiyun 	case offsetof(struct bpf_sockopt, retval):
1885*4882a593Smuzhiyun 		if (size != size_default)
1886*4882a593Smuzhiyun 			return false;
1887*4882a593Smuzhiyun 		return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
1888*4882a593Smuzhiyun 	default:
1889*4882a593Smuzhiyun 		if (size != size_default)
1890*4882a593Smuzhiyun 			return false;
1891*4882a593Smuzhiyun 		break;
1892*4882a593Smuzhiyun 	}
1893*4882a593Smuzhiyun 	return true;
1894*4882a593Smuzhiyun }
1895*4882a593Smuzhiyun 
1896*4882a593Smuzhiyun #define CG_SOCKOPT_ACCESS_FIELD(T, F)					\
1897*4882a593Smuzhiyun 	T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),			\
1898*4882a593Smuzhiyun 	  si->dst_reg, si->src_reg,					\
1899*4882a593Smuzhiyun 	  offsetof(struct bpf_sockopt_kern, F))
1900*4882a593Smuzhiyun 
cg_sockopt_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)1901*4882a593Smuzhiyun static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
1902*4882a593Smuzhiyun 					 const struct bpf_insn *si,
1903*4882a593Smuzhiyun 					 struct bpf_insn *insn_buf,
1904*4882a593Smuzhiyun 					 struct bpf_prog *prog,
1905*4882a593Smuzhiyun 					 u32 *target_size)
1906*4882a593Smuzhiyun {
1907*4882a593Smuzhiyun 	struct bpf_insn *insn = insn_buf;
1908*4882a593Smuzhiyun 
1909*4882a593Smuzhiyun 	switch (si->off) {
1910*4882a593Smuzhiyun 	case offsetof(struct bpf_sockopt, sk):
1911*4882a593Smuzhiyun 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
1912*4882a593Smuzhiyun 		break;
1913*4882a593Smuzhiyun 	case offsetof(struct bpf_sockopt, level):
1914*4882a593Smuzhiyun 		if (type == BPF_WRITE)
1915*4882a593Smuzhiyun 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
1916*4882a593Smuzhiyun 		else
1917*4882a593Smuzhiyun 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
1918*4882a593Smuzhiyun 		break;
1919*4882a593Smuzhiyun 	case offsetof(struct bpf_sockopt, optname):
1920*4882a593Smuzhiyun 		if (type == BPF_WRITE)
1921*4882a593Smuzhiyun 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
1922*4882a593Smuzhiyun 		else
1923*4882a593Smuzhiyun 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
1924*4882a593Smuzhiyun 		break;
1925*4882a593Smuzhiyun 	case offsetof(struct bpf_sockopt, optlen):
1926*4882a593Smuzhiyun 		if (type == BPF_WRITE)
1927*4882a593Smuzhiyun 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
1928*4882a593Smuzhiyun 		else
1929*4882a593Smuzhiyun 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
1930*4882a593Smuzhiyun 		break;
1931*4882a593Smuzhiyun 	case offsetof(struct bpf_sockopt, retval):
1932*4882a593Smuzhiyun 		if (type == BPF_WRITE)
1933*4882a593Smuzhiyun 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
1934*4882a593Smuzhiyun 		else
1935*4882a593Smuzhiyun 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
1936*4882a593Smuzhiyun 		break;
1937*4882a593Smuzhiyun 	case offsetof(struct bpf_sockopt, optval):
1938*4882a593Smuzhiyun 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
1939*4882a593Smuzhiyun 		break;
1940*4882a593Smuzhiyun 	case offsetof(struct bpf_sockopt, optval_end):
1941*4882a593Smuzhiyun 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
1942*4882a593Smuzhiyun 		break;
1943*4882a593Smuzhiyun 	}
1944*4882a593Smuzhiyun 
1945*4882a593Smuzhiyun 	return insn - insn_buf;
1946*4882a593Smuzhiyun }
1947*4882a593Smuzhiyun 
cg_sockopt_get_prologue(struct bpf_insn * insn_buf,bool direct_write,const struct bpf_prog * prog)1948*4882a593Smuzhiyun static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
1949*4882a593Smuzhiyun 				   bool direct_write,
1950*4882a593Smuzhiyun 				   const struct bpf_prog *prog)
1951*4882a593Smuzhiyun {
1952*4882a593Smuzhiyun 	/* Nothing to do for sockopt argument. The data is kzalloc'ated.
1953*4882a593Smuzhiyun 	 */
1954*4882a593Smuzhiyun 	return 0;
1955*4882a593Smuzhiyun }
1956*4882a593Smuzhiyun 
1957*4882a593Smuzhiyun const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
1958*4882a593Smuzhiyun 	.get_func_proto		= cg_sockopt_func_proto,
1959*4882a593Smuzhiyun 	.is_valid_access	= cg_sockopt_is_valid_access,
1960*4882a593Smuzhiyun 	.convert_ctx_access	= cg_sockopt_convert_ctx_access,
1961*4882a593Smuzhiyun 	.gen_prologue		= cg_sockopt_get_prologue,
1962*4882a593Smuzhiyun };
1963*4882a593Smuzhiyun 
1964*4882a593Smuzhiyun const struct bpf_prog_ops cg_sockopt_prog_ops = {
1965*4882a593Smuzhiyun };
1966