xref: /OK3568_Linux_fs/kernel/net/sched/sch_fq.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  *  Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
6*4882a593Smuzhiyun  *
7*4882a593Smuzhiyun  *  Meant to be mostly used for locally generated traffic :
8*4882a593Smuzhiyun  *  Fast classification depends on skb->sk being set before reaching us.
9*4882a593Smuzhiyun  *  If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
10*4882a593Smuzhiyun  *  All packets belonging to a socket are considered as a 'flow'.
11*4882a593Smuzhiyun  *
12*4882a593Smuzhiyun  *  Flows are dynamically allocated and stored in a hash table of RB trees
13*4882a593Smuzhiyun  *  They are also part of one Round Robin 'queues' (new or old flows)
14*4882a593Smuzhiyun  *
15*4882a593Smuzhiyun  *  Burst avoidance (aka pacing) capability :
16*4882a593Smuzhiyun  *
17*4882a593Smuzhiyun  *  Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
18*4882a593Smuzhiyun  *  bunch of packets, and this packet scheduler adds delay between
19*4882a593Smuzhiyun  *  packets to respect rate limitation.
20*4882a593Smuzhiyun  *
21*4882a593Smuzhiyun  *  enqueue() :
22*4882a593Smuzhiyun  *   - lookup one RB tree (out of 1024 or more) to find the flow.
23*4882a593Smuzhiyun  *     If non existent flow, create it, add it to the tree.
24*4882a593Smuzhiyun  *     Add skb to the per flow list of skb (fifo).
25*4882a593Smuzhiyun  *   - Use a special fifo for high prio packets
26*4882a593Smuzhiyun  *
27*4882a593Smuzhiyun  *  dequeue() : serves flows in Round Robin
28*4882a593Smuzhiyun  *  Note : When a flow becomes empty, we do not immediately remove it from
29*4882a593Smuzhiyun  *  rb trees, for performance reasons (its expected to send additional packets,
30*4882a593Smuzhiyun  *  or SLAB cache will reuse socket for another flow)
31*4882a593Smuzhiyun  */
32*4882a593Smuzhiyun 
33*4882a593Smuzhiyun #include <linux/module.h>
34*4882a593Smuzhiyun #include <linux/types.h>
35*4882a593Smuzhiyun #include <linux/kernel.h>
36*4882a593Smuzhiyun #include <linux/jiffies.h>
37*4882a593Smuzhiyun #include <linux/string.h>
38*4882a593Smuzhiyun #include <linux/in.h>
39*4882a593Smuzhiyun #include <linux/errno.h>
40*4882a593Smuzhiyun #include <linux/init.h>
41*4882a593Smuzhiyun #include <linux/skbuff.h>
42*4882a593Smuzhiyun #include <linux/slab.h>
43*4882a593Smuzhiyun #include <linux/rbtree.h>
44*4882a593Smuzhiyun #include <linux/hash.h>
45*4882a593Smuzhiyun #include <linux/prefetch.h>
46*4882a593Smuzhiyun #include <linux/vmalloc.h>
47*4882a593Smuzhiyun #include <net/netlink.h>
48*4882a593Smuzhiyun #include <net/pkt_sched.h>
49*4882a593Smuzhiyun #include <net/sock.h>
50*4882a593Smuzhiyun #include <net/tcp_states.h>
51*4882a593Smuzhiyun #include <net/tcp.h>
52*4882a593Smuzhiyun 
53*4882a593Smuzhiyun struct fq_skb_cb {
54*4882a593Smuzhiyun 	u64	        time_to_send;
55*4882a593Smuzhiyun };
56*4882a593Smuzhiyun 
fq_skb_cb(struct sk_buff * skb)57*4882a593Smuzhiyun static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
58*4882a593Smuzhiyun {
59*4882a593Smuzhiyun 	qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
60*4882a593Smuzhiyun 	return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
61*4882a593Smuzhiyun }
62*4882a593Smuzhiyun 
63*4882a593Smuzhiyun /*
64*4882a593Smuzhiyun  * Per flow structure, dynamically allocated.
65*4882a593Smuzhiyun  * If packets have monotically increasing time_to_send, they are placed in O(1)
66*4882a593Smuzhiyun  * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
67*4882a593Smuzhiyun  */
68*4882a593Smuzhiyun struct fq_flow {
69*4882a593Smuzhiyun /* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
70*4882a593Smuzhiyun 	struct rb_root	t_root;
71*4882a593Smuzhiyun 	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
72*4882a593Smuzhiyun 	union {
73*4882a593Smuzhiyun 		struct sk_buff *tail;	/* last skb in the list */
74*4882a593Smuzhiyun 		unsigned long  age;	/* (jiffies | 1UL) when flow was emptied, for gc */
75*4882a593Smuzhiyun 	};
76*4882a593Smuzhiyun 	struct rb_node	fq_node;	/* anchor in fq_root[] trees */
77*4882a593Smuzhiyun 	struct sock	*sk;
78*4882a593Smuzhiyun 	u32		socket_hash;	/* sk_hash */
79*4882a593Smuzhiyun 	int		qlen;		/* number of packets in flow queue */
80*4882a593Smuzhiyun 
81*4882a593Smuzhiyun /* Second cache line, used in fq_dequeue() */
82*4882a593Smuzhiyun 	int		credit;
83*4882a593Smuzhiyun 	/* 32bit hole on 64bit arches */
84*4882a593Smuzhiyun 
85*4882a593Smuzhiyun 	struct fq_flow *next;		/* next pointer in RR lists */
86*4882a593Smuzhiyun 
87*4882a593Smuzhiyun 	struct rb_node  rate_node;	/* anchor in q->delayed tree */
88*4882a593Smuzhiyun 	u64		time_next_packet;
89*4882a593Smuzhiyun } ____cacheline_aligned_in_smp;
90*4882a593Smuzhiyun 
91*4882a593Smuzhiyun struct fq_flow_head {
92*4882a593Smuzhiyun 	struct fq_flow *first;
93*4882a593Smuzhiyun 	struct fq_flow *last;
94*4882a593Smuzhiyun };
95*4882a593Smuzhiyun 
96*4882a593Smuzhiyun struct fq_sched_data {
97*4882a593Smuzhiyun 	struct fq_flow_head new_flows;
98*4882a593Smuzhiyun 
99*4882a593Smuzhiyun 	struct fq_flow_head old_flows;
100*4882a593Smuzhiyun 
101*4882a593Smuzhiyun 	struct rb_root	delayed;	/* for rate limited flows */
102*4882a593Smuzhiyun 	u64		time_next_delayed_flow;
103*4882a593Smuzhiyun 	u64		ktime_cache;	/* copy of last ktime_get_ns() */
104*4882a593Smuzhiyun 	unsigned long	unthrottle_latency_ns;
105*4882a593Smuzhiyun 
106*4882a593Smuzhiyun 	struct fq_flow	internal;	/* for non classified or high prio packets */
107*4882a593Smuzhiyun 	u32		quantum;
108*4882a593Smuzhiyun 	u32		initial_quantum;
109*4882a593Smuzhiyun 	u32		flow_refill_delay;
110*4882a593Smuzhiyun 	u32		flow_plimit;	/* max packets per flow */
111*4882a593Smuzhiyun 	unsigned long	flow_max_rate;	/* optional max rate per flow */
112*4882a593Smuzhiyun 	u64		ce_threshold;
113*4882a593Smuzhiyun 	u64		horizon;	/* horizon in ns */
114*4882a593Smuzhiyun 	u32		orphan_mask;	/* mask for orphaned skb */
115*4882a593Smuzhiyun 	u32		low_rate_threshold;
116*4882a593Smuzhiyun 	struct rb_root	*fq_root;
117*4882a593Smuzhiyun 	u8		rate_enable;
118*4882a593Smuzhiyun 	u8		fq_trees_log;
119*4882a593Smuzhiyun 	u8		horizon_drop;
120*4882a593Smuzhiyun 	u32		flows;
121*4882a593Smuzhiyun 	u32		inactive_flows;
122*4882a593Smuzhiyun 	u32		throttled_flows;
123*4882a593Smuzhiyun 
124*4882a593Smuzhiyun 	u64		stat_gc_flows;
125*4882a593Smuzhiyun 	u64		stat_internal_packets;
126*4882a593Smuzhiyun 	u64		stat_throttled;
127*4882a593Smuzhiyun 	u64		stat_ce_mark;
128*4882a593Smuzhiyun 	u64		stat_horizon_drops;
129*4882a593Smuzhiyun 	u64		stat_horizon_caps;
130*4882a593Smuzhiyun 	u64		stat_flows_plimit;
131*4882a593Smuzhiyun 	u64		stat_pkts_too_long;
132*4882a593Smuzhiyun 	u64		stat_allocation_errors;
133*4882a593Smuzhiyun 
134*4882a593Smuzhiyun 	u32		timer_slack; /* hrtimer slack in ns */
135*4882a593Smuzhiyun 	struct qdisc_watchdog watchdog;
136*4882a593Smuzhiyun };
137*4882a593Smuzhiyun 
138*4882a593Smuzhiyun /*
139*4882a593Smuzhiyun  * f->tail and f->age share the same location.
140*4882a593Smuzhiyun  * We can use the low order bit to differentiate if this location points
141*4882a593Smuzhiyun  * to a sk_buff or contains a jiffies value, if we force this value to be odd.
142*4882a593Smuzhiyun  * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
143*4882a593Smuzhiyun  */
fq_flow_set_detached(struct fq_flow * f)144*4882a593Smuzhiyun static void fq_flow_set_detached(struct fq_flow *f)
145*4882a593Smuzhiyun {
146*4882a593Smuzhiyun 	f->age = jiffies | 1UL;
147*4882a593Smuzhiyun }
148*4882a593Smuzhiyun 
fq_flow_is_detached(const struct fq_flow * f)149*4882a593Smuzhiyun static bool fq_flow_is_detached(const struct fq_flow *f)
150*4882a593Smuzhiyun {
151*4882a593Smuzhiyun 	return !!(f->age & 1UL);
152*4882a593Smuzhiyun }
153*4882a593Smuzhiyun 
154*4882a593Smuzhiyun /* special value to mark a throttled flow (not on old/new list) */
155*4882a593Smuzhiyun static struct fq_flow throttled;
156*4882a593Smuzhiyun 
fq_flow_is_throttled(const struct fq_flow * f)157*4882a593Smuzhiyun static bool fq_flow_is_throttled(const struct fq_flow *f)
158*4882a593Smuzhiyun {
159*4882a593Smuzhiyun 	return f->next == &throttled;
160*4882a593Smuzhiyun }
161*4882a593Smuzhiyun 
fq_flow_add_tail(struct fq_flow_head * head,struct fq_flow * flow)162*4882a593Smuzhiyun static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
163*4882a593Smuzhiyun {
164*4882a593Smuzhiyun 	if (head->first)
165*4882a593Smuzhiyun 		head->last->next = flow;
166*4882a593Smuzhiyun 	else
167*4882a593Smuzhiyun 		head->first = flow;
168*4882a593Smuzhiyun 	head->last = flow;
169*4882a593Smuzhiyun 	flow->next = NULL;
170*4882a593Smuzhiyun }
171*4882a593Smuzhiyun 
fq_flow_unset_throttled(struct fq_sched_data * q,struct fq_flow * f)172*4882a593Smuzhiyun static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
173*4882a593Smuzhiyun {
174*4882a593Smuzhiyun 	rb_erase(&f->rate_node, &q->delayed);
175*4882a593Smuzhiyun 	q->throttled_flows--;
176*4882a593Smuzhiyun 	fq_flow_add_tail(&q->old_flows, f);
177*4882a593Smuzhiyun }
178*4882a593Smuzhiyun 
fq_flow_set_throttled(struct fq_sched_data * q,struct fq_flow * f)179*4882a593Smuzhiyun static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
180*4882a593Smuzhiyun {
181*4882a593Smuzhiyun 	struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
182*4882a593Smuzhiyun 
183*4882a593Smuzhiyun 	while (*p) {
184*4882a593Smuzhiyun 		struct fq_flow *aux;
185*4882a593Smuzhiyun 
186*4882a593Smuzhiyun 		parent = *p;
187*4882a593Smuzhiyun 		aux = rb_entry(parent, struct fq_flow, rate_node);
188*4882a593Smuzhiyun 		if (f->time_next_packet >= aux->time_next_packet)
189*4882a593Smuzhiyun 			p = &parent->rb_right;
190*4882a593Smuzhiyun 		else
191*4882a593Smuzhiyun 			p = &parent->rb_left;
192*4882a593Smuzhiyun 	}
193*4882a593Smuzhiyun 	rb_link_node(&f->rate_node, parent, p);
194*4882a593Smuzhiyun 	rb_insert_color(&f->rate_node, &q->delayed);
195*4882a593Smuzhiyun 	q->throttled_flows++;
196*4882a593Smuzhiyun 	q->stat_throttled++;
197*4882a593Smuzhiyun 
198*4882a593Smuzhiyun 	f->next = &throttled;
199*4882a593Smuzhiyun 	if (q->time_next_delayed_flow > f->time_next_packet)
200*4882a593Smuzhiyun 		q->time_next_delayed_flow = f->time_next_packet;
201*4882a593Smuzhiyun }
202*4882a593Smuzhiyun 
203*4882a593Smuzhiyun 
204*4882a593Smuzhiyun static struct kmem_cache *fq_flow_cachep __read_mostly;
205*4882a593Smuzhiyun 
206*4882a593Smuzhiyun 
207*4882a593Smuzhiyun /* limit number of collected flows per round */
208*4882a593Smuzhiyun #define FQ_GC_MAX 8
209*4882a593Smuzhiyun #define FQ_GC_AGE (3*HZ)
210*4882a593Smuzhiyun 
fq_gc_candidate(const struct fq_flow * f)211*4882a593Smuzhiyun static bool fq_gc_candidate(const struct fq_flow *f)
212*4882a593Smuzhiyun {
213*4882a593Smuzhiyun 	return fq_flow_is_detached(f) &&
214*4882a593Smuzhiyun 	       time_after(jiffies, f->age + FQ_GC_AGE);
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun 
fq_gc(struct fq_sched_data * q,struct rb_root * root,struct sock * sk)217*4882a593Smuzhiyun static void fq_gc(struct fq_sched_data *q,
218*4882a593Smuzhiyun 		  struct rb_root *root,
219*4882a593Smuzhiyun 		  struct sock *sk)
220*4882a593Smuzhiyun {
221*4882a593Smuzhiyun 	struct rb_node **p, *parent;
222*4882a593Smuzhiyun 	void *tofree[FQ_GC_MAX];
223*4882a593Smuzhiyun 	struct fq_flow *f;
224*4882a593Smuzhiyun 	int i, fcnt = 0;
225*4882a593Smuzhiyun 
226*4882a593Smuzhiyun 	p = &root->rb_node;
227*4882a593Smuzhiyun 	parent = NULL;
228*4882a593Smuzhiyun 	while (*p) {
229*4882a593Smuzhiyun 		parent = *p;
230*4882a593Smuzhiyun 
231*4882a593Smuzhiyun 		f = rb_entry(parent, struct fq_flow, fq_node);
232*4882a593Smuzhiyun 		if (f->sk == sk)
233*4882a593Smuzhiyun 			break;
234*4882a593Smuzhiyun 
235*4882a593Smuzhiyun 		if (fq_gc_candidate(f)) {
236*4882a593Smuzhiyun 			tofree[fcnt++] = f;
237*4882a593Smuzhiyun 			if (fcnt == FQ_GC_MAX)
238*4882a593Smuzhiyun 				break;
239*4882a593Smuzhiyun 		}
240*4882a593Smuzhiyun 
241*4882a593Smuzhiyun 		if (f->sk > sk)
242*4882a593Smuzhiyun 			p = &parent->rb_right;
243*4882a593Smuzhiyun 		else
244*4882a593Smuzhiyun 			p = &parent->rb_left;
245*4882a593Smuzhiyun 	}
246*4882a593Smuzhiyun 
247*4882a593Smuzhiyun 	if (!fcnt)
248*4882a593Smuzhiyun 		return;
249*4882a593Smuzhiyun 
250*4882a593Smuzhiyun 	for (i = fcnt; i > 0; ) {
251*4882a593Smuzhiyun 		f = tofree[--i];
252*4882a593Smuzhiyun 		rb_erase(&f->fq_node, root);
253*4882a593Smuzhiyun 	}
254*4882a593Smuzhiyun 	q->flows -= fcnt;
255*4882a593Smuzhiyun 	q->inactive_flows -= fcnt;
256*4882a593Smuzhiyun 	q->stat_gc_flows += fcnt;
257*4882a593Smuzhiyun 
258*4882a593Smuzhiyun 	kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
259*4882a593Smuzhiyun }
260*4882a593Smuzhiyun 
fq_classify(struct sk_buff * skb,struct fq_sched_data * q)261*4882a593Smuzhiyun static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
262*4882a593Smuzhiyun {
263*4882a593Smuzhiyun 	struct rb_node **p, *parent;
264*4882a593Smuzhiyun 	struct sock *sk = skb->sk;
265*4882a593Smuzhiyun 	struct rb_root *root;
266*4882a593Smuzhiyun 	struct fq_flow *f;
267*4882a593Smuzhiyun 
268*4882a593Smuzhiyun 	/* warning: no starvation prevention... */
269*4882a593Smuzhiyun 	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
270*4882a593Smuzhiyun 		return &q->internal;
271*4882a593Smuzhiyun 
272*4882a593Smuzhiyun 	/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
273*4882a593Smuzhiyun 	 * or a listener (SYNCOOKIE mode)
274*4882a593Smuzhiyun 	 * 1) request sockets are not full blown,
275*4882a593Smuzhiyun 	 *    they do not contain sk_pacing_rate
276*4882a593Smuzhiyun 	 * 2) They are not part of a 'flow' yet
277*4882a593Smuzhiyun 	 * 3) We do not want to rate limit them (eg SYNFLOOD attack),
278*4882a593Smuzhiyun 	 *    especially if the listener set SO_MAX_PACING_RATE
279*4882a593Smuzhiyun 	 * 4) We pretend they are orphaned
280*4882a593Smuzhiyun 	 */
281*4882a593Smuzhiyun 	if (!sk || sk_listener(sk)) {
282*4882a593Smuzhiyun 		unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
283*4882a593Smuzhiyun 
284*4882a593Smuzhiyun 		/* By forcing low order bit to 1, we make sure to not
285*4882a593Smuzhiyun 		 * collide with a local flow (socket pointers are word aligned)
286*4882a593Smuzhiyun 		 */
287*4882a593Smuzhiyun 		sk = (struct sock *)((hash << 1) | 1UL);
288*4882a593Smuzhiyun 		skb_orphan(skb);
289*4882a593Smuzhiyun 	} else if (sk->sk_state == TCP_CLOSE) {
290*4882a593Smuzhiyun 		unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
291*4882a593Smuzhiyun 		/*
292*4882a593Smuzhiyun 		 * Sockets in TCP_CLOSE are non connected.
293*4882a593Smuzhiyun 		 * Typical use case is UDP sockets, they can send packets
294*4882a593Smuzhiyun 		 * with sendto() to many different destinations.
295*4882a593Smuzhiyun 		 * We probably could use a generic bit advertising
296*4882a593Smuzhiyun 		 * non connected sockets, instead of sk_state == TCP_CLOSE,
297*4882a593Smuzhiyun 		 * if we care enough.
298*4882a593Smuzhiyun 		 */
299*4882a593Smuzhiyun 		sk = (struct sock *)((hash << 1) | 1UL);
300*4882a593Smuzhiyun 	}
301*4882a593Smuzhiyun 
302*4882a593Smuzhiyun 	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
303*4882a593Smuzhiyun 
304*4882a593Smuzhiyun 	if (q->flows >= (2U << q->fq_trees_log) &&
305*4882a593Smuzhiyun 	    q->inactive_flows > q->flows/2)
306*4882a593Smuzhiyun 		fq_gc(q, root, sk);
307*4882a593Smuzhiyun 
308*4882a593Smuzhiyun 	p = &root->rb_node;
309*4882a593Smuzhiyun 	parent = NULL;
310*4882a593Smuzhiyun 	while (*p) {
311*4882a593Smuzhiyun 		parent = *p;
312*4882a593Smuzhiyun 
313*4882a593Smuzhiyun 		f = rb_entry(parent, struct fq_flow, fq_node);
314*4882a593Smuzhiyun 		if (f->sk == sk) {
315*4882a593Smuzhiyun 			/* socket might have been reallocated, so check
316*4882a593Smuzhiyun 			 * if its sk_hash is the same.
317*4882a593Smuzhiyun 			 * It not, we need to refill credit with
318*4882a593Smuzhiyun 			 * initial quantum
319*4882a593Smuzhiyun 			 */
320*4882a593Smuzhiyun 			if (unlikely(skb->sk == sk &&
321*4882a593Smuzhiyun 				     f->socket_hash != sk->sk_hash)) {
322*4882a593Smuzhiyun 				f->credit = q->initial_quantum;
323*4882a593Smuzhiyun 				f->socket_hash = sk->sk_hash;
324*4882a593Smuzhiyun 				if (q->rate_enable)
325*4882a593Smuzhiyun 					smp_store_release(&sk->sk_pacing_status,
326*4882a593Smuzhiyun 							  SK_PACING_FQ);
327*4882a593Smuzhiyun 				if (fq_flow_is_throttled(f))
328*4882a593Smuzhiyun 					fq_flow_unset_throttled(q, f);
329*4882a593Smuzhiyun 				f->time_next_packet = 0ULL;
330*4882a593Smuzhiyun 			}
331*4882a593Smuzhiyun 			return f;
332*4882a593Smuzhiyun 		}
333*4882a593Smuzhiyun 		if (f->sk > sk)
334*4882a593Smuzhiyun 			p = &parent->rb_right;
335*4882a593Smuzhiyun 		else
336*4882a593Smuzhiyun 			p = &parent->rb_left;
337*4882a593Smuzhiyun 	}
338*4882a593Smuzhiyun 
339*4882a593Smuzhiyun 	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
340*4882a593Smuzhiyun 	if (unlikely(!f)) {
341*4882a593Smuzhiyun 		q->stat_allocation_errors++;
342*4882a593Smuzhiyun 		return &q->internal;
343*4882a593Smuzhiyun 	}
344*4882a593Smuzhiyun 	/* f->t_root is already zeroed after kmem_cache_zalloc() */
345*4882a593Smuzhiyun 
346*4882a593Smuzhiyun 	fq_flow_set_detached(f);
347*4882a593Smuzhiyun 	f->sk = sk;
348*4882a593Smuzhiyun 	if (skb->sk == sk) {
349*4882a593Smuzhiyun 		f->socket_hash = sk->sk_hash;
350*4882a593Smuzhiyun 		if (q->rate_enable)
351*4882a593Smuzhiyun 			smp_store_release(&sk->sk_pacing_status,
352*4882a593Smuzhiyun 					  SK_PACING_FQ);
353*4882a593Smuzhiyun 	}
354*4882a593Smuzhiyun 	f->credit = q->initial_quantum;
355*4882a593Smuzhiyun 
356*4882a593Smuzhiyun 	rb_link_node(&f->fq_node, parent, p);
357*4882a593Smuzhiyun 	rb_insert_color(&f->fq_node, root);
358*4882a593Smuzhiyun 
359*4882a593Smuzhiyun 	q->flows++;
360*4882a593Smuzhiyun 	q->inactive_flows++;
361*4882a593Smuzhiyun 	return f;
362*4882a593Smuzhiyun }
363*4882a593Smuzhiyun 
fq_peek(struct fq_flow * flow)364*4882a593Smuzhiyun static struct sk_buff *fq_peek(struct fq_flow *flow)
365*4882a593Smuzhiyun {
366*4882a593Smuzhiyun 	struct sk_buff *skb = skb_rb_first(&flow->t_root);
367*4882a593Smuzhiyun 	struct sk_buff *head = flow->head;
368*4882a593Smuzhiyun 
369*4882a593Smuzhiyun 	if (!skb)
370*4882a593Smuzhiyun 		return head;
371*4882a593Smuzhiyun 
372*4882a593Smuzhiyun 	if (!head)
373*4882a593Smuzhiyun 		return skb;
374*4882a593Smuzhiyun 
375*4882a593Smuzhiyun 	if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
376*4882a593Smuzhiyun 		return skb;
377*4882a593Smuzhiyun 	return head;
378*4882a593Smuzhiyun }
379*4882a593Smuzhiyun 
fq_erase_head(struct Qdisc * sch,struct fq_flow * flow,struct sk_buff * skb)380*4882a593Smuzhiyun static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
381*4882a593Smuzhiyun 			  struct sk_buff *skb)
382*4882a593Smuzhiyun {
383*4882a593Smuzhiyun 	if (skb == flow->head) {
384*4882a593Smuzhiyun 		flow->head = skb->next;
385*4882a593Smuzhiyun 	} else {
386*4882a593Smuzhiyun 		rb_erase(&skb->rbnode, &flow->t_root);
387*4882a593Smuzhiyun 		skb->dev = qdisc_dev(sch);
388*4882a593Smuzhiyun 	}
389*4882a593Smuzhiyun }
390*4882a593Smuzhiyun 
391*4882a593Smuzhiyun /* Remove one skb from flow queue.
392*4882a593Smuzhiyun  * This skb must be the return value of prior fq_peek().
393*4882a593Smuzhiyun  */
fq_dequeue_skb(struct Qdisc * sch,struct fq_flow * flow,struct sk_buff * skb)394*4882a593Smuzhiyun static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
395*4882a593Smuzhiyun 			   struct sk_buff *skb)
396*4882a593Smuzhiyun {
397*4882a593Smuzhiyun 	fq_erase_head(sch, flow, skb);
398*4882a593Smuzhiyun 	skb_mark_not_on_list(skb);
399*4882a593Smuzhiyun 	flow->qlen--;
400*4882a593Smuzhiyun 	qdisc_qstats_backlog_dec(sch, skb);
401*4882a593Smuzhiyun 	sch->q.qlen--;
402*4882a593Smuzhiyun }
403*4882a593Smuzhiyun 
flow_queue_add(struct fq_flow * flow,struct sk_buff * skb)404*4882a593Smuzhiyun static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
405*4882a593Smuzhiyun {
406*4882a593Smuzhiyun 	struct rb_node **p, *parent;
407*4882a593Smuzhiyun 	struct sk_buff *head, *aux;
408*4882a593Smuzhiyun 
409*4882a593Smuzhiyun 	head = flow->head;
410*4882a593Smuzhiyun 	if (!head ||
411*4882a593Smuzhiyun 	    fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
412*4882a593Smuzhiyun 		if (!head)
413*4882a593Smuzhiyun 			flow->head = skb;
414*4882a593Smuzhiyun 		else
415*4882a593Smuzhiyun 			flow->tail->next = skb;
416*4882a593Smuzhiyun 		flow->tail = skb;
417*4882a593Smuzhiyun 		skb->next = NULL;
418*4882a593Smuzhiyun 		return;
419*4882a593Smuzhiyun 	}
420*4882a593Smuzhiyun 
421*4882a593Smuzhiyun 	p = &flow->t_root.rb_node;
422*4882a593Smuzhiyun 	parent = NULL;
423*4882a593Smuzhiyun 
424*4882a593Smuzhiyun 	while (*p) {
425*4882a593Smuzhiyun 		parent = *p;
426*4882a593Smuzhiyun 		aux = rb_to_skb(parent);
427*4882a593Smuzhiyun 		if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
428*4882a593Smuzhiyun 			p = &parent->rb_right;
429*4882a593Smuzhiyun 		else
430*4882a593Smuzhiyun 			p = &parent->rb_left;
431*4882a593Smuzhiyun 	}
432*4882a593Smuzhiyun 	rb_link_node(&skb->rbnode, parent, p);
433*4882a593Smuzhiyun 	rb_insert_color(&skb->rbnode, &flow->t_root);
434*4882a593Smuzhiyun }
435*4882a593Smuzhiyun 
fq_packet_beyond_horizon(const struct sk_buff * skb,const struct fq_sched_data * q)436*4882a593Smuzhiyun static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
437*4882a593Smuzhiyun 				    const struct fq_sched_data *q)
438*4882a593Smuzhiyun {
439*4882a593Smuzhiyun 	return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
440*4882a593Smuzhiyun }
441*4882a593Smuzhiyun 
fq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)442*4882a593Smuzhiyun static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
443*4882a593Smuzhiyun 		      struct sk_buff **to_free)
444*4882a593Smuzhiyun {
445*4882a593Smuzhiyun 	struct fq_sched_data *q = qdisc_priv(sch);
446*4882a593Smuzhiyun 	struct fq_flow *f;
447*4882a593Smuzhiyun 
448*4882a593Smuzhiyun 	if (unlikely(sch->q.qlen >= sch->limit))
449*4882a593Smuzhiyun 		return qdisc_drop(skb, sch, to_free);
450*4882a593Smuzhiyun 
451*4882a593Smuzhiyun 	if (!skb->tstamp) {
452*4882a593Smuzhiyun 		fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
453*4882a593Smuzhiyun 	} else {
454*4882a593Smuzhiyun 		/* Check if packet timestamp is too far in the future.
455*4882a593Smuzhiyun 		 * Try first if our cached value, to avoid ktime_get_ns()
456*4882a593Smuzhiyun 		 * cost in most cases.
457*4882a593Smuzhiyun 		 */
458*4882a593Smuzhiyun 		if (fq_packet_beyond_horizon(skb, q)) {
459*4882a593Smuzhiyun 			/* Refresh our cache and check another time */
460*4882a593Smuzhiyun 			q->ktime_cache = ktime_get_ns();
461*4882a593Smuzhiyun 			if (fq_packet_beyond_horizon(skb, q)) {
462*4882a593Smuzhiyun 				if (q->horizon_drop) {
463*4882a593Smuzhiyun 					q->stat_horizon_drops++;
464*4882a593Smuzhiyun 					return qdisc_drop(skb, sch, to_free);
465*4882a593Smuzhiyun 				}
466*4882a593Smuzhiyun 				q->stat_horizon_caps++;
467*4882a593Smuzhiyun 				skb->tstamp = q->ktime_cache + q->horizon;
468*4882a593Smuzhiyun 			}
469*4882a593Smuzhiyun 		}
470*4882a593Smuzhiyun 		fq_skb_cb(skb)->time_to_send = skb->tstamp;
471*4882a593Smuzhiyun 	}
472*4882a593Smuzhiyun 
473*4882a593Smuzhiyun 	f = fq_classify(skb, q);
474*4882a593Smuzhiyun 	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
475*4882a593Smuzhiyun 		q->stat_flows_plimit++;
476*4882a593Smuzhiyun 		return qdisc_drop(skb, sch, to_free);
477*4882a593Smuzhiyun 	}
478*4882a593Smuzhiyun 
479*4882a593Smuzhiyun 	f->qlen++;
480*4882a593Smuzhiyun 	qdisc_qstats_backlog_inc(sch, skb);
481*4882a593Smuzhiyun 	if (fq_flow_is_detached(f)) {
482*4882a593Smuzhiyun 		fq_flow_add_tail(&q->new_flows, f);
483*4882a593Smuzhiyun 		if (time_after(jiffies, f->age + q->flow_refill_delay))
484*4882a593Smuzhiyun 			f->credit = max_t(u32, f->credit, q->quantum);
485*4882a593Smuzhiyun 		q->inactive_flows--;
486*4882a593Smuzhiyun 	}
487*4882a593Smuzhiyun 
488*4882a593Smuzhiyun 	/* Note: this overwrites f->age */
489*4882a593Smuzhiyun 	flow_queue_add(f, skb);
490*4882a593Smuzhiyun 
491*4882a593Smuzhiyun 	if (unlikely(f == &q->internal)) {
492*4882a593Smuzhiyun 		q->stat_internal_packets++;
493*4882a593Smuzhiyun 	}
494*4882a593Smuzhiyun 	sch->q.qlen++;
495*4882a593Smuzhiyun 
496*4882a593Smuzhiyun 	return NET_XMIT_SUCCESS;
497*4882a593Smuzhiyun }
498*4882a593Smuzhiyun 
fq_check_throttled(struct fq_sched_data * q,u64 now)499*4882a593Smuzhiyun static void fq_check_throttled(struct fq_sched_data *q, u64 now)
500*4882a593Smuzhiyun {
501*4882a593Smuzhiyun 	unsigned long sample;
502*4882a593Smuzhiyun 	struct rb_node *p;
503*4882a593Smuzhiyun 
504*4882a593Smuzhiyun 	if (q->time_next_delayed_flow > now)
505*4882a593Smuzhiyun 		return;
506*4882a593Smuzhiyun 
507*4882a593Smuzhiyun 	/* Update unthrottle latency EWMA.
508*4882a593Smuzhiyun 	 * This is cheap and can help diagnosing timer/latency problems.
509*4882a593Smuzhiyun 	 */
510*4882a593Smuzhiyun 	sample = (unsigned long)(now - q->time_next_delayed_flow);
511*4882a593Smuzhiyun 	q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
512*4882a593Smuzhiyun 	q->unthrottle_latency_ns += sample >> 3;
513*4882a593Smuzhiyun 
514*4882a593Smuzhiyun 	q->time_next_delayed_flow = ~0ULL;
515*4882a593Smuzhiyun 	while ((p = rb_first(&q->delayed)) != NULL) {
516*4882a593Smuzhiyun 		struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
517*4882a593Smuzhiyun 
518*4882a593Smuzhiyun 		if (f->time_next_packet > now) {
519*4882a593Smuzhiyun 			q->time_next_delayed_flow = f->time_next_packet;
520*4882a593Smuzhiyun 			break;
521*4882a593Smuzhiyun 		}
522*4882a593Smuzhiyun 		fq_flow_unset_throttled(q, f);
523*4882a593Smuzhiyun 	}
524*4882a593Smuzhiyun }
525*4882a593Smuzhiyun 
fq_dequeue(struct Qdisc * sch)526*4882a593Smuzhiyun static struct sk_buff *fq_dequeue(struct Qdisc *sch)
527*4882a593Smuzhiyun {
528*4882a593Smuzhiyun 	struct fq_sched_data *q = qdisc_priv(sch);
529*4882a593Smuzhiyun 	struct fq_flow_head *head;
530*4882a593Smuzhiyun 	struct sk_buff *skb;
531*4882a593Smuzhiyun 	struct fq_flow *f;
532*4882a593Smuzhiyun 	unsigned long rate;
533*4882a593Smuzhiyun 	u32 plen;
534*4882a593Smuzhiyun 	u64 now;
535*4882a593Smuzhiyun 
536*4882a593Smuzhiyun 	if (!sch->q.qlen)
537*4882a593Smuzhiyun 		return NULL;
538*4882a593Smuzhiyun 
539*4882a593Smuzhiyun 	skb = fq_peek(&q->internal);
540*4882a593Smuzhiyun 	if (unlikely(skb)) {
541*4882a593Smuzhiyun 		fq_dequeue_skb(sch, &q->internal, skb);
542*4882a593Smuzhiyun 		goto out;
543*4882a593Smuzhiyun 	}
544*4882a593Smuzhiyun 
545*4882a593Smuzhiyun 	q->ktime_cache = now = ktime_get_ns();
546*4882a593Smuzhiyun 	fq_check_throttled(q, now);
547*4882a593Smuzhiyun begin:
548*4882a593Smuzhiyun 	head = &q->new_flows;
549*4882a593Smuzhiyun 	if (!head->first) {
550*4882a593Smuzhiyun 		head = &q->old_flows;
551*4882a593Smuzhiyun 		if (!head->first) {
552*4882a593Smuzhiyun 			if (q->time_next_delayed_flow != ~0ULL)
553*4882a593Smuzhiyun 				qdisc_watchdog_schedule_range_ns(&q->watchdog,
554*4882a593Smuzhiyun 							q->time_next_delayed_flow,
555*4882a593Smuzhiyun 							q->timer_slack);
556*4882a593Smuzhiyun 			return NULL;
557*4882a593Smuzhiyun 		}
558*4882a593Smuzhiyun 	}
559*4882a593Smuzhiyun 	f = head->first;
560*4882a593Smuzhiyun 
561*4882a593Smuzhiyun 	if (f->credit <= 0) {
562*4882a593Smuzhiyun 		f->credit += q->quantum;
563*4882a593Smuzhiyun 		head->first = f->next;
564*4882a593Smuzhiyun 		fq_flow_add_tail(&q->old_flows, f);
565*4882a593Smuzhiyun 		goto begin;
566*4882a593Smuzhiyun 	}
567*4882a593Smuzhiyun 
568*4882a593Smuzhiyun 	skb = fq_peek(f);
569*4882a593Smuzhiyun 	if (skb) {
570*4882a593Smuzhiyun 		u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
571*4882a593Smuzhiyun 					     f->time_next_packet);
572*4882a593Smuzhiyun 
573*4882a593Smuzhiyun 		if (now < time_next_packet) {
574*4882a593Smuzhiyun 			head->first = f->next;
575*4882a593Smuzhiyun 			f->time_next_packet = time_next_packet;
576*4882a593Smuzhiyun 			fq_flow_set_throttled(q, f);
577*4882a593Smuzhiyun 			goto begin;
578*4882a593Smuzhiyun 		}
579*4882a593Smuzhiyun 		prefetch(&skb->end);
580*4882a593Smuzhiyun 		if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
581*4882a593Smuzhiyun 			INET_ECN_set_ce(skb);
582*4882a593Smuzhiyun 			q->stat_ce_mark++;
583*4882a593Smuzhiyun 		}
584*4882a593Smuzhiyun 		fq_dequeue_skb(sch, f, skb);
585*4882a593Smuzhiyun 	} else {
586*4882a593Smuzhiyun 		head->first = f->next;
587*4882a593Smuzhiyun 		/* force a pass through old_flows to prevent starvation */
588*4882a593Smuzhiyun 		if ((head == &q->new_flows) && q->old_flows.first) {
589*4882a593Smuzhiyun 			fq_flow_add_tail(&q->old_flows, f);
590*4882a593Smuzhiyun 		} else {
591*4882a593Smuzhiyun 			fq_flow_set_detached(f);
592*4882a593Smuzhiyun 			q->inactive_flows++;
593*4882a593Smuzhiyun 		}
594*4882a593Smuzhiyun 		goto begin;
595*4882a593Smuzhiyun 	}
596*4882a593Smuzhiyun 	plen = qdisc_pkt_len(skb);
597*4882a593Smuzhiyun 	f->credit -= plen;
598*4882a593Smuzhiyun 
599*4882a593Smuzhiyun 	if (!q->rate_enable)
600*4882a593Smuzhiyun 		goto out;
601*4882a593Smuzhiyun 
602*4882a593Smuzhiyun 	rate = q->flow_max_rate;
603*4882a593Smuzhiyun 
604*4882a593Smuzhiyun 	/* If EDT time was provided for this skb, we need to
605*4882a593Smuzhiyun 	 * update f->time_next_packet only if this qdisc enforces
606*4882a593Smuzhiyun 	 * a flow max rate.
607*4882a593Smuzhiyun 	 */
608*4882a593Smuzhiyun 	if (!skb->tstamp) {
609*4882a593Smuzhiyun 		if (skb->sk)
610*4882a593Smuzhiyun 			rate = min(skb->sk->sk_pacing_rate, rate);
611*4882a593Smuzhiyun 
612*4882a593Smuzhiyun 		if (rate <= q->low_rate_threshold) {
613*4882a593Smuzhiyun 			f->credit = 0;
614*4882a593Smuzhiyun 		} else {
615*4882a593Smuzhiyun 			plen = max(plen, q->quantum);
616*4882a593Smuzhiyun 			if (f->credit > 0)
617*4882a593Smuzhiyun 				goto out;
618*4882a593Smuzhiyun 		}
619*4882a593Smuzhiyun 	}
620*4882a593Smuzhiyun 	if (rate != ~0UL) {
621*4882a593Smuzhiyun 		u64 len = (u64)plen * NSEC_PER_SEC;
622*4882a593Smuzhiyun 
623*4882a593Smuzhiyun 		if (likely(rate))
624*4882a593Smuzhiyun 			len = div64_ul(len, rate);
625*4882a593Smuzhiyun 		/* Since socket rate can change later,
626*4882a593Smuzhiyun 		 * clamp the delay to 1 second.
627*4882a593Smuzhiyun 		 * Really, providers of too big packets should be fixed !
628*4882a593Smuzhiyun 		 */
629*4882a593Smuzhiyun 		if (unlikely(len > NSEC_PER_SEC)) {
630*4882a593Smuzhiyun 			len = NSEC_PER_SEC;
631*4882a593Smuzhiyun 			q->stat_pkts_too_long++;
632*4882a593Smuzhiyun 		}
633*4882a593Smuzhiyun 		/* Account for schedule/timers drifts.
634*4882a593Smuzhiyun 		 * f->time_next_packet was set when prior packet was sent,
635*4882a593Smuzhiyun 		 * and current time (@now) can be too late by tens of us.
636*4882a593Smuzhiyun 		 */
637*4882a593Smuzhiyun 		if (f->time_next_packet)
638*4882a593Smuzhiyun 			len -= min(len/2, now - f->time_next_packet);
639*4882a593Smuzhiyun 		f->time_next_packet = now + len;
640*4882a593Smuzhiyun 	}
641*4882a593Smuzhiyun out:
642*4882a593Smuzhiyun 	qdisc_bstats_update(sch, skb);
643*4882a593Smuzhiyun 	return skb;
644*4882a593Smuzhiyun }
645*4882a593Smuzhiyun 
fq_flow_purge(struct fq_flow * flow)646*4882a593Smuzhiyun static void fq_flow_purge(struct fq_flow *flow)
647*4882a593Smuzhiyun {
648*4882a593Smuzhiyun 	struct rb_node *p = rb_first(&flow->t_root);
649*4882a593Smuzhiyun 
650*4882a593Smuzhiyun 	while (p) {
651*4882a593Smuzhiyun 		struct sk_buff *skb = rb_to_skb(p);
652*4882a593Smuzhiyun 
653*4882a593Smuzhiyun 		p = rb_next(p);
654*4882a593Smuzhiyun 		rb_erase(&skb->rbnode, &flow->t_root);
655*4882a593Smuzhiyun 		rtnl_kfree_skbs(skb, skb);
656*4882a593Smuzhiyun 	}
657*4882a593Smuzhiyun 	rtnl_kfree_skbs(flow->head, flow->tail);
658*4882a593Smuzhiyun 	flow->head = NULL;
659*4882a593Smuzhiyun 	flow->qlen = 0;
660*4882a593Smuzhiyun }
661*4882a593Smuzhiyun 
fq_reset(struct Qdisc * sch)662*4882a593Smuzhiyun static void fq_reset(struct Qdisc *sch)
663*4882a593Smuzhiyun {
664*4882a593Smuzhiyun 	struct fq_sched_data *q = qdisc_priv(sch);
665*4882a593Smuzhiyun 	struct rb_root *root;
666*4882a593Smuzhiyun 	struct rb_node *p;
667*4882a593Smuzhiyun 	struct fq_flow *f;
668*4882a593Smuzhiyun 	unsigned int idx;
669*4882a593Smuzhiyun 
670*4882a593Smuzhiyun 	sch->q.qlen = 0;
671*4882a593Smuzhiyun 	sch->qstats.backlog = 0;
672*4882a593Smuzhiyun 
673*4882a593Smuzhiyun 	fq_flow_purge(&q->internal);
674*4882a593Smuzhiyun 
675*4882a593Smuzhiyun 	if (!q->fq_root)
676*4882a593Smuzhiyun 		return;
677*4882a593Smuzhiyun 
678*4882a593Smuzhiyun 	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
679*4882a593Smuzhiyun 		root = &q->fq_root[idx];
680*4882a593Smuzhiyun 		while ((p = rb_first(root)) != NULL) {
681*4882a593Smuzhiyun 			f = rb_entry(p, struct fq_flow, fq_node);
682*4882a593Smuzhiyun 			rb_erase(p, root);
683*4882a593Smuzhiyun 
684*4882a593Smuzhiyun 			fq_flow_purge(f);
685*4882a593Smuzhiyun 
686*4882a593Smuzhiyun 			kmem_cache_free(fq_flow_cachep, f);
687*4882a593Smuzhiyun 		}
688*4882a593Smuzhiyun 	}
689*4882a593Smuzhiyun 	q->new_flows.first	= NULL;
690*4882a593Smuzhiyun 	q->old_flows.first	= NULL;
691*4882a593Smuzhiyun 	q->delayed		= RB_ROOT;
692*4882a593Smuzhiyun 	q->flows		= 0;
693*4882a593Smuzhiyun 	q->inactive_flows	= 0;
694*4882a593Smuzhiyun 	q->throttled_flows	= 0;
695*4882a593Smuzhiyun }
696*4882a593Smuzhiyun 
fq_rehash(struct fq_sched_data * q,struct rb_root * old_array,u32 old_log,struct rb_root * new_array,u32 new_log)697*4882a593Smuzhiyun static void fq_rehash(struct fq_sched_data *q,
698*4882a593Smuzhiyun 		      struct rb_root *old_array, u32 old_log,
699*4882a593Smuzhiyun 		      struct rb_root *new_array, u32 new_log)
700*4882a593Smuzhiyun {
701*4882a593Smuzhiyun 	struct rb_node *op, **np, *parent;
702*4882a593Smuzhiyun 	struct rb_root *oroot, *nroot;
703*4882a593Smuzhiyun 	struct fq_flow *of, *nf;
704*4882a593Smuzhiyun 	int fcnt = 0;
705*4882a593Smuzhiyun 	u32 idx;
706*4882a593Smuzhiyun 
707*4882a593Smuzhiyun 	for (idx = 0; idx < (1U << old_log); idx++) {
708*4882a593Smuzhiyun 		oroot = &old_array[idx];
709*4882a593Smuzhiyun 		while ((op = rb_first(oroot)) != NULL) {
710*4882a593Smuzhiyun 			rb_erase(op, oroot);
711*4882a593Smuzhiyun 			of = rb_entry(op, struct fq_flow, fq_node);
712*4882a593Smuzhiyun 			if (fq_gc_candidate(of)) {
713*4882a593Smuzhiyun 				fcnt++;
714*4882a593Smuzhiyun 				kmem_cache_free(fq_flow_cachep, of);
715*4882a593Smuzhiyun 				continue;
716*4882a593Smuzhiyun 			}
717*4882a593Smuzhiyun 			nroot = &new_array[hash_ptr(of->sk, new_log)];
718*4882a593Smuzhiyun 
719*4882a593Smuzhiyun 			np = &nroot->rb_node;
720*4882a593Smuzhiyun 			parent = NULL;
721*4882a593Smuzhiyun 			while (*np) {
722*4882a593Smuzhiyun 				parent = *np;
723*4882a593Smuzhiyun 
724*4882a593Smuzhiyun 				nf = rb_entry(parent, struct fq_flow, fq_node);
725*4882a593Smuzhiyun 				BUG_ON(nf->sk == of->sk);
726*4882a593Smuzhiyun 
727*4882a593Smuzhiyun 				if (nf->sk > of->sk)
728*4882a593Smuzhiyun 					np = &parent->rb_right;
729*4882a593Smuzhiyun 				else
730*4882a593Smuzhiyun 					np = &parent->rb_left;
731*4882a593Smuzhiyun 			}
732*4882a593Smuzhiyun 
733*4882a593Smuzhiyun 			rb_link_node(&of->fq_node, parent, np);
734*4882a593Smuzhiyun 			rb_insert_color(&of->fq_node, nroot);
735*4882a593Smuzhiyun 		}
736*4882a593Smuzhiyun 	}
737*4882a593Smuzhiyun 	q->flows -= fcnt;
738*4882a593Smuzhiyun 	q->inactive_flows -= fcnt;
739*4882a593Smuzhiyun 	q->stat_gc_flows += fcnt;
740*4882a593Smuzhiyun }
741*4882a593Smuzhiyun 
fq_free(void * addr)742*4882a593Smuzhiyun static void fq_free(void *addr)
743*4882a593Smuzhiyun {
744*4882a593Smuzhiyun 	kvfree(addr);
745*4882a593Smuzhiyun }
746*4882a593Smuzhiyun 
fq_resize(struct Qdisc * sch,u32 log)747*4882a593Smuzhiyun static int fq_resize(struct Qdisc *sch, u32 log)
748*4882a593Smuzhiyun {
749*4882a593Smuzhiyun 	struct fq_sched_data *q = qdisc_priv(sch);
750*4882a593Smuzhiyun 	struct rb_root *array;
751*4882a593Smuzhiyun 	void *old_fq_root;
752*4882a593Smuzhiyun 	u32 idx;
753*4882a593Smuzhiyun 
754*4882a593Smuzhiyun 	if (q->fq_root && log == q->fq_trees_log)
755*4882a593Smuzhiyun 		return 0;
756*4882a593Smuzhiyun 
757*4882a593Smuzhiyun 	/* If XPS was setup, we can allocate memory on right NUMA node */
758*4882a593Smuzhiyun 	array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
759*4882a593Smuzhiyun 			      netdev_queue_numa_node_read(sch->dev_queue));
760*4882a593Smuzhiyun 	if (!array)
761*4882a593Smuzhiyun 		return -ENOMEM;
762*4882a593Smuzhiyun 
763*4882a593Smuzhiyun 	for (idx = 0; idx < (1U << log); idx++)
764*4882a593Smuzhiyun 		array[idx] = RB_ROOT;
765*4882a593Smuzhiyun 
766*4882a593Smuzhiyun 	sch_tree_lock(sch);
767*4882a593Smuzhiyun 
768*4882a593Smuzhiyun 	old_fq_root = q->fq_root;
769*4882a593Smuzhiyun 	if (old_fq_root)
770*4882a593Smuzhiyun 		fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
771*4882a593Smuzhiyun 
772*4882a593Smuzhiyun 	q->fq_root = array;
773*4882a593Smuzhiyun 	q->fq_trees_log = log;
774*4882a593Smuzhiyun 
775*4882a593Smuzhiyun 	sch_tree_unlock(sch);
776*4882a593Smuzhiyun 
777*4882a593Smuzhiyun 	fq_free(old_fq_root);
778*4882a593Smuzhiyun 
779*4882a593Smuzhiyun 	return 0;
780*4882a593Smuzhiyun }
781*4882a593Smuzhiyun 
782*4882a593Smuzhiyun static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
783*4882a593Smuzhiyun 	[TCA_FQ_UNSPEC]			= { .strict_start_type = TCA_FQ_TIMER_SLACK },
784*4882a593Smuzhiyun 
785*4882a593Smuzhiyun 	[TCA_FQ_PLIMIT]			= { .type = NLA_U32 },
786*4882a593Smuzhiyun 	[TCA_FQ_FLOW_PLIMIT]		= { .type = NLA_U32 },
787*4882a593Smuzhiyun 	[TCA_FQ_QUANTUM]		= { .type = NLA_U32 },
788*4882a593Smuzhiyun 	[TCA_FQ_INITIAL_QUANTUM]	= { .type = NLA_U32 },
789*4882a593Smuzhiyun 	[TCA_FQ_RATE_ENABLE]		= { .type = NLA_U32 },
790*4882a593Smuzhiyun 	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 },
791*4882a593Smuzhiyun 	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
792*4882a593Smuzhiyun 	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
793*4882a593Smuzhiyun 	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
794*4882a593Smuzhiyun 	[TCA_FQ_ORPHAN_MASK]		= { .type = NLA_U32 },
795*4882a593Smuzhiyun 	[TCA_FQ_LOW_RATE_THRESHOLD]	= { .type = NLA_U32 },
796*4882a593Smuzhiyun 	[TCA_FQ_CE_THRESHOLD]		= { .type = NLA_U32 },
797*4882a593Smuzhiyun 	[TCA_FQ_TIMER_SLACK]		= { .type = NLA_U32 },
798*4882a593Smuzhiyun 	[TCA_FQ_HORIZON]		= { .type = NLA_U32 },
799*4882a593Smuzhiyun 	[TCA_FQ_HORIZON_DROP]		= { .type = NLA_U8 },
800*4882a593Smuzhiyun };
801*4882a593Smuzhiyun 
fq_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)802*4882a593Smuzhiyun static int fq_change(struct Qdisc *sch, struct nlattr *opt,
803*4882a593Smuzhiyun 		     struct netlink_ext_ack *extack)
804*4882a593Smuzhiyun {
805*4882a593Smuzhiyun 	struct fq_sched_data *q = qdisc_priv(sch);
806*4882a593Smuzhiyun 	struct nlattr *tb[TCA_FQ_MAX + 1];
807*4882a593Smuzhiyun 	int err, drop_count = 0;
808*4882a593Smuzhiyun 	unsigned drop_len = 0;
809*4882a593Smuzhiyun 	u32 fq_log;
810*4882a593Smuzhiyun 
811*4882a593Smuzhiyun 	if (!opt)
812*4882a593Smuzhiyun 		return -EINVAL;
813*4882a593Smuzhiyun 
814*4882a593Smuzhiyun 	err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
815*4882a593Smuzhiyun 					  NULL);
816*4882a593Smuzhiyun 	if (err < 0)
817*4882a593Smuzhiyun 		return err;
818*4882a593Smuzhiyun 
819*4882a593Smuzhiyun 	sch_tree_lock(sch);
820*4882a593Smuzhiyun 
821*4882a593Smuzhiyun 	fq_log = q->fq_trees_log;
822*4882a593Smuzhiyun 
823*4882a593Smuzhiyun 	if (tb[TCA_FQ_BUCKETS_LOG]) {
824*4882a593Smuzhiyun 		u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
825*4882a593Smuzhiyun 
826*4882a593Smuzhiyun 		if (nval >= 1 && nval <= ilog2(256*1024))
827*4882a593Smuzhiyun 			fq_log = nval;
828*4882a593Smuzhiyun 		else
829*4882a593Smuzhiyun 			err = -EINVAL;
830*4882a593Smuzhiyun 	}
831*4882a593Smuzhiyun 	if (tb[TCA_FQ_PLIMIT])
832*4882a593Smuzhiyun 		sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
833*4882a593Smuzhiyun 
834*4882a593Smuzhiyun 	if (tb[TCA_FQ_FLOW_PLIMIT])
835*4882a593Smuzhiyun 		q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
836*4882a593Smuzhiyun 
837*4882a593Smuzhiyun 	if (tb[TCA_FQ_QUANTUM]) {
838*4882a593Smuzhiyun 		u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
839*4882a593Smuzhiyun 
840*4882a593Smuzhiyun 		if (quantum > 0 && quantum <= (1 << 20)) {
841*4882a593Smuzhiyun 			q->quantum = quantum;
842*4882a593Smuzhiyun 		} else {
843*4882a593Smuzhiyun 			NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
844*4882a593Smuzhiyun 			err = -EINVAL;
845*4882a593Smuzhiyun 		}
846*4882a593Smuzhiyun 	}
847*4882a593Smuzhiyun 
848*4882a593Smuzhiyun 	if (tb[TCA_FQ_INITIAL_QUANTUM])
849*4882a593Smuzhiyun 		q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
850*4882a593Smuzhiyun 
851*4882a593Smuzhiyun 	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
852*4882a593Smuzhiyun 		pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
853*4882a593Smuzhiyun 				    nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
854*4882a593Smuzhiyun 
855*4882a593Smuzhiyun 	if (tb[TCA_FQ_FLOW_MAX_RATE]) {
856*4882a593Smuzhiyun 		u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
857*4882a593Smuzhiyun 
858*4882a593Smuzhiyun 		q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
859*4882a593Smuzhiyun 	}
860*4882a593Smuzhiyun 	if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
861*4882a593Smuzhiyun 		q->low_rate_threshold =
862*4882a593Smuzhiyun 			nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
863*4882a593Smuzhiyun 
864*4882a593Smuzhiyun 	if (tb[TCA_FQ_RATE_ENABLE]) {
865*4882a593Smuzhiyun 		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
866*4882a593Smuzhiyun 
867*4882a593Smuzhiyun 		if (enable <= 1)
868*4882a593Smuzhiyun 			q->rate_enable = enable;
869*4882a593Smuzhiyun 		else
870*4882a593Smuzhiyun 			err = -EINVAL;
871*4882a593Smuzhiyun 	}
872*4882a593Smuzhiyun 
873*4882a593Smuzhiyun 	if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
874*4882a593Smuzhiyun 		u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
875*4882a593Smuzhiyun 
876*4882a593Smuzhiyun 		q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
877*4882a593Smuzhiyun 	}
878*4882a593Smuzhiyun 
879*4882a593Smuzhiyun 	if (tb[TCA_FQ_ORPHAN_MASK])
880*4882a593Smuzhiyun 		q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
881*4882a593Smuzhiyun 
882*4882a593Smuzhiyun 	if (tb[TCA_FQ_CE_THRESHOLD])
883*4882a593Smuzhiyun 		q->ce_threshold = (u64)NSEC_PER_USEC *
884*4882a593Smuzhiyun 				  nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
885*4882a593Smuzhiyun 
886*4882a593Smuzhiyun 	if (tb[TCA_FQ_TIMER_SLACK])
887*4882a593Smuzhiyun 		q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
888*4882a593Smuzhiyun 
889*4882a593Smuzhiyun 	if (tb[TCA_FQ_HORIZON])
890*4882a593Smuzhiyun 		q->horizon = (u64)NSEC_PER_USEC *
891*4882a593Smuzhiyun 				  nla_get_u32(tb[TCA_FQ_HORIZON]);
892*4882a593Smuzhiyun 
893*4882a593Smuzhiyun 	if (tb[TCA_FQ_HORIZON_DROP])
894*4882a593Smuzhiyun 		q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]);
895*4882a593Smuzhiyun 
896*4882a593Smuzhiyun 	if (!err) {
897*4882a593Smuzhiyun 
898*4882a593Smuzhiyun 		sch_tree_unlock(sch);
899*4882a593Smuzhiyun 		err = fq_resize(sch, fq_log);
900*4882a593Smuzhiyun 		sch_tree_lock(sch);
901*4882a593Smuzhiyun 	}
902*4882a593Smuzhiyun 	while (sch->q.qlen > sch->limit) {
903*4882a593Smuzhiyun 		struct sk_buff *skb = fq_dequeue(sch);
904*4882a593Smuzhiyun 
905*4882a593Smuzhiyun 		if (!skb)
906*4882a593Smuzhiyun 			break;
907*4882a593Smuzhiyun 		drop_len += qdisc_pkt_len(skb);
908*4882a593Smuzhiyun 		rtnl_kfree_skbs(skb, skb);
909*4882a593Smuzhiyun 		drop_count++;
910*4882a593Smuzhiyun 	}
911*4882a593Smuzhiyun 	qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
912*4882a593Smuzhiyun 
913*4882a593Smuzhiyun 	sch_tree_unlock(sch);
914*4882a593Smuzhiyun 	return err;
915*4882a593Smuzhiyun }
916*4882a593Smuzhiyun 
fq_destroy(struct Qdisc * sch)917*4882a593Smuzhiyun static void fq_destroy(struct Qdisc *sch)
918*4882a593Smuzhiyun {
919*4882a593Smuzhiyun 	struct fq_sched_data *q = qdisc_priv(sch);
920*4882a593Smuzhiyun 
921*4882a593Smuzhiyun 	fq_reset(sch);
922*4882a593Smuzhiyun 	fq_free(q->fq_root);
923*4882a593Smuzhiyun 	qdisc_watchdog_cancel(&q->watchdog);
924*4882a593Smuzhiyun }
925*4882a593Smuzhiyun 
fq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)926*4882a593Smuzhiyun static int fq_init(struct Qdisc *sch, struct nlattr *opt,
927*4882a593Smuzhiyun 		   struct netlink_ext_ack *extack)
928*4882a593Smuzhiyun {
929*4882a593Smuzhiyun 	struct fq_sched_data *q = qdisc_priv(sch);
930*4882a593Smuzhiyun 	int err;
931*4882a593Smuzhiyun 
932*4882a593Smuzhiyun 	sch->limit		= 10000;
933*4882a593Smuzhiyun 	q->flow_plimit		= 100;
934*4882a593Smuzhiyun 	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));
935*4882a593Smuzhiyun 	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
936*4882a593Smuzhiyun 	q->flow_refill_delay	= msecs_to_jiffies(40);
937*4882a593Smuzhiyun 	q->flow_max_rate	= ~0UL;
938*4882a593Smuzhiyun 	q->time_next_delayed_flow = ~0ULL;
939*4882a593Smuzhiyun 	q->rate_enable		= 1;
940*4882a593Smuzhiyun 	q->new_flows.first	= NULL;
941*4882a593Smuzhiyun 	q->old_flows.first	= NULL;
942*4882a593Smuzhiyun 	q->delayed		= RB_ROOT;
943*4882a593Smuzhiyun 	q->fq_root		= NULL;
944*4882a593Smuzhiyun 	q->fq_trees_log		= ilog2(1024);
945*4882a593Smuzhiyun 	q->orphan_mask		= 1024 - 1;
946*4882a593Smuzhiyun 	q->low_rate_threshold	= 550000 / 8;
947*4882a593Smuzhiyun 
948*4882a593Smuzhiyun 	q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
949*4882a593Smuzhiyun 
950*4882a593Smuzhiyun 	q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
951*4882a593Smuzhiyun 	q->horizon_drop = 1; /* by default, drop packets beyond horizon */
952*4882a593Smuzhiyun 
953*4882a593Smuzhiyun 	/* Default ce_threshold of 4294 seconds */
954*4882a593Smuzhiyun 	q->ce_threshold		= (u64)NSEC_PER_USEC * ~0U;
955*4882a593Smuzhiyun 
956*4882a593Smuzhiyun 	qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
957*4882a593Smuzhiyun 
958*4882a593Smuzhiyun 	if (opt)
959*4882a593Smuzhiyun 		err = fq_change(sch, opt, extack);
960*4882a593Smuzhiyun 	else
961*4882a593Smuzhiyun 		err = fq_resize(sch, q->fq_trees_log);
962*4882a593Smuzhiyun 
963*4882a593Smuzhiyun 	return err;
964*4882a593Smuzhiyun }
965*4882a593Smuzhiyun 
fq_dump(struct Qdisc * sch,struct sk_buff * skb)966*4882a593Smuzhiyun static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
967*4882a593Smuzhiyun {
968*4882a593Smuzhiyun 	struct fq_sched_data *q = qdisc_priv(sch);
969*4882a593Smuzhiyun 	u64 ce_threshold = q->ce_threshold;
970*4882a593Smuzhiyun 	u64 horizon = q->horizon;
971*4882a593Smuzhiyun 	struct nlattr *opts;
972*4882a593Smuzhiyun 
973*4882a593Smuzhiyun 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
974*4882a593Smuzhiyun 	if (opts == NULL)
975*4882a593Smuzhiyun 		goto nla_put_failure;
976*4882a593Smuzhiyun 
977*4882a593Smuzhiyun 	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
978*4882a593Smuzhiyun 
979*4882a593Smuzhiyun 	do_div(ce_threshold, NSEC_PER_USEC);
980*4882a593Smuzhiyun 	do_div(horizon, NSEC_PER_USEC);
981*4882a593Smuzhiyun 
982*4882a593Smuzhiyun 	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
983*4882a593Smuzhiyun 	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
984*4882a593Smuzhiyun 	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
985*4882a593Smuzhiyun 	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
986*4882a593Smuzhiyun 	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
987*4882a593Smuzhiyun 	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
988*4882a593Smuzhiyun 			min_t(unsigned long, q->flow_max_rate, ~0U)) ||
989*4882a593Smuzhiyun 	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
990*4882a593Smuzhiyun 			jiffies_to_usecs(q->flow_refill_delay)) ||
991*4882a593Smuzhiyun 	    nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
992*4882a593Smuzhiyun 	    nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
993*4882a593Smuzhiyun 			q->low_rate_threshold) ||
994*4882a593Smuzhiyun 	    nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
995*4882a593Smuzhiyun 	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
996*4882a593Smuzhiyun 	    nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) ||
997*4882a593Smuzhiyun 	    nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
998*4882a593Smuzhiyun 	    nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
999*4882a593Smuzhiyun 		goto nla_put_failure;
1000*4882a593Smuzhiyun 
1001*4882a593Smuzhiyun 	return nla_nest_end(skb, opts);
1002*4882a593Smuzhiyun 
1003*4882a593Smuzhiyun nla_put_failure:
1004*4882a593Smuzhiyun 	return -1;
1005*4882a593Smuzhiyun }
1006*4882a593Smuzhiyun 
fq_dump_stats(struct Qdisc * sch,struct gnet_dump * d)1007*4882a593Smuzhiyun static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1008*4882a593Smuzhiyun {
1009*4882a593Smuzhiyun 	struct fq_sched_data *q = qdisc_priv(sch);
1010*4882a593Smuzhiyun 	struct tc_fq_qd_stats st;
1011*4882a593Smuzhiyun 
1012*4882a593Smuzhiyun 	sch_tree_lock(sch);
1013*4882a593Smuzhiyun 
1014*4882a593Smuzhiyun 	st.gc_flows		  = q->stat_gc_flows;
1015*4882a593Smuzhiyun 	st.highprio_packets	  = q->stat_internal_packets;
1016*4882a593Smuzhiyun 	st.tcp_retrans		  = 0;
1017*4882a593Smuzhiyun 	st.throttled		  = q->stat_throttled;
1018*4882a593Smuzhiyun 	st.flows_plimit		  = q->stat_flows_plimit;
1019*4882a593Smuzhiyun 	st.pkts_too_long	  = q->stat_pkts_too_long;
1020*4882a593Smuzhiyun 	st.allocation_errors	  = q->stat_allocation_errors;
1021*4882a593Smuzhiyun 	st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
1022*4882a593Smuzhiyun 				    ktime_get_ns();
1023*4882a593Smuzhiyun 	st.flows		  = q->flows;
1024*4882a593Smuzhiyun 	st.inactive_flows	  = q->inactive_flows;
1025*4882a593Smuzhiyun 	st.throttled_flows	  = q->throttled_flows;
1026*4882a593Smuzhiyun 	st.unthrottle_latency_ns  = min_t(unsigned long,
1027*4882a593Smuzhiyun 					  q->unthrottle_latency_ns, ~0U);
1028*4882a593Smuzhiyun 	st.ce_mark		  = q->stat_ce_mark;
1029*4882a593Smuzhiyun 	st.horizon_drops	  = q->stat_horizon_drops;
1030*4882a593Smuzhiyun 	st.horizon_caps		  = q->stat_horizon_caps;
1031*4882a593Smuzhiyun 	sch_tree_unlock(sch);
1032*4882a593Smuzhiyun 
1033*4882a593Smuzhiyun 	return gnet_stats_copy_app(d, &st, sizeof(st));
1034*4882a593Smuzhiyun }
1035*4882a593Smuzhiyun 
1036*4882a593Smuzhiyun static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
1037*4882a593Smuzhiyun 	.id		=	"fq",
1038*4882a593Smuzhiyun 	.priv_size	=	sizeof(struct fq_sched_data),
1039*4882a593Smuzhiyun 
1040*4882a593Smuzhiyun 	.enqueue	=	fq_enqueue,
1041*4882a593Smuzhiyun 	.dequeue	=	fq_dequeue,
1042*4882a593Smuzhiyun 	.peek		=	qdisc_peek_dequeued,
1043*4882a593Smuzhiyun 	.init		=	fq_init,
1044*4882a593Smuzhiyun 	.reset		=	fq_reset,
1045*4882a593Smuzhiyun 	.destroy	=	fq_destroy,
1046*4882a593Smuzhiyun 	.change		=	fq_change,
1047*4882a593Smuzhiyun 	.dump		=	fq_dump,
1048*4882a593Smuzhiyun 	.dump_stats	=	fq_dump_stats,
1049*4882a593Smuzhiyun 	.owner		=	THIS_MODULE,
1050*4882a593Smuzhiyun };
1051*4882a593Smuzhiyun 
fq_module_init(void)1052*4882a593Smuzhiyun static int __init fq_module_init(void)
1053*4882a593Smuzhiyun {
1054*4882a593Smuzhiyun 	int ret;
1055*4882a593Smuzhiyun 
1056*4882a593Smuzhiyun 	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
1057*4882a593Smuzhiyun 					   sizeof(struct fq_flow),
1058*4882a593Smuzhiyun 					   0, 0, NULL);
1059*4882a593Smuzhiyun 	if (!fq_flow_cachep)
1060*4882a593Smuzhiyun 		return -ENOMEM;
1061*4882a593Smuzhiyun 
1062*4882a593Smuzhiyun 	ret = register_qdisc(&fq_qdisc_ops);
1063*4882a593Smuzhiyun 	if (ret)
1064*4882a593Smuzhiyun 		kmem_cache_destroy(fq_flow_cachep);
1065*4882a593Smuzhiyun 	return ret;
1066*4882a593Smuzhiyun }
1067*4882a593Smuzhiyun 
fq_module_exit(void)1068*4882a593Smuzhiyun static void __exit fq_module_exit(void)
1069*4882a593Smuzhiyun {
1070*4882a593Smuzhiyun 	unregister_qdisc(&fq_qdisc_ops);
1071*4882a593Smuzhiyun 	kmem_cache_destroy(fq_flow_cachep);
1072*4882a593Smuzhiyun }
1073*4882a593Smuzhiyun 
1074*4882a593Smuzhiyun module_init(fq_module_init)
1075*4882a593Smuzhiyun module_exit(fq_module_exit)
1076*4882a593Smuzhiyun MODULE_AUTHOR("Eric Dumazet");
1077*4882a593Smuzhiyun MODULE_LICENSE("GPL");
1078*4882a593Smuzhiyun MODULE_DESCRIPTION("Fair Queue Packet Scheduler");
1079