1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Meant to be mostly used for locally generated traffic :
8*4882a593Smuzhiyun * Fast classification depends on skb->sk being set before reaching us.
9*4882a593Smuzhiyun * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
10*4882a593Smuzhiyun * All packets belonging to a socket are considered as a 'flow'.
11*4882a593Smuzhiyun *
12*4882a593Smuzhiyun * Flows are dynamically allocated and stored in a hash table of RB trees
13*4882a593Smuzhiyun * They are also part of one Round Robin 'queues' (new or old flows)
14*4882a593Smuzhiyun *
15*4882a593Smuzhiyun * Burst avoidance (aka pacing) capability :
16*4882a593Smuzhiyun *
17*4882a593Smuzhiyun * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
18*4882a593Smuzhiyun * bunch of packets, and this packet scheduler adds delay between
19*4882a593Smuzhiyun * packets to respect rate limitation.
20*4882a593Smuzhiyun *
21*4882a593Smuzhiyun * enqueue() :
22*4882a593Smuzhiyun * - lookup one RB tree (out of 1024 or more) to find the flow.
23*4882a593Smuzhiyun * If non existent flow, create it, add it to the tree.
24*4882a593Smuzhiyun * Add skb to the per flow list of skb (fifo).
25*4882a593Smuzhiyun * - Use a special fifo for high prio packets
26*4882a593Smuzhiyun *
27*4882a593Smuzhiyun * dequeue() : serves flows in Round Robin
28*4882a593Smuzhiyun * Note : When a flow becomes empty, we do not immediately remove it from
29*4882a593Smuzhiyun * rb trees, for performance reasons (its expected to send additional packets,
30*4882a593Smuzhiyun * or SLAB cache will reuse socket for another flow)
31*4882a593Smuzhiyun */
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun #include <linux/module.h>
34*4882a593Smuzhiyun #include <linux/types.h>
35*4882a593Smuzhiyun #include <linux/kernel.h>
36*4882a593Smuzhiyun #include <linux/jiffies.h>
37*4882a593Smuzhiyun #include <linux/string.h>
38*4882a593Smuzhiyun #include <linux/in.h>
39*4882a593Smuzhiyun #include <linux/errno.h>
40*4882a593Smuzhiyun #include <linux/init.h>
41*4882a593Smuzhiyun #include <linux/skbuff.h>
42*4882a593Smuzhiyun #include <linux/slab.h>
43*4882a593Smuzhiyun #include <linux/rbtree.h>
44*4882a593Smuzhiyun #include <linux/hash.h>
45*4882a593Smuzhiyun #include <linux/prefetch.h>
46*4882a593Smuzhiyun #include <linux/vmalloc.h>
47*4882a593Smuzhiyun #include <net/netlink.h>
48*4882a593Smuzhiyun #include <net/pkt_sched.h>
49*4882a593Smuzhiyun #include <net/sock.h>
50*4882a593Smuzhiyun #include <net/tcp_states.h>
51*4882a593Smuzhiyun #include <net/tcp.h>
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun struct fq_skb_cb {
54*4882a593Smuzhiyun u64 time_to_send;
55*4882a593Smuzhiyun };
56*4882a593Smuzhiyun
fq_skb_cb(struct sk_buff * skb)57*4882a593Smuzhiyun static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
58*4882a593Smuzhiyun {
59*4882a593Smuzhiyun qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
60*4882a593Smuzhiyun return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
61*4882a593Smuzhiyun }
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun /*
64*4882a593Smuzhiyun * Per flow structure, dynamically allocated.
65*4882a593Smuzhiyun * If packets have monotically increasing time_to_send, they are placed in O(1)
66*4882a593Smuzhiyun * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
67*4882a593Smuzhiyun */
68*4882a593Smuzhiyun struct fq_flow {
69*4882a593Smuzhiyun /* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
70*4882a593Smuzhiyun struct rb_root t_root;
71*4882a593Smuzhiyun struct sk_buff *head; /* list of skbs for this flow : first skb */
72*4882a593Smuzhiyun union {
73*4882a593Smuzhiyun struct sk_buff *tail; /* last skb in the list */
74*4882a593Smuzhiyun unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */
75*4882a593Smuzhiyun };
76*4882a593Smuzhiyun struct rb_node fq_node; /* anchor in fq_root[] trees */
77*4882a593Smuzhiyun struct sock *sk;
78*4882a593Smuzhiyun u32 socket_hash; /* sk_hash */
79*4882a593Smuzhiyun int qlen; /* number of packets in flow queue */
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun /* Second cache line, used in fq_dequeue() */
82*4882a593Smuzhiyun int credit;
83*4882a593Smuzhiyun /* 32bit hole on 64bit arches */
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun struct fq_flow *next; /* next pointer in RR lists */
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun struct rb_node rate_node; /* anchor in q->delayed tree */
88*4882a593Smuzhiyun u64 time_next_packet;
89*4882a593Smuzhiyun } ____cacheline_aligned_in_smp;
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun struct fq_flow_head {
92*4882a593Smuzhiyun struct fq_flow *first;
93*4882a593Smuzhiyun struct fq_flow *last;
94*4882a593Smuzhiyun };
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun struct fq_sched_data {
97*4882a593Smuzhiyun struct fq_flow_head new_flows;
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun struct fq_flow_head old_flows;
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun struct rb_root delayed; /* for rate limited flows */
102*4882a593Smuzhiyun u64 time_next_delayed_flow;
103*4882a593Smuzhiyun u64 ktime_cache; /* copy of last ktime_get_ns() */
104*4882a593Smuzhiyun unsigned long unthrottle_latency_ns;
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun struct fq_flow internal; /* for non classified or high prio packets */
107*4882a593Smuzhiyun u32 quantum;
108*4882a593Smuzhiyun u32 initial_quantum;
109*4882a593Smuzhiyun u32 flow_refill_delay;
110*4882a593Smuzhiyun u32 flow_plimit; /* max packets per flow */
111*4882a593Smuzhiyun unsigned long flow_max_rate; /* optional max rate per flow */
112*4882a593Smuzhiyun u64 ce_threshold;
113*4882a593Smuzhiyun u64 horizon; /* horizon in ns */
114*4882a593Smuzhiyun u32 orphan_mask; /* mask for orphaned skb */
115*4882a593Smuzhiyun u32 low_rate_threshold;
116*4882a593Smuzhiyun struct rb_root *fq_root;
117*4882a593Smuzhiyun u8 rate_enable;
118*4882a593Smuzhiyun u8 fq_trees_log;
119*4882a593Smuzhiyun u8 horizon_drop;
120*4882a593Smuzhiyun u32 flows;
121*4882a593Smuzhiyun u32 inactive_flows;
122*4882a593Smuzhiyun u32 throttled_flows;
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun u64 stat_gc_flows;
125*4882a593Smuzhiyun u64 stat_internal_packets;
126*4882a593Smuzhiyun u64 stat_throttled;
127*4882a593Smuzhiyun u64 stat_ce_mark;
128*4882a593Smuzhiyun u64 stat_horizon_drops;
129*4882a593Smuzhiyun u64 stat_horizon_caps;
130*4882a593Smuzhiyun u64 stat_flows_plimit;
131*4882a593Smuzhiyun u64 stat_pkts_too_long;
132*4882a593Smuzhiyun u64 stat_allocation_errors;
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun u32 timer_slack; /* hrtimer slack in ns */
135*4882a593Smuzhiyun struct qdisc_watchdog watchdog;
136*4882a593Smuzhiyun };
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun /*
139*4882a593Smuzhiyun * f->tail and f->age share the same location.
140*4882a593Smuzhiyun * We can use the low order bit to differentiate if this location points
141*4882a593Smuzhiyun * to a sk_buff or contains a jiffies value, if we force this value to be odd.
142*4882a593Smuzhiyun * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
143*4882a593Smuzhiyun */
fq_flow_set_detached(struct fq_flow * f)144*4882a593Smuzhiyun static void fq_flow_set_detached(struct fq_flow *f)
145*4882a593Smuzhiyun {
146*4882a593Smuzhiyun f->age = jiffies | 1UL;
147*4882a593Smuzhiyun }
148*4882a593Smuzhiyun
fq_flow_is_detached(const struct fq_flow * f)149*4882a593Smuzhiyun static bool fq_flow_is_detached(const struct fq_flow *f)
150*4882a593Smuzhiyun {
151*4882a593Smuzhiyun return !!(f->age & 1UL);
152*4882a593Smuzhiyun }
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun /* special value to mark a throttled flow (not on old/new list) */
155*4882a593Smuzhiyun static struct fq_flow throttled;
156*4882a593Smuzhiyun
fq_flow_is_throttled(const struct fq_flow * f)157*4882a593Smuzhiyun static bool fq_flow_is_throttled(const struct fq_flow *f)
158*4882a593Smuzhiyun {
159*4882a593Smuzhiyun return f->next == &throttled;
160*4882a593Smuzhiyun }
161*4882a593Smuzhiyun
fq_flow_add_tail(struct fq_flow_head * head,struct fq_flow * flow)162*4882a593Smuzhiyun static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
163*4882a593Smuzhiyun {
164*4882a593Smuzhiyun if (head->first)
165*4882a593Smuzhiyun head->last->next = flow;
166*4882a593Smuzhiyun else
167*4882a593Smuzhiyun head->first = flow;
168*4882a593Smuzhiyun head->last = flow;
169*4882a593Smuzhiyun flow->next = NULL;
170*4882a593Smuzhiyun }
171*4882a593Smuzhiyun
fq_flow_unset_throttled(struct fq_sched_data * q,struct fq_flow * f)172*4882a593Smuzhiyun static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
173*4882a593Smuzhiyun {
174*4882a593Smuzhiyun rb_erase(&f->rate_node, &q->delayed);
175*4882a593Smuzhiyun q->throttled_flows--;
176*4882a593Smuzhiyun fq_flow_add_tail(&q->old_flows, f);
177*4882a593Smuzhiyun }
178*4882a593Smuzhiyun
fq_flow_set_throttled(struct fq_sched_data * q,struct fq_flow * f)179*4882a593Smuzhiyun static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
180*4882a593Smuzhiyun {
181*4882a593Smuzhiyun struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun while (*p) {
184*4882a593Smuzhiyun struct fq_flow *aux;
185*4882a593Smuzhiyun
186*4882a593Smuzhiyun parent = *p;
187*4882a593Smuzhiyun aux = rb_entry(parent, struct fq_flow, rate_node);
188*4882a593Smuzhiyun if (f->time_next_packet >= aux->time_next_packet)
189*4882a593Smuzhiyun p = &parent->rb_right;
190*4882a593Smuzhiyun else
191*4882a593Smuzhiyun p = &parent->rb_left;
192*4882a593Smuzhiyun }
193*4882a593Smuzhiyun rb_link_node(&f->rate_node, parent, p);
194*4882a593Smuzhiyun rb_insert_color(&f->rate_node, &q->delayed);
195*4882a593Smuzhiyun q->throttled_flows++;
196*4882a593Smuzhiyun q->stat_throttled++;
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun f->next = &throttled;
199*4882a593Smuzhiyun if (q->time_next_delayed_flow > f->time_next_packet)
200*4882a593Smuzhiyun q->time_next_delayed_flow = f->time_next_packet;
201*4882a593Smuzhiyun }
202*4882a593Smuzhiyun
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun static struct kmem_cache *fq_flow_cachep __read_mostly;
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun /* limit number of collected flows per round */
208*4882a593Smuzhiyun #define FQ_GC_MAX 8
209*4882a593Smuzhiyun #define FQ_GC_AGE (3*HZ)
210*4882a593Smuzhiyun
fq_gc_candidate(const struct fq_flow * f)211*4882a593Smuzhiyun static bool fq_gc_candidate(const struct fq_flow *f)
212*4882a593Smuzhiyun {
213*4882a593Smuzhiyun return fq_flow_is_detached(f) &&
214*4882a593Smuzhiyun time_after(jiffies, f->age + FQ_GC_AGE);
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun
fq_gc(struct fq_sched_data * q,struct rb_root * root,struct sock * sk)217*4882a593Smuzhiyun static void fq_gc(struct fq_sched_data *q,
218*4882a593Smuzhiyun struct rb_root *root,
219*4882a593Smuzhiyun struct sock *sk)
220*4882a593Smuzhiyun {
221*4882a593Smuzhiyun struct rb_node **p, *parent;
222*4882a593Smuzhiyun void *tofree[FQ_GC_MAX];
223*4882a593Smuzhiyun struct fq_flow *f;
224*4882a593Smuzhiyun int i, fcnt = 0;
225*4882a593Smuzhiyun
226*4882a593Smuzhiyun p = &root->rb_node;
227*4882a593Smuzhiyun parent = NULL;
228*4882a593Smuzhiyun while (*p) {
229*4882a593Smuzhiyun parent = *p;
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun f = rb_entry(parent, struct fq_flow, fq_node);
232*4882a593Smuzhiyun if (f->sk == sk)
233*4882a593Smuzhiyun break;
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun if (fq_gc_candidate(f)) {
236*4882a593Smuzhiyun tofree[fcnt++] = f;
237*4882a593Smuzhiyun if (fcnt == FQ_GC_MAX)
238*4882a593Smuzhiyun break;
239*4882a593Smuzhiyun }
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun if (f->sk > sk)
242*4882a593Smuzhiyun p = &parent->rb_right;
243*4882a593Smuzhiyun else
244*4882a593Smuzhiyun p = &parent->rb_left;
245*4882a593Smuzhiyun }
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun if (!fcnt)
248*4882a593Smuzhiyun return;
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun for (i = fcnt; i > 0; ) {
251*4882a593Smuzhiyun f = tofree[--i];
252*4882a593Smuzhiyun rb_erase(&f->fq_node, root);
253*4882a593Smuzhiyun }
254*4882a593Smuzhiyun q->flows -= fcnt;
255*4882a593Smuzhiyun q->inactive_flows -= fcnt;
256*4882a593Smuzhiyun q->stat_gc_flows += fcnt;
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
259*4882a593Smuzhiyun }
260*4882a593Smuzhiyun
fq_classify(struct sk_buff * skb,struct fq_sched_data * q)261*4882a593Smuzhiyun static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
262*4882a593Smuzhiyun {
263*4882a593Smuzhiyun struct rb_node **p, *parent;
264*4882a593Smuzhiyun struct sock *sk = skb->sk;
265*4882a593Smuzhiyun struct rb_root *root;
266*4882a593Smuzhiyun struct fq_flow *f;
267*4882a593Smuzhiyun
268*4882a593Smuzhiyun /* warning: no starvation prevention... */
269*4882a593Smuzhiyun if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
270*4882a593Smuzhiyun return &q->internal;
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
273*4882a593Smuzhiyun * or a listener (SYNCOOKIE mode)
274*4882a593Smuzhiyun * 1) request sockets are not full blown,
275*4882a593Smuzhiyun * they do not contain sk_pacing_rate
276*4882a593Smuzhiyun * 2) They are not part of a 'flow' yet
277*4882a593Smuzhiyun * 3) We do not want to rate limit them (eg SYNFLOOD attack),
278*4882a593Smuzhiyun * especially if the listener set SO_MAX_PACING_RATE
279*4882a593Smuzhiyun * 4) We pretend they are orphaned
280*4882a593Smuzhiyun */
281*4882a593Smuzhiyun if (!sk || sk_listener(sk)) {
282*4882a593Smuzhiyun unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun /* By forcing low order bit to 1, we make sure to not
285*4882a593Smuzhiyun * collide with a local flow (socket pointers are word aligned)
286*4882a593Smuzhiyun */
287*4882a593Smuzhiyun sk = (struct sock *)((hash << 1) | 1UL);
288*4882a593Smuzhiyun skb_orphan(skb);
289*4882a593Smuzhiyun } else if (sk->sk_state == TCP_CLOSE) {
290*4882a593Smuzhiyun unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
291*4882a593Smuzhiyun /*
292*4882a593Smuzhiyun * Sockets in TCP_CLOSE are non connected.
293*4882a593Smuzhiyun * Typical use case is UDP sockets, they can send packets
294*4882a593Smuzhiyun * with sendto() to many different destinations.
295*4882a593Smuzhiyun * We probably could use a generic bit advertising
296*4882a593Smuzhiyun * non connected sockets, instead of sk_state == TCP_CLOSE,
297*4882a593Smuzhiyun * if we care enough.
298*4882a593Smuzhiyun */
299*4882a593Smuzhiyun sk = (struct sock *)((hash << 1) | 1UL);
300*4882a593Smuzhiyun }
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
303*4882a593Smuzhiyun
304*4882a593Smuzhiyun if (q->flows >= (2U << q->fq_trees_log) &&
305*4882a593Smuzhiyun q->inactive_flows > q->flows/2)
306*4882a593Smuzhiyun fq_gc(q, root, sk);
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun p = &root->rb_node;
309*4882a593Smuzhiyun parent = NULL;
310*4882a593Smuzhiyun while (*p) {
311*4882a593Smuzhiyun parent = *p;
312*4882a593Smuzhiyun
313*4882a593Smuzhiyun f = rb_entry(parent, struct fq_flow, fq_node);
314*4882a593Smuzhiyun if (f->sk == sk) {
315*4882a593Smuzhiyun /* socket might have been reallocated, so check
316*4882a593Smuzhiyun * if its sk_hash is the same.
317*4882a593Smuzhiyun * It not, we need to refill credit with
318*4882a593Smuzhiyun * initial quantum
319*4882a593Smuzhiyun */
320*4882a593Smuzhiyun if (unlikely(skb->sk == sk &&
321*4882a593Smuzhiyun f->socket_hash != sk->sk_hash)) {
322*4882a593Smuzhiyun f->credit = q->initial_quantum;
323*4882a593Smuzhiyun f->socket_hash = sk->sk_hash;
324*4882a593Smuzhiyun if (q->rate_enable)
325*4882a593Smuzhiyun smp_store_release(&sk->sk_pacing_status,
326*4882a593Smuzhiyun SK_PACING_FQ);
327*4882a593Smuzhiyun if (fq_flow_is_throttled(f))
328*4882a593Smuzhiyun fq_flow_unset_throttled(q, f);
329*4882a593Smuzhiyun f->time_next_packet = 0ULL;
330*4882a593Smuzhiyun }
331*4882a593Smuzhiyun return f;
332*4882a593Smuzhiyun }
333*4882a593Smuzhiyun if (f->sk > sk)
334*4882a593Smuzhiyun p = &parent->rb_right;
335*4882a593Smuzhiyun else
336*4882a593Smuzhiyun p = &parent->rb_left;
337*4882a593Smuzhiyun }
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
340*4882a593Smuzhiyun if (unlikely(!f)) {
341*4882a593Smuzhiyun q->stat_allocation_errors++;
342*4882a593Smuzhiyun return &q->internal;
343*4882a593Smuzhiyun }
344*4882a593Smuzhiyun /* f->t_root is already zeroed after kmem_cache_zalloc() */
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun fq_flow_set_detached(f);
347*4882a593Smuzhiyun f->sk = sk;
348*4882a593Smuzhiyun if (skb->sk == sk) {
349*4882a593Smuzhiyun f->socket_hash = sk->sk_hash;
350*4882a593Smuzhiyun if (q->rate_enable)
351*4882a593Smuzhiyun smp_store_release(&sk->sk_pacing_status,
352*4882a593Smuzhiyun SK_PACING_FQ);
353*4882a593Smuzhiyun }
354*4882a593Smuzhiyun f->credit = q->initial_quantum;
355*4882a593Smuzhiyun
356*4882a593Smuzhiyun rb_link_node(&f->fq_node, parent, p);
357*4882a593Smuzhiyun rb_insert_color(&f->fq_node, root);
358*4882a593Smuzhiyun
359*4882a593Smuzhiyun q->flows++;
360*4882a593Smuzhiyun q->inactive_flows++;
361*4882a593Smuzhiyun return f;
362*4882a593Smuzhiyun }
363*4882a593Smuzhiyun
fq_peek(struct fq_flow * flow)364*4882a593Smuzhiyun static struct sk_buff *fq_peek(struct fq_flow *flow)
365*4882a593Smuzhiyun {
366*4882a593Smuzhiyun struct sk_buff *skb = skb_rb_first(&flow->t_root);
367*4882a593Smuzhiyun struct sk_buff *head = flow->head;
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun if (!skb)
370*4882a593Smuzhiyun return head;
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun if (!head)
373*4882a593Smuzhiyun return skb;
374*4882a593Smuzhiyun
375*4882a593Smuzhiyun if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
376*4882a593Smuzhiyun return skb;
377*4882a593Smuzhiyun return head;
378*4882a593Smuzhiyun }
379*4882a593Smuzhiyun
fq_erase_head(struct Qdisc * sch,struct fq_flow * flow,struct sk_buff * skb)380*4882a593Smuzhiyun static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
381*4882a593Smuzhiyun struct sk_buff *skb)
382*4882a593Smuzhiyun {
383*4882a593Smuzhiyun if (skb == flow->head) {
384*4882a593Smuzhiyun flow->head = skb->next;
385*4882a593Smuzhiyun } else {
386*4882a593Smuzhiyun rb_erase(&skb->rbnode, &flow->t_root);
387*4882a593Smuzhiyun skb->dev = qdisc_dev(sch);
388*4882a593Smuzhiyun }
389*4882a593Smuzhiyun }
390*4882a593Smuzhiyun
391*4882a593Smuzhiyun /* Remove one skb from flow queue.
392*4882a593Smuzhiyun * This skb must be the return value of prior fq_peek().
393*4882a593Smuzhiyun */
fq_dequeue_skb(struct Qdisc * sch,struct fq_flow * flow,struct sk_buff * skb)394*4882a593Smuzhiyun static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
395*4882a593Smuzhiyun struct sk_buff *skb)
396*4882a593Smuzhiyun {
397*4882a593Smuzhiyun fq_erase_head(sch, flow, skb);
398*4882a593Smuzhiyun skb_mark_not_on_list(skb);
399*4882a593Smuzhiyun flow->qlen--;
400*4882a593Smuzhiyun qdisc_qstats_backlog_dec(sch, skb);
401*4882a593Smuzhiyun sch->q.qlen--;
402*4882a593Smuzhiyun }
403*4882a593Smuzhiyun
flow_queue_add(struct fq_flow * flow,struct sk_buff * skb)404*4882a593Smuzhiyun static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
405*4882a593Smuzhiyun {
406*4882a593Smuzhiyun struct rb_node **p, *parent;
407*4882a593Smuzhiyun struct sk_buff *head, *aux;
408*4882a593Smuzhiyun
409*4882a593Smuzhiyun head = flow->head;
410*4882a593Smuzhiyun if (!head ||
411*4882a593Smuzhiyun fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
412*4882a593Smuzhiyun if (!head)
413*4882a593Smuzhiyun flow->head = skb;
414*4882a593Smuzhiyun else
415*4882a593Smuzhiyun flow->tail->next = skb;
416*4882a593Smuzhiyun flow->tail = skb;
417*4882a593Smuzhiyun skb->next = NULL;
418*4882a593Smuzhiyun return;
419*4882a593Smuzhiyun }
420*4882a593Smuzhiyun
421*4882a593Smuzhiyun p = &flow->t_root.rb_node;
422*4882a593Smuzhiyun parent = NULL;
423*4882a593Smuzhiyun
424*4882a593Smuzhiyun while (*p) {
425*4882a593Smuzhiyun parent = *p;
426*4882a593Smuzhiyun aux = rb_to_skb(parent);
427*4882a593Smuzhiyun if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
428*4882a593Smuzhiyun p = &parent->rb_right;
429*4882a593Smuzhiyun else
430*4882a593Smuzhiyun p = &parent->rb_left;
431*4882a593Smuzhiyun }
432*4882a593Smuzhiyun rb_link_node(&skb->rbnode, parent, p);
433*4882a593Smuzhiyun rb_insert_color(&skb->rbnode, &flow->t_root);
434*4882a593Smuzhiyun }
435*4882a593Smuzhiyun
fq_packet_beyond_horizon(const struct sk_buff * skb,const struct fq_sched_data * q)436*4882a593Smuzhiyun static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
437*4882a593Smuzhiyun const struct fq_sched_data *q)
438*4882a593Smuzhiyun {
439*4882a593Smuzhiyun return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
440*4882a593Smuzhiyun }
441*4882a593Smuzhiyun
fq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)442*4882a593Smuzhiyun static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
443*4882a593Smuzhiyun struct sk_buff **to_free)
444*4882a593Smuzhiyun {
445*4882a593Smuzhiyun struct fq_sched_data *q = qdisc_priv(sch);
446*4882a593Smuzhiyun struct fq_flow *f;
447*4882a593Smuzhiyun
448*4882a593Smuzhiyun if (unlikely(sch->q.qlen >= sch->limit))
449*4882a593Smuzhiyun return qdisc_drop(skb, sch, to_free);
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun if (!skb->tstamp) {
452*4882a593Smuzhiyun fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
453*4882a593Smuzhiyun } else {
454*4882a593Smuzhiyun /* Check if packet timestamp is too far in the future.
455*4882a593Smuzhiyun * Try first if our cached value, to avoid ktime_get_ns()
456*4882a593Smuzhiyun * cost in most cases.
457*4882a593Smuzhiyun */
458*4882a593Smuzhiyun if (fq_packet_beyond_horizon(skb, q)) {
459*4882a593Smuzhiyun /* Refresh our cache and check another time */
460*4882a593Smuzhiyun q->ktime_cache = ktime_get_ns();
461*4882a593Smuzhiyun if (fq_packet_beyond_horizon(skb, q)) {
462*4882a593Smuzhiyun if (q->horizon_drop) {
463*4882a593Smuzhiyun q->stat_horizon_drops++;
464*4882a593Smuzhiyun return qdisc_drop(skb, sch, to_free);
465*4882a593Smuzhiyun }
466*4882a593Smuzhiyun q->stat_horizon_caps++;
467*4882a593Smuzhiyun skb->tstamp = q->ktime_cache + q->horizon;
468*4882a593Smuzhiyun }
469*4882a593Smuzhiyun }
470*4882a593Smuzhiyun fq_skb_cb(skb)->time_to_send = skb->tstamp;
471*4882a593Smuzhiyun }
472*4882a593Smuzhiyun
473*4882a593Smuzhiyun f = fq_classify(skb, q);
474*4882a593Smuzhiyun if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
475*4882a593Smuzhiyun q->stat_flows_plimit++;
476*4882a593Smuzhiyun return qdisc_drop(skb, sch, to_free);
477*4882a593Smuzhiyun }
478*4882a593Smuzhiyun
479*4882a593Smuzhiyun f->qlen++;
480*4882a593Smuzhiyun qdisc_qstats_backlog_inc(sch, skb);
481*4882a593Smuzhiyun if (fq_flow_is_detached(f)) {
482*4882a593Smuzhiyun fq_flow_add_tail(&q->new_flows, f);
483*4882a593Smuzhiyun if (time_after(jiffies, f->age + q->flow_refill_delay))
484*4882a593Smuzhiyun f->credit = max_t(u32, f->credit, q->quantum);
485*4882a593Smuzhiyun q->inactive_flows--;
486*4882a593Smuzhiyun }
487*4882a593Smuzhiyun
488*4882a593Smuzhiyun /* Note: this overwrites f->age */
489*4882a593Smuzhiyun flow_queue_add(f, skb);
490*4882a593Smuzhiyun
491*4882a593Smuzhiyun if (unlikely(f == &q->internal)) {
492*4882a593Smuzhiyun q->stat_internal_packets++;
493*4882a593Smuzhiyun }
494*4882a593Smuzhiyun sch->q.qlen++;
495*4882a593Smuzhiyun
496*4882a593Smuzhiyun return NET_XMIT_SUCCESS;
497*4882a593Smuzhiyun }
498*4882a593Smuzhiyun
fq_check_throttled(struct fq_sched_data * q,u64 now)499*4882a593Smuzhiyun static void fq_check_throttled(struct fq_sched_data *q, u64 now)
500*4882a593Smuzhiyun {
501*4882a593Smuzhiyun unsigned long sample;
502*4882a593Smuzhiyun struct rb_node *p;
503*4882a593Smuzhiyun
504*4882a593Smuzhiyun if (q->time_next_delayed_flow > now)
505*4882a593Smuzhiyun return;
506*4882a593Smuzhiyun
507*4882a593Smuzhiyun /* Update unthrottle latency EWMA.
508*4882a593Smuzhiyun * This is cheap and can help diagnosing timer/latency problems.
509*4882a593Smuzhiyun */
510*4882a593Smuzhiyun sample = (unsigned long)(now - q->time_next_delayed_flow);
511*4882a593Smuzhiyun q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
512*4882a593Smuzhiyun q->unthrottle_latency_ns += sample >> 3;
513*4882a593Smuzhiyun
514*4882a593Smuzhiyun q->time_next_delayed_flow = ~0ULL;
515*4882a593Smuzhiyun while ((p = rb_first(&q->delayed)) != NULL) {
516*4882a593Smuzhiyun struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
517*4882a593Smuzhiyun
518*4882a593Smuzhiyun if (f->time_next_packet > now) {
519*4882a593Smuzhiyun q->time_next_delayed_flow = f->time_next_packet;
520*4882a593Smuzhiyun break;
521*4882a593Smuzhiyun }
522*4882a593Smuzhiyun fq_flow_unset_throttled(q, f);
523*4882a593Smuzhiyun }
524*4882a593Smuzhiyun }
525*4882a593Smuzhiyun
fq_dequeue(struct Qdisc * sch)526*4882a593Smuzhiyun static struct sk_buff *fq_dequeue(struct Qdisc *sch)
527*4882a593Smuzhiyun {
528*4882a593Smuzhiyun struct fq_sched_data *q = qdisc_priv(sch);
529*4882a593Smuzhiyun struct fq_flow_head *head;
530*4882a593Smuzhiyun struct sk_buff *skb;
531*4882a593Smuzhiyun struct fq_flow *f;
532*4882a593Smuzhiyun unsigned long rate;
533*4882a593Smuzhiyun u32 plen;
534*4882a593Smuzhiyun u64 now;
535*4882a593Smuzhiyun
536*4882a593Smuzhiyun if (!sch->q.qlen)
537*4882a593Smuzhiyun return NULL;
538*4882a593Smuzhiyun
539*4882a593Smuzhiyun skb = fq_peek(&q->internal);
540*4882a593Smuzhiyun if (unlikely(skb)) {
541*4882a593Smuzhiyun fq_dequeue_skb(sch, &q->internal, skb);
542*4882a593Smuzhiyun goto out;
543*4882a593Smuzhiyun }
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun q->ktime_cache = now = ktime_get_ns();
546*4882a593Smuzhiyun fq_check_throttled(q, now);
547*4882a593Smuzhiyun begin:
548*4882a593Smuzhiyun head = &q->new_flows;
549*4882a593Smuzhiyun if (!head->first) {
550*4882a593Smuzhiyun head = &q->old_flows;
551*4882a593Smuzhiyun if (!head->first) {
552*4882a593Smuzhiyun if (q->time_next_delayed_flow != ~0ULL)
553*4882a593Smuzhiyun qdisc_watchdog_schedule_range_ns(&q->watchdog,
554*4882a593Smuzhiyun q->time_next_delayed_flow,
555*4882a593Smuzhiyun q->timer_slack);
556*4882a593Smuzhiyun return NULL;
557*4882a593Smuzhiyun }
558*4882a593Smuzhiyun }
559*4882a593Smuzhiyun f = head->first;
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun if (f->credit <= 0) {
562*4882a593Smuzhiyun f->credit += q->quantum;
563*4882a593Smuzhiyun head->first = f->next;
564*4882a593Smuzhiyun fq_flow_add_tail(&q->old_flows, f);
565*4882a593Smuzhiyun goto begin;
566*4882a593Smuzhiyun }
567*4882a593Smuzhiyun
568*4882a593Smuzhiyun skb = fq_peek(f);
569*4882a593Smuzhiyun if (skb) {
570*4882a593Smuzhiyun u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
571*4882a593Smuzhiyun f->time_next_packet);
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun if (now < time_next_packet) {
574*4882a593Smuzhiyun head->first = f->next;
575*4882a593Smuzhiyun f->time_next_packet = time_next_packet;
576*4882a593Smuzhiyun fq_flow_set_throttled(q, f);
577*4882a593Smuzhiyun goto begin;
578*4882a593Smuzhiyun }
579*4882a593Smuzhiyun prefetch(&skb->end);
580*4882a593Smuzhiyun if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
581*4882a593Smuzhiyun INET_ECN_set_ce(skb);
582*4882a593Smuzhiyun q->stat_ce_mark++;
583*4882a593Smuzhiyun }
584*4882a593Smuzhiyun fq_dequeue_skb(sch, f, skb);
585*4882a593Smuzhiyun } else {
586*4882a593Smuzhiyun head->first = f->next;
587*4882a593Smuzhiyun /* force a pass through old_flows to prevent starvation */
588*4882a593Smuzhiyun if ((head == &q->new_flows) && q->old_flows.first) {
589*4882a593Smuzhiyun fq_flow_add_tail(&q->old_flows, f);
590*4882a593Smuzhiyun } else {
591*4882a593Smuzhiyun fq_flow_set_detached(f);
592*4882a593Smuzhiyun q->inactive_flows++;
593*4882a593Smuzhiyun }
594*4882a593Smuzhiyun goto begin;
595*4882a593Smuzhiyun }
596*4882a593Smuzhiyun plen = qdisc_pkt_len(skb);
597*4882a593Smuzhiyun f->credit -= plen;
598*4882a593Smuzhiyun
599*4882a593Smuzhiyun if (!q->rate_enable)
600*4882a593Smuzhiyun goto out;
601*4882a593Smuzhiyun
602*4882a593Smuzhiyun rate = q->flow_max_rate;
603*4882a593Smuzhiyun
604*4882a593Smuzhiyun /* If EDT time was provided for this skb, we need to
605*4882a593Smuzhiyun * update f->time_next_packet only if this qdisc enforces
606*4882a593Smuzhiyun * a flow max rate.
607*4882a593Smuzhiyun */
608*4882a593Smuzhiyun if (!skb->tstamp) {
609*4882a593Smuzhiyun if (skb->sk)
610*4882a593Smuzhiyun rate = min(skb->sk->sk_pacing_rate, rate);
611*4882a593Smuzhiyun
612*4882a593Smuzhiyun if (rate <= q->low_rate_threshold) {
613*4882a593Smuzhiyun f->credit = 0;
614*4882a593Smuzhiyun } else {
615*4882a593Smuzhiyun plen = max(plen, q->quantum);
616*4882a593Smuzhiyun if (f->credit > 0)
617*4882a593Smuzhiyun goto out;
618*4882a593Smuzhiyun }
619*4882a593Smuzhiyun }
620*4882a593Smuzhiyun if (rate != ~0UL) {
621*4882a593Smuzhiyun u64 len = (u64)plen * NSEC_PER_SEC;
622*4882a593Smuzhiyun
623*4882a593Smuzhiyun if (likely(rate))
624*4882a593Smuzhiyun len = div64_ul(len, rate);
625*4882a593Smuzhiyun /* Since socket rate can change later,
626*4882a593Smuzhiyun * clamp the delay to 1 second.
627*4882a593Smuzhiyun * Really, providers of too big packets should be fixed !
628*4882a593Smuzhiyun */
629*4882a593Smuzhiyun if (unlikely(len > NSEC_PER_SEC)) {
630*4882a593Smuzhiyun len = NSEC_PER_SEC;
631*4882a593Smuzhiyun q->stat_pkts_too_long++;
632*4882a593Smuzhiyun }
633*4882a593Smuzhiyun /* Account for schedule/timers drifts.
634*4882a593Smuzhiyun * f->time_next_packet was set when prior packet was sent,
635*4882a593Smuzhiyun * and current time (@now) can be too late by tens of us.
636*4882a593Smuzhiyun */
637*4882a593Smuzhiyun if (f->time_next_packet)
638*4882a593Smuzhiyun len -= min(len/2, now - f->time_next_packet);
639*4882a593Smuzhiyun f->time_next_packet = now + len;
640*4882a593Smuzhiyun }
641*4882a593Smuzhiyun out:
642*4882a593Smuzhiyun qdisc_bstats_update(sch, skb);
643*4882a593Smuzhiyun return skb;
644*4882a593Smuzhiyun }
645*4882a593Smuzhiyun
fq_flow_purge(struct fq_flow * flow)646*4882a593Smuzhiyun static void fq_flow_purge(struct fq_flow *flow)
647*4882a593Smuzhiyun {
648*4882a593Smuzhiyun struct rb_node *p = rb_first(&flow->t_root);
649*4882a593Smuzhiyun
650*4882a593Smuzhiyun while (p) {
651*4882a593Smuzhiyun struct sk_buff *skb = rb_to_skb(p);
652*4882a593Smuzhiyun
653*4882a593Smuzhiyun p = rb_next(p);
654*4882a593Smuzhiyun rb_erase(&skb->rbnode, &flow->t_root);
655*4882a593Smuzhiyun rtnl_kfree_skbs(skb, skb);
656*4882a593Smuzhiyun }
657*4882a593Smuzhiyun rtnl_kfree_skbs(flow->head, flow->tail);
658*4882a593Smuzhiyun flow->head = NULL;
659*4882a593Smuzhiyun flow->qlen = 0;
660*4882a593Smuzhiyun }
661*4882a593Smuzhiyun
fq_reset(struct Qdisc * sch)662*4882a593Smuzhiyun static void fq_reset(struct Qdisc *sch)
663*4882a593Smuzhiyun {
664*4882a593Smuzhiyun struct fq_sched_data *q = qdisc_priv(sch);
665*4882a593Smuzhiyun struct rb_root *root;
666*4882a593Smuzhiyun struct rb_node *p;
667*4882a593Smuzhiyun struct fq_flow *f;
668*4882a593Smuzhiyun unsigned int idx;
669*4882a593Smuzhiyun
670*4882a593Smuzhiyun sch->q.qlen = 0;
671*4882a593Smuzhiyun sch->qstats.backlog = 0;
672*4882a593Smuzhiyun
673*4882a593Smuzhiyun fq_flow_purge(&q->internal);
674*4882a593Smuzhiyun
675*4882a593Smuzhiyun if (!q->fq_root)
676*4882a593Smuzhiyun return;
677*4882a593Smuzhiyun
678*4882a593Smuzhiyun for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
679*4882a593Smuzhiyun root = &q->fq_root[idx];
680*4882a593Smuzhiyun while ((p = rb_first(root)) != NULL) {
681*4882a593Smuzhiyun f = rb_entry(p, struct fq_flow, fq_node);
682*4882a593Smuzhiyun rb_erase(p, root);
683*4882a593Smuzhiyun
684*4882a593Smuzhiyun fq_flow_purge(f);
685*4882a593Smuzhiyun
686*4882a593Smuzhiyun kmem_cache_free(fq_flow_cachep, f);
687*4882a593Smuzhiyun }
688*4882a593Smuzhiyun }
689*4882a593Smuzhiyun q->new_flows.first = NULL;
690*4882a593Smuzhiyun q->old_flows.first = NULL;
691*4882a593Smuzhiyun q->delayed = RB_ROOT;
692*4882a593Smuzhiyun q->flows = 0;
693*4882a593Smuzhiyun q->inactive_flows = 0;
694*4882a593Smuzhiyun q->throttled_flows = 0;
695*4882a593Smuzhiyun }
696*4882a593Smuzhiyun
fq_rehash(struct fq_sched_data * q,struct rb_root * old_array,u32 old_log,struct rb_root * new_array,u32 new_log)697*4882a593Smuzhiyun static void fq_rehash(struct fq_sched_data *q,
698*4882a593Smuzhiyun struct rb_root *old_array, u32 old_log,
699*4882a593Smuzhiyun struct rb_root *new_array, u32 new_log)
700*4882a593Smuzhiyun {
701*4882a593Smuzhiyun struct rb_node *op, **np, *parent;
702*4882a593Smuzhiyun struct rb_root *oroot, *nroot;
703*4882a593Smuzhiyun struct fq_flow *of, *nf;
704*4882a593Smuzhiyun int fcnt = 0;
705*4882a593Smuzhiyun u32 idx;
706*4882a593Smuzhiyun
707*4882a593Smuzhiyun for (idx = 0; idx < (1U << old_log); idx++) {
708*4882a593Smuzhiyun oroot = &old_array[idx];
709*4882a593Smuzhiyun while ((op = rb_first(oroot)) != NULL) {
710*4882a593Smuzhiyun rb_erase(op, oroot);
711*4882a593Smuzhiyun of = rb_entry(op, struct fq_flow, fq_node);
712*4882a593Smuzhiyun if (fq_gc_candidate(of)) {
713*4882a593Smuzhiyun fcnt++;
714*4882a593Smuzhiyun kmem_cache_free(fq_flow_cachep, of);
715*4882a593Smuzhiyun continue;
716*4882a593Smuzhiyun }
717*4882a593Smuzhiyun nroot = &new_array[hash_ptr(of->sk, new_log)];
718*4882a593Smuzhiyun
719*4882a593Smuzhiyun np = &nroot->rb_node;
720*4882a593Smuzhiyun parent = NULL;
721*4882a593Smuzhiyun while (*np) {
722*4882a593Smuzhiyun parent = *np;
723*4882a593Smuzhiyun
724*4882a593Smuzhiyun nf = rb_entry(parent, struct fq_flow, fq_node);
725*4882a593Smuzhiyun BUG_ON(nf->sk == of->sk);
726*4882a593Smuzhiyun
727*4882a593Smuzhiyun if (nf->sk > of->sk)
728*4882a593Smuzhiyun np = &parent->rb_right;
729*4882a593Smuzhiyun else
730*4882a593Smuzhiyun np = &parent->rb_left;
731*4882a593Smuzhiyun }
732*4882a593Smuzhiyun
733*4882a593Smuzhiyun rb_link_node(&of->fq_node, parent, np);
734*4882a593Smuzhiyun rb_insert_color(&of->fq_node, nroot);
735*4882a593Smuzhiyun }
736*4882a593Smuzhiyun }
737*4882a593Smuzhiyun q->flows -= fcnt;
738*4882a593Smuzhiyun q->inactive_flows -= fcnt;
739*4882a593Smuzhiyun q->stat_gc_flows += fcnt;
740*4882a593Smuzhiyun }
741*4882a593Smuzhiyun
fq_free(void * addr)742*4882a593Smuzhiyun static void fq_free(void *addr)
743*4882a593Smuzhiyun {
744*4882a593Smuzhiyun kvfree(addr);
745*4882a593Smuzhiyun }
746*4882a593Smuzhiyun
fq_resize(struct Qdisc * sch,u32 log)747*4882a593Smuzhiyun static int fq_resize(struct Qdisc *sch, u32 log)
748*4882a593Smuzhiyun {
749*4882a593Smuzhiyun struct fq_sched_data *q = qdisc_priv(sch);
750*4882a593Smuzhiyun struct rb_root *array;
751*4882a593Smuzhiyun void *old_fq_root;
752*4882a593Smuzhiyun u32 idx;
753*4882a593Smuzhiyun
754*4882a593Smuzhiyun if (q->fq_root && log == q->fq_trees_log)
755*4882a593Smuzhiyun return 0;
756*4882a593Smuzhiyun
757*4882a593Smuzhiyun /* If XPS was setup, we can allocate memory on right NUMA node */
758*4882a593Smuzhiyun array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
759*4882a593Smuzhiyun netdev_queue_numa_node_read(sch->dev_queue));
760*4882a593Smuzhiyun if (!array)
761*4882a593Smuzhiyun return -ENOMEM;
762*4882a593Smuzhiyun
763*4882a593Smuzhiyun for (idx = 0; idx < (1U << log); idx++)
764*4882a593Smuzhiyun array[idx] = RB_ROOT;
765*4882a593Smuzhiyun
766*4882a593Smuzhiyun sch_tree_lock(sch);
767*4882a593Smuzhiyun
768*4882a593Smuzhiyun old_fq_root = q->fq_root;
769*4882a593Smuzhiyun if (old_fq_root)
770*4882a593Smuzhiyun fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
771*4882a593Smuzhiyun
772*4882a593Smuzhiyun q->fq_root = array;
773*4882a593Smuzhiyun q->fq_trees_log = log;
774*4882a593Smuzhiyun
775*4882a593Smuzhiyun sch_tree_unlock(sch);
776*4882a593Smuzhiyun
777*4882a593Smuzhiyun fq_free(old_fq_root);
778*4882a593Smuzhiyun
779*4882a593Smuzhiyun return 0;
780*4882a593Smuzhiyun }
781*4882a593Smuzhiyun
782*4882a593Smuzhiyun static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
783*4882a593Smuzhiyun [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK },
784*4882a593Smuzhiyun
785*4882a593Smuzhiyun [TCA_FQ_PLIMIT] = { .type = NLA_U32 },
786*4882a593Smuzhiyun [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
787*4882a593Smuzhiyun [TCA_FQ_QUANTUM] = { .type = NLA_U32 },
788*4882a593Smuzhiyun [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
789*4882a593Smuzhiyun [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
790*4882a593Smuzhiyun [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
791*4882a593Smuzhiyun [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
792*4882a593Smuzhiyun [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
793*4882a593Smuzhiyun [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
794*4882a593Smuzhiyun [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
795*4882a593Smuzhiyun [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
796*4882a593Smuzhiyun [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
797*4882a593Smuzhiyun [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
798*4882a593Smuzhiyun [TCA_FQ_HORIZON] = { .type = NLA_U32 },
799*4882a593Smuzhiyun [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 },
800*4882a593Smuzhiyun };
801*4882a593Smuzhiyun
fq_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)802*4882a593Smuzhiyun static int fq_change(struct Qdisc *sch, struct nlattr *opt,
803*4882a593Smuzhiyun struct netlink_ext_ack *extack)
804*4882a593Smuzhiyun {
805*4882a593Smuzhiyun struct fq_sched_data *q = qdisc_priv(sch);
806*4882a593Smuzhiyun struct nlattr *tb[TCA_FQ_MAX + 1];
807*4882a593Smuzhiyun int err, drop_count = 0;
808*4882a593Smuzhiyun unsigned drop_len = 0;
809*4882a593Smuzhiyun u32 fq_log;
810*4882a593Smuzhiyun
811*4882a593Smuzhiyun if (!opt)
812*4882a593Smuzhiyun return -EINVAL;
813*4882a593Smuzhiyun
814*4882a593Smuzhiyun err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
815*4882a593Smuzhiyun NULL);
816*4882a593Smuzhiyun if (err < 0)
817*4882a593Smuzhiyun return err;
818*4882a593Smuzhiyun
819*4882a593Smuzhiyun sch_tree_lock(sch);
820*4882a593Smuzhiyun
821*4882a593Smuzhiyun fq_log = q->fq_trees_log;
822*4882a593Smuzhiyun
823*4882a593Smuzhiyun if (tb[TCA_FQ_BUCKETS_LOG]) {
824*4882a593Smuzhiyun u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
825*4882a593Smuzhiyun
826*4882a593Smuzhiyun if (nval >= 1 && nval <= ilog2(256*1024))
827*4882a593Smuzhiyun fq_log = nval;
828*4882a593Smuzhiyun else
829*4882a593Smuzhiyun err = -EINVAL;
830*4882a593Smuzhiyun }
831*4882a593Smuzhiyun if (tb[TCA_FQ_PLIMIT])
832*4882a593Smuzhiyun sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
833*4882a593Smuzhiyun
834*4882a593Smuzhiyun if (tb[TCA_FQ_FLOW_PLIMIT])
835*4882a593Smuzhiyun q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
836*4882a593Smuzhiyun
837*4882a593Smuzhiyun if (tb[TCA_FQ_QUANTUM]) {
838*4882a593Smuzhiyun u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
839*4882a593Smuzhiyun
840*4882a593Smuzhiyun if (quantum > 0 && quantum <= (1 << 20)) {
841*4882a593Smuzhiyun q->quantum = quantum;
842*4882a593Smuzhiyun } else {
843*4882a593Smuzhiyun NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
844*4882a593Smuzhiyun err = -EINVAL;
845*4882a593Smuzhiyun }
846*4882a593Smuzhiyun }
847*4882a593Smuzhiyun
848*4882a593Smuzhiyun if (tb[TCA_FQ_INITIAL_QUANTUM])
849*4882a593Smuzhiyun q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
850*4882a593Smuzhiyun
851*4882a593Smuzhiyun if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
852*4882a593Smuzhiyun pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
853*4882a593Smuzhiyun nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
854*4882a593Smuzhiyun
855*4882a593Smuzhiyun if (tb[TCA_FQ_FLOW_MAX_RATE]) {
856*4882a593Smuzhiyun u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
857*4882a593Smuzhiyun
858*4882a593Smuzhiyun q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
859*4882a593Smuzhiyun }
860*4882a593Smuzhiyun if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
861*4882a593Smuzhiyun q->low_rate_threshold =
862*4882a593Smuzhiyun nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
863*4882a593Smuzhiyun
864*4882a593Smuzhiyun if (tb[TCA_FQ_RATE_ENABLE]) {
865*4882a593Smuzhiyun u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
866*4882a593Smuzhiyun
867*4882a593Smuzhiyun if (enable <= 1)
868*4882a593Smuzhiyun q->rate_enable = enable;
869*4882a593Smuzhiyun else
870*4882a593Smuzhiyun err = -EINVAL;
871*4882a593Smuzhiyun }
872*4882a593Smuzhiyun
873*4882a593Smuzhiyun if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
874*4882a593Smuzhiyun u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
875*4882a593Smuzhiyun
876*4882a593Smuzhiyun q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
877*4882a593Smuzhiyun }
878*4882a593Smuzhiyun
879*4882a593Smuzhiyun if (tb[TCA_FQ_ORPHAN_MASK])
880*4882a593Smuzhiyun q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
881*4882a593Smuzhiyun
882*4882a593Smuzhiyun if (tb[TCA_FQ_CE_THRESHOLD])
883*4882a593Smuzhiyun q->ce_threshold = (u64)NSEC_PER_USEC *
884*4882a593Smuzhiyun nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
885*4882a593Smuzhiyun
886*4882a593Smuzhiyun if (tb[TCA_FQ_TIMER_SLACK])
887*4882a593Smuzhiyun q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
888*4882a593Smuzhiyun
889*4882a593Smuzhiyun if (tb[TCA_FQ_HORIZON])
890*4882a593Smuzhiyun q->horizon = (u64)NSEC_PER_USEC *
891*4882a593Smuzhiyun nla_get_u32(tb[TCA_FQ_HORIZON]);
892*4882a593Smuzhiyun
893*4882a593Smuzhiyun if (tb[TCA_FQ_HORIZON_DROP])
894*4882a593Smuzhiyun q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]);
895*4882a593Smuzhiyun
896*4882a593Smuzhiyun if (!err) {
897*4882a593Smuzhiyun
898*4882a593Smuzhiyun sch_tree_unlock(sch);
899*4882a593Smuzhiyun err = fq_resize(sch, fq_log);
900*4882a593Smuzhiyun sch_tree_lock(sch);
901*4882a593Smuzhiyun }
902*4882a593Smuzhiyun while (sch->q.qlen > sch->limit) {
903*4882a593Smuzhiyun struct sk_buff *skb = fq_dequeue(sch);
904*4882a593Smuzhiyun
905*4882a593Smuzhiyun if (!skb)
906*4882a593Smuzhiyun break;
907*4882a593Smuzhiyun drop_len += qdisc_pkt_len(skb);
908*4882a593Smuzhiyun rtnl_kfree_skbs(skb, skb);
909*4882a593Smuzhiyun drop_count++;
910*4882a593Smuzhiyun }
911*4882a593Smuzhiyun qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
912*4882a593Smuzhiyun
913*4882a593Smuzhiyun sch_tree_unlock(sch);
914*4882a593Smuzhiyun return err;
915*4882a593Smuzhiyun }
916*4882a593Smuzhiyun
fq_destroy(struct Qdisc * sch)917*4882a593Smuzhiyun static void fq_destroy(struct Qdisc *sch)
918*4882a593Smuzhiyun {
919*4882a593Smuzhiyun struct fq_sched_data *q = qdisc_priv(sch);
920*4882a593Smuzhiyun
921*4882a593Smuzhiyun fq_reset(sch);
922*4882a593Smuzhiyun fq_free(q->fq_root);
923*4882a593Smuzhiyun qdisc_watchdog_cancel(&q->watchdog);
924*4882a593Smuzhiyun }
925*4882a593Smuzhiyun
fq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)926*4882a593Smuzhiyun static int fq_init(struct Qdisc *sch, struct nlattr *opt,
927*4882a593Smuzhiyun struct netlink_ext_ack *extack)
928*4882a593Smuzhiyun {
929*4882a593Smuzhiyun struct fq_sched_data *q = qdisc_priv(sch);
930*4882a593Smuzhiyun int err;
931*4882a593Smuzhiyun
932*4882a593Smuzhiyun sch->limit = 10000;
933*4882a593Smuzhiyun q->flow_plimit = 100;
934*4882a593Smuzhiyun q->quantum = 2 * psched_mtu(qdisc_dev(sch));
935*4882a593Smuzhiyun q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
936*4882a593Smuzhiyun q->flow_refill_delay = msecs_to_jiffies(40);
937*4882a593Smuzhiyun q->flow_max_rate = ~0UL;
938*4882a593Smuzhiyun q->time_next_delayed_flow = ~0ULL;
939*4882a593Smuzhiyun q->rate_enable = 1;
940*4882a593Smuzhiyun q->new_flows.first = NULL;
941*4882a593Smuzhiyun q->old_flows.first = NULL;
942*4882a593Smuzhiyun q->delayed = RB_ROOT;
943*4882a593Smuzhiyun q->fq_root = NULL;
944*4882a593Smuzhiyun q->fq_trees_log = ilog2(1024);
945*4882a593Smuzhiyun q->orphan_mask = 1024 - 1;
946*4882a593Smuzhiyun q->low_rate_threshold = 550000 / 8;
947*4882a593Smuzhiyun
948*4882a593Smuzhiyun q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
949*4882a593Smuzhiyun
950*4882a593Smuzhiyun q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
951*4882a593Smuzhiyun q->horizon_drop = 1; /* by default, drop packets beyond horizon */
952*4882a593Smuzhiyun
953*4882a593Smuzhiyun /* Default ce_threshold of 4294 seconds */
954*4882a593Smuzhiyun q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
955*4882a593Smuzhiyun
956*4882a593Smuzhiyun qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
957*4882a593Smuzhiyun
958*4882a593Smuzhiyun if (opt)
959*4882a593Smuzhiyun err = fq_change(sch, opt, extack);
960*4882a593Smuzhiyun else
961*4882a593Smuzhiyun err = fq_resize(sch, q->fq_trees_log);
962*4882a593Smuzhiyun
963*4882a593Smuzhiyun return err;
964*4882a593Smuzhiyun }
965*4882a593Smuzhiyun
fq_dump(struct Qdisc * sch,struct sk_buff * skb)966*4882a593Smuzhiyun static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
967*4882a593Smuzhiyun {
968*4882a593Smuzhiyun struct fq_sched_data *q = qdisc_priv(sch);
969*4882a593Smuzhiyun u64 ce_threshold = q->ce_threshold;
970*4882a593Smuzhiyun u64 horizon = q->horizon;
971*4882a593Smuzhiyun struct nlattr *opts;
972*4882a593Smuzhiyun
973*4882a593Smuzhiyun opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
974*4882a593Smuzhiyun if (opts == NULL)
975*4882a593Smuzhiyun goto nla_put_failure;
976*4882a593Smuzhiyun
977*4882a593Smuzhiyun /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
978*4882a593Smuzhiyun
979*4882a593Smuzhiyun do_div(ce_threshold, NSEC_PER_USEC);
980*4882a593Smuzhiyun do_div(horizon, NSEC_PER_USEC);
981*4882a593Smuzhiyun
982*4882a593Smuzhiyun if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
983*4882a593Smuzhiyun nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
984*4882a593Smuzhiyun nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
985*4882a593Smuzhiyun nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
986*4882a593Smuzhiyun nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
987*4882a593Smuzhiyun nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
988*4882a593Smuzhiyun min_t(unsigned long, q->flow_max_rate, ~0U)) ||
989*4882a593Smuzhiyun nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
990*4882a593Smuzhiyun jiffies_to_usecs(q->flow_refill_delay)) ||
991*4882a593Smuzhiyun nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
992*4882a593Smuzhiyun nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
993*4882a593Smuzhiyun q->low_rate_threshold) ||
994*4882a593Smuzhiyun nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
995*4882a593Smuzhiyun nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
996*4882a593Smuzhiyun nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) ||
997*4882a593Smuzhiyun nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
998*4882a593Smuzhiyun nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
999*4882a593Smuzhiyun goto nla_put_failure;
1000*4882a593Smuzhiyun
1001*4882a593Smuzhiyun return nla_nest_end(skb, opts);
1002*4882a593Smuzhiyun
1003*4882a593Smuzhiyun nla_put_failure:
1004*4882a593Smuzhiyun return -1;
1005*4882a593Smuzhiyun }
1006*4882a593Smuzhiyun
fq_dump_stats(struct Qdisc * sch,struct gnet_dump * d)1007*4882a593Smuzhiyun static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1008*4882a593Smuzhiyun {
1009*4882a593Smuzhiyun struct fq_sched_data *q = qdisc_priv(sch);
1010*4882a593Smuzhiyun struct tc_fq_qd_stats st;
1011*4882a593Smuzhiyun
1012*4882a593Smuzhiyun sch_tree_lock(sch);
1013*4882a593Smuzhiyun
1014*4882a593Smuzhiyun st.gc_flows = q->stat_gc_flows;
1015*4882a593Smuzhiyun st.highprio_packets = q->stat_internal_packets;
1016*4882a593Smuzhiyun st.tcp_retrans = 0;
1017*4882a593Smuzhiyun st.throttled = q->stat_throttled;
1018*4882a593Smuzhiyun st.flows_plimit = q->stat_flows_plimit;
1019*4882a593Smuzhiyun st.pkts_too_long = q->stat_pkts_too_long;
1020*4882a593Smuzhiyun st.allocation_errors = q->stat_allocation_errors;
1021*4882a593Smuzhiyun st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
1022*4882a593Smuzhiyun ktime_get_ns();
1023*4882a593Smuzhiyun st.flows = q->flows;
1024*4882a593Smuzhiyun st.inactive_flows = q->inactive_flows;
1025*4882a593Smuzhiyun st.throttled_flows = q->throttled_flows;
1026*4882a593Smuzhiyun st.unthrottle_latency_ns = min_t(unsigned long,
1027*4882a593Smuzhiyun q->unthrottle_latency_ns, ~0U);
1028*4882a593Smuzhiyun st.ce_mark = q->stat_ce_mark;
1029*4882a593Smuzhiyun st.horizon_drops = q->stat_horizon_drops;
1030*4882a593Smuzhiyun st.horizon_caps = q->stat_horizon_caps;
1031*4882a593Smuzhiyun sch_tree_unlock(sch);
1032*4882a593Smuzhiyun
1033*4882a593Smuzhiyun return gnet_stats_copy_app(d, &st, sizeof(st));
1034*4882a593Smuzhiyun }
1035*4882a593Smuzhiyun
1036*4882a593Smuzhiyun static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
1037*4882a593Smuzhiyun .id = "fq",
1038*4882a593Smuzhiyun .priv_size = sizeof(struct fq_sched_data),
1039*4882a593Smuzhiyun
1040*4882a593Smuzhiyun .enqueue = fq_enqueue,
1041*4882a593Smuzhiyun .dequeue = fq_dequeue,
1042*4882a593Smuzhiyun .peek = qdisc_peek_dequeued,
1043*4882a593Smuzhiyun .init = fq_init,
1044*4882a593Smuzhiyun .reset = fq_reset,
1045*4882a593Smuzhiyun .destroy = fq_destroy,
1046*4882a593Smuzhiyun .change = fq_change,
1047*4882a593Smuzhiyun .dump = fq_dump,
1048*4882a593Smuzhiyun .dump_stats = fq_dump_stats,
1049*4882a593Smuzhiyun .owner = THIS_MODULE,
1050*4882a593Smuzhiyun };
1051*4882a593Smuzhiyun
fq_module_init(void)1052*4882a593Smuzhiyun static int __init fq_module_init(void)
1053*4882a593Smuzhiyun {
1054*4882a593Smuzhiyun int ret;
1055*4882a593Smuzhiyun
1056*4882a593Smuzhiyun fq_flow_cachep = kmem_cache_create("fq_flow_cache",
1057*4882a593Smuzhiyun sizeof(struct fq_flow),
1058*4882a593Smuzhiyun 0, 0, NULL);
1059*4882a593Smuzhiyun if (!fq_flow_cachep)
1060*4882a593Smuzhiyun return -ENOMEM;
1061*4882a593Smuzhiyun
1062*4882a593Smuzhiyun ret = register_qdisc(&fq_qdisc_ops);
1063*4882a593Smuzhiyun if (ret)
1064*4882a593Smuzhiyun kmem_cache_destroy(fq_flow_cachep);
1065*4882a593Smuzhiyun return ret;
1066*4882a593Smuzhiyun }
1067*4882a593Smuzhiyun
fq_module_exit(void)1068*4882a593Smuzhiyun static void __exit fq_module_exit(void)
1069*4882a593Smuzhiyun {
1070*4882a593Smuzhiyun unregister_qdisc(&fq_qdisc_ops);
1071*4882a593Smuzhiyun kmem_cache_destroy(fq_flow_cachep);
1072*4882a593Smuzhiyun }
1073*4882a593Smuzhiyun
1074*4882a593Smuzhiyun module_init(fq_module_init)
1075*4882a593Smuzhiyun module_exit(fq_module_exit)
1076*4882a593Smuzhiyun MODULE_AUTHOR("Eric Dumazet");
1077*4882a593Smuzhiyun MODULE_LICENSE("GPL");
1078*4882a593Smuzhiyun MODULE_DESCRIPTION("Fair Queue Packet Scheduler");
1079