1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * TCP Low Priority (TCP-LP)
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * TCP Low Priority is a distributed algorithm whose goal is to utilize only
6*4882a593Smuzhiyun * the excess network bandwidth as compared to the ``fair share`` of
7*4882a593Smuzhiyun * bandwidth as targeted by TCP.
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * As of 2.6.13, Linux supports pluggable congestion control algorithms.
10*4882a593Smuzhiyun * Due to the limitation of the API, we take the following changes from
11*4882a593Smuzhiyun * the original TCP-LP implementation:
12*4882a593Smuzhiyun * o We use newReno in most core CA handling. Only add some checking
13*4882a593Smuzhiyun * within cong_avoid.
14*4882a593Smuzhiyun * o Error correcting in remote HZ, therefore remote HZ will be keeped
15*4882a593Smuzhiyun * on checking and updating.
16*4882a593Smuzhiyun * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since
17*4882a593Smuzhiyun * OWD have a similar meaning as RTT. Also correct the buggy formular.
18*4882a593Smuzhiyun * o Handle reaction for Early Congestion Indication (ECI) within
19*4882a593Smuzhiyun * pkts_acked, as mentioned within pseudo code.
20*4882a593Smuzhiyun * o OWD is handled in relative format, where local time stamp will in
21*4882a593Smuzhiyun * tcp_time_stamp format.
22*4882a593Smuzhiyun *
23*4882a593Smuzhiyun * Original Author:
24*4882a593Smuzhiyun * Aleksandar Kuzmanovic <akuzma@northwestern.edu>
25*4882a593Smuzhiyun * Available from:
26*4882a593Smuzhiyun * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
27*4882a593Smuzhiyun * Original implementation for 2.4.19:
28*4882a593Smuzhiyun * http://www-ece.rice.edu/networks/TCP-LP/
29*4882a593Smuzhiyun *
30*4882a593Smuzhiyun * 2.6.x module Authors:
31*4882a593Smuzhiyun * Wong Hoi Sing, Edison <hswong3i@gmail.com>
32*4882a593Smuzhiyun * Hung Hing Lun, Mike <hlhung3i@gmail.com>
33*4882a593Smuzhiyun * SourceForge project page:
34*4882a593Smuzhiyun * http://tcp-lp-mod.sourceforge.net/
35*4882a593Smuzhiyun */
36*4882a593Smuzhiyun
37*4882a593Smuzhiyun #include <linux/module.h>
38*4882a593Smuzhiyun #include <net/tcp.h>
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun /* resolution of owd */
41*4882a593Smuzhiyun #define LP_RESOL TCP_TS_HZ
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun /**
44*4882a593Smuzhiyun * enum tcp_lp_state
45*4882a593Smuzhiyun * @LP_VALID_RHZ: is remote HZ valid?
46*4882a593Smuzhiyun * @LP_VALID_OWD: is OWD valid?
47*4882a593Smuzhiyun * @LP_WITHIN_THR: are we within threshold?
48*4882a593Smuzhiyun * @LP_WITHIN_INF: are we within inference?
49*4882a593Smuzhiyun *
50*4882a593Smuzhiyun * TCP-LP's state flags.
51*4882a593Smuzhiyun * We create this set of state flag mainly for debugging.
52*4882a593Smuzhiyun */
53*4882a593Smuzhiyun enum tcp_lp_state {
54*4882a593Smuzhiyun LP_VALID_RHZ = (1 << 0),
55*4882a593Smuzhiyun LP_VALID_OWD = (1 << 1),
56*4882a593Smuzhiyun LP_WITHIN_THR = (1 << 3),
57*4882a593Smuzhiyun LP_WITHIN_INF = (1 << 4),
58*4882a593Smuzhiyun };
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun /**
61*4882a593Smuzhiyun * struct lp
62*4882a593Smuzhiyun * @flag: TCP-LP state flag
63*4882a593Smuzhiyun * @sowd: smoothed OWD << 3
64*4882a593Smuzhiyun * @owd_min: min OWD
65*4882a593Smuzhiyun * @owd_max: max OWD
66*4882a593Smuzhiyun * @owd_max_rsv: resrved max owd
67*4882a593Smuzhiyun * @remote_hz: estimated remote HZ
68*4882a593Smuzhiyun * @remote_ref_time: remote reference time
69*4882a593Smuzhiyun * @local_ref_time: local reference time
70*4882a593Smuzhiyun * @last_drop: time for last active drop
71*4882a593Smuzhiyun * @inference: current inference
72*4882a593Smuzhiyun *
73*4882a593Smuzhiyun * TCP-LP's private struct.
74*4882a593Smuzhiyun * We get the idea from original TCP-LP implementation where only left those we
75*4882a593Smuzhiyun * found are really useful.
76*4882a593Smuzhiyun */
77*4882a593Smuzhiyun struct lp {
78*4882a593Smuzhiyun u32 flag;
79*4882a593Smuzhiyun u32 sowd;
80*4882a593Smuzhiyun u32 owd_min;
81*4882a593Smuzhiyun u32 owd_max;
82*4882a593Smuzhiyun u32 owd_max_rsv;
83*4882a593Smuzhiyun u32 remote_hz;
84*4882a593Smuzhiyun u32 remote_ref_time;
85*4882a593Smuzhiyun u32 local_ref_time;
86*4882a593Smuzhiyun u32 last_drop;
87*4882a593Smuzhiyun u32 inference;
88*4882a593Smuzhiyun };
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun /**
91*4882a593Smuzhiyun * tcp_lp_init
92*4882a593Smuzhiyun *
93*4882a593Smuzhiyun * Init all required variables.
94*4882a593Smuzhiyun * Clone the handling from Vegas module implementation.
95*4882a593Smuzhiyun */
tcp_lp_init(struct sock * sk)96*4882a593Smuzhiyun static void tcp_lp_init(struct sock *sk)
97*4882a593Smuzhiyun {
98*4882a593Smuzhiyun struct lp *lp = inet_csk_ca(sk);
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun lp->flag = 0;
101*4882a593Smuzhiyun lp->sowd = 0;
102*4882a593Smuzhiyun lp->owd_min = 0xffffffff;
103*4882a593Smuzhiyun lp->owd_max = 0;
104*4882a593Smuzhiyun lp->owd_max_rsv = 0;
105*4882a593Smuzhiyun lp->remote_hz = 0;
106*4882a593Smuzhiyun lp->remote_ref_time = 0;
107*4882a593Smuzhiyun lp->local_ref_time = 0;
108*4882a593Smuzhiyun lp->last_drop = 0;
109*4882a593Smuzhiyun lp->inference = 0;
110*4882a593Smuzhiyun }
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun /**
113*4882a593Smuzhiyun * tcp_lp_cong_avoid
114*4882a593Smuzhiyun *
115*4882a593Smuzhiyun * Implementation of cong_avoid.
116*4882a593Smuzhiyun * Will only call newReno CA when away from inference.
117*4882a593Smuzhiyun * From TCP-LP's paper, this will be handled in additive increasement.
118*4882a593Smuzhiyun */
tcp_lp_cong_avoid(struct sock * sk,u32 ack,u32 acked)119*4882a593Smuzhiyun static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
120*4882a593Smuzhiyun {
121*4882a593Smuzhiyun struct lp *lp = inet_csk_ca(sk);
122*4882a593Smuzhiyun
123*4882a593Smuzhiyun if (!(lp->flag & LP_WITHIN_INF))
124*4882a593Smuzhiyun tcp_reno_cong_avoid(sk, ack, acked);
125*4882a593Smuzhiyun }
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun /**
128*4882a593Smuzhiyun * tcp_lp_remote_hz_estimator
129*4882a593Smuzhiyun *
130*4882a593Smuzhiyun * Estimate remote HZ.
131*4882a593Smuzhiyun * We keep on updating the estimated value, where original TCP-LP
132*4882a593Smuzhiyun * implementation only guest it for once and use forever.
133*4882a593Smuzhiyun */
tcp_lp_remote_hz_estimator(struct sock * sk)134*4882a593Smuzhiyun static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
135*4882a593Smuzhiyun {
136*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
137*4882a593Smuzhiyun struct lp *lp = inet_csk_ca(sk);
138*4882a593Smuzhiyun s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */
139*4882a593Smuzhiyun s64 m = 0;
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun /* not yet record reference time
142*4882a593Smuzhiyun * go away!! record it before come back!! */
143*4882a593Smuzhiyun if (lp->remote_ref_time == 0 || lp->local_ref_time == 0)
144*4882a593Smuzhiyun goto out;
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun /* we can't calc remote HZ with no different!! */
147*4882a593Smuzhiyun if (tp->rx_opt.rcv_tsval == lp->remote_ref_time ||
148*4882a593Smuzhiyun tp->rx_opt.rcv_tsecr == lp->local_ref_time)
149*4882a593Smuzhiyun goto out;
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun m = TCP_TS_HZ *
152*4882a593Smuzhiyun (tp->rx_opt.rcv_tsval - lp->remote_ref_time) /
153*4882a593Smuzhiyun (tp->rx_opt.rcv_tsecr - lp->local_ref_time);
154*4882a593Smuzhiyun if (m < 0)
155*4882a593Smuzhiyun m = -m;
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun if (rhz > 0) {
158*4882a593Smuzhiyun m -= rhz >> 6; /* m is now error in remote HZ est */
159*4882a593Smuzhiyun rhz += m; /* 63/64 old + 1/64 new */
160*4882a593Smuzhiyun } else
161*4882a593Smuzhiyun rhz = m << 6;
162*4882a593Smuzhiyun
163*4882a593Smuzhiyun out:
164*4882a593Smuzhiyun /* record time for successful remote HZ calc */
165*4882a593Smuzhiyun if ((rhz >> 6) > 0)
166*4882a593Smuzhiyun lp->flag |= LP_VALID_RHZ;
167*4882a593Smuzhiyun else
168*4882a593Smuzhiyun lp->flag &= ~LP_VALID_RHZ;
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun /* record reference time stamp */
171*4882a593Smuzhiyun lp->remote_ref_time = tp->rx_opt.rcv_tsval;
172*4882a593Smuzhiyun lp->local_ref_time = tp->rx_opt.rcv_tsecr;
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun return rhz >> 6;
175*4882a593Smuzhiyun }
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun /**
178*4882a593Smuzhiyun * tcp_lp_owd_calculator
179*4882a593Smuzhiyun *
180*4882a593Smuzhiyun * Calculate one way delay (in relative format).
181*4882a593Smuzhiyun * Original implement OWD as minus of remote time difference to local time
182*4882a593Smuzhiyun * difference directly. As this time difference just simply equal to RTT, when
183*4882a593Smuzhiyun * the network status is stable, remote RTT will equal to local RTT, and result
184*4882a593Smuzhiyun * OWD into zero.
185*4882a593Smuzhiyun * It seems to be a bug and so we fixed it.
186*4882a593Smuzhiyun */
tcp_lp_owd_calculator(struct sock * sk)187*4882a593Smuzhiyun static u32 tcp_lp_owd_calculator(struct sock *sk)
188*4882a593Smuzhiyun {
189*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
190*4882a593Smuzhiyun struct lp *lp = inet_csk_ca(sk);
191*4882a593Smuzhiyun s64 owd = 0;
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun lp->remote_hz = tcp_lp_remote_hz_estimator(sk);
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun if (lp->flag & LP_VALID_RHZ) {
196*4882a593Smuzhiyun owd =
197*4882a593Smuzhiyun tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) -
198*4882a593Smuzhiyun tp->rx_opt.rcv_tsecr * (LP_RESOL / TCP_TS_HZ);
199*4882a593Smuzhiyun if (owd < 0)
200*4882a593Smuzhiyun owd = -owd;
201*4882a593Smuzhiyun }
202*4882a593Smuzhiyun
203*4882a593Smuzhiyun if (owd > 0)
204*4882a593Smuzhiyun lp->flag |= LP_VALID_OWD;
205*4882a593Smuzhiyun else
206*4882a593Smuzhiyun lp->flag &= ~LP_VALID_OWD;
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun return owd;
209*4882a593Smuzhiyun }
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun /**
212*4882a593Smuzhiyun * tcp_lp_rtt_sample
213*4882a593Smuzhiyun *
214*4882a593Smuzhiyun * Implementation or rtt_sample.
215*4882a593Smuzhiyun * Will take the following action,
216*4882a593Smuzhiyun * 1. calc OWD,
217*4882a593Smuzhiyun * 2. record the min/max OWD,
218*4882a593Smuzhiyun * 3. calc smoothed OWD (SOWD).
219*4882a593Smuzhiyun * Most ideas come from the original TCP-LP implementation.
220*4882a593Smuzhiyun */
tcp_lp_rtt_sample(struct sock * sk,u32 rtt)221*4882a593Smuzhiyun static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
222*4882a593Smuzhiyun {
223*4882a593Smuzhiyun struct lp *lp = inet_csk_ca(sk);
224*4882a593Smuzhiyun s64 mowd = tcp_lp_owd_calculator(sk);
225*4882a593Smuzhiyun
226*4882a593Smuzhiyun /* sorry that we don't have valid data */
227*4882a593Smuzhiyun if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD))
228*4882a593Smuzhiyun return;
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun /* record the next min owd */
231*4882a593Smuzhiyun if (mowd < lp->owd_min)
232*4882a593Smuzhiyun lp->owd_min = mowd;
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun /* always forget the max of the max
235*4882a593Smuzhiyun * we just set owd_max as one below it */
236*4882a593Smuzhiyun if (mowd > lp->owd_max) {
237*4882a593Smuzhiyun if (mowd > lp->owd_max_rsv) {
238*4882a593Smuzhiyun if (lp->owd_max_rsv == 0)
239*4882a593Smuzhiyun lp->owd_max = mowd;
240*4882a593Smuzhiyun else
241*4882a593Smuzhiyun lp->owd_max = lp->owd_max_rsv;
242*4882a593Smuzhiyun lp->owd_max_rsv = mowd;
243*4882a593Smuzhiyun } else
244*4882a593Smuzhiyun lp->owd_max = mowd;
245*4882a593Smuzhiyun }
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun /* calc for smoothed owd */
248*4882a593Smuzhiyun if (lp->sowd != 0) {
249*4882a593Smuzhiyun mowd -= lp->sowd >> 3; /* m is now error in owd est */
250*4882a593Smuzhiyun lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */
251*4882a593Smuzhiyun } else
252*4882a593Smuzhiyun lp->sowd = mowd << 3; /* take the measured time be owd */
253*4882a593Smuzhiyun }
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun /**
256*4882a593Smuzhiyun * tcp_lp_pkts_acked
257*4882a593Smuzhiyun *
258*4882a593Smuzhiyun * Implementation of pkts_acked.
259*4882a593Smuzhiyun * Deal with active drop under Early Congestion Indication.
260*4882a593Smuzhiyun * Only drop to half and 1 will be handle, because we hope to use back
261*4882a593Smuzhiyun * newReno in increase case.
262*4882a593Smuzhiyun * We work it out by following the idea from TCP-LP's paper directly
263*4882a593Smuzhiyun */
tcp_lp_pkts_acked(struct sock * sk,const struct ack_sample * sample)264*4882a593Smuzhiyun static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
265*4882a593Smuzhiyun {
266*4882a593Smuzhiyun struct tcp_sock *tp = tcp_sk(sk);
267*4882a593Smuzhiyun struct lp *lp = inet_csk_ca(sk);
268*4882a593Smuzhiyun u32 now = tcp_time_stamp(tp);
269*4882a593Smuzhiyun u32 delta;
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun if (sample->rtt_us > 0)
272*4882a593Smuzhiyun tcp_lp_rtt_sample(sk, sample->rtt_us);
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun /* calc inference */
275*4882a593Smuzhiyun delta = now - tp->rx_opt.rcv_tsecr;
276*4882a593Smuzhiyun if ((s32)delta > 0)
277*4882a593Smuzhiyun lp->inference = 3 * delta;
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun /* test if within inference */
280*4882a593Smuzhiyun if (lp->last_drop && (now - lp->last_drop < lp->inference))
281*4882a593Smuzhiyun lp->flag |= LP_WITHIN_INF;
282*4882a593Smuzhiyun else
283*4882a593Smuzhiyun lp->flag &= ~LP_WITHIN_INF;
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun /* test if within threshold */
286*4882a593Smuzhiyun if (lp->sowd >> 3 <
287*4882a593Smuzhiyun lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100)
288*4882a593Smuzhiyun lp->flag |= LP_WITHIN_THR;
289*4882a593Smuzhiyun else
290*4882a593Smuzhiyun lp->flag &= ~LP_WITHIN_THR;
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag,
293*4882a593Smuzhiyun tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max,
294*4882a593Smuzhiyun lp->sowd >> 3);
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun if (lp->flag & LP_WITHIN_THR)
297*4882a593Smuzhiyun return;
298*4882a593Smuzhiyun
299*4882a593Smuzhiyun /* FIXME: try to reset owd_min and owd_max here
300*4882a593Smuzhiyun * so decrease the chance the min/max is no longer suitable
301*4882a593Smuzhiyun * and will usually within threshold when whithin inference */
302*4882a593Smuzhiyun lp->owd_min = lp->sowd >> 3;
303*4882a593Smuzhiyun lp->owd_max = lp->sowd >> 2;
304*4882a593Smuzhiyun lp->owd_max_rsv = lp->sowd >> 2;
305*4882a593Smuzhiyun
306*4882a593Smuzhiyun /* happened within inference
307*4882a593Smuzhiyun * drop snd_cwnd into 1 */
308*4882a593Smuzhiyun if (lp->flag & LP_WITHIN_INF)
309*4882a593Smuzhiyun tp->snd_cwnd = 1U;
310*4882a593Smuzhiyun
311*4882a593Smuzhiyun /* happened after inference
312*4882a593Smuzhiyun * cut snd_cwnd into half */
313*4882a593Smuzhiyun else
314*4882a593Smuzhiyun tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U);
315*4882a593Smuzhiyun
316*4882a593Smuzhiyun /* record this drop time */
317*4882a593Smuzhiyun lp->last_drop = now;
318*4882a593Smuzhiyun }
319*4882a593Smuzhiyun
320*4882a593Smuzhiyun static struct tcp_congestion_ops tcp_lp __read_mostly = {
321*4882a593Smuzhiyun .init = tcp_lp_init,
322*4882a593Smuzhiyun .ssthresh = tcp_reno_ssthresh,
323*4882a593Smuzhiyun .undo_cwnd = tcp_reno_undo_cwnd,
324*4882a593Smuzhiyun .cong_avoid = tcp_lp_cong_avoid,
325*4882a593Smuzhiyun .pkts_acked = tcp_lp_pkts_acked,
326*4882a593Smuzhiyun
327*4882a593Smuzhiyun .owner = THIS_MODULE,
328*4882a593Smuzhiyun .name = "lp"
329*4882a593Smuzhiyun };
330*4882a593Smuzhiyun
tcp_lp_register(void)331*4882a593Smuzhiyun static int __init tcp_lp_register(void)
332*4882a593Smuzhiyun {
333*4882a593Smuzhiyun BUILD_BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE);
334*4882a593Smuzhiyun return tcp_register_congestion_control(&tcp_lp);
335*4882a593Smuzhiyun }
336*4882a593Smuzhiyun
tcp_lp_unregister(void)337*4882a593Smuzhiyun static void __exit tcp_lp_unregister(void)
338*4882a593Smuzhiyun {
339*4882a593Smuzhiyun tcp_unregister_congestion_control(&tcp_lp);
340*4882a593Smuzhiyun }
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun module_init(tcp_lp_register);
343*4882a593Smuzhiyun module_exit(tcp_lp_unregister);
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun Mike");
346*4882a593Smuzhiyun MODULE_LICENSE("GPL");
347*4882a593Smuzhiyun MODULE_DESCRIPTION("TCP Low Priority");
348