xref: /OK3568_Linux_fs/kernel/drivers/md/dm-historical-service-time.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * Historical Service Time
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  *  Keeps a time-weighted exponential moving average of the historical
6*4882a593Smuzhiyun  *  service time. Estimates future service time based on the historical
7*4882a593Smuzhiyun  *  service time and the number of outstanding requests.
8*4882a593Smuzhiyun  *
9*4882a593Smuzhiyun  *  Marks paths stale if they have not finished within hst *
10*4882a593Smuzhiyun  *  num_paths. If a path is stale and unused, we will send a single
11*4882a593Smuzhiyun  *  request to probe in case the path has improved. This situation
12*4882a593Smuzhiyun  *  generally arises if the path is so much worse than others that it
13*4882a593Smuzhiyun  *  will never have the best estimated service time, or if the entire
14*4882a593Smuzhiyun  *  multipath device is unused. If a path is stale and in use, limit the
15*4882a593Smuzhiyun  *  number of requests it can receive with the assumption that the path
16*4882a593Smuzhiyun  *  has become degraded.
17*4882a593Smuzhiyun  *
18*4882a593Smuzhiyun  *  To avoid repeatedly calculating exponents for time weighting, times
19*4882a593Smuzhiyun  *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
20*4882a593Smuzhiyun  *  ns, and the weighting is pre-calculated.
21*4882a593Smuzhiyun  *
22*4882a593Smuzhiyun  */
23*4882a593Smuzhiyun 
24*4882a593Smuzhiyun #include "dm.h"
25*4882a593Smuzhiyun #include "dm-path-selector.h"
26*4882a593Smuzhiyun 
27*4882a593Smuzhiyun #include <linux/blkdev.h>
28*4882a593Smuzhiyun #include <linux/slab.h>
29*4882a593Smuzhiyun #include <linux/module.h>
30*4882a593Smuzhiyun 
31*4882a593Smuzhiyun 
32*4882a593Smuzhiyun #define DM_MSG_PREFIX	"multipath historical-service-time"
33*4882a593Smuzhiyun #define HST_MIN_IO 1
34*4882a593Smuzhiyun #define HST_VERSION "0.1.1"
35*4882a593Smuzhiyun 
36*4882a593Smuzhiyun #define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
37*4882a593Smuzhiyun #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
38*4882a593Smuzhiyun #define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
39*4882a593Smuzhiyun #define HST_FIXED_95 972
40*4882a593Smuzhiyun #define HST_MAX_INFLIGHT HST_FIXED_1
41*4882a593Smuzhiyun #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
42*4882a593Smuzhiyun #define HST_WEIGHT_COUNT 64ULL
43*4882a593Smuzhiyun 
44*4882a593Smuzhiyun struct selector {
45*4882a593Smuzhiyun 	struct list_head valid_paths;
46*4882a593Smuzhiyun 	struct list_head failed_paths;
47*4882a593Smuzhiyun 	int valid_count;
48*4882a593Smuzhiyun 	spinlock_t lock;
49*4882a593Smuzhiyun 
50*4882a593Smuzhiyun 	unsigned int weights[HST_WEIGHT_COUNT];
51*4882a593Smuzhiyun 	unsigned int threshold_multiplier;
52*4882a593Smuzhiyun };
53*4882a593Smuzhiyun 
54*4882a593Smuzhiyun struct path_info {
55*4882a593Smuzhiyun 	struct list_head list;
56*4882a593Smuzhiyun 	struct dm_path *path;
57*4882a593Smuzhiyun 	unsigned int repeat_count;
58*4882a593Smuzhiyun 
59*4882a593Smuzhiyun 	spinlock_t lock;
60*4882a593Smuzhiyun 
61*4882a593Smuzhiyun 	u64 historical_service_time; /* Fixed point */
62*4882a593Smuzhiyun 
63*4882a593Smuzhiyun 	u64 stale_after;
64*4882a593Smuzhiyun 	u64 last_finish;
65*4882a593Smuzhiyun 
66*4882a593Smuzhiyun 	u64 outstanding;
67*4882a593Smuzhiyun };
68*4882a593Smuzhiyun 
69*4882a593Smuzhiyun /**
70*4882a593Smuzhiyun  * fixed_power - compute: x^n, in O(log n) time
71*4882a593Smuzhiyun  *
72*4882a593Smuzhiyun  * @x:         base of the power
73*4882a593Smuzhiyun  * @frac_bits: fractional bits of @x
74*4882a593Smuzhiyun  * @n:         power to raise @x to.
75*4882a593Smuzhiyun  *
76*4882a593Smuzhiyun  * By exploiting the relation between the definition of the natural power
77*4882a593Smuzhiyun  * function: x^n := x*x*...*x (x multiplied by itself for n times), and
78*4882a593Smuzhiyun  * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
79*4882a593Smuzhiyun  * (where: n_i \elem {0, 1}, the binary vector representing n),
80*4882a593Smuzhiyun  * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
81*4882a593Smuzhiyun  * of course trivially computable in O(log_2 n), the length of our binary
82*4882a593Smuzhiyun  * vector.
83*4882a593Smuzhiyun  *
84*4882a593Smuzhiyun  * (see: kernel/sched/loadavg.c)
85*4882a593Smuzhiyun  */
fixed_power(u64 x,unsigned int frac_bits,unsigned int n)86*4882a593Smuzhiyun static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
87*4882a593Smuzhiyun {
88*4882a593Smuzhiyun 	unsigned long result = 1UL << frac_bits;
89*4882a593Smuzhiyun 
90*4882a593Smuzhiyun 	if (n) {
91*4882a593Smuzhiyun 		for (;;) {
92*4882a593Smuzhiyun 			if (n & 1) {
93*4882a593Smuzhiyun 				result *= x;
94*4882a593Smuzhiyun 				result += 1UL << (frac_bits - 1);
95*4882a593Smuzhiyun 				result >>= frac_bits;
96*4882a593Smuzhiyun 			}
97*4882a593Smuzhiyun 			n >>= 1;
98*4882a593Smuzhiyun 			if (!n)
99*4882a593Smuzhiyun 				break;
100*4882a593Smuzhiyun 			x *= x;
101*4882a593Smuzhiyun 			x += 1UL << (frac_bits - 1);
102*4882a593Smuzhiyun 			x >>= frac_bits;
103*4882a593Smuzhiyun 		}
104*4882a593Smuzhiyun 	}
105*4882a593Smuzhiyun 
106*4882a593Smuzhiyun 	return result;
107*4882a593Smuzhiyun }
108*4882a593Smuzhiyun 
109*4882a593Smuzhiyun /*
110*4882a593Smuzhiyun  * Calculate the next value of an exponential moving average
111*4882a593Smuzhiyun  * a_1 = a_0 * e + a * (1 - e)
112*4882a593Smuzhiyun  *
113*4882a593Smuzhiyun  * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
114*4882a593Smuzhiyun  * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
115*4882a593Smuzhiyun  * @weight: [0, HST_FIXED_1]
116*4882a593Smuzhiyun  *
117*4882a593Smuzhiyun  * Note:
118*4882a593Smuzhiyun  *   To account for multiple periods in the same calculation,
119*4882a593Smuzhiyun  *   a_n = a_0 * e^n + a * (1 - e^n),
120*4882a593Smuzhiyun  *   so call fixed_ema(last, next, pow(weight, N))
121*4882a593Smuzhiyun  */
fixed_ema(u64 last,u64 next,u64 weight)122*4882a593Smuzhiyun static u64 fixed_ema(u64 last, u64 next, u64 weight)
123*4882a593Smuzhiyun {
124*4882a593Smuzhiyun 	last *= weight;
125*4882a593Smuzhiyun 	last += next * (HST_FIXED_1 - weight);
126*4882a593Smuzhiyun 	last += 1ULL << (HST_FIXED_SHIFT - 1);
127*4882a593Smuzhiyun 	return last >> HST_FIXED_SHIFT;
128*4882a593Smuzhiyun }
129*4882a593Smuzhiyun 
alloc_selector(void)130*4882a593Smuzhiyun static struct selector *alloc_selector(void)
131*4882a593Smuzhiyun {
132*4882a593Smuzhiyun 	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
133*4882a593Smuzhiyun 
134*4882a593Smuzhiyun 	if (s) {
135*4882a593Smuzhiyun 		INIT_LIST_HEAD(&s->valid_paths);
136*4882a593Smuzhiyun 		INIT_LIST_HEAD(&s->failed_paths);
137*4882a593Smuzhiyun 		spin_lock_init(&s->lock);
138*4882a593Smuzhiyun 		s->valid_count = 0;
139*4882a593Smuzhiyun 	}
140*4882a593Smuzhiyun 
141*4882a593Smuzhiyun 	return s;
142*4882a593Smuzhiyun }
143*4882a593Smuzhiyun 
144*4882a593Smuzhiyun /*
145*4882a593Smuzhiyun  * Get the weight for a given time span.
146*4882a593Smuzhiyun  */
hst_weight(struct path_selector * ps,u64 delta)147*4882a593Smuzhiyun static u64 hst_weight(struct path_selector *ps, u64 delta)
148*4882a593Smuzhiyun {
149*4882a593Smuzhiyun 	struct selector *s = ps->context;
150*4882a593Smuzhiyun 	int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
151*4882a593Smuzhiyun 			   HST_WEIGHT_COUNT - 1);
152*4882a593Smuzhiyun 
153*4882a593Smuzhiyun 	return s->weights[bucket];
154*4882a593Smuzhiyun }
155*4882a593Smuzhiyun 
156*4882a593Smuzhiyun /*
157*4882a593Smuzhiyun  * Set up the weights array.
158*4882a593Smuzhiyun  *
159*4882a593Smuzhiyun  * weights[len-1] = 0
160*4882a593Smuzhiyun  * weights[n] = base ^ (n + 1)
161*4882a593Smuzhiyun  */
hst_set_weights(struct path_selector * ps,unsigned int base)162*4882a593Smuzhiyun static void hst_set_weights(struct path_selector *ps, unsigned int base)
163*4882a593Smuzhiyun {
164*4882a593Smuzhiyun 	struct selector *s = ps->context;
165*4882a593Smuzhiyun 	int i;
166*4882a593Smuzhiyun 
167*4882a593Smuzhiyun 	if (base >= HST_FIXED_1)
168*4882a593Smuzhiyun 		return;
169*4882a593Smuzhiyun 
170*4882a593Smuzhiyun 	for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
171*4882a593Smuzhiyun 		s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
172*4882a593Smuzhiyun 	s->weights[HST_WEIGHT_COUNT - 1] = 0;
173*4882a593Smuzhiyun }
174*4882a593Smuzhiyun 
hst_create(struct path_selector * ps,unsigned int argc,char ** argv)175*4882a593Smuzhiyun static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
176*4882a593Smuzhiyun {
177*4882a593Smuzhiyun 	struct selector *s;
178*4882a593Smuzhiyun 	unsigned int base_weight = HST_FIXED_95;
179*4882a593Smuzhiyun 	unsigned int threshold_multiplier = 0;
180*4882a593Smuzhiyun 	char dummy;
181*4882a593Smuzhiyun 
182*4882a593Smuzhiyun 	/*
183*4882a593Smuzhiyun 	 * Arguments: [<base_weight> [<threshold_multiplier>]]
184*4882a593Smuzhiyun 	 *   <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
185*4882a593Smuzhiyun 	 *                  value of 0 will completely ignore any history.
186*4882a593Smuzhiyun 	 *                  If not given, default (HST_FIXED_95) is used.
187*4882a593Smuzhiyun 	 *   <threshold_multiplier>: Minimum threshold multiplier for paths to
188*4882a593Smuzhiyun 	 *                  be considered different. That is, a path is
189*4882a593Smuzhiyun 	 *                  considered different iff (p1 > N * p2) where p1
190*4882a593Smuzhiyun 	 *                  is the path with higher service time. A threshold
191*4882a593Smuzhiyun 	 *                  of 1 or 0 has no effect. Defaults to 0.
192*4882a593Smuzhiyun 	 */
193*4882a593Smuzhiyun 	if (argc > 2)
194*4882a593Smuzhiyun 		return -EINVAL;
195*4882a593Smuzhiyun 
196*4882a593Smuzhiyun 	if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
197*4882a593Smuzhiyun 	     base_weight >= HST_FIXED_1)) {
198*4882a593Smuzhiyun 		return -EINVAL;
199*4882a593Smuzhiyun 	}
200*4882a593Smuzhiyun 
201*4882a593Smuzhiyun 	if (argc > 1 && (sscanf(argv[1], "%u%c",
202*4882a593Smuzhiyun 				&threshold_multiplier, &dummy) != 1)) {
203*4882a593Smuzhiyun 		return -EINVAL;
204*4882a593Smuzhiyun 	}
205*4882a593Smuzhiyun 
206*4882a593Smuzhiyun 	s = alloc_selector();
207*4882a593Smuzhiyun 	if (!s)
208*4882a593Smuzhiyun 		return -ENOMEM;
209*4882a593Smuzhiyun 
210*4882a593Smuzhiyun 	ps->context = s;
211*4882a593Smuzhiyun 
212*4882a593Smuzhiyun 	hst_set_weights(ps, base_weight);
213*4882a593Smuzhiyun 	s->threshold_multiplier = threshold_multiplier;
214*4882a593Smuzhiyun 	return 0;
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun 
free_paths(struct list_head * paths)217*4882a593Smuzhiyun static void free_paths(struct list_head *paths)
218*4882a593Smuzhiyun {
219*4882a593Smuzhiyun 	struct path_info *pi, *next;
220*4882a593Smuzhiyun 
221*4882a593Smuzhiyun 	list_for_each_entry_safe(pi, next, paths, list) {
222*4882a593Smuzhiyun 		list_del(&pi->list);
223*4882a593Smuzhiyun 		kfree(pi);
224*4882a593Smuzhiyun 	}
225*4882a593Smuzhiyun }
226*4882a593Smuzhiyun 
hst_destroy(struct path_selector * ps)227*4882a593Smuzhiyun static void hst_destroy(struct path_selector *ps)
228*4882a593Smuzhiyun {
229*4882a593Smuzhiyun 	struct selector *s = ps->context;
230*4882a593Smuzhiyun 
231*4882a593Smuzhiyun 	free_paths(&s->valid_paths);
232*4882a593Smuzhiyun 	free_paths(&s->failed_paths);
233*4882a593Smuzhiyun 	kfree(s);
234*4882a593Smuzhiyun 	ps->context = NULL;
235*4882a593Smuzhiyun }
236*4882a593Smuzhiyun 
hst_status(struct path_selector * ps,struct dm_path * path,status_type_t type,char * result,unsigned int maxlen)237*4882a593Smuzhiyun static int hst_status(struct path_selector *ps, struct dm_path *path,
238*4882a593Smuzhiyun 		     status_type_t type, char *result, unsigned int maxlen)
239*4882a593Smuzhiyun {
240*4882a593Smuzhiyun 	unsigned int sz = 0;
241*4882a593Smuzhiyun 	struct path_info *pi;
242*4882a593Smuzhiyun 
243*4882a593Smuzhiyun 	if (!path) {
244*4882a593Smuzhiyun 		struct selector *s = ps->context;
245*4882a593Smuzhiyun 
246*4882a593Smuzhiyun 		DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
247*4882a593Smuzhiyun 	} else {
248*4882a593Smuzhiyun 		pi = path->pscontext;
249*4882a593Smuzhiyun 
250*4882a593Smuzhiyun 		switch (type) {
251*4882a593Smuzhiyun 		case STATUSTYPE_INFO:
252*4882a593Smuzhiyun 			DMEMIT("%llu %llu %llu ", pi->historical_service_time,
253*4882a593Smuzhiyun 			       pi->outstanding, pi->stale_after);
254*4882a593Smuzhiyun 			break;
255*4882a593Smuzhiyun 		case STATUSTYPE_TABLE:
256*4882a593Smuzhiyun 			DMEMIT("0 ");
257*4882a593Smuzhiyun 			break;
258*4882a593Smuzhiyun 		}
259*4882a593Smuzhiyun 	}
260*4882a593Smuzhiyun 
261*4882a593Smuzhiyun 	return sz;
262*4882a593Smuzhiyun }
263*4882a593Smuzhiyun 
hst_add_path(struct path_selector * ps,struct dm_path * path,int argc,char ** argv,char ** error)264*4882a593Smuzhiyun static int hst_add_path(struct path_selector *ps, struct dm_path *path,
265*4882a593Smuzhiyun 		       int argc, char **argv, char **error)
266*4882a593Smuzhiyun {
267*4882a593Smuzhiyun 	struct selector *s = ps->context;
268*4882a593Smuzhiyun 	struct path_info *pi;
269*4882a593Smuzhiyun 	unsigned int repeat_count = HST_MIN_IO;
270*4882a593Smuzhiyun 	char dummy;
271*4882a593Smuzhiyun 	unsigned long flags;
272*4882a593Smuzhiyun 
273*4882a593Smuzhiyun 	/*
274*4882a593Smuzhiyun 	 * Arguments: [<repeat_count>]
275*4882a593Smuzhiyun 	 *   <repeat_count>: The number of I/Os before switching path.
276*4882a593Smuzhiyun 	 *                   If not given, default (HST_MIN_IO) is used.
277*4882a593Smuzhiyun 	 */
278*4882a593Smuzhiyun 	if (argc > 1) {
279*4882a593Smuzhiyun 		*error = "historical-service-time ps: incorrect number of arguments";
280*4882a593Smuzhiyun 		return -EINVAL;
281*4882a593Smuzhiyun 	}
282*4882a593Smuzhiyun 
283*4882a593Smuzhiyun 	if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
284*4882a593Smuzhiyun 		*error = "historical-service-time ps: invalid repeat count";
285*4882a593Smuzhiyun 		return -EINVAL;
286*4882a593Smuzhiyun 	}
287*4882a593Smuzhiyun 
288*4882a593Smuzhiyun 	/* allocate the path */
289*4882a593Smuzhiyun 	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
290*4882a593Smuzhiyun 	if (!pi) {
291*4882a593Smuzhiyun 		*error = "historical-service-time ps: Error allocating path context";
292*4882a593Smuzhiyun 		return -ENOMEM;
293*4882a593Smuzhiyun 	}
294*4882a593Smuzhiyun 
295*4882a593Smuzhiyun 	pi->path = path;
296*4882a593Smuzhiyun 	pi->repeat_count = repeat_count;
297*4882a593Smuzhiyun 
298*4882a593Smuzhiyun 	pi->historical_service_time = HST_FIXED_1;
299*4882a593Smuzhiyun 
300*4882a593Smuzhiyun 	spin_lock_init(&pi->lock);
301*4882a593Smuzhiyun 	pi->outstanding = 0;
302*4882a593Smuzhiyun 
303*4882a593Smuzhiyun 	pi->stale_after = 0;
304*4882a593Smuzhiyun 	pi->last_finish = 0;
305*4882a593Smuzhiyun 
306*4882a593Smuzhiyun 	path->pscontext = pi;
307*4882a593Smuzhiyun 
308*4882a593Smuzhiyun 	spin_lock_irqsave(&s->lock, flags);
309*4882a593Smuzhiyun 	list_add_tail(&pi->list, &s->valid_paths);
310*4882a593Smuzhiyun 	s->valid_count++;
311*4882a593Smuzhiyun 	spin_unlock_irqrestore(&s->lock, flags);
312*4882a593Smuzhiyun 
313*4882a593Smuzhiyun 	return 0;
314*4882a593Smuzhiyun }
315*4882a593Smuzhiyun 
hst_fail_path(struct path_selector * ps,struct dm_path * path)316*4882a593Smuzhiyun static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
317*4882a593Smuzhiyun {
318*4882a593Smuzhiyun 	struct selector *s = ps->context;
319*4882a593Smuzhiyun 	struct path_info *pi = path->pscontext;
320*4882a593Smuzhiyun 	unsigned long flags;
321*4882a593Smuzhiyun 
322*4882a593Smuzhiyun 	spin_lock_irqsave(&s->lock, flags);
323*4882a593Smuzhiyun 	list_move(&pi->list, &s->failed_paths);
324*4882a593Smuzhiyun 	s->valid_count--;
325*4882a593Smuzhiyun 	spin_unlock_irqrestore(&s->lock, flags);
326*4882a593Smuzhiyun }
327*4882a593Smuzhiyun 
hst_reinstate_path(struct path_selector * ps,struct dm_path * path)328*4882a593Smuzhiyun static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
329*4882a593Smuzhiyun {
330*4882a593Smuzhiyun 	struct selector *s = ps->context;
331*4882a593Smuzhiyun 	struct path_info *pi = path->pscontext;
332*4882a593Smuzhiyun 	unsigned long flags;
333*4882a593Smuzhiyun 
334*4882a593Smuzhiyun 	spin_lock_irqsave(&s->lock, flags);
335*4882a593Smuzhiyun 	list_move_tail(&pi->list, &s->valid_paths);
336*4882a593Smuzhiyun 	s->valid_count++;
337*4882a593Smuzhiyun 	spin_unlock_irqrestore(&s->lock, flags);
338*4882a593Smuzhiyun 
339*4882a593Smuzhiyun 	return 0;
340*4882a593Smuzhiyun }
341*4882a593Smuzhiyun 
hst_fill_compare(struct path_info * pi,u64 * hst,u64 * out,u64 * stale)342*4882a593Smuzhiyun static void hst_fill_compare(struct path_info *pi, u64 *hst,
343*4882a593Smuzhiyun 			     u64 *out, u64 *stale)
344*4882a593Smuzhiyun {
345*4882a593Smuzhiyun 	unsigned long flags;
346*4882a593Smuzhiyun 
347*4882a593Smuzhiyun 	spin_lock_irqsave(&pi->lock, flags);
348*4882a593Smuzhiyun 	*hst = pi->historical_service_time;
349*4882a593Smuzhiyun 	*out = pi->outstanding;
350*4882a593Smuzhiyun 	*stale = pi->stale_after;
351*4882a593Smuzhiyun 	spin_unlock_irqrestore(&pi->lock, flags);
352*4882a593Smuzhiyun }
353*4882a593Smuzhiyun 
354*4882a593Smuzhiyun /*
355*4882a593Smuzhiyun  * Compare the estimated service time of 2 paths, pi1 and pi2,
356*4882a593Smuzhiyun  * for the incoming I/O.
357*4882a593Smuzhiyun  *
358*4882a593Smuzhiyun  * Returns:
359*4882a593Smuzhiyun  * < 0 : pi1 is better
360*4882a593Smuzhiyun  * 0   : no difference between pi1 and pi2
361*4882a593Smuzhiyun  * > 0 : pi2 is better
362*4882a593Smuzhiyun  *
363*4882a593Smuzhiyun  */
hst_compare(struct path_info * pi1,struct path_info * pi2,u64 time_now,struct path_selector * ps)364*4882a593Smuzhiyun static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
365*4882a593Smuzhiyun 			     u64 time_now, struct path_selector *ps)
366*4882a593Smuzhiyun {
367*4882a593Smuzhiyun 	struct selector *s = ps->context;
368*4882a593Smuzhiyun 	u64 hst1, hst2;
369*4882a593Smuzhiyun 	long long out1, out2, stale1, stale2;
370*4882a593Smuzhiyun 	int pi2_better, over_threshold;
371*4882a593Smuzhiyun 
372*4882a593Smuzhiyun 	hst_fill_compare(pi1, &hst1, &out1, &stale1);
373*4882a593Smuzhiyun 	hst_fill_compare(pi2, &hst2, &out2, &stale2);
374*4882a593Smuzhiyun 
375*4882a593Smuzhiyun 	/* Check here if estimated latency for two paths are too similar.
376*4882a593Smuzhiyun 	 * If this is the case, we skip extra calculation and just compare
377*4882a593Smuzhiyun 	 * outstanding requests. In this case, any unloaded paths will
378*4882a593Smuzhiyun 	 * be preferred.
379*4882a593Smuzhiyun 	 */
380*4882a593Smuzhiyun 	if (hst1 > hst2)
381*4882a593Smuzhiyun 		over_threshold = hst1 > (s->threshold_multiplier * hst2);
382*4882a593Smuzhiyun 	else
383*4882a593Smuzhiyun 		over_threshold = hst2 > (s->threshold_multiplier * hst1);
384*4882a593Smuzhiyun 
385*4882a593Smuzhiyun 	if (!over_threshold)
386*4882a593Smuzhiyun 		return out1 - out2;
387*4882a593Smuzhiyun 
388*4882a593Smuzhiyun 	/*
389*4882a593Smuzhiyun 	 * If an unloaded path is stale, choose it. If both paths are unloaded,
390*4882a593Smuzhiyun 	 * choose path that is the most stale.
391*4882a593Smuzhiyun 	 * (If one path is loaded, choose the other)
392*4882a593Smuzhiyun 	 */
393*4882a593Smuzhiyun 	if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
394*4882a593Smuzhiyun 	    (!out1 && !out2))
395*4882a593Smuzhiyun 		return (!out2 * stale1) - (!out1 * stale2);
396*4882a593Smuzhiyun 
397*4882a593Smuzhiyun 	/* Compare estimated service time. If outstanding is the same, we
398*4882a593Smuzhiyun 	 * don't need to multiply
399*4882a593Smuzhiyun 	 */
400*4882a593Smuzhiyun 	if (out1 == out2) {
401*4882a593Smuzhiyun 		pi2_better = hst1 > hst2;
402*4882a593Smuzhiyun 	} else {
403*4882a593Smuzhiyun 		/* Potential overflow with out >= 1024 */
404*4882a593Smuzhiyun 		if (unlikely(out1 >= HST_MAX_INFLIGHT ||
405*4882a593Smuzhiyun 			     out2 >= HST_MAX_INFLIGHT)) {
406*4882a593Smuzhiyun 			/* If over 1023 in-flights, we may overflow if hst
407*4882a593Smuzhiyun 			 * is at max. (With this shift we still overflow at
408*4882a593Smuzhiyun 			 * 1048576 in-flights, which is high enough).
409*4882a593Smuzhiyun 			 */
410*4882a593Smuzhiyun 			hst1 >>= HST_FIXED_SHIFT;
411*4882a593Smuzhiyun 			hst2 >>= HST_FIXED_SHIFT;
412*4882a593Smuzhiyun 		}
413*4882a593Smuzhiyun 		pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
414*4882a593Smuzhiyun 	}
415*4882a593Smuzhiyun 
416*4882a593Smuzhiyun 	/* In the case that the 'winner' is stale, limit to equal usage. */
417*4882a593Smuzhiyun 	if (pi2_better) {
418*4882a593Smuzhiyun 		if (stale2 < time_now)
419*4882a593Smuzhiyun 			return out1 - out2;
420*4882a593Smuzhiyun 		return 1;
421*4882a593Smuzhiyun 	}
422*4882a593Smuzhiyun 	if (stale1 < time_now)
423*4882a593Smuzhiyun 		return out1 - out2;
424*4882a593Smuzhiyun 	return -1;
425*4882a593Smuzhiyun }
426*4882a593Smuzhiyun 
hst_select_path(struct path_selector * ps,size_t nr_bytes)427*4882a593Smuzhiyun static struct dm_path *hst_select_path(struct path_selector *ps,
428*4882a593Smuzhiyun 				       size_t nr_bytes)
429*4882a593Smuzhiyun {
430*4882a593Smuzhiyun 	struct selector *s = ps->context;
431*4882a593Smuzhiyun 	struct path_info *pi = NULL, *best = NULL;
432*4882a593Smuzhiyun 	u64 time_now = ktime_get_ns();
433*4882a593Smuzhiyun 	struct dm_path *ret = NULL;
434*4882a593Smuzhiyun 	unsigned long flags;
435*4882a593Smuzhiyun 
436*4882a593Smuzhiyun 	spin_lock_irqsave(&s->lock, flags);
437*4882a593Smuzhiyun 	if (list_empty(&s->valid_paths))
438*4882a593Smuzhiyun 		goto out;
439*4882a593Smuzhiyun 
440*4882a593Smuzhiyun 	list_for_each_entry(pi, &s->valid_paths, list) {
441*4882a593Smuzhiyun 		if (!best || (hst_compare(pi, best, time_now, ps) < 0))
442*4882a593Smuzhiyun 			best = pi;
443*4882a593Smuzhiyun 	}
444*4882a593Smuzhiyun 
445*4882a593Smuzhiyun 	if (!best)
446*4882a593Smuzhiyun 		goto out;
447*4882a593Smuzhiyun 
448*4882a593Smuzhiyun 	/* Move last used path to end (least preferred in case of ties) */
449*4882a593Smuzhiyun 	list_move_tail(&best->list, &s->valid_paths);
450*4882a593Smuzhiyun 
451*4882a593Smuzhiyun 	ret = best->path;
452*4882a593Smuzhiyun 
453*4882a593Smuzhiyun out:
454*4882a593Smuzhiyun 	spin_unlock_irqrestore(&s->lock, flags);
455*4882a593Smuzhiyun 	return ret;
456*4882a593Smuzhiyun }
457*4882a593Smuzhiyun 
hst_start_io(struct path_selector * ps,struct dm_path * path,size_t nr_bytes)458*4882a593Smuzhiyun static int hst_start_io(struct path_selector *ps, struct dm_path *path,
459*4882a593Smuzhiyun 			size_t nr_bytes)
460*4882a593Smuzhiyun {
461*4882a593Smuzhiyun 	struct path_info *pi = path->pscontext;
462*4882a593Smuzhiyun 	unsigned long flags;
463*4882a593Smuzhiyun 
464*4882a593Smuzhiyun 	spin_lock_irqsave(&pi->lock, flags);
465*4882a593Smuzhiyun 	pi->outstanding++;
466*4882a593Smuzhiyun 	spin_unlock_irqrestore(&pi->lock, flags);
467*4882a593Smuzhiyun 
468*4882a593Smuzhiyun 	return 0;
469*4882a593Smuzhiyun }
470*4882a593Smuzhiyun 
path_service_time(struct path_info * pi,u64 start_time)471*4882a593Smuzhiyun static u64 path_service_time(struct path_info *pi, u64 start_time)
472*4882a593Smuzhiyun {
473*4882a593Smuzhiyun 	u64 now = ktime_get_ns();
474*4882a593Smuzhiyun 
475*4882a593Smuzhiyun 	/* if a previous disk request has finished after this IO was
476*4882a593Smuzhiyun 	 * sent to the hardware, pretend the submission happened
477*4882a593Smuzhiyun 	 * serially.
478*4882a593Smuzhiyun 	 */
479*4882a593Smuzhiyun 	if (time_after64(pi->last_finish, start_time))
480*4882a593Smuzhiyun 		start_time = pi->last_finish;
481*4882a593Smuzhiyun 
482*4882a593Smuzhiyun 	pi->last_finish = now;
483*4882a593Smuzhiyun 	if (time_before64(now, start_time))
484*4882a593Smuzhiyun 		return 0;
485*4882a593Smuzhiyun 
486*4882a593Smuzhiyun 	return now - start_time;
487*4882a593Smuzhiyun }
488*4882a593Smuzhiyun 
hst_end_io(struct path_selector * ps,struct dm_path * path,size_t nr_bytes,u64 start_time)489*4882a593Smuzhiyun static int hst_end_io(struct path_selector *ps, struct dm_path *path,
490*4882a593Smuzhiyun 		      size_t nr_bytes, u64 start_time)
491*4882a593Smuzhiyun {
492*4882a593Smuzhiyun 	struct path_info *pi = path->pscontext;
493*4882a593Smuzhiyun 	struct selector *s = ps->context;
494*4882a593Smuzhiyun 	unsigned long flags;
495*4882a593Smuzhiyun 	u64 st;
496*4882a593Smuzhiyun 
497*4882a593Smuzhiyun 	spin_lock_irqsave(&pi->lock, flags);
498*4882a593Smuzhiyun 
499*4882a593Smuzhiyun 	st = path_service_time(pi, start_time);
500*4882a593Smuzhiyun 	pi->outstanding--;
501*4882a593Smuzhiyun 	pi->historical_service_time =
502*4882a593Smuzhiyun 		fixed_ema(pi->historical_service_time,
503*4882a593Smuzhiyun 			  min(st * HST_FIXED_1, HST_FIXED_MAX),
504*4882a593Smuzhiyun 			  hst_weight(ps, st));
505*4882a593Smuzhiyun 
506*4882a593Smuzhiyun 	/*
507*4882a593Smuzhiyun 	 * On request end, mark path as fresh. If a path hasn't
508*4882a593Smuzhiyun 	 * finished any requests within the fresh period, the estimated
509*4882a593Smuzhiyun 	 * service time is considered too optimistic and we limit the
510*4882a593Smuzhiyun 	 * maximum requests on that path.
511*4882a593Smuzhiyun 	 */
512*4882a593Smuzhiyun 	pi->stale_after = pi->last_finish +
513*4882a593Smuzhiyun 		(s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
514*4882a593Smuzhiyun 
515*4882a593Smuzhiyun 	spin_unlock_irqrestore(&pi->lock, flags);
516*4882a593Smuzhiyun 
517*4882a593Smuzhiyun 	return 0;
518*4882a593Smuzhiyun }
519*4882a593Smuzhiyun 
520*4882a593Smuzhiyun static struct path_selector_type hst_ps = {
521*4882a593Smuzhiyun 	.name		= "historical-service-time",
522*4882a593Smuzhiyun 	.module		= THIS_MODULE,
523*4882a593Smuzhiyun 	.table_args	= 1,
524*4882a593Smuzhiyun 	.info_args	= 3,
525*4882a593Smuzhiyun 	.create		= hst_create,
526*4882a593Smuzhiyun 	.destroy	= hst_destroy,
527*4882a593Smuzhiyun 	.status		= hst_status,
528*4882a593Smuzhiyun 	.add_path	= hst_add_path,
529*4882a593Smuzhiyun 	.fail_path	= hst_fail_path,
530*4882a593Smuzhiyun 	.reinstate_path	= hst_reinstate_path,
531*4882a593Smuzhiyun 	.select_path	= hst_select_path,
532*4882a593Smuzhiyun 	.start_io	= hst_start_io,
533*4882a593Smuzhiyun 	.end_io		= hst_end_io,
534*4882a593Smuzhiyun };
535*4882a593Smuzhiyun 
dm_hst_init(void)536*4882a593Smuzhiyun static int __init dm_hst_init(void)
537*4882a593Smuzhiyun {
538*4882a593Smuzhiyun 	int r = dm_register_path_selector(&hst_ps);
539*4882a593Smuzhiyun 
540*4882a593Smuzhiyun 	if (r < 0)
541*4882a593Smuzhiyun 		DMERR("register failed %d", r);
542*4882a593Smuzhiyun 
543*4882a593Smuzhiyun 	DMINFO("version " HST_VERSION " loaded");
544*4882a593Smuzhiyun 
545*4882a593Smuzhiyun 	return r;
546*4882a593Smuzhiyun }
547*4882a593Smuzhiyun 
dm_hst_exit(void)548*4882a593Smuzhiyun static void __exit dm_hst_exit(void)
549*4882a593Smuzhiyun {
550*4882a593Smuzhiyun 	int r = dm_unregister_path_selector(&hst_ps);
551*4882a593Smuzhiyun 
552*4882a593Smuzhiyun 	if (r < 0)
553*4882a593Smuzhiyun 		DMERR("unregister failed %d", r);
554*4882a593Smuzhiyun }
555*4882a593Smuzhiyun 
556*4882a593Smuzhiyun module_init(dm_hst_init);
557*4882a593Smuzhiyun module_exit(dm_hst_exit);
558*4882a593Smuzhiyun 
559*4882a593Smuzhiyun MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
560*4882a593Smuzhiyun MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
561*4882a593Smuzhiyun MODULE_LICENSE("GPL");
562