kernel/block/blk-throttle.c

1 // SPDX-License-Identifier: GPL-2.0
13 #include <linux/blk-cgroup.h>
15 #include "blk-cgroup-rwstat.h"
30 #define DFL_LATENCY_TARGET (-1L)
35  * For HD, very small latency comes from sequential IO. Such IO is helpless to
55  * the parent, they're popped in round-robin order so that no single source
60  * throtl_service_queue and then dispatched in round-robin order.
69 	struct list_head	node;		/* service_queue->queued[] */
86 	 * their ->disptime.
96 	THROTL_TG_WAS_EMPTY	= 1 << 1,	/* bio_lists[] became non-empty */
178 	unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
188 /* We measure latency for request size from <= 4k to >= 1M */
197 	unsigned long latency; /* ns / 1024 */  member
246 	return pd_to_blkg(&tg->pd);  in tg_to_blkg()
250  * sq_to_tg - return the throl_grp the specified service queue belongs to
253  * Return the throtl_grp @sq belongs to.  If @sq is the top-level one
258 	if (sq && sq->parent_sq)  in sq_to_tg()
265  * sq_to_td - return throtl_data the specified service queue belongs to
276 		return tg->td;  in sq_to_td()
282  * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
285  *           every throtl_slice, the limit scales up 1/2 .low limit till the
287  * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
289 static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)  in throtl_adjusted_limit()  argument
292 	if (td->scale < 4096 && time_after_eq(jiffies,  in throtl_adjusted_limit()
293 	    td->low_upgrade_time + td->scale * td->throtl_slice))  in throtl_adjusted_limit()
294 		td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;  in throtl_adjusted_limit()
296 	return low + (low >> 1) * td->scale;  in throtl_adjusted_limit()
305 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)  in tg_bps_limit()
308 	td = tg->td;  in tg_bps_limit()
309 	ret = tg->bps[rw][td->limit_index];  in tg_bps_limit()
310 	if (ret == 0 && td->limit_index == LIMIT_LOW) {  in tg_bps_limit()
312 		if (!list_empty(&blkg->blkcg->css.children) ||  in tg_bps_limit()
313 		    tg->iops[rw][td->limit_index])  in tg_bps_limit()
319 	if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&  in tg_bps_limit()
320 	    tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {  in tg_bps_limit()
323 		adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);  in tg_bps_limit()
324 		ret = min(tg->bps[rw][LIMIT_MAX], adjusted);  in tg_bps_limit()
335 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)  in tg_iops_limit()
338 	td = tg->td;  in tg_iops_limit()
339 	ret = tg->iops[rw][td->limit_index];  in tg_iops_limit()
340 	if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {  in tg_iops_limit()
342 		if (!list_empty(&blkg->blkcg->css.children) ||  in tg_iops_limit()
343 		    tg->bps[rw][td->limit_index])  in tg_iops_limit()
349 	if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&  in tg_iops_limit()
350 	    tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {  in tg_iops_limit()
353 		adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);  in tg_iops_limit()
356 		ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);  in tg_iops_limit()
362 	clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
365  * throtl_log - log debug message via blktrace
378 	if (likely(!blk_trace_note_message_enabled(__td->queue)))	\
381 		blk_add_cgroup_trace_msg(__td->queue,			\
382 			tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\
384 		blk_add_trace_msg(__td->queue, "throtl " fmt, ##args);	\
393 	return bio->bi_iter.bi_size;  in throtl_bio_data_size()
398 	INIT_LIST_HEAD(&qn->node);  in throtl_qnode_init()
399 	bio_list_init(&qn->bios);  in throtl_qnode_init()
400 	qn->tg = tg;  in throtl_qnode_init()
404  * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
407  * @queued: the service_queue->queued[] list @qn belongs to
410  * @qn->tg's reference count is bumped when @qn is activated.  See the
416 	bio_list_add(&qn->bios, bio);  in throtl_qnode_add_bio()
417 	if (list_empty(&qn->node)) {  in throtl_qnode_add_bio()
418 		list_add_tail(&qn->node, queued);  in throtl_qnode_add_bio()
419 		blkg_get(tg_to_blkg(qn->tg));  in throtl_qnode_add_bio()
424  * throtl_peek_queued - peek the first bio on a qnode list
436 	bio = bio_list_peek(&qn->bios);  in throtl_peek_queued()
442  * throtl_pop_queued - pop the first bio form a qnode list
448  * that the popping order is round-robin.
465 	bio = bio_list_pop(&qn->bios);  in throtl_pop_queued()
468 	if (bio_list_empty(&qn->bios)) {  in throtl_pop_queued()
469 		list_del_init(&qn->node);  in throtl_pop_queued()
471 			*tg_to_put = qn->tg;  in throtl_pop_queued()
473 			blkg_put(tg_to_blkg(qn->tg));  in throtl_pop_queued()
475 		list_move_tail(&qn->node, queued);  in throtl_pop_queued()
484 	INIT_LIST_HEAD(&sq->queued[0]);  in throtl_service_queue_init()
485 	INIT_LIST_HEAD(&sq->queued[1]);  in throtl_service_queue_init()
486 	sq->pending_tree = RB_ROOT_CACHED;  in throtl_service_queue_init()
487 	timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);  in throtl_service_queue_init()
497 	tg = kzalloc_node(sizeof(*tg), gfp, q->node);  in throtl_pd_alloc()
501 	if (blkg_rwstat_init(&tg->stat_bytes, gfp))  in throtl_pd_alloc()
504 	if (blkg_rwstat_init(&tg->stat_ios, gfp))  in throtl_pd_alloc()
507 	throtl_service_queue_init(&tg->service_queue);  in throtl_pd_alloc()
510 		throtl_qnode_init(&tg->qnode_on_self[rw], tg);  in throtl_pd_alloc()
511 		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);  in throtl_pd_alloc()
514 	RB_CLEAR_NODE(&tg->rb_node);  in throtl_pd_alloc()
515 	tg->bps[READ][LIMIT_MAX] = U64_MAX;  in throtl_pd_alloc()
516 	tg->bps[WRITE][LIMIT_MAX] = U64_MAX;  in throtl_pd_alloc()
517 	tg->iops[READ][LIMIT_MAX] = UINT_MAX;  in throtl_pd_alloc()
518 	tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;  in throtl_pd_alloc()
519 	tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;  in throtl_pd_alloc()
520 	tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;  in throtl_pd_alloc()
521 	tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;  in throtl_pd_alloc()
522 	tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;  in throtl_pd_alloc()
525 	tg->latency_target = DFL_LATENCY_TARGET;  in throtl_pd_alloc()
526 	tg->latency_target_conf = DFL_LATENCY_TARGET;  in throtl_pd_alloc()
527 	tg->idletime_threshold = DFL_IDLE_THRESHOLD;  in throtl_pd_alloc()
528 	tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;  in throtl_pd_alloc()
530 	return &tg->pd;  in throtl_pd_alloc()
533 	blkg_rwstat_exit(&tg->stat_bytes);  in throtl_pd_alloc()
543 	struct throtl_data *td = blkg->q->td;  in throtl_pd_init()
544 	struct throtl_service_queue *sq = &tg->service_queue;  in throtl_pd_init()
559 	sq->parent_sq = &td->service_queue;  in throtl_pd_init()
560 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)  in throtl_pd_init()
561 		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;  in throtl_pd_init()
562 	tg->td = td;  in throtl_pd_init()
572 	struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);  in tg_update_has_rules()
573 	struct throtl_data *td = tg->td;  in tg_update_has_rules()
577 		tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||  in tg_update_has_rules()
578 			(td->limit_valid[td->limit_index] &&  in tg_update_has_rules()
600 	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {  in blk_throtl_update_limit_valid()
603 		if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||  in blk_throtl_update_limit_valid()
604 		    tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {  in blk_throtl_update_limit_valid()
611 	td->limit_valid[LIMIT_LOW] = low_valid;  in blk_throtl_update_limit_valid()
619 	tg->bps[READ][LIMIT_LOW] = 0;  in throtl_pd_offline()
620 	tg->bps[WRITE][LIMIT_LOW] = 0;  in throtl_pd_offline()
621 	tg->iops[READ][LIMIT_LOW] = 0;  in throtl_pd_offline()
622 	tg->iops[WRITE][LIMIT_LOW] = 0;  in throtl_pd_offline()
624 	blk_throtl_update_limit_valid(tg->td);  in throtl_pd_offline()
626 	if (!tg->td->limit_valid[tg->td->limit_index])  in throtl_pd_offline()
627 		throtl_upgrade_state(tg->td);  in throtl_pd_offline()
634 	del_timer_sync(&tg->service_queue.pending_timer);  in throtl_pd_free()
635 	blkg_rwstat_exit(&tg->stat_bytes);  in throtl_pd_free()
636 	blkg_rwstat_exit(&tg->stat_ios);  in throtl_pd_free()
645 	n = rb_first_cached(&parent_sq->pending_tree);  in throtl_rb_first()
655 	rb_erase_cached(n, &parent_sq->pending_tree);  in throtl_rb_erase()
657 	--parent_sq->nr_pending;  in throtl_rb_erase()
668 	parent_sq->first_pending_disptime = tg->disptime;  in update_min_dispatch_time()
673 	struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;  in tg_service_queue_add()
674 	struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node;  in tg_service_queue_add()
677 	unsigned long key = tg->disptime;  in tg_service_queue_add()
684 		if (time_before(key, __tg->disptime))  in tg_service_queue_add()
685 			node = &parent->rb_left;  in tg_service_queue_add()
687 			node = &parent->rb_right;  in tg_service_queue_add()
692 	rb_link_node(&tg->rb_node, parent, node);  in tg_service_queue_add()
693 	rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree,  in tg_service_queue_add()
699 	if (!(tg->flags & THROTL_TG_PENDING)) {  in throtl_enqueue_tg()
701 		tg->flags |= THROTL_TG_PENDING;  in throtl_enqueue_tg()
702 		tg->service_queue.parent_sq->nr_pending++;  in throtl_enqueue_tg()
708 	if (tg->flags & THROTL_TG_PENDING) {  in throtl_dequeue_tg()
709 		throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);  in throtl_dequeue_tg()
710 		tg->flags &= ~THROTL_TG_PENDING;  in throtl_dequeue_tg()
718 	unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice;  in throtl_schedule_pending_timer()
729 	mod_timer(&sq->pending_timer, expires);  in throtl_schedule_pending_timer()
731 		   expires - jiffies, jiffies);  in throtl_schedule_pending_timer()
735  * throtl_schedule_next_dispatch - schedule the next dispatch cycle
739  * Arm @sq->pending_timer so that the next dispatch cycle starts on the
749  * delay before dispatch starts even if @sq->first_pending_disptime is not
756 	if (!sq->nr_pending)  in throtl_schedule_next_dispatch()
762 	if (force || time_after(sq->first_pending_disptime, jiffies)) {  in throtl_schedule_next_dispatch()
763 		throtl_schedule_pending_timer(sq, sq->first_pending_disptime);  in throtl_schedule_next_dispatch()
774 	tg->bytes_disp[rw] = 0;  in throtl_start_new_slice_with_credit()
775 	tg->io_disp[rw] = 0;  in throtl_start_new_slice_with_credit()
777 	atomic_set(&tg->io_split_cnt[rw], 0);  in throtl_start_new_slice_with_credit()
785 	if (time_after_eq(start, tg->slice_start[rw]))  in throtl_start_new_slice_with_credit()
786 		tg->slice_start[rw] = start;  in throtl_start_new_slice_with_credit()
788 	tg->slice_end[rw] = jiffies + tg->td->throtl_slice;  in throtl_start_new_slice_with_credit()
789 	throtl_log(&tg->service_queue,  in throtl_start_new_slice_with_credit()
791 		   rw == READ ? 'R' : 'W', tg->slice_start[rw],  in throtl_start_new_slice_with_credit()
792 		   tg->slice_end[rw], jiffies);  in throtl_start_new_slice_with_credit()
797 	tg->bytes_disp[rw] = 0;  in throtl_start_new_slice()
798 	tg->io_disp[rw] = 0;  in throtl_start_new_slice()
799 	tg->slice_start[rw] = jiffies;  in throtl_start_new_slice()
800 	tg->slice_end[rw] = jiffies + tg->td->throtl_slice;  in throtl_start_new_slice()
802 	atomic_set(&tg->io_split_cnt[rw], 0);  in throtl_start_new_slice()
804 	throtl_log(&tg->service_queue,  in throtl_start_new_slice()
806 		   rw == READ ? 'R' : 'W', tg->slice_start[rw],  in throtl_start_new_slice()
807 		   tg->slice_end[rw], jiffies);  in throtl_start_new_slice()
813 	tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);  in throtl_set_slice_end()
820 	throtl_log(&tg->service_queue,  in throtl_extend_slice()
822 		   rw == READ ? 'R' : 'W', tg->slice_start[rw],  in throtl_extend_slice()
823 		   tg->slice_end[rw], jiffies);  in throtl_extend_slice()
829 	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))  in throtl_slice_used()
841 	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));  in throtl_trim_slice()
844 	 * If bps are unlimited (-1), then time slice don't get  in throtl_trim_slice()
853 	 * that initially cgroup limit was very low resulting in high  in throtl_trim_slice()
859 	throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);  in throtl_trim_slice()
861 	time_elapsed = jiffies - tg->slice_start[rw];  in throtl_trim_slice()
863 	nr_slices = time_elapsed / tg->td->throtl_slice;  in throtl_trim_slice()
867 	tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;  in throtl_trim_slice()
871 	io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /  in throtl_trim_slice()
877 	if (tg->bytes_disp[rw] >= bytes_trim)  in throtl_trim_slice()
878 		tg->bytes_disp[rw] -= bytes_trim;  in throtl_trim_slice()
880 		tg->bytes_disp[rw] = 0;  in throtl_trim_slice()
882 	if (tg->io_disp[rw] >= io_trim)  in throtl_trim_slice()
883 		tg->io_disp[rw] -= io_trim;  in throtl_trim_slice()
885 		tg->io_disp[rw] = 0;  in throtl_trim_slice()
887 	tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;  in throtl_trim_slice()
889 	throtl_log(&tg->service_queue,  in throtl_trim_slice()
892 		   tg->slice_start[rw], tg->slice_end[rw], jiffies);  in throtl_trim_slice()
909 	jiffy_elapsed = jiffies - tg->slice_start[rw];  in tg_with_in_iops_limit()
912 	jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);  in tg_with_in_iops_limit()
929 	if (tg->io_disp[rw] + 1 <= io_allowed) {  in tg_with_in_iops_limit()
936 	jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;  in tg_with_in_iops_limit()
957 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];  in tg_with_in_bps_limit()
961 		jiffy_elapsed_rnd = tg->td->throtl_slice;  in tg_with_in_bps_limit()
963 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);  in tg_with_in_bps_limit()
967 	if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {  in tg_with_in_bps_limit()
974 	extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;  in tg_with_in_bps_limit()
984 	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);  in tg_with_in_bps_limit()
992  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
1008 	BUG_ON(tg->service_queue.nr_queued[rw] &&  in tg_may_dispatch()
1009 	       bio != throtl_peek_queued(&tg->service_queue.queued[rw]));  in tg_may_dispatch()
1011 	/* If tg->bps = -1, then BW is unlimited */  in tg_may_dispatch()
1025 	if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))  in tg_may_dispatch()
1028 		if (time_before(tg->slice_end[rw],  in tg_may_dispatch()
1029 		    jiffies + tg->td->throtl_slice))  in tg_may_dispatch()
1031 				jiffies + tg->td->throtl_slice);  in tg_may_dispatch()
1035 		tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0);  in tg_may_dispatch()
1049 	if (time_before(tg->slice_end[rw], jiffies + max_wait))  in tg_may_dispatch()
1061 	tg->bytes_disp[rw] += bio_size;  in throtl_charge_bio()
1062 	tg->io_disp[rw]++;  in throtl_charge_bio()
1063 	tg->last_bytes_disp[rw] += bio_size;  in throtl_charge_bio()
1064 	tg->last_io_disp[rw]++;  in throtl_charge_bio()
1068 	 * more than once as a throttled bio will go through blk-throtl the  in throtl_charge_bio()
1077  * throtl_add_bio_tg - add a bio to the specified throtl_grp
1083  * tg->qnode_on_self[] is used.
1088 	struct throtl_service_queue *sq = &tg->service_queue;  in throtl_add_bio_tg()
1092 		qn = &tg->qnode_on_self[rw];  in throtl_add_bio_tg()
1100 	if (!sq->nr_queued[rw])  in throtl_add_bio_tg()
1101 		tg->flags |= THROTL_TG_WAS_EMPTY;  in throtl_add_bio_tg()
1103 	throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);  in throtl_add_bio_tg()
1105 	sq->nr_queued[rw]++;  in throtl_add_bio_tg()
1111 	struct throtl_service_queue *sq = &tg->service_queue;  in tg_update_disptime()
1112 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;  in tg_update_disptime()
1115 	bio = throtl_peek_queued(&sq->queued[READ]);  in tg_update_disptime()
1119 	bio = throtl_peek_queued(&sq->queued[WRITE]);  in tg_update_disptime()
1128 	tg->disptime = disptime;  in tg_update_disptime()
1132 	tg->flags &= ~THROTL_TG_WAS_EMPTY;  in tg_update_disptime()
1140 				child_tg->slice_start[rw]);  in start_parent_slice_with_credit()
1147 	struct throtl_service_queue *sq = &tg->service_queue;  in tg_dispatch_one_bio()
1148 	struct throtl_service_queue *parent_sq = sq->parent_sq;  in tg_dispatch_one_bio()
1159 	bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);  in tg_dispatch_one_bio()
1160 	sq->nr_queued[rw]--;  in tg_dispatch_one_bio()
1167 	 * @td->service_queue, @bio is ready to be issued.  Put it on its  in tg_dispatch_one_bio()
1172 		throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);  in tg_dispatch_one_bio()
1175 		throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],  in tg_dispatch_one_bio()
1176 				     &parent_sq->queued[rw]);  in tg_dispatch_one_bio()
1177 		BUG_ON(tg->td->nr_queued[rw] <= 0);  in tg_dispatch_one_bio()
1178 		tg->td->nr_queued[rw]--;  in tg_dispatch_one_bio()
1189 	struct throtl_service_queue *sq = &tg->service_queue;  in throtl_dispatch_tg()
1192 	unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads;  in throtl_dispatch_tg()
1197 	while ((bio = throtl_peek_queued(&sq->queued[READ])) &&  in throtl_dispatch_tg()
1207 	while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&  in throtl_dispatch_tg()
1228 		if (!parent_sq->nr_pending)  in throtl_select_dispatch()
1235 		if (time_before(jiffies, tg->disptime))  in throtl_select_dispatch()
1242 		sq = &tg->service_queue;  in throtl_select_dispatch()
1243 		if (sq->nr_queued[0] || sq->nr_queued[1])  in throtl_select_dispatch()
1256  * throtl_pending_timer_fn - timer function for service_queue->pending_timer
1267  * the top-level service_tree is reached, throtl_data->dispatch_work is
1275 	struct request_queue *q = td->queue;  in throtl_pending_timer_fn()
1280 	spin_lock_irq(&q->queue_lock);  in throtl_pending_timer_fn()
1285 	parent_sq = sq->parent_sq;  in throtl_pending_timer_fn()
1290 			   sq->nr_queued[READ] + sq->nr_queued[WRITE],  in throtl_pending_timer_fn()
1291 			   sq->nr_queued[READ], sq->nr_queued[WRITE]);  in throtl_pending_timer_fn()
1303 		spin_unlock_irq(&q->queue_lock);  in throtl_pending_timer_fn()
1305 		spin_lock_irq(&q->queue_lock);  in throtl_pending_timer_fn()
1313 		if (tg->flags & THROTL_TG_WAS_EMPTY) {  in throtl_pending_timer_fn()
1323 		/* reached the top-level, queue issuing */  in throtl_pending_timer_fn()
1324 		queue_work(kthrotld_workqueue, &td->dispatch_work);  in throtl_pending_timer_fn()
1327 	spin_unlock_irq(&q->queue_lock);  in throtl_pending_timer_fn()
1331  * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
1335  * of throtl_data->service_queue.  Those bios are ready and issued by this
1342 	struct throtl_service_queue *td_sq = &td->service_queue;  in blk_throtl_dispatch_work_fn()
1343 	struct request_queue *q = td->queue;  in blk_throtl_dispatch_work_fn()
1351 	spin_lock_irq(&q->queue_lock);  in blk_throtl_dispatch_work_fn()
1353 		while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))  in blk_throtl_dispatch_work_fn()
1355 	spin_unlock_irq(&q->queue_lock);  in blk_throtl_dispatch_work_fn()
1390 			  &blkcg_policy_throtl, seq_cft(sf)->private, false);  in tg_print_conf_u64()
1397 			  &blkcg_policy_throtl, seq_cft(sf)->private, false);  in tg_print_conf_uint()
1403 	struct throtl_service_queue *sq = &tg->service_queue;  in tg_conf_updated()
1407 	throtl_log(&tg->service_queue,  in tg_conf_updated()
1417 	 * blk-throttle.  in tg_conf_updated()
1420 			global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) {  in tg_conf_updated()
1426 		if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent ||  in tg_conf_updated()
1427 		    !blkg->parent->parent)  in tg_conf_updated()
1429 		parent_tg = blkg_to_tg(blkg->parent);  in tg_conf_updated()
1432 		 * higher latency target  in tg_conf_updated()
1434 		this_tg->idletime_threshold = min(this_tg->idletime_threshold,  in tg_conf_updated()
1435 				parent_tg->idletime_threshold);  in tg_conf_updated()
1436 		this_tg->latency_target = max(this_tg->latency_target,  in tg_conf_updated()
1437 				parent_tg->latency_target);  in tg_conf_updated()
1446 	 * account recently dispatched IO with new low rate.  in tg_conf_updated()
1451 	if (tg->flags & THROTL_TG_PENDING) {  in tg_conf_updated()
1453 		throtl_schedule_next_dispatch(sq->parent_sq, true);  in tg_conf_updated()
1470 	ret = -EINVAL;  in tg_set_conf()
1479 		*(u64 *)((void *)tg + of_cft(of)->private) = v;  in tg_set_conf()
1481 		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;  in tg_set_conf()
1506 			  seq_cft(sf)->private, true);  in tg_print_rwstat()
1524 			  seq_cft(sf)->private, true);  in tg_print_rwstat_recursive()
1580 	const char *dname = blkg_dev_name(pd->blkg);  in tg_prfill_limit()
1598 	if (tg->bps_conf[READ][off] == bps_dft &&  in tg_prfill_limit()
1599 	    tg->bps_conf[WRITE][off] == bps_dft &&  in tg_prfill_limit()
1600 	    tg->iops_conf[READ][off] == iops_dft &&  in tg_prfill_limit()
1601 	    tg->iops_conf[WRITE][off] == iops_dft &&  in tg_prfill_limit()
1603 	     (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&  in tg_prfill_limit()
1604 	      tg->latency_target_conf == DFL_LATENCY_TARGET)))  in tg_prfill_limit()
1607 	if (tg->bps_conf[READ][off] != U64_MAX)  in tg_prfill_limit()
1609 			tg->bps_conf[READ][off]);  in tg_prfill_limit()
1610 	if (tg->bps_conf[WRITE][off] != U64_MAX)  in tg_prfill_limit()
1612 			tg->bps_conf[WRITE][off]);  in tg_prfill_limit()
1613 	if (tg->iops_conf[READ][off] != UINT_MAX)  in tg_prfill_limit()
1615 			tg->iops_conf[READ][off]);  in tg_prfill_limit()
1616 	if (tg->iops_conf[WRITE][off] != UINT_MAX)  in tg_prfill_limit()
1618 			tg->iops_conf[WRITE][off]);  in tg_prfill_limit()
1620 		if (tg->idletime_threshold_conf == ULONG_MAX)  in tg_prfill_limit()
1624 				tg->idletime_threshold_conf);  in tg_prfill_limit()
1626 		if (tg->latency_target_conf == ULONG_MAX)  in tg_prfill_limit()
1627 			strcpy(latency_time, " latency=max");  in tg_prfill_limit()
1630 				" latency=%lu", tg->latency_target_conf);  in tg_prfill_limit()
1642 			  &blkcg_policy_throtl, seq_cft(sf)->private, false);  in tg_print_limit()
1656 	int index = of_cft(of)->private;  in tg_set_limit()
1664 	v[0] = tg->bps_conf[READ][index];  in tg_set_limit()
1665 	v[1] = tg->bps_conf[WRITE][index];  in tg_set_limit()
1666 	v[2] = tg->iops_conf[READ][index];  in tg_set_limit()
1667 	v[3] = tg->iops_conf[WRITE][index];  in tg_set_limit()
1669 	idle_time = tg->idletime_threshold_conf;  in tg_set_limit()
1670 	latency_time = tg->latency_target_conf;  in tg_set_limit()
1683 		ret = -EINVAL;  in tg_set_limit()
1689 		ret = -ERANGE;  in tg_set_limit()
1693 		ret = -EINVAL;  in tg_set_limit()
1704 		else if (off == LIMIT_LOW && !strcmp(tok, "latency"))  in tg_set_limit()
1710 	tg->bps_conf[READ][index] = v[0];  in tg_set_limit()
1711 	tg->bps_conf[WRITE][index] = v[1];  in tg_set_limit()
1712 	tg->iops_conf[READ][index] = v[2];  in tg_set_limit()
1713 	tg->iops_conf[WRITE][index] = v[3];  in tg_set_limit()
1716 		tg->bps[READ][index] = v[0];  in tg_set_limit()
1717 		tg->bps[WRITE][index] = v[1];  in tg_set_limit()
1718 		tg->iops[READ][index] = v[2];  in tg_set_limit()
1719 		tg->iops[WRITE][index] = v[3];  in tg_set_limit()
1721 	tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],  in tg_set_limit()
1722 		tg->bps_conf[READ][LIMIT_MAX]);  in tg_set_limit()
1723 	tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],  in tg_set_limit()
1724 		tg->bps_conf[WRITE][LIMIT_MAX]);  in tg_set_limit()
1725 	tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],  in tg_set_limit()
1726 		tg->iops_conf[READ][LIMIT_MAX]);  in tg_set_limit()
1727 	tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],  in tg_set_limit()
1728 		tg->iops_conf[WRITE][LIMIT_MAX]);  in tg_set_limit()
1729 	tg->idletime_threshold_conf = idle_time;  in tg_set_limit()
1730 	tg->latency_target_conf = latency_time;  in tg_set_limit()
1732 	/* force user to configure all settings for low limit  */  in tg_set_limit()
1733 	if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||  in tg_set_limit()
1734 	      tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||  in tg_set_limit()
1735 	    tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||  in tg_set_limit()
1736 	    tg->latency_target_conf == DFL_LATENCY_TARGET) {  in tg_set_limit()
1737 		tg->bps[READ][LIMIT_LOW] = 0;  in tg_set_limit()
1738 		tg->bps[WRITE][LIMIT_LOW] = 0;  in tg_set_limit()
1739 		tg->iops[READ][LIMIT_LOW] = 0;  in tg_set_limit()
1740 		tg->iops[WRITE][LIMIT_LOW] = 0;  in tg_set_limit()
1741 		tg->idletime_threshold = DFL_IDLE_THRESHOLD;  in tg_set_limit()
1742 		tg->latency_target = DFL_LATENCY_TARGET;  in tg_set_limit()
1744 		tg->idletime_threshold = tg->idletime_threshold_conf;  in tg_set_limit()
1745 		tg->latency_target = tg->latency_target_conf;  in tg_set_limit()
1748 	blk_throtl_update_limit_valid(tg->td);  in tg_set_limit()
1749 	if (tg->td->limit_valid[LIMIT_LOW]) {  in tg_set_limit()
1751 			tg->td->limit_index = LIMIT_LOW;  in tg_set_limit()
1753 		tg->td->limit_index = LIMIT_MAX;  in tg_set_limit()
1755 		tg->td->limit_valid[LIMIT_LOW]);  in tg_set_limit()
1765 		.name = "low",
1784 	struct throtl_data *td = q->td;  in throtl_shutdown_wq()
1786 	cancel_work_sync(&td->dispatch_work);  in throtl_shutdown_wq()
1804 	if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])  in __tg_last_low_overflow_time()
1805 		rtime = tg->last_low_overflow_time[READ];  in __tg_last_low_overflow_time()
1806 	if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])  in __tg_last_low_overflow_time()
1807 		wtime = tg->last_low_overflow_time[WRITE];  in __tg_last_low_overflow_time()
1819 		parent_sq = parent->service_queue.parent_sq;  in tg_last_low_overflow_time()
1825 		 * The parent doesn't have low limit, it always reaches low  in tg_last_low_overflow_time()
1828 		if (!parent->bps[READ][LIMIT_LOW] &&  in tg_last_low_overflow_time()
1829 		    !parent->iops[READ][LIMIT_LOW] &&  in tg_last_low_overflow_time()
1830 		    !parent->bps[WRITE][LIMIT_LOW] &&  in tg_last_low_overflow_time()
1831 		    !parent->iops[WRITE][LIMIT_LOW])  in tg_last_low_overflow_time()
1843 	 * - single idle is too long, longer than a fixed value (in case user  in throtl_tg_is_idle()
1845 	 * - average think time is more than threshold  in throtl_tg_is_idle()
1846 	 * - IO latency is largely below threshold  in throtl_tg_is_idle()
1851 	time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);  in throtl_tg_is_idle()
1852 	ret = tg->latency_target == DFL_LATENCY_TARGET ||  in throtl_tg_is_idle()
1853 	      tg->idletime_threshold == DFL_IDLE_THRESHOLD ||  in throtl_tg_is_idle()
1854 	      (ktime_get_ns() >> 10) - tg->last_finish_time > time ||  in throtl_tg_is_idle()
1855 	      tg->avg_idletime > tg->idletime_threshold ||  in throtl_tg_is_idle()
1856 	      (tg->latency_target && tg->bio_cnt &&  in throtl_tg_is_idle()
1857 		tg->bad_bio_cnt * 5 < tg->bio_cnt);  in throtl_tg_is_idle()
1858 	throtl_log(&tg->service_queue,  in throtl_tg_is_idle()
1860 		tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,  in throtl_tg_is_idle()
1861 		tg->bio_cnt, ret, tg->td->scale);  in throtl_tg_is_idle()
1867 	struct throtl_service_queue *sq = &tg->service_queue;  in throtl_tg_can_upgrade()
1871 	 * if cgroup reaches low limit (if low limit is 0, the cgroup always  in throtl_tg_can_upgrade()
1874 	read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];  in throtl_tg_can_upgrade()
1875 	write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];  in throtl_tg_can_upgrade()
1878 	if (read_limit && sq->nr_queued[READ] &&  in throtl_tg_can_upgrade()
1879 	    (!write_limit || sq->nr_queued[WRITE]))  in throtl_tg_can_upgrade()
1881 	if (write_limit && sq->nr_queued[WRITE] &&  in throtl_tg_can_upgrade()
1882 	    (!read_limit || sq->nr_queued[READ]))  in throtl_tg_can_upgrade()
1886 		tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&  in throtl_tg_can_upgrade()
1897 		tg = sq_to_tg(tg->service_queue.parent_sq);  in throtl_hierarchy_can_upgrade()
1898 		if (!tg || !tg_to_blkg(tg)->parent)  in throtl_hierarchy_can_upgrade()
1910 	if (td->limit_index != LIMIT_LOW)  in throtl_can_upgrade()
1913 	if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))  in throtl_can_upgrade()
1917 	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {  in throtl_can_upgrade()
1922 		if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))  in throtl_can_upgrade()
1937 	if (tg->td->limit_index != LIMIT_LOW)  in throtl_upgrade_check()
1940 	if (time_after(tg->last_check_time + tg->td->throtl_slice, now))  in throtl_upgrade_check()
1943 	tg->last_check_time = now;  in throtl_upgrade_check()
1946 	     __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))  in throtl_upgrade_check()
1949 	if (throtl_can_upgrade(tg->td, NULL))  in throtl_upgrade_check()
1950 		throtl_upgrade_state(tg->td);  in throtl_upgrade_check()
1958 	throtl_log(&td->service_queue, "upgrade to max");  in throtl_upgrade_state()
1959 	td->limit_index = LIMIT_MAX;  in throtl_upgrade_state()
1960 	td->low_upgrade_time = jiffies;  in throtl_upgrade_state()
1961 	td->scale = 0;  in throtl_upgrade_state()
1963 	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {  in throtl_upgrade_state()
1965 		struct throtl_service_queue *sq = &tg->service_queue;  in throtl_upgrade_state()
1967 		tg->disptime = jiffies - 1;  in throtl_upgrade_state()
1972 	throtl_select_dispatch(&td->service_queue);  in throtl_upgrade_state()
1973 	throtl_schedule_next_dispatch(&td->service_queue, true);  in throtl_upgrade_state()
1974 	queue_work(kthrotld_workqueue, &td->dispatch_work);  in throtl_upgrade_state()
1979 	td->scale /= 2;  in throtl_downgrade_state()
1981 	throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);  in throtl_downgrade_state()
1982 	if (td->scale) {  in throtl_downgrade_state()
1983 		td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;  in throtl_downgrade_state()
1987 	td->limit_index = LIMIT_LOW;  in throtl_downgrade_state()
1988 	td->low_downgrade_time = jiffies;  in throtl_downgrade_state()
1993 	struct throtl_data *td = tg->td;  in throtl_tg_can_downgrade()
1997 	 * If cgroup is below low limit, consider downgrade and throttle other  in throtl_tg_can_downgrade()
2000 	if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&  in throtl_tg_can_downgrade()
2002 					td->throtl_slice) &&  in throtl_tg_can_downgrade()
2004 	     !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))  in throtl_tg_can_downgrade()
2014 		tg = sq_to_tg(tg->service_queue.parent_sq);  in throtl_hierarchy_can_downgrade()
2015 		if (!tg || !tg_to_blkg(tg)->parent)  in throtl_hierarchy_can_downgrade()
2028 	if (tg->td->limit_index != LIMIT_MAX ||  in throtl_downgrade_check()
2029 	    !tg->td->limit_valid[LIMIT_LOW])  in throtl_downgrade_check()
2031 	if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))  in throtl_downgrade_check()
2033 	if (time_after(tg->last_check_time + tg->td->throtl_slice, now))  in throtl_downgrade_check()
2036 	elapsed_time = now - tg->last_check_time;  in throtl_downgrade_check()
2037 	tg->last_check_time = now;  in throtl_downgrade_check()
2040 			tg->td->throtl_slice))  in throtl_downgrade_check()
2043 	if (tg->bps[READ][LIMIT_LOW]) {  in throtl_downgrade_check()
2044 		bps = tg->last_bytes_disp[READ] * HZ;  in throtl_downgrade_check()
2046 		if (bps >= tg->bps[READ][LIMIT_LOW])  in throtl_downgrade_check()
2047 			tg->last_low_overflow_time[READ] = now;  in throtl_downgrade_check()
2050 	if (tg->bps[WRITE][LIMIT_LOW]) {  in throtl_downgrade_check()
2051 		bps = tg->last_bytes_disp[WRITE] * HZ;  in throtl_downgrade_check()
2053 		if (bps >= tg->bps[WRITE][LIMIT_LOW])  in throtl_downgrade_check()
2054 			tg->last_low_overflow_time[WRITE] = now;  in throtl_downgrade_check()
2057 	if (tg->iops[READ][LIMIT_LOW]) {  in throtl_downgrade_check()
2058 		tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0);  in throtl_downgrade_check()
2059 		iops = tg->last_io_disp[READ] * HZ / elapsed_time;  in throtl_downgrade_check()
2060 		if (iops >= tg->iops[READ][LIMIT_LOW])  in throtl_downgrade_check()
2061 			tg->last_low_overflow_time[READ] = now;  in throtl_downgrade_check()
2064 	if (tg->iops[WRITE][LIMIT_LOW]) {  in throtl_downgrade_check()
2065 		tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0);  in throtl_downgrade_check()
2066 		iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;  in throtl_downgrade_check()
2067 		if (iops >= tg->iops[WRITE][LIMIT_LOW])  in throtl_downgrade_check()
2068 			tg->last_low_overflow_time[WRITE] = now;  in throtl_downgrade_check()
2072 	 * If cgroup is below low limit, consider downgrade and throttle other  in throtl_downgrade_check()
2076 		throtl_downgrade_state(tg->td);  in throtl_downgrade_check()
2078 	tg->last_bytes_disp[READ] = 0;  in throtl_downgrade_check()
2079 	tg->last_bytes_disp[WRITE] = 0;  in throtl_downgrade_check()
2080 	tg->last_io_disp[READ] = 0;  in throtl_downgrade_check()
2081 	tg->last_io_disp[WRITE] = 0;  in throtl_downgrade_check()
2087 	unsigned long last_finish_time = tg->last_finish_time;  in blk_throtl_update_idletime()
2094 	    last_finish_time == tg->checked_last_finish_time)  in blk_throtl_update_idletime()
2097 	tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;  in blk_throtl_update_idletime()
2098 	tg->checked_last_finish_time = last_finish_time;  in blk_throtl_update_idletime()
2107 	unsigned long latency[2];  in throtl_update_latency_buckets()  local
2109 	if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])  in throtl_update_latency_buckets()
2111 	if (time_before(jiffies, td->last_calculate_time + HZ))  in throtl_update_latency_buckets()
2113 	td->last_calculate_time = jiffies;  in throtl_update_latency_buckets()
2118 			struct latency_bucket *tmp = &td->tmp_buckets[rw][i];  in throtl_update_latency_buckets()
2124 				bucket = per_cpu_ptr(td->latency_buckets[rw],  in throtl_update_latency_buckets()
2126 				tmp->total_latency += bucket[i].total_latency;  in throtl_update_latency_buckets()
2127 				tmp->samples += bucket[i].samples;  in throtl_update_latency_buckets()
2132 			if (tmp->samples >= 32) {  in throtl_update_latency_buckets()
2133 				int samples = tmp->samples;  in throtl_update_latency_buckets()
2135 				latency[rw] = tmp->total_latency;  in throtl_update_latency_buckets()
2137 				tmp->total_latency = 0;  in throtl_update_latency_buckets()
2138 				tmp->samples = 0;  in throtl_update_latency_buckets()
2139 				latency[rw] /= samples;  in throtl_update_latency_buckets()
2140 				if (latency[rw] == 0)  in throtl_update_latency_buckets()
2142 				avg_latency[rw][i].latency = latency[rw];  in throtl_update_latency_buckets()
2149 			if (!avg_latency[rw][i].latency) {  in throtl_update_latency_buckets()
2150 				if (td->avg_buckets[rw][i].latency < last_latency[rw])  in throtl_update_latency_buckets()
2151 					td->avg_buckets[rw][i].latency =  in throtl_update_latency_buckets()
2156 			if (!td->avg_buckets[rw][i].valid)  in throtl_update_latency_buckets()
2157 				latency[rw] = avg_latency[rw][i].latency;  in throtl_update_latency_buckets()
2159 				latency[rw] = (td->avg_buckets[rw][i].latency * 7 +  in throtl_update_latency_buckets()
2160 					avg_latency[rw][i].latency) >> 3;  in throtl_update_latency_buckets()
2162 			td->avg_buckets[rw][i].latency = max(latency[rw],  in throtl_update_latency_buckets()
2164 			td->avg_buckets[rw][i].valid = true;  in throtl_update_latency_buckets()
2165 			last_latency[rw] = td->avg_buckets[rw][i].latency;  in throtl_update_latency_buckets()
2170 		throtl_log(&td->service_queue,  in throtl_update_latency_buckets()
2171 			"Latency bucket %d: read latency=%ld, read valid=%d, "  in throtl_update_latency_buckets()
2172 			"write latency=%ld, write valid=%d", i,  in throtl_update_latency_buckets()
2173 			td->avg_buckets[READ][i].latency,  in throtl_update_latency_buckets()
2174 			td->avg_buckets[READ][i].valid,  in throtl_update_latency_buckets()
2175 			td->avg_buckets[WRITE][i].latency,  in throtl_update_latency_buckets()
2176 			td->avg_buckets[WRITE][i].valid);  in throtl_update_latency_buckets()
2186 	struct blkcg_gq *blkg = bio->bi_blkg;  in blk_throtl_charge_bio_split()
2192 		if (!parent->has_rules[rw])  in blk_throtl_charge_bio_split()
2195 		atomic_inc(&parent->io_split_cnt[rw]);  in blk_throtl_charge_bio_split()
2196 		atomic_inc(&parent->last_io_split_cnt[rw]);  in blk_throtl_charge_bio_split()
2198 		parent_sq = parent->service_queue.parent_sq;  in blk_throtl_charge_bio_split()
2205 	struct request_queue *q = bio->bi_disk->queue;  in blk_throtl_bio()
2206 	struct blkcg_gq *blkg = bio->bi_blkg;  in blk_throtl_bio()
2212 	struct throtl_data *td = tg->td;  in blk_throtl_bio()
2221 		blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,  in blk_throtl_bio()
2222 				bio->bi_iter.bi_size);  in blk_throtl_bio()
2223 		blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);  in blk_throtl_bio()
2226 	if (!tg->has_rules[rw])  in blk_throtl_bio()
2229 	spin_lock_irq(&q->queue_lock);  in blk_throtl_bio()
2235 	sq = &tg->service_queue;  in blk_throtl_bio()
2239 		if (tg->last_low_overflow_time[rw] == 0)  in blk_throtl_bio()
2240 			tg->last_low_overflow_time[rw] = jiffies;  in blk_throtl_bio()
2243 		/* throtl is FIFO - if bios are already queued, should queue */  in blk_throtl_bio()
2244 		if (sq->nr_queued[rw])  in blk_throtl_bio()
2249 			tg->last_low_overflow_time[rw] = jiffies;  in blk_throtl_bio()
2266 		 * low rate and * newly queued IO gets a really long dispatch  in blk_throtl_bio()
2278 		qn = &tg->qnode_on_parent[rw];  in blk_throtl_bio()
2279 		sq = sq->parent_sq;  in blk_throtl_bio()
2285 	/* out-of-limit, queue to @tg */  in blk_throtl_bio()
2288 		   tg->bytes_disp[rw], bio->bi_iter.bi_size,  in blk_throtl_bio()
2290 		   tg->io_disp[rw], tg_iops_limit(tg, rw),  in blk_throtl_bio()
2291 		   sq->nr_queued[READ], sq->nr_queued[WRITE]);  in blk_throtl_bio()
2293 	tg->last_low_overflow_time[rw] = jiffies;  in blk_throtl_bio()
2295 	td->nr_queued[rw]++;  in blk_throtl_bio()
2305 	if (tg->flags & THROTL_TG_WAS_EMPTY) {  in blk_throtl_bio()
2307 		throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);  in blk_throtl_bio()
2311 	spin_unlock_irq(&q->queue_lock);  in blk_throtl_bio()
2316 	if (throttled || !td->track_bio_latency)  in blk_throtl_bio()
2317 		bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;  in blk_throtl_bio()
2327 	struct latency_bucket *latency;  in throtl_track_latency()  local
2330 	if (!td || td->limit_index != LIMIT_LOW ||  in throtl_track_latency()
2332 	    !blk_queue_nonrot(td->queue))  in throtl_track_latency()
2337 	latency = get_cpu_ptr(td->latency_buckets[op]);  in throtl_track_latency()
2338 	latency[index].total_latency += time;  in throtl_track_latency()
2339 	latency[index].samples++;  in throtl_track_latency()
2340 	put_cpu_ptr(td->latency_buckets[op]);  in throtl_track_latency()
2345 	struct request_queue *q = rq->q;  in blk_throtl_stat_add()
2346 	struct throtl_data *td = q->td;  in blk_throtl_stat_add()
2362 	blkg = bio->bi_blkg;  in blk_throtl_bio_endio()
2366 	if (!tg->td->limit_valid[LIMIT_LOW])  in blk_throtl_bio_endio()
2370 	tg->last_finish_time = finish_time_ns >> 10;  in blk_throtl_bio_endio()
2372 	start_time = bio_issue_time(&bio->bi_issue) >> 10;  in blk_throtl_bio_endio()
2377 	lat = finish_time - start_time;  in blk_throtl_bio_endio()
2379 	if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY))  in blk_throtl_bio_endio()
2380 		throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue),  in blk_throtl_bio_endio()
2383 	if (tg->latency_target && lat >= tg->td->filtered_latency) {  in blk_throtl_bio_endio()
2387 		bucket = request_bucket_index(bio_issue_size(&bio->bi_issue));  in blk_throtl_bio_endio()
2388 		threshold = tg->td->avg_buckets[rw][bucket].latency +  in blk_throtl_bio_endio()
2389 			tg->latency_target;  in blk_throtl_bio_endio()
2391 			tg->bad_bio_cnt++;  in blk_throtl_bio_endio()
2396 		tg->bio_cnt++;  in blk_throtl_bio_endio()
2399 	if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {  in blk_throtl_bio_endio()
2400 		tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;  in blk_throtl_bio_endio()
2401 		tg->bio_cnt /= 2;  in blk_throtl_bio_endio()
2402 		tg->bad_bio_cnt /= 2;  in blk_throtl_bio_endio()
2412 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);  in blk_throtl_init()
2414 		return -ENOMEM;  in blk_throtl_init()
2415 	td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *  in blk_throtl_init()
2417 	if (!td->latency_buckets[READ]) {  in blk_throtl_init()
2419 		return -ENOMEM;  in blk_throtl_init()
2421 	td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *  in blk_throtl_init()
2423 	if (!td->latency_buckets[WRITE]) {  in blk_throtl_init()
2424 		free_percpu(td->latency_buckets[READ]);  in blk_throtl_init()
2426 		return -ENOMEM;  in blk_throtl_init()
2429 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);  in blk_throtl_init()
2430 	throtl_service_queue_init(&td->service_queue);  in blk_throtl_init()
2432 	q->td = td;  in blk_throtl_init()
2433 	td->queue = q;  in blk_throtl_init()
2435 	td->limit_valid[LIMIT_MAX] = true;  in blk_throtl_init()
2436 	td->limit_index = LIMIT_MAX;  in blk_throtl_init()
2437 	td->low_upgrade_time = jiffies;  in blk_throtl_init()
2438 	td->low_downgrade_time = jiffies;  in blk_throtl_init()
2443 		free_percpu(td->latency_buckets[READ]);  in blk_throtl_init()
2444 		free_percpu(td->latency_buckets[WRITE]);  in blk_throtl_init()
2452 	BUG_ON(!q->td);  in blk_throtl_exit()
2453 	del_timer_sync(&q->td->service_queue.pending_timer);  in blk_throtl_exit()
2456 	free_percpu(q->td->latency_buckets[READ]);  in blk_throtl_exit()
2457 	free_percpu(q->td->latency_buckets[WRITE]);  in blk_throtl_exit()
2458 	kfree(q->td);  in blk_throtl_exit()
2466 	td = q->td;  in blk_throtl_register_queue()
2470 		td->throtl_slice = DFL_THROTL_SLICE_SSD;  in blk_throtl_register_queue()
2471 		td->filtered_latency = LATENCY_FILTERED_SSD;  in blk_throtl_register_queue()
2473 		td->throtl_slice = DFL_THROTL_SLICE_HD;  in blk_throtl_register_queue()
2474 		td->filtered_latency = LATENCY_FILTERED_HD;  in blk_throtl_register_queue()
2476 			td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;  in blk_throtl_register_queue()
2477 			td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;  in blk_throtl_register_queue()
2481 	/* if no low limit, use previous default */  in blk_throtl_register_queue()
2482 	td->throtl_slice = DFL_THROTL_SLICE_HD;  in blk_throtl_register_queue()
2485 	td->track_bio_latency = !queue_is_mq(q);  in blk_throtl_register_queue()
2486 	if (!td->track_bio_latency)  in blk_throtl_register_queue()
2493 	if (!q->td)  in blk_throtl_sample_time_show()
2494 		return -EINVAL;  in blk_throtl_sample_time_show()
2495 	return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));  in blk_throtl_sample_time_show()
2504 	if (!q->td)  in blk_throtl_sample_time_store()
2505 		return -EINVAL;  in blk_throtl_sample_time_store()
2507 		return -EINVAL;  in blk_throtl_sample_time_store()
2510 		return -EINVAL;  in blk_throtl_sample_time_store()
2511 	q->td->throtl_slice = t;  in blk_throtl_sample_time_store()