Lines Matching +full:fine +full:- +full:granular
1 /* SPDX-License-Identifier: GPL-2.0
17 * useless for the purpose of IO capacity distribution. While on-device
19 * non-queued rotational devices, this is no longer viable with modern
27 * implement a reasonable work-conserving proportional IO resource
37 * Currently, there's only one builtin cost model - linear. Each IO is
47 * device-specific coefficients.
54 * 2-1. Vtime Distribution
75 * against the device vtime - an IO which takes 10ms on the underlying
84 * 2-2. Vrate Adjustment
97 * To slow down, we lower the vrate - the rate at which the device vtime
100 * 750ms worth of IOs per second, and vice-versa for speeding up.
102 * Device business is determined using two criteria - rq wait and
105 * When a device gets saturated, the on-device and then the request queues
121 * service. There is an inherent trade-off - the tighter the latency QoS,
125 * 2-3. Work Conservation
133 * compared to free-for-all competition. This is too high a cost to pay
156 * controller uses a drgn based monitoring script -
165 * - per : Timer period
166 * - cur_per : Internal wall and device vtime clock
167 * - vrate : Device virtual time rate against wall clock
168 * - weight : Surplus-adjusted and configured weights
169 * - hweight : Surplus-adjusted and configured hierarchical weights
170 * - inflt : The percentage of in-flight IO cost at the end of last period
171 * - del_ms : Deferred issuer delay induction level and duration
172 * - usages : Usage history
181 #include <linux/blk-cgroup.h>
184 #include "blk-rq-qos.h"
185 #include "blk-stat.h"
186 #include "blk-wbt.h"
190 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
200 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
220 * iocg->vtime is targeted at 50% behind the device vtime, which
233 /* 1/64k is granular enough and can easily be handled w/ u32 */
244 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
245 * granularity and days of wrap-around time even at extreme vrates.
266 * The effect of delay is indirect and non-linear and a huge amount of
281 * cache, the kernel doesn't have well-defined back-pressure propagation
302 * size-proportional components of cost calculation in closer
303 * numbers of digits to per-IO cost components.
307 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
319 /* io.cost.qos controls including per-dev enable of the whole controller */
459 /* per device-cgroup pair */
465 * A iocg can get its weight from two sources - an explicit
466 * per-device-cgroup configuration or the default weight of the
467 * cgroup. `cfg_weight` is the explicit per-device-cgroup
500 * currently in-flight IOs.
648 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
673 return kobject_name(q->kobj.parent); in q_name()
680 return q_name(ioc->rqos.q); in ioc_name()
695 return pd_to_blkg(&iocg->pd); in iocg_to_blkg()
726 bio->bi_iocost_cost = cost; in iocg_commit_bio()
727 atomic64_add(cost, &iocg->vtime); in iocg_commit_bio()
729 gcs = get_cpu_ptr(iocg->pcpu_stat); in iocg_commit_bio()
730 local64_add(abs_cost, &gcs->abs_vusage); in iocg_commit_bio()
737 spin_lock_irqsave(&iocg->ioc->lock, *flags); in iocg_lock()
738 spin_lock(&iocg->waitq.lock); in iocg_lock()
740 spin_lock_irqsave(&iocg->waitq.lock, *flags); in iocg_lock()
747 spin_unlock(&iocg->waitq.lock); in iocg_unlock()
748 spin_unlock_irqrestore(&iocg->ioc->lock, *flags); in iocg_unlock()
750 spin_unlock_irqrestore(&iocg->waitq.lock, *flags); in iocg_unlock()
759 struct ioc_margins *margins = &ioc->margins; in ioc_refresh_margins()
760 u32 period_us = ioc->period_us; in ioc_refresh_margins()
761 u64 vrate = ioc->vtime_base_rate; in ioc_refresh_margins()
763 margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; in ioc_refresh_margins()
764 margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; in ioc_refresh_margins()
765 margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; in ioc_refresh_margins()
773 lockdep_assert_held(&ioc->lock); in ioc_refresh_period_us()
776 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) { in ioc_refresh_period_us()
777 ppm = ioc->params.qos[QOS_RPPM]; in ioc_refresh_period_us()
778 lat = ioc->params.qos[QOS_RLAT]; in ioc_refresh_period_us()
780 ppm = ioc->params.qos[QOS_WPPM]; in ioc_refresh_period_us()
781 lat = ioc->params.qos[QOS_WLAT]; in ioc_refresh_period_us()
786 * of IOs while short enough for granular control. Define it as a in ioc_refresh_period_us()
793 multi = max_t(u32, (MILLION - ppm) / 50000, 2); in ioc_refresh_period_us()
800 ioc->period_us = period_us; in ioc_refresh_period_us()
801 ioc->timer_slack_ns = div64_u64( in ioc_refresh_period_us()
809 int idx = ioc->autop_idx; in ioc_autop_idx()
815 if (!blk_queue_nonrot(ioc->rqos.q)) in ioc_autop_idx()
819 if (blk_queue_depth(ioc->rqos.q) == 1) in ioc_autop_idx()
827 if (ioc->user_qos_params || ioc->user_cost_model) in ioc_autop_idx()
831 vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); in ioc_autop_idx()
834 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { in ioc_autop_idx()
835 if (!ioc->autop_too_fast_at) in ioc_autop_idx()
836 ioc->autop_too_fast_at = now_ns; in ioc_autop_idx()
837 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC) in ioc_autop_idx()
840 ioc->autop_too_fast_at = 0; in ioc_autop_idx()
843 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) { in ioc_autop_idx()
844 if (!ioc->autop_too_slow_at) in ioc_autop_idx()
845 ioc->autop_too_slow_at = now_ns; in ioc_autop_idx()
846 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC) in ioc_autop_idx()
847 return idx - 1; in ioc_autop_idx()
849 ioc->autop_too_slow_at = 0; in ioc_autop_idx()
864 * *@page per-page cost 1s / (@bps / 4096)
865 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
866 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
882 *seqio = v - *page; in calc_lcoefs()
888 *randio = v - *page; in calc_lcoefs()
894 u64 *u = ioc->params.i_lcoefs; in ioc_refresh_lcoefs()
895 u64 *c = ioc->params.lcoefs; in ioc_refresh_lcoefs()
908 lockdep_assert_held(&ioc->lock); in ioc_refresh_params()
913 if (idx == ioc->autop_idx && !force) in ioc_refresh_params()
916 if (idx != ioc->autop_idx) in ioc_refresh_params()
917 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); in ioc_refresh_params()
919 ioc->autop_idx = idx; in ioc_refresh_params()
920 ioc->autop_too_fast_at = 0; in ioc_refresh_params()
921 ioc->autop_too_slow_at = 0; in ioc_refresh_params()
923 if (!ioc->user_qos_params) in ioc_refresh_params()
924 memcpy(ioc->params.qos, p->qos, sizeof(p->qos)); in ioc_refresh_params()
925 if (!ioc->user_cost_model) in ioc_refresh_params()
926 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs)); in ioc_refresh_params()
931 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * in ioc_refresh_params()
933 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] * in ioc_refresh_params()
948 s64 pleft = ioc->period_at + ioc->period_us - now->now; in ioc_refresh_vrate()
949 s64 vperiod = ioc->period_us * ioc->vtime_base_rate; in ioc_refresh_vrate()
952 lockdep_assert_held(&ioc->lock); in ioc_refresh_vrate()
963 vcomp = -div64_s64(ioc->vtime_err, pleft); in ioc_refresh_vrate()
964 vcomp_min = -(ioc->vtime_base_rate >> 1); in ioc_refresh_vrate()
965 vcomp_max = ioc->vtime_base_rate; in ioc_refresh_vrate()
968 ioc->vtime_err += vcomp * pleft; in ioc_refresh_vrate()
970 atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); in ioc_refresh_vrate()
973 ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); in ioc_refresh_vrate()
981 now->now_ns = ktime_get(); in ioc_now()
982 now->now = ktime_to_us(now->now_ns); in ioc_now()
983 now->vrate = atomic64_read(&ioc->vtime_rate); in ioc_now()
994 seq = read_seqcount_begin(&ioc->period_seqcount); in ioc_now()
995 now->vnow = ioc->period_at_vtime + in ioc_now()
996 (now->now - ioc->period_at) * now->vrate; in ioc_now()
997 } while (read_seqcount_retry(&ioc->period_seqcount, seq)); in ioc_now()
1002 WARN_ON_ONCE(ioc->running != IOC_RUNNING); in ioc_start_period()
1004 write_seqcount_begin(&ioc->period_seqcount); in ioc_start_period()
1005 ioc->period_at = now->now; in ioc_start_period()
1006 ioc->period_at_vtime = now->vnow; in ioc_start_period()
1007 write_seqcount_end(&ioc->period_seqcount); in ioc_start_period()
1009 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us); in ioc_start_period()
1010 add_timer(&ioc->timer); in ioc_start_period()
1016 * is saved to be used as reference for later inuse in-period adjustments.
1021 struct ioc *ioc = iocg->ioc; in __propagate_weights()
1024 lockdep_assert_held(&ioc->lock); in __propagate_weights()
1031 if (list_empty(&iocg->active_list) && iocg->child_active_sum) { in __propagate_weights()
1032 inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, in __propagate_weights()
1033 iocg->child_active_sum); in __propagate_weights()
1038 iocg->last_inuse = iocg->inuse; in __propagate_weights()
1040 iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); in __propagate_weights()
1042 if (active == iocg->active && inuse == iocg->inuse) in __propagate_weights()
1045 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { in __propagate_weights()
1046 struct ioc_gq *parent = iocg->ancestors[lvl]; in __propagate_weights()
1047 struct ioc_gq *child = iocg->ancestors[lvl + 1]; in __propagate_weights()
1051 parent->child_active_sum += (s32)(active - child->active); in __propagate_weights()
1052 parent->child_inuse_sum += (s32)(inuse - child->inuse); in __propagate_weights()
1054 child->active = active; in __propagate_weights()
1055 child->inuse = inuse; in __propagate_weights()
1062 if (parent->child_active_sum) { in __propagate_weights()
1063 parent_active = parent->weight; in __propagate_weights()
1065 parent_active * parent->child_inuse_sum, in __propagate_weights()
1066 parent->child_active_sum); in __propagate_weights()
1070 if (parent_active == parent->active && in __propagate_weights()
1071 parent_inuse == parent->inuse) in __propagate_weights()
1078 ioc->weights_updated = true; in __propagate_weights()
1083 lockdep_assert_held(&ioc->lock); in commit_weights()
1085 if (ioc->weights_updated) { in commit_weights()
1088 atomic_inc(&ioc->hweight_gen); in commit_weights()
1089 ioc->weights_updated = false; in commit_weights()
1097 commit_weights(iocg->ioc); in propagate_weights()
1102 struct ioc *ioc = iocg->ioc; in current_hweight()
1107 /* hot path - if uptodate, use cached */ in current_hweight()
1108 ioc_gen = atomic_read(&ioc->hweight_gen); in current_hweight()
1109 if (ioc_gen == iocg->hweight_gen) in current_hweight()
1125 for (lvl = 0; lvl <= iocg->level - 1; lvl++) { in current_hweight()
1126 struct ioc_gq *parent = iocg->ancestors[lvl]; in current_hweight()
1127 struct ioc_gq *child = iocg->ancestors[lvl + 1]; in current_hweight()
1128 u64 active_sum = READ_ONCE(parent->child_active_sum); in current_hweight()
1129 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); in current_hweight()
1130 u32 active = READ_ONCE(child->active); in current_hweight()
1131 u32 inuse = READ_ONCE(child->inuse); in current_hweight()
1144 iocg->hweight_active = max_t(u32, hwa, 1); in current_hweight()
1145 iocg->hweight_inuse = max_t(u32, hwi, 1); in current_hweight()
1146 iocg->hweight_gen = ioc_gen; in current_hweight()
1149 *hw_activep = iocg->hweight_active; in current_hweight()
1151 *hw_inusep = iocg->hweight_inuse; in current_hweight()
1161 u32 inuse = iocg->active; in current_hweight_max()
1165 lockdep_assert_held(&iocg->ioc->lock); in current_hweight_max()
1167 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { in current_hweight_max()
1168 struct ioc_gq *parent = iocg->ancestors[lvl]; in current_hweight_max()
1169 struct ioc_gq *child = iocg->ancestors[lvl + 1]; in current_hweight_max()
1171 child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; in current_hweight_max()
1173 inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, in current_hweight_max()
1174 parent->child_active_sum); in current_hweight_max()
1182 struct ioc *ioc = iocg->ioc; in weight_updated()
1184 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg); in weight_updated()
1187 lockdep_assert_held(&ioc->lock); in weight_updated()
1189 weight = iocg->cfg_weight ?: iocc->dfl_weight; in weight_updated()
1190 if (weight != iocg->weight && iocg->active) in weight_updated()
1191 propagate_weights(iocg, weight, iocg->inuse, true, now); in weight_updated()
1192 iocg->weight = weight; in weight_updated()
1197 struct ioc *ioc = iocg->ioc; in iocg_activate()
1206 if (!list_empty(&iocg->active_list)) { in iocg_activate()
1208 cur_period = atomic64_read(&ioc->cur_period); in iocg_activate()
1209 if (atomic64_read(&iocg->active_period) != cur_period) in iocg_activate()
1210 atomic64_set(&iocg->active_period, cur_period); in iocg_activate()
1215 if (iocg->child_active_sum) in iocg_activate()
1218 spin_lock_irq(&ioc->lock); in iocg_activate()
1223 cur_period = atomic64_read(&ioc->cur_period); in iocg_activate()
1224 last_period = atomic64_read(&iocg->active_period); in iocg_activate()
1225 atomic64_set(&iocg->active_period, cur_period); in iocg_activate()
1227 /* already activated or breaking leaf-only constraint? */ in iocg_activate()
1228 if (!list_empty(&iocg->active_list)) in iocg_activate()
1230 for (i = iocg->level - 1; i > 0; i--) in iocg_activate()
1231 if (!list_empty(&iocg->ancestors[i]->active_list)) in iocg_activate()
1234 if (iocg->child_active_sum) in iocg_activate()
1241 vtarget = now->vnow - ioc->margins.target; in iocg_activate()
1242 vtime = atomic64_read(&iocg->vtime); in iocg_activate()
1244 atomic64_add(vtarget - vtime, &iocg->vtime); in iocg_activate()
1245 atomic64_add(vtarget - vtime, &iocg->done_vtime); in iocg_activate()
1253 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; in iocg_activate()
1254 list_add(&iocg->active_list, &ioc->active_iocgs); in iocg_activate()
1256 propagate_weights(iocg, iocg->weight, in iocg_activate()
1257 iocg->last_inuse ?: iocg->weight, true, now); in iocg_activate()
1262 iocg->activated_at = now->now; in iocg_activate()
1264 if (ioc->running == IOC_IDLE) { in iocg_activate()
1265 ioc->running = IOC_RUNNING; in iocg_activate()
1266 ioc->dfgv_period_at = now->now; in iocg_activate()
1267 ioc->dfgv_period_rem = 0; in iocg_activate()
1272 spin_unlock_irq(&ioc->lock); in iocg_activate()
1276 spin_unlock_irq(&ioc->lock); in iocg_activate()
1282 struct ioc *ioc = iocg->ioc; in iocg_kick_delay()
1288 lockdep_assert_held(&iocg->waitq.lock); in iocg_kick_delay()
1290 /* calculate the current delay in effect - 1/2 every second */ in iocg_kick_delay()
1291 tdelta = now->now - iocg->delay_at; in iocg_kick_delay()
1292 if (iocg->delay) in iocg_kick_delay()
1293 delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); in iocg_kick_delay()
1299 vover = atomic64_read(&iocg->vtime) + in iocg_kick_delay()
1300 abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; in iocg_kick_delay()
1302 ioc->period_us * ioc->vtime_base_rate); in iocg_kick_delay()
1310 div_u64((MAX_DELAY - MIN_DELAY) * in iocg_kick_delay()
1311 (vover_pct - MIN_DELAY_THR_PCT), in iocg_kick_delay()
1312 MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); in iocg_kick_delay()
1316 iocg->delay = new_delay; in iocg_kick_delay()
1317 iocg->delay_at = now->now; in iocg_kick_delay()
1322 if (!iocg->indelay_since) in iocg_kick_delay()
1323 iocg->indelay_since = now->now; in iocg_kick_delay()
1327 if (iocg->indelay_since) { in iocg_kick_delay()
1328 iocg->local_stat.indelay_us += now->now - iocg->indelay_since; in iocg_kick_delay()
1329 iocg->indelay_since = 0; in iocg_kick_delay()
1331 iocg->delay = 0; in iocg_kick_delay()
1342 lockdep_assert_held(&iocg->ioc->lock); in iocg_incur_debt()
1343 lockdep_assert_held(&iocg->waitq.lock); in iocg_incur_debt()
1344 WARN_ON_ONCE(list_empty(&iocg->active_list)); in iocg_incur_debt()
1350 if (!iocg->abs_vdebt && abs_cost) { in iocg_incur_debt()
1351 iocg->indebt_since = now->now; in iocg_incur_debt()
1352 propagate_weights(iocg, iocg->active, 0, false, now); in iocg_incur_debt()
1355 iocg->abs_vdebt += abs_cost; in iocg_incur_debt()
1357 gcs = get_cpu_ptr(iocg->pcpu_stat); in iocg_incur_debt()
1358 local64_add(abs_cost, &gcs->abs_vusage); in iocg_incur_debt()
1365 lockdep_assert_held(&iocg->ioc->lock); in iocg_pay_debt()
1366 lockdep_assert_held(&iocg->waitq.lock); in iocg_pay_debt()
1369 WARN_ON_ONCE(list_empty(&iocg->active_list)); in iocg_pay_debt()
1370 WARN_ON_ONCE(iocg->inuse > 1); in iocg_pay_debt()
1372 iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); in iocg_pay_debt()
1375 if (!iocg->abs_vdebt) { in iocg_pay_debt()
1376 iocg->local_stat.indebt_us += now->now - iocg->indebt_since; in iocg_pay_debt()
1377 iocg->indebt_since = 0; in iocg_pay_debt()
1379 propagate_weights(iocg, iocg->active, iocg->last_inuse, in iocg_pay_debt()
1389 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); in iocg_wake_fn()
1391 ctx->vbudget -= cost; in iocg_wake_fn()
1393 if (ctx->vbudget < 0) in iocg_wake_fn()
1394 return -1; in iocg_wake_fn()
1396 iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); in iocg_wake_fn()
1397 wait->committed = true; in iocg_wake_fn()
1407 list_del_init_careful(&wq_entry->entry); in iocg_wake_fn()
1413 * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1414 * addition to iocg->waitq.lock.
1419 struct ioc *ioc = iocg->ioc; in iocg_kick_waitq()
1425 lockdep_assert_held(&iocg->waitq.lock); in iocg_kick_waitq()
1428 vbudget = now->vnow - atomic64_read(&iocg->vtime); in iocg_kick_waitq()
1431 if (pay_debt && iocg->abs_vdebt && vbudget > 0) { in iocg_kick_waitq()
1433 u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); in iocg_kick_waitq()
1436 lockdep_assert_held(&ioc->lock); in iocg_kick_waitq()
1438 atomic64_add(vpay, &iocg->vtime); in iocg_kick_waitq()
1439 atomic64_add(vpay, &iocg->done_vtime); in iocg_kick_waitq()
1441 vbudget -= vpay; in iocg_kick_waitq()
1444 if (iocg->abs_vdebt || iocg->delay) in iocg_kick_waitq()
1453 if (iocg->abs_vdebt) { in iocg_kick_waitq()
1454 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); in iocg_kick_waitq()
1455 vbudget = min_t(s64, 0, vbudget - vdebt); in iocg_kick_waitq()
1466 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); in iocg_kick_waitq()
1468 if (!waitqueue_active(&iocg->waitq)) { in iocg_kick_waitq()
1469 if (iocg->wait_since) { in iocg_kick_waitq()
1470 iocg->local_stat.wait_us += now->now - iocg->wait_since; in iocg_kick_waitq()
1471 iocg->wait_since = 0; in iocg_kick_waitq()
1476 if (!iocg->wait_since) in iocg_kick_waitq()
1477 iocg->wait_since = now->now; in iocg_kick_waitq()
1483 vshortage = -ctx.vbudget; in iocg_kick_waitq()
1484 expires = now->now_ns + in iocg_kick_waitq()
1485 DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * in iocg_kick_waitq()
1487 expires += ioc->timer_slack_ns; in iocg_kick_waitq()
1490 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); in iocg_kick_waitq()
1491 if (hrtimer_is_queued(&iocg->waitq_timer) && in iocg_kick_waitq()
1492 abs(oexpires - expires) <= ioc->timer_slack_ns) in iocg_kick_waitq()
1495 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), in iocg_kick_waitq()
1496 ioc->timer_slack_ns, HRTIMER_MODE_ABS); in iocg_kick_waitq()
1502 bool pay_debt = READ_ONCE(iocg->abs_vdebt); in iocg_waitq_timer_fn()
1506 ioc_now(iocg->ioc, &now); in iocg_waitq_timer_fn()
1523 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu); in ioc_lat_stat()
1527 u32 this_met = local_read(&stat->missed[rw].nr_met); in ioc_lat_stat()
1528 u32 this_missed = local_read(&stat->missed[rw].nr_missed); in ioc_lat_stat()
1530 nr_met[rw] += this_met - stat->missed[rw].last_met; in ioc_lat_stat()
1531 nr_missed[rw] += this_missed - stat->missed[rw].last_missed; in ioc_lat_stat()
1532 stat->missed[rw].last_met = this_met; in ioc_lat_stat()
1533 stat->missed[rw].last_missed = this_missed; in ioc_lat_stat()
1536 this_rq_wait_ns = local64_read(&stat->rq_wait_ns); in ioc_lat_stat()
1537 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; in ioc_lat_stat()
1538 stat->last_rq_wait_ns = this_rq_wait_ns; in ioc_lat_stat()
1551 ioc->period_us * NSEC_PER_USEC); in ioc_lat_stat()
1557 struct ioc *ioc = iocg->ioc; in iocg_is_idle()
1560 if (atomic64_read(&iocg->active_period) == in iocg_is_idle()
1561 atomic64_read(&ioc->cur_period)) in iocg_is_idle()
1565 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime)) in iocg_is_idle()
1572 * Call this function on the target leaf @iocg's to build pre-order traversal
1574 * ->walk_list and the caller is responsible for dissolving the list after use.
1581 WARN_ON_ONCE(!list_empty(&iocg->walk_list)); in iocg_build_inner_walk()
1584 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { in iocg_build_inner_walk()
1585 if (!list_empty(&iocg->ancestors[lvl]->walk_list)) in iocg_build_inner_walk()
1589 /* walk down and visit the inner nodes to get pre-order traversal */ in iocg_build_inner_walk()
1590 while (++lvl <= iocg->level - 1) { in iocg_build_inner_walk()
1591 struct ioc_gq *inner = iocg->ancestors[lvl]; in iocg_build_inner_walk()
1594 list_add_tail(&inner->walk_list, inner_walk); in iocg_build_inner_walk()
1598 /* collect per-cpu counters and propagate the deltas to the parent */
1601 struct ioc *ioc = iocg->ioc; in iocg_flush_stat_one()
1607 lockdep_assert_held(&iocg->ioc->lock); in iocg_flush_stat_one()
1609 /* collect per-cpu counters */ in iocg_flush_stat_one()
1612 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); in iocg_flush_stat_one()
1614 vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; in iocg_flush_stat_one()
1615 iocg->last_stat_abs_vusage = abs_vusage; in iocg_flush_stat_one()
1617 iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); in iocg_flush_stat_one()
1618 iocg->local_stat.usage_us += iocg->usage_delta_us; in iocg_flush_stat_one()
1622 iocg->local_stat.usage_us + iocg->desc_stat.usage_us; in iocg_flush_stat_one()
1624 iocg->local_stat.wait_us + iocg->desc_stat.wait_us; in iocg_flush_stat_one()
1626 iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us; in iocg_flush_stat_one()
1628 iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us; in iocg_flush_stat_one()
1631 if (iocg->level > 0) { in iocg_flush_stat_one()
1633 &iocg->ancestors[iocg->level - 1]->desc_stat; in iocg_flush_stat_one()
1635 parent_stat->usage_us += in iocg_flush_stat_one()
1636 new_stat.usage_us - iocg->last_stat.usage_us; in iocg_flush_stat_one()
1637 parent_stat->wait_us += in iocg_flush_stat_one()
1638 new_stat.wait_us - iocg->last_stat.wait_us; in iocg_flush_stat_one()
1639 parent_stat->indebt_us += in iocg_flush_stat_one()
1640 new_stat.indebt_us - iocg->last_stat.indebt_us; in iocg_flush_stat_one()
1641 parent_stat->indelay_us += in iocg_flush_stat_one()
1642 new_stat.indelay_us - iocg->last_stat.indelay_us; in iocg_flush_stat_one()
1645 iocg->last_stat = new_stat; in iocg_flush_stat_one()
1663 list_del_init(&iocg->walk_list); in iocg_flush_stat()
1675 struct ioc *ioc = iocg->ioc; in hweight_after_donation()
1676 u64 vtime = atomic64_read(&iocg->vtime); in hweight_after_donation()
1680 if (iocg->abs_vdebt) in hweight_after_donation()
1684 if (waitqueue_active(&iocg->waitq) || in hweight_after_donation()
1685 time_after64(vtime, now->vnow - ioc->margins.min)) in hweight_after_donation()
1689 excess = now->vnow - vtime - ioc->margins.target; in hweight_after_donation()
1691 atomic64_add(excess, &iocg->vtime); in hweight_after_donation()
1692 atomic64_add(excess, &iocg->done_vtime); in hweight_after_donation()
1694 ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); in hweight_after_donation()
1704 * new budget (1 - MARGIN_TARGET) and the leftover from the last period in hweight_after_donation()
1707 * usage = (1 - MARGIN_TARGET + delta) * new_hwi in hweight_after_donation()
1711 * new_hwi = usage / (1 - MARGIN_TARGET + delta) in hweight_after_donation()
1713 delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), in hweight_after_donation()
1714 now->vnow - ioc->period_at_vtime); in hweight_after_donation()
1716 new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); in hweight_after_donation()
1722 * For work-conservation, an iocg which isn't using all of its share should
1723 * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1744 * Given the weights and target after-donation hweight_inuse values, Andy's
1746 * sibling level to maintain the relative relationship between all non-donating
1748 * non-donating parts, calculates global donation rate which is used to
1749 * determine the target hweight_inuse for each node, and then derives per-level
1756 * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1766 * f is the sum of the absolute budgets of non-donating nodes in the subtree.
1769 * w_f is the non-donating portion of w. w_f = w * f / b
1772 * s_f and s_t are the non-donating and donating portions of s.
1774 * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1798 after_sum += iocg->hweight_after_donation; in transfer_surpluses()
1800 if (iocg->hweight_after_donation > hwa) { in transfer_surpluses()
1801 over_sum += iocg->hweight_after_donation; in transfer_surpluses()
1802 list_add(&iocg->walk_list, &over_hwa); in transfer_surpluses()
1811 u32 over_delta = after_sum - (WEIGHT_ONE - 1); in transfer_surpluses()
1813 over_target = over_sum - over_delta; in transfer_surpluses()
1820 iocg->hweight_after_donation = in transfer_surpluses()
1821 div_u64((u64)iocg->hweight_after_donation * in transfer_surpluses()
1823 list_del_init(&iocg->walk_list); in transfer_surpluses()
1827 * Build pre-order inner node walk list and prepare for donation in transfer_surpluses()
1835 WARN_ON_ONCE(root_iocg->level > 0); in transfer_surpluses()
1838 iocg->child_adjusted_sum = 0; in transfer_surpluses()
1839 iocg->hweight_donating = 0; in transfer_surpluses()
1840 iocg->hweight_after_donation = 0; in transfer_surpluses()
1848 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1850 parent->hweight_donating += iocg->hweight_donating; in transfer_surpluses()
1851 parent->hweight_after_donation += iocg->hweight_after_donation; in transfer_surpluses()
1855 if (iocg->level > 0) { in transfer_surpluses()
1856 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1858 parent->hweight_donating += iocg->hweight_donating; in transfer_surpluses()
1859 parent->hweight_after_donation += iocg->hweight_after_donation; in transfer_surpluses()
1869 if (iocg->level) { in transfer_surpluses()
1870 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1872 iocg->hweight_active = DIV64_U64_ROUND_UP( in transfer_surpluses()
1873 (u64)parent->hweight_active * iocg->active, in transfer_surpluses()
1874 parent->child_active_sum); in transfer_surpluses()
1878 iocg->hweight_donating = min(iocg->hweight_donating, in transfer_surpluses()
1879 iocg->hweight_active); in transfer_surpluses()
1880 iocg->hweight_after_donation = min(iocg->hweight_after_donation, in transfer_surpluses()
1881 iocg->hweight_donating - 1); in transfer_surpluses()
1882 if (WARN_ON_ONCE(iocg->hweight_active <= 1 || in transfer_surpluses()
1883 iocg->hweight_donating <= 1 || in transfer_surpluses()
1884 iocg->hweight_after_donation == 0)) { in transfer_surpluses()
1886 pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); in transfer_surpluses()
1888 iocg->hweight_active, iocg->hweight_donating, in transfer_surpluses()
1889 iocg->hweight_after_donation); in transfer_surpluses()
1894 * Calculate the global donation rate (gamma) - the rate to adjust in transfer_surpluses()
1895 * non-donating budgets by. in transfer_surpluses()
1901 * hweights can't be whole; however, due to the round-ups during hweight in transfer_surpluses()
1902 * calculations, root_iocg->hweight_donating might still end up equal to in transfer_surpluses()
1905 * gamma = (1 - t_r') / (1 - t_r) in transfer_surpluses()
1908 (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, in transfer_surpluses()
1909 WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); in transfer_surpluses()
1920 if (iocg->level == 0) { in transfer_surpluses()
1922 iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( in transfer_surpluses()
1923 iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), in transfer_surpluses()
1924 WEIGHT_ONE - iocg->hweight_after_donation); in transfer_surpluses()
1928 parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1931 iocg->hweight_inuse = DIV64_U64_ROUND_UP( in transfer_surpluses()
1932 (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), in transfer_surpluses()
1933 WEIGHT_ONE) + iocg->hweight_after_donation; in transfer_surpluses()
1937 (u64)parent->child_adjusted_sum * iocg->hweight_inuse, in transfer_surpluses()
1938 parent->hweight_inuse); in transfer_surpluses()
1942 iocg->child_active_sum * iocg->hweight_donating, in transfer_surpluses()
1943 iocg->hweight_active); in transfer_surpluses()
1944 sf = iocg->child_active_sum - st; in transfer_surpluses()
1946 (u64)iocg->active * iocg->hweight_donating, in transfer_surpluses()
1947 iocg->hweight_active); in transfer_surpluses()
1949 (u64)inuse * iocg->hweight_after_donation, in transfer_surpluses()
1950 iocg->hweight_inuse); in transfer_surpluses()
1952 iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); in transfer_surpluses()
1956 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and in transfer_surpluses()
1960 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1964 * In-debt iocgs participated in the donation calculation with in transfer_surpluses()
1966 * accordingly would work fine but debt handling expects in transfer_surpluses()
1967 * @iocg->inuse stay at the minimum and we don't wanna in transfer_surpluses()
1970 if (iocg->abs_vdebt) { in transfer_surpluses()
1971 WARN_ON_ONCE(iocg->inuse > 1); in transfer_surpluses()
1977 parent->child_adjusted_sum * iocg->hweight_after_donation, in transfer_surpluses()
1978 parent->hweight_inuse); in transfer_surpluses()
1981 iocg->inuse, inuse, in transfer_surpluses()
1982 iocg->hweight_inuse, in transfer_surpluses()
1983 iocg->hweight_after_donation); in transfer_surpluses()
1985 __propagate_weights(iocg, iocg->active, inuse, true, now); in transfer_surpluses()
1990 list_del_init(&iocg->walk_list); in transfer_surpluses()
1997 * more. If there are no other subsequent IO issuers, the in-debt iocg may end
2012 ioc->dfgv_period_at = now->now; in ioc_forgive_debts()
2013 ioc->dfgv_period_rem = 0; in ioc_forgive_debts()
2014 ioc->dfgv_usage_us_sum = 0; in ioc_forgive_debts()
2024 if (ioc->busy_level > 0) in ioc_forgive_debts()
2025 usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us); in ioc_forgive_debts()
2027 ioc->dfgv_usage_us_sum += usage_us_sum; in ioc_forgive_debts()
2028 if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD)) in ioc_forgive_debts()
2035 dur = now->now - ioc->dfgv_period_at; in ioc_forgive_debts()
2036 usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur); in ioc_forgive_debts()
2038 ioc->dfgv_period_at = now->now; in ioc_forgive_debts()
2039 ioc->dfgv_usage_us_sum = 0; in ioc_forgive_debts()
2043 ioc->dfgv_period_rem = 0; in ioc_forgive_debts()
2052 * run and carrying over the left-over duration in @ioc->dfgv_period_rem in ioc_forgive_debts()
2053 * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive in ioc_forgive_debts()
2056 nr_cycles = dur + ioc->dfgv_period_rem; in ioc_forgive_debts()
2057 ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD); in ioc_forgive_debts()
2059 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { in ioc_forgive_debts()
2062 if (!iocg->abs_vdebt && !iocg->delay) in ioc_forgive_debts()
2065 spin_lock(&iocg->waitq.lock); in ioc_forgive_debts()
2067 old_debt = iocg->abs_vdebt; in ioc_forgive_debts()
2068 old_delay = iocg->delay; in ioc_forgive_debts()
2070 if (iocg->abs_vdebt) in ioc_forgive_debts()
2071 iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1; in ioc_forgive_debts()
2072 if (iocg->delay) in ioc_forgive_debts()
2073 iocg->delay = iocg->delay >> nr_cycles ?: 1; in ioc_forgive_debts()
2078 old_debt, iocg->abs_vdebt, in ioc_forgive_debts()
2079 old_delay, iocg->delay); in ioc_forgive_debts()
2081 spin_unlock(&iocg->waitq.lock); in ioc_forgive_debts()
2093 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; in ioc_timer_fn()
2094 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; in ioc_timer_fn()
2103 spin_lock_irq(&ioc->lock); in ioc_timer_fn()
2107 period_vtime = now.vnow - ioc->period_at_vtime; in ioc_timer_fn()
2109 spin_unlock_irq(&ioc->lock); in ioc_timer_fn()
2119 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { in ioc_timer_fn()
2120 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && in ioc_timer_fn()
2121 !iocg->delay && !iocg_is_idle(iocg)) in ioc_timer_fn()
2124 spin_lock(&iocg->waitq.lock); in ioc_timer_fn()
2127 if (iocg->wait_since) { in ioc_timer_fn()
2128 iocg->local_stat.wait_us += now.now - iocg->wait_since; in ioc_timer_fn()
2129 iocg->wait_since = now.now; in ioc_timer_fn()
2131 if (iocg->indebt_since) { in ioc_timer_fn()
2132 iocg->local_stat.indebt_us += in ioc_timer_fn()
2133 now.now - iocg->indebt_since; in ioc_timer_fn()
2134 iocg->indebt_since = now.now; in ioc_timer_fn()
2136 if (iocg->indelay_since) { in ioc_timer_fn()
2137 iocg->local_stat.indelay_us += in ioc_timer_fn()
2138 now.now - iocg->indelay_since; in ioc_timer_fn()
2139 iocg->indelay_since = now.now; in ioc_timer_fn()
2142 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || in ioc_timer_fn()
2143 iocg->delay) { in ioc_timer_fn()
2146 if (iocg->abs_vdebt || iocg->delay) in ioc_timer_fn()
2150 u64 vtime = atomic64_read(&iocg->vtime); in ioc_timer_fn()
2159 excess = now.vnow - vtime - ioc->margins.target; in ioc_timer_fn()
2164 ioc->vtime_err -= div64_u64(excess * old_hwi, in ioc_timer_fn()
2169 list_del_init(&iocg->active_list); in ioc_timer_fn()
2172 spin_unlock(&iocg->waitq.lock); in ioc_timer_fn()
2178 * below needs updated usage stat. Let's bring stat up-to-date. in ioc_timer_fn()
2180 iocg_flush_stat(&ioc->active_iocgs, &now); in ioc_timer_fn()
2183 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { in ioc_timer_fn()
2191 vdone = atomic64_read(&iocg->done_vtime); in ioc_timer_fn()
2192 vtime = atomic64_read(&iocg->vtime); in ioc_timer_fn()
2197 * in-flight for longer than a period. Detect them by in ioc_timer_fn()
2202 !atomic_read(&iocg_to_blkg(iocg)->use_delay) && in ioc_timer_fn()
2204 time_after64(vtime, now.vnow - in ioc_timer_fn()
2206 time_before64(vdone, now.vnow - period_vtime)) in ioc_timer_fn()
2210 * Determine absolute usage factoring in in-flight IOs to avoid in ioc_timer_fn()
2211 * high-latency completions appearing as idle. in ioc_timer_fn()
2213 usage_us = iocg->usage_delta_us; in ioc_timer_fn()
2218 cost_to_abs_cost(vtime - vdone, hw_inuse), in ioc_timer_fn()
2219 ioc->vtime_base_rate); in ioc_timer_fn()
2224 if (time_after64(iocg->activated_at, ioc->period_at)) in ioc_timer_fn()
2225 usage_dur = max_t(u64, now.now - iocg->activated_at, 1); in ioc_timer_fn()
2227 usage_dur = max_t(u64, now.now - ioc->period_at, 1); in ioc_timer_fn()
2235 WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); in ioc_timer_fn()
2237 (!waitqueue_active(&iocg->waitq) && in ioc_timer_fn()
2238 time_before64(vtime, now.vnow - ioc->margins.low))) { in ioc_timer_fn()
2257 iocg->hweight_donating = hwa; in ioc_timer_fn()
2258 iocg->hweight_after_donation = new_hwi; in ioc_timer_fn()
2259 list_add(&iocg->surplus_list, &surpluses); in ioc_timer_fn()
2260 } else if (!iocg->abs_vdebt) { in ioc_timer_fn()
2272 iocg->inuse, iocg->active, in ioc_timer_fn()
2273 iocg->hweight_inuse, new_hwi); in ioc_timer_fn()
2275 __propagate_weights(iocg, iocg->active, in ioc_timer_fn()
2276 iocg->active, true, &now); in ioc_timer_fn()
2292 list_del_init(&iocg->surplus_list); in ioc_timer_fn()
2300 prev_busy_level = ioc->busy_level; in ioc_timer_fn()
2305 ioc->busy_level = max(ioc->busy_level, 0); in ioc_timer_fn()
2306 ioc->busy_level++; in ioc_timer_fn()
2316 ioc->busy_level = min(ioc->busy_level, 0); in ioc_timer_fn()
2323 ioc->busy_level--; in ioc_timer_fn()
2331 ioc->busy_level = 0; in ioc_timer_fn()
2335 ioc->busy_level = 0; in ioc_timer_fn()
2338 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); in ioc_timer_fn()
2340 if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { in ioc_timer_fn()
2341 u64 vrate = ioc->vtime_base_rate; in ioc_timer_fn()
2342 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; in ioc_timer_fn()
2358 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), in ioc_timer_fn()
2362 int idx = min_t(int, abs(ioc->busy_level), in ioc_timer_fn()
2363 ARRAY_SIZE(vrate_adj_pct) - 1); in ioc_timer_fn()
2366 if (ioc->busy_level > 0) in ioc_timer_fn()
2367 adj_pct = 100 - adj_pct; in ioc_timer_fn()
2378 ioc->vtime_base_rate = vrate; in ioc_timer_fn()
2380 } else if (ioc->busy_level != prev_busy_level || nr_lagging) { in ioc_timer_fn()
2381 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), in ioc_timer_fn()
2394 atomic64_inc(&ioc->cur_period); in ioc_timer_fn()
2396 if (ioc->running != IOC_STOP) { in ioc_timer_fn()
2397 if (!list_empty(&ioc->active_iocgs)) { in ioc_timer_fn()
2400 ioc->busy_level = 0; in ioc_timer_fn()
2401 ioc->vtime_err = 0; in ioc_timer_fn()
2402 ioc->running = IOC_IDLE; in ioc_timer_fn()
2408 spin_unlock_irq(&ioc->lock); in ioc_timer_fn()
2414 struct ioc *ioc = iocg->ioc; in adjust_inuse_and_calc_cost()
2415 struct ioc_margins *margins = &ioc->margins; in adjust_inuse_and_calc_cost()
2416 u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; in adjust_inuse_and_calc_cost()
2424 margin = now->vnow - vtime - cost; in adjust_inuse_and_calc_cost()
2427 if (iocg->abs_vdebt) in adjust_inuse_and_calc_cost()
2434 if (margin >= iocg->saved_margin || margin >= margins->low || in adjust_inuse_and_calc_cost()
2435 iocg->inuse == iocg->active) in adjust_inuse_and_calc_cost()
2438 spin_lock_irq(&ioc->lock); in adjust_inuse_and_calc_cost()
2441 if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { in adjust_inuse_and_calc_cost()
2442 spin_unlock_irq(&ioc->lock); in adjust_inuse_and_calc_cost()
2448 * adj_step must be determined after acquiring ioc->lock - we might in adjust_inuse_and_calc_cost()
2450 * be reading 0 iocg->active before ioc->lock which will lead to in adjust_inuse_and_calc_cost()
2453 new_inuse = iocg->inuse; in adjust_inuse_and_calc_cost()
2454 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); in adjust_inuse_and_calc_cost()
2457 propagate_weights(iocg, iocg->active, new_inuse, true, now); in adjust_inuse_and_calc_cost()
2460 } while (time_after64(vtime + cost, now->vnow) && in adjust_inuse_and_calc_cost()
2461 iocg->inuse != iocg->active); in adjust_inuse_and_calc_cost()
2463 spin_unlock_irq(&ioc->lock); in adjust_inuse_and_calc_cost()
2466 old_inuse, iocg->inuse, old_hwi, hwi); in adjust_inuse_and_calc_cost()
2474 struct ioc *ioc = iocg->ioc; in calc_vtime_cost_builtin()
2482 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; in calc_vtime_cost_builtin()
2483 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO]; in calc_vtime_cost_builtin()
2484 coef_page = ioc->params.lcoefs[LCOEF_RPAGE]; in calc_vtime_cost_builtin()
2487 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO]; in calc_vtime_cost_builtin()
2488 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO]; in calc_vtime_cost_builtin()
2489 coef_page = ioc->params.lcoefs[LCOEF_WPAGE]; in calc_vtime_cost_builtin()
2495 if (iocg->cursor) { in calc_vtime_cost_builtin()
2496 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor); in calc_vtime_cost_builtin()
2527 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE]; in calc_size_vtime_cost_builtin()
2530 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE]; in calc_size_vtime_cost_builtin()
2547 struct blkcg_gq *blkg = bio->bi_blkg; in ioc_rqos_throttle()
2557 if (!ioc->enabled || !iocg || !iocg->level) in ioc_rqos_throttle()
2568 iocg->cursor = bio_end_sector(bio); in ioc_rqos_throttle()
2569 vtime = atomic64_read(&iocg->vtime); in ioc_rqos_throttle()
2574 * tests are racy but the races aren't systemic - we only miss once in ioc_rqos_throttle()
2575 * in a while which is fine. in ioc_rqos_throttle()
2577 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && in ioc_rqos_throttle()
2585 * cause priority inversions are punted to @ioc->aux_iocg and charged as in ioc_rqos_throttle()
2586 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling in ioc_rqos_throttle()
2587 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine in ioc_rqos_throttle()
2591 ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); in ioc_rqos_throttle()
2597 * is synchronized against both ioc->lock and waitq.lock and we won't in ioc_rqos_throttle()
2602 if (unlikely(list_empty(&iocg->active_list))) { in ioc_rqos_throttle()
2628 blkcg_schedule_throttle(rqos->q, in ioc_rqos_throttle()
2629 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); in ioc_rqos_throttle()
2635 if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { in ioc_rqos_throttle()
2641 propagate_weights(iocg, iocg->active, iocg->active, true, in ioc_rqos_throttle()
2650 * waiting for to allow re-evaluation using a custom wait entry. in ioc_rqos_throttle()
2655 * All waiters are on iocg->waitq and the wait states are in ioc_rqos_throttle()
2664 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); in ioc_rqos_throttle()
2677 finish_wait(&iocg->waitq, &wait.wait); in ioc_rqos_throttle()
2683 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); in ioc_rqos_merge()
2691 if (!ioc->enabled || !iocg || !iocg->level) in ioc_rqos_merge()
2700 vtime = atomic64_read(&iocg->vtime); in ioc_rqos_merge()
2705 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor) in ioc_rqos_merge()
2706 iocg->cursor = bio_end; in ioc_rqos_merge()
2712 if (rq->bio && rq->bio->bi_iocost_cost && in ioc_rqos_merge()
2713 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { in ioc_rqos_merge()
2723 spin_lock_irqsave(&ioc->lock, flags); in ioc_rqos_merge()
2724 spin_lock(&iocg->waitq.lock); in ioc_rqos_merge()
2726 if (likely(!list_empty(&iocg->active_list))) { in ioc_rqos_merge()
2729 blkcg_schedule_throttle(rqos->q, in ioc_rqos_merge()
2730 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); in ioc_rqos_merge()
2735 spin_unlock(&iocg->waitq.lock); in ioc_rqos_merge()
2736 spin_unlock_irqrestore(&ioc->lock, flags); in ioc_rqos_merge()
2741 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); in ioc_rqos_done_bio()
2743 if (iocg && bio->bi_iocost_cost) in ioc_rqos_done_bio()
2744 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime); in ioc_rqos_done_bio()
2754 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) in ioc_rqos_done()
2770 on_q_ns = ktime_get_ns() - rq->alloc_time_ns; in ioc_rqos_done()
2771 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; in ioc_rqos_done()
2774 ccs = get_cpu_ptr(ioc->pcpu_stat); in ioc_rqos_done()
2777 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) in ioc_rqos_done()
2778 local_inc(&ccs->missed[rw].nr_met); in ioc_rqos_done()
2780 local_inc(&ccs->missed[rw].nr_missed); in ioc_rqos_done()
2782 local64_add(rq_wait_ns, &ccs->rq_wait_ns); in ioc_rqos_done()
2791 spin_lock_irq(&ioc->lock); in ioc_rqos_queue_depth_changed()
2793 spin_unlock_irq(&ioc->lock); in ioc_rqos_queue_depth_changed()
2800 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost); in ioc_rqos_exit()
2802 spin_lock_irq(&ioc->lock); in ioc_rqos_exit()
2803 ioc->running = IOC_STOP; in ioc_rqos_exit()
2804 spin_unlock_irq(&ioc->lock); in ioc_rqos_exit()
2806 del_timer_sync(&ioc->timer); in ioc_rqos_exit()
2807 free_percpu(ioc->pcpu_stat); in ioc_rqos_exit()
2828 return -ENOMEM; in blk_iocost_init()
2830 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat); in blk_iocost_init()
2831 if (!ioc->pcpu_stat) { in blk_iocost_init()
2833 return -ENOMEM; in blk_iocost_init()
2837 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); in blk_iocost_init()
2839 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { in blk_iocost_init()
2840 local_set(&ccs->missed[i].nr_met, 0); in blk_iocost_init()
2841 local_set(&ccs->missed[i].nr_missed, 0); in blk_iocost_init()
2843 local64_set(&ccs->rq_wait_ns, 0); in blk_iocost_init()
2846 rqos = &ioc->rqos; in blk_iocost_init()
2847 rqos->id = RQ_QOS_COST; in blk_iocost_init()
2848 rqos->ops = &ioc_rqos_ops; in blk_iocost_init()
2849 rqos->q = q; in blk_iocost_init()
2851 spin_lock_init(&ioc->lock); in blk_iocost_init()
2852 timer_setup(&ioc->timer, ioc_timer_fn, 0); in blk_iocost_init()
2853 INIT_LIST_HEAD(&ioc->active_iocgs); in blk_iocost_init()
2855 ioc->running = IOC_IDLE; in blk_iocost_init()
2856 ioc->vtime_base_rate = VTIME_PER_USEC; in blk_iocost_init()
2857 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); in blk_iocost_init()
2858 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); in blk_iocost_init()
2859 ioc->period_at = ktime_to_us(ktime_get()); in blk_iocost_init()
2860 atomic64_set(&ioc->cur_period, 0); in blk_iocost_init()
2861 atomic_set(&ioc->hweight_gen, 0); in blk_iocost_init()
2863 spin_lock_irq(&ioc->lock); in blk_iocost_init()
2864 ioc->autop_idx = AUTOP_INVALID; in blk_iocost_init()
2866 spin_unlock_irq(&ioc->lock); in blk_iocost_init()
2878 free_percpu(ioc->pcpu_stat); in blk_iocost_init()
2893 iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; in ioc_cpd_alloc()
2894 return &iocc->cpd; in ioc_cpd_alloc()
2905 int levels = blkcg->css.cgroup->level + 1; in ioc_pd_alloc()
2908 iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node); in ioc_pd_alloc()
2912 iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); in ioc_pd_alloc()
2913 if (!iocg->pcpu_stat) { in ioc_pd_alloc()
2918 return &iocg->pd; in ioc_pd_alloc()
2924 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd); in ioc_pd_init()
2925 struct ioc *ioc = q_to_ioc(blkg->q); in ioc_pd_init()
2932 iocg->ioc = ioc; in ioc_pd_init()
2933 atomic64_set(&iocg->vtime, now.vnow); in ioc_pd_init()
2934 atomic64_set(&iocg->done_vtime, now.vnow); in ioc_pd_init()
2935 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); in ioc_pd_init()
2936 INIT_LIST_HEAD(&iocg->active_list); in ioc_pd_init()
2937 INIT_LIST_HEAD(&iocg->walk_list); in ioc_pd_init()
2938 INIT_LIST_HEAD(&iocg->surplus_list); in ioc_pd_init()
2939 iocg->hweight_active = WEIGHT_ONE; in ioc_pd_init()
2940 iocg->hweight_inuse = WEIGHT_ONE; in ioc_pd_init()
2942 init_waitqueue_head(&iocg->waitq); in ioc_pd_init()
2943 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); in ioc_pd_init()
2944 iocg->waitq_timer.function = iocg_waitq_timer_fn; in ioc_pd_init()
2946 iocg->level = blkg->blkcg->css.cgroup->level; in ioc_pd_init()
2948 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) { in ioc_pd_init()
2950 iocg->ancestors[tiocg->level] = tiocg; in ioc_pd_init()
2953 spin_lock_irqsave(&ioc->lock, flags); in ioc_pd_init()
2955 spin_unlock_irqrestore(&ioc->lock, flags); in ioc_pd_init()
2961 struct ioc *ioc = iocg->ioc; in ioc_pd_free()
2965 spin_lock_irqsave(&ioc->lock, flags); in ioc_pd_free()
2967 if (!list_empty(&iocg->active_list)) { in ioc_pd_free()
2972 list_del_init(&iocg->active_list); in ioc_pd_free()
2975 WARN_ON_ONCE(!list_empty(&iocg->walk_list)); in ioc_pd_free()
2976 WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); in ioc_pd_free()
2978 spin_unlock_irqrestore(&ioc->lock, flags); in ioc_pd_free()
2980 hrtimer_cancel(&iocg->waitq_timer); in ioc_pd_free()
2982 free_percpu(iocg->pcpu_stat); in ioc_pd_free()
2989 struct ioc *ioc = iocg->ioc; in ioc_pd_stat()
2992 if (!ioc->enabled) in ioc_pd_stat()
2995 if (iocg->level == 0) { in ioc_pd_stat()
2997 ioc->vtime_base_rate * 10000, in ioc_pd_stat()
2999 pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", in ioc_pd_stat()
3003 pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", in ioc_pd_stat()
3004 iocg->last_stat.usage_us); in ioc_pd_stat()
3007 pos += scnprintf(buf + pos, size - pos, in ioc_pd_stat()
3009 iocg->last_stat.wait_us, in ioc_pd_stat()
3010 iocg->last_stat.indebt_us, in ioc_pd_stat()
3011 iocg->last_stat.indelay_us); in ioc_pd_stat()
3019 const char *dname = blkg_dev_name(pd->blkg); in ioc_weight_prfill()
3022 if (dname && iocg->cfg_weight) in ioc_weight_prfill()
3023 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); in ioc_weight_prfill()
3033 seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); in ioc_weight_show()
3035 &blkcg_policy_iocost, seq_cft(sf)->private, false); in ioc_weight_show()
3054 return -EINVAL; in ioc_weight_write()
3057 return -EINVAL; in ioc_weight_write()
3059 spin_lock_irq(&blkcg->lock); in ioc_weight_write()
3060 iocc->dfl_weight = v * WEIGHT_ONE; in ioc_weight_write()
3061 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { in ioc_weight_write()
3065 spin_lock(&iocg->ioc->lock); in ioc_weight_write()
3066 ioc_now(iocg->ioc, &now); in ioc_weight_write()
3068 spin_unlock(&iocg->ioc->lock); in ioc_weight_write()
3071 spin_unlock_irq(&blkcg->lock); in ioc_weight_write()
3091 spin_lock(&iocg->ioc->lock); in ioc_weight_write()
3092 iocg->cfg_weight = v * WEIGHT_ONE; in ioc_weight_write()
3093 ioc_now(iocg->ioc, &now); in ioc_weight_write()
3095 spin_unlock(&iocg->ioc->lock); in ioc_weight_write()
3102 return -EINVAL; in ioc_weight_write()
3108 const char *dname = blkg_dev_name(pd->blkg); in ioc_qos_prfill()
3109 struct ioc *ioc = pd_to_iocg(pd)->ioc; in ioc_qos_prfill()
3115 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", in ioc_qos_prfill()
3116 ioc->params.qos[QOS_RPPM] / 10000, in ioc_qos_prfill()
3117 ioc->params.qos[QOS_RPPM] % 10000 / 100, in ioc_qos_prfill()
3118 ioc->params.qos[QOS_RLAT], in ioc_qos_prfill()
3119 ioc->params.qos[QOS_WPPM] / 10000, in ioc_qos_prfill()
3120 ioc->params.qos[QOS_WPPM] % 10000 / 100, in ioc_qos_prfill()
3121 ioc->params.qos[QOS_WLAT], in ioc_qos_prfill()
3122 ioc->params.qos[QOS_MIN] / 10000, in ioc_qos_prfill()
3123 ioc->params.qos[QOS_MIN] % 10000 / 100, in ioc_qos_prfill()
3124 ioc->params.qos[QOS_MAX] / 10000, in ioc_qos_prfill()
3125 ioc->params.qos[QOS_MAX] % 10000 / 100); in ioc_qos_prfill()
3134 &blkcg_policy_iocost, seq_cft(sf)->private, false); in ioc_qos_show()
3168 ioc = q_to_ioc(disk->queue); in ioc_qos_write()
3170 ret = blk_iocost_init(disk->queue); in ioc_qos_write()
3173 ioc = q_to_ioc(disk->queue); in ioc_qos_write()
3176 spin_lock_irq(&ioc->lock); in ioc_qos_write()
3177 memcpy(qos, ioc->params.qos, sizeof(qos)); in ioc_qos_write()
3178 enable = ioc->enabled; in ioc_qos_write()
3179 user = ioc->user_qos_params; in ioc_qos_write()
3180 spin_unlock_irq(&ioc->lock); in ioc_qos_write()
3247 spin_lock_irq(&ioc->lock); in ioc_qos_write()
3250 blk_stat_enable_accounting(ioc->rqos.q); in ioc_qos_write()
3251 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); in ioc_qos_write()
3252 ioc->enabled = true; in ioc_qos_write()
3254 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); in ioc_qos_write()
3255 ioc->enabled = false; in ioc_qos_write()
3259 memcpy(ioc->params.qos, qos, sizeof(qos)); in ioc_qos_write()
3260 ioc->user_qos_params = true; in ioc_qos_write()
3262 ioc->user_qos_params = false; in ioc_qos_write()
3266 spin_unlock_irq(&ioc->lock); in ioc_qos_write()
3271 ret = -EINVAL; in ioc_qos_write()
3280 const char *dname = blkg_dev_name(pd->blkg); in ioc_cost_model_prfill()
3281 struct ioc *ioc = pd_to_iocg(pd)->ioc; in ioc_cost_model_prfill()
3282 u64 *u = ioc->params.i_lcoefs; in ioc_cost_model_prfill()
3290 dname, ioc->user_cost_model ? "user" : "auto", in ioc_cost_model_prfill()
3301 &blkcg_policy_iocost, seq_cft(sf)->private, false); in ioc_cost_model_show()
3335 ioc = q_to_ioc(disk->queue); in ioc_cost_model_write()
3337 ret = blk_iocost_init(disk->queue); in ioc_cost_model_write()
3340 ioc = q_to_ioc(disk->queue); in ioc_cost_model_write()
3343 spin_lock_irq(&ioc->lock); in ioc_cost_model_write()
3344 memcpy(u, ioc->params.i_lcoefs, sizeof(u)); in ioc_cost_model_write()
3345 user = ioc->user_cost_model; in ioc_cost_model_write()
3346 spin_unlock_irq(&ioc->lock); in ioc_cost_model_write()
3383 spin_lock_irq(&ioc->lock); in ioc_cost_model_write()
3385 memcpy(ioc->params.i_lcoefs, u, sizeof(u)); in ioc_cost_model_write()
3386 ioc->user_cost_model = true; in ioc_cost_model_write()
3388 ioc->user_cost_model = false; in ioc_cost_model_write()
3391 spin_unlock_irq(&ioc->lock); in ioc_cost_model_write()
3397 ret = -EINVAL; in ioc_cost_model_write()