1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * SPDX-License-Identifier: MIT
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright © 2008-2018 Intel Corporation
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun #include <linux/sched/mm.h>
8*4882a593Smuzhiyun #include <linux/stop_machine.h>
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun #include "display/intel_display_types.h"
11*4882a593Smuzhiyun #include "display/intel_overlay.h"
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun #include "gem/i915_gem_context.h"
14*4882a593Smuzhiyun
15*4882a593Smuzhiyun #include "i915_drv.h"
16*4882a593Smuzhiyun #include "i915_gpu_error.h"
17*4882a593Smuzhiyun #include "i915_irq.h"
18*4882a593Smuzhiyun #include "intel_breadcrumbs.h"
19*4882a593Smuzhiyun #include "intel_engine_pm.h"
20*4882a593Smuzhiyun #include "intel_gt.h"
21*4882a593Smuzhiyun #include "intel_gt_pm.h"
22*4882a593Smuzhiyun #include "intel_reset.h"
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun #include "uc/intel_guc.h"
25*4882a593Smuzhiyun #include "uc/intel_guc_submission.h"
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun #define RESET_MAX_RETRIES 3
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun /* XXX How to handle concurrent GGTT updates using tiling registers? */
30*4882a593Smuzhiyun #define RESET_UNDER_STOP_MACHINE 0
31*4882a593Smuzhiyun
rmw_set_fw(struct intel_uncore * uncore,i915_reg_t reg,u32 set)32*4882a593Smuzhiyun static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
33*4882a593Smuzhiyun {
34*4882a593Smuzhiyun intel_uncore_rmw_fw(uncore, reg, 0, set);
35*4882a593Smuzhiyun }
36*4882a593Smuzhiyun
rmw_clear_fw(struct intel_uncore * uncore,i915_reg_t reg,u32 clr)37*4882a593Smuzhiyun static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
38*4882a593Smuzhiyun {
39*4882a593Smuzhiyun intel_uncore_rmw_fw(uncore, reg, clr, 0);
40*4882a593Smuzhiyun }
41*4882a593Smuzhiyun
engine_skip_context(struct i915_request * rq)42*4882a593Smuzhiyun static void engine_skip_context(struct i915_request *rq)
43*4882a593Smuzhiyun {
44*4882a593Smuzhiyun struct intel_engine_cs *engine = rq->engine;
45*4882a593Smuzhiyun struct intel_context *hung_ctx = rq->context;
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun if (!i915_request_is_active(rq))
48*4882a593Smuzhiyun return;
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun lockdep_assert_held(&engine->active.lock);
51*4882a593Smuzhiyun list_for_each_entry_continue(rq, &engine->active.requests, sched.link)
52*4882a593Smuzhiyun if (rq->context == hung_ctx) {
53*4882a593Smuzhiyun i915_request_set_error_once(rq, -EIO);
54*4882a593Smuzhiyun __i915_request_skip(rq);
55*4882a593Smuzhiyun }
56*4882a593Smuzhiyun }
57*4882a593Smuzhiyun
client_mark_guilty(struct i915_gem_context * ctx,bool banned)58*4882a593Smuzhiyun static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
59*4882a593Smuzhiyun {
60*4882a593Smuzhiyun struct drm_i915_file_private *file_priv = ctx->file_priv;
61*4882a593Smuzhiyun unsigned long prev_hang;
62*4882a593Smuzhiyun unsigned int score;
63*4882a593Smuzhiyun
64*4882a593Smuzhiyun if (IS_ERR_OR_NULL(file_priv))
65*4882a593Smuzhiyun return;
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun score = 0;
68*4882a593Smuzhiyun if (banned)
69*4882a593Smuzhiyun score = I915_CLIENT_SCORE_CONTEXT_BAN;
70*4882a593Smuzhiyun
71*4882a593Smuzhiyun prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
72*4882a593Smuzhiyun if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
73*4882a593Smuzhiyun score += I915_CLIENT_SCORE_HANG_FAST;
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun if (score) {
76*4882a593Smuzhiyun atomic_add(score, &file_priv->ban_score);
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun drm_dbg(&ctx->i915->drm,
79*4882a593Smuzhiyun "client %s: gained %u ban score, now %u\n",
80*4882a593Smuzhiyun ctx->name, score,
81*4882a593Smuzhiyun atomic_read(&file_priv->ban_score));
82*4882a593Smuzhiyun }
83*4882a593Smuzhiyun }
84*4882a593Smuzhiyun
mark_guilty(struct i915_request * rq)85*4882a593Smuzhiyun static bool mark_guilty(struct i915_request *rq)
86*4882a593Smuzhiyun {
87*4882a593Smuzhiyun struct i915_gem_context *ctx;
88*4882a593Smuzhiyun unsigned long prev_hang;
89*4882a593Smuzhiyun bool banned;
90*4882a593Smuzhiyun int i;
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun if (intel_context_is_closed(rq->context)) {
93*4882a593Smuzhiyun intel_context_set_banned(rq->context);
94*4882a593Smuzhiyun return true;
95*4882a593Smuzhiyun }
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun rcu_read_lock();
98*4882a593Smuzhiyun ctx = rcu_dereference(rq->context->gem_context);
99*4882a593Smuzhiyun if (ctx && !kref_get_unless_zero(&ctx->ref))
100*4882a593Smuzhiyun ctx = NULL;
101*4882a593Smuzhiyun rcu_read_unlock();
102*4882a593Smuzhiyun if (!ctx)
103*4882a593Smuzhiyun return intel_context_is_banned(rq->context);
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun atomic_inc(&ctx->guilty_count);
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun /* Cool contexts are too cool to be banned! (Used for reset testing.) */
108*4882a593Smuzhiyun if (!i915_gem_context_is_bannable(ctx)) {
109*4882a593Smuzhiyun banned = false;
110*4882a593Smuzhiyun goto out;
111*4882a593Smuzhiyun }
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun drm_notice(&ctx->i915->drm,
114*4882a593Smuzhiyun "%s context reset due to GPU hang\n",
115*4882a593Smuzhiyun ctx->name);
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun /* Record the timestamp for the last N hangs */
118*4882a593Smuzhiyun prev_hang = ctx->hang_timestamp[0];
119*4882a593Smuzhiyun for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
120*4882a593Smuzhiyun ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
121*4882a593Smuzhiyun ctx->hang_timestamp[i] = jiffies;
122*4882a593Smuzhiyun
123*4882a593Smuzhiyun /* If we have hung N+1 times in rapid succession, we ban the context! */
124*4882a593Smuzhiyun banned = !i915_gem_context_is_recoverable(ctx);
125*4882a593Smuzhiyun if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
126*4882a593Smuzhiyun banned = true;
127*4882a593Smuzhiyun if (banned) {
128*4882a593Smuzhiyun drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
129*4882a593Smuzhiyun ctx->name, atomic_read(&ctx->guilty_count));
130*4882a593Smuzhiyun intel_context_set_banned(rq->context);
131*4882a593Smuzhiyun }
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun client_mark_guilty(ctx, banned);
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun out:
136*4882a593Smuzhiyun i915_gem_context_put(ctx);
137*4882a593Smuzhiyun return banned;
138*4882a593Smuzhiyun }
139*4882a593Smuzhiyun
mark_innocent(struct i915_request * rq)140*4882a593Smuzhiyun static void mark_innocent(struct i915_request *rq)
141*4882a593Smuzhiyun {
142*4882a593Smuzhiyun struct i915_gem_context *ctx;
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun rcu_read_lock();
145*4882a593Smuzhiyun ctx = rcu_dereference(rq->context->gem_context);
146*4882a593Smuzhiyun if (ctx)
147*4882a593Smuzhiyun atomic_inc(&ctx->active_count);
148*4882a593Smuzhiyun rcu_read_unlock();
149*4882a593Smuzhiyun }
150*4882a593Smuzhiyun
__i915_request_reset(struct i915_request * rq,bool guilty)151*4882a593Smuzhiyun void __i915_request_reset(struct i915_request *rq, bool guilty)
152*4882a593Smuzhiyun {
153*4882a593Smuzhiyun RQ_TRACE(rq, "guilty? %s\n", yesno(guilty));
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun GEM_BUG_ON(i915_request_completed(rq));
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun rcu_read_lock(); /* protect the GEM context */
158*4882a593Smuzhiyun if (guilty) {
159*4882a593Smuzhiyun i915_request_set_error_once(rq, -EIO);
160*4882a593Smuzhiyun __i915_request_skip(rq);
161*4882a593Smuzhiyun if (mark_guilty(rq))
162*4882a593Smuzhiyun engine_skip_context(rq);
163*4882a593Smuzhiyun } else {
164*4882a593Smuzhiyun i915_request_set_error_once(rq, -EAGAIN);
165*4882a593Smuzhiyun mark_innocent(rq);
166*4882a593Smuzhiyun }
167*4882a593Smuzhiyun rcu_read_unlock();
168*4882a593Smuzhiyun }
169*4882a593Smuzhiyun
i915_in_reset(struct pci_dev * pdev)170*4882a593Smuzhiyun static bool i915_in_reset(struct pci_dev *pdev)
171*4882a593Smuzhiyun {
172*4882a593Smuzhiyun u8 gdrst;
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun pci_read_config_byte(pdev, I915_GDRST, &gdrst);
175*4882a593Smuzhiyun return gdrst & GRDOM_RESET_STATUS;
176*4882a593Smuzhiyun }
177*4882a593Smuzhiyun
i915_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)178*4882a593Smuzhiyun static int i915_do_reset(struct intel_gt *gt,
179*4882a593Smuzhiyun intel_engine_mask_t engine_mask,
180*4882a593Smuzhiyun unsigned int retry)
181*4882a593Smuzhiyun {
182*4882a593Smuzhiyun struct pci_dev *pdev = gt->i915->drm.pdev;
183*4882a593Smuzhiyun int err;
184*4882a593Smuzhiyun
185*4882a593Smuzhiyun /* Assert reset for at least 20 usec, and wait for acknowledgement. */
186*4882a593Smuzhiyun pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
187*4882a593Smuzhiyun udelay(50);
188*4882a593Smuzhiyun err = wait_for_atomic(i915_in_reset(pdev), 50);
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun /* Clear the reset request. */
191*4882a593Smuzhiyun pci_write_config_byte(pdev, I915_GDRST, 0);
192*4882a593Smuzhiyun udelay(50);
193*4882a593Smuzhiyun if (!err)
194*4882a593Smuzhiyun err = wait_for_atomic(!i915_in_reset(pdev), 50);
195*4882a593Smuzhiyun
196*4882a593Smuzhiyun return err;
197*4882a593Smuzhiyun }
198*4882a593Smuzhiyun
g4x_reset_complete(struct pci_dev * pdev)199*4882a593Smuzhiyun static bool g4x_reset_complete(struct pci_dev *pdev)
200*4882a593Smuzhiyun {
201*4882a593Smuzhiyun u8 gdrst;
202*4882a593Smuzhiyun
203*4882a593Smuzhiyun pci_read_config_byte(pdev, I915_GDRST, &gdrst);
204*4882a593Smuzhiyun return (gdrst & GRDOM_RESET_ENABLE) == 0;
205*4882a593Smuzhiyun }
206*4882a593Smuzhiyun
g33_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)207*4882a593Smuzhiyun static int g33_do_reset(struct intel_gt *gt,
208*4882a593Smuzhiyun intel_engine_mask_t engine_mask,
209*4882a593Smuzhiyun unsigned int retry)
210*4882a593Smuzhiyun {
211*4882a593Smuzhiyun struct pci_dev *pdev = gt->i915->drm.pdev;
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
214*4882a593Smuzhiyun return wait_for_atomic(g4x_reset_complete(pdev), 50);
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun
g4x_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)217*4882a593Smuzhiyun static int g4x_do_reset(struct intel_gt *gt,
218*4882a593Smuzhiyun intel_engine_mask_t engine_mask,
219*4882a593Smuzhiyun unsigned int retry)
220*4882a593Smuzhiyun {
221*4882a593Smuzhiyun struct pci_dev *pdev = gt->i915->drm.pdev;
222*4882a593Smuzhiyun struct intel_uncore *uncore = gt->uncore;
223*4882a593Smuzhiyun int ret;
224*4882a593Smuzhiyun
225*4882a593Smuzhiyun /* WaVcpClkGateDisableForMediaReset:ctg,elk */
226*4882a593Smuzhiyun rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
227*4882a593Smuzhiyun intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun pci_write_config_byte(pdev, I915_GDRST,
230*4882a593Smuzhiyun GRDOM_MEDIA | GRDOM_RESET_ENABLE);
231*4882a593Smuzhiyun ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
232*4882a593Smuzhiyun if (ret) {
233*4882a593Smuzhiyun drm_dbg(>->i915->drm, "Wait for media reset failed\n");
234*4882a593Smuzhiyun goto out;
235*4882a593Smuzhiyun }
236*4882a593Smuzhiyun
237*4882a593Smuzhiyun pci_write_config_byte(pdev, I915_GDRST,
238*4882a593Smuzhiyun GRDOM_RENDER | GRDOM_RESET_ENABLE);
239*4882a593Smuzhiyun ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
240*4882a593Smuzhiyun if (ret) {
241*4882a593Smuzhiyun drm_dbg(>->i915->drm, "Wait for render reset failed\n");
242*4882a593Smuzhiyun goto out;
243*4882a593Smuzhiyun }
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun out:
246*4882a593Smuzhiyun pci_write_config_byte(pdev, I915_GDRST, 0);
247*4882a593Smuzhiyun
248*4882a593Smuzhiyun rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
249*4882a593Smuzhiyun intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun return ret;
252*4882a593Smuzhiyun }
253*4882a593Smuzhiyun
ilk_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)254*4882a593Smuzhiyun static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask,
255*4882a593Smuzhiyun unsigned int retry)
256*4882a593Smuzhiyun {
257*4882a593Smuzhiyun struct intel_uncore *uncore = gt->uncore;
258*4882a593Smuzhiyun int ret;
259*4882a593Smuzhiyun
260*4882a593Smuzhiyun intel_uncore_write_fw(uncore, ILK_GDSR,
261*4882a593Smuzhiyun ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
262*4882a593Smuzhiyun ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
263*4882a593Smuzhiyun ILK_GRDOM_RESET_ENABLE, 0,
264*4882a593Smuzhiyun 5000, 0,
265*4882a593Smuzhiyun NULL);
266*4882a593Smuzhiyun if (ret) {
267*4882a593Smuzhiyun drm_dbg(>->i915->drm, "Wait for render reset failed\n");
268*4882a593Smuzhiyun goto out;
269*4882a593Smuzhiyun }
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun intel_uncore_write_fw(uncore, ILK_GDSR,
272*4882a593Smuzhiyun ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
273*4882a593Smuzhiyun ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
274*4882a593Smuzhiyun ILK_GRDOM_RESET_ENABLE, 0,
275*4882a593Smuzhiyun 5000, 0,
276*4882a593Smuzhiyun NULL);
277*4882a593Smuzhiyun if (ret) {
278*4882a593Smuzhiyun drm_dbg(>->i915->drm, "Wait for media reset failed\n");
279*4882a593Smuzhiyun goto out;
280*4882a593Smuzhiyun }
281*4882a593Smuzhiyun
282*4882a593Smuzhiyun out:
283*4882a593Smuzhiyun intel_uncore_write_fw(uncore, ILK_GDSR, 0);
284*4882a593Smuzhiyun intel_uncore_posting_read_fw(uncore, ILK_GDSR);
285*4882a593Smuzhiyun return ret;
286*4882a593Smuzhiyun }
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
gen6_hw_domain_reset(struct intel_gt * gt,u32 hw_domain_mask)289*4882a593Smuzhiyun static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
290*4882a593Smuzhiyun {
291*4882a593Smuzhiyun struct intel_uncore *uncore = gt->uncore;
292*4882a593Smuzhiyun int err;
293*4882a593Smuzhiyun
294*4882a593Smuzhiyun /*
295*4882a593Smuzhiyun * GEN6_GDRST is not in the gt power well, no need to check
296*4882a593Smuzhiyun * for fifo space for the write or forcewake the chip for
297*4882a593Smuzhiyun * the read
298*4882a593Smuzhiyun */
299*4882a593Smuzhiyun intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
300*4882a593Smuzhiyun
301*4882a593Smuzhiyun /* Wait for the device to ack the reset requests */
302*4882a593Smuzhiyun err = __intel_wait_for_register_fw(uncore,
303*4882a593Smuzhiyun GEN6_GDRST, hw_domain_mask, 0,
304*4882a593Smuzhiyun 500, 0,
305*4882a593Smuzhiyun NULL);
306*4882a593Smuzhiyun if (err)
307*4882a593Smuzhiyun drm_dbg(>->i915->drm,
308*4882a593Smuzhiyun "Wait for 0x%08x engines reset failed\n",
309*4882a593Smuzhiyun hw_domain_mask);
310*4882a593Smuzhiyun
311*4882a593Smuzhiyun return err;
312*4882a593Smuzhiyun }
313*4882a593Smuzhiyun
gen6_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)314*4882a593Smuzhiyun static int gen6_reset_engines(struct intel_gt *gt,
315*4882a593Smuzhiyun intel_engine_mask_t engine_mask,
316*4882a593Smuzhiyun unsigned int retry)
317*4882a593Smuzhiyun {
318*4882a593Smuzhiyun static const u32 hw_engine_mask[] = {
319*4882a593Smuzhiyun [RCS0] = GEN6_GRDOM_RENDER,
320*4882a593Smuzhiyun [BCS0] = GEN6_GRDOM_BLT,
321*4882a593Smuzhiyun [VCS0] = GEN6_GRDOM_MEDIA,
322*4882a593Smuzhiyun [VCS1] = GEN8_GRDOM_MEDIA2,
323*4882a593Smuzhiyun [VECS0] = GEN6_GRDOM_VECS,
324*4882a593Smuzhiyun };
325*4882a593Smuzhiyun struct intel_engine_cs *engine;
326*4882a593Smuzhiyun u32 hw_mask;
327*4882a593Smuzhiyun
328*4882a593Smuzhiyun if (engine_mask == ALL_ENGINES) {
329*4882a593Smuzhiyun hw_mask = GEN6_GRDOM_FULL;
330*4882a593Smuzhiyun } else {
331*4882a593Smuzhiyun intel_engine_mask_t tmp;
332*4882a593Smuzhiyun
333*4882a593Smuzhiyun hw_mask = 0;
334*4882a593Smuzhiyun for_each_engine_masked(engine, gt, engine_mask, tmp) {
335*4882a593Smuzhiyun GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
336*4882a593Smuzhiyun hw_mask |= hw_engine_mask[engine->id];
337*4882a593Smuzhiyun }
338*4882a593Smuzhiyun }
339*4882a593Smuzhiyun
340*4882a593Smuzhiyun return gen6_hw_domain_reset(gt, hw_mask);
341*4882a593Smuzhiyun }
342*4882a593Smuzhiyun
gen11_lock_sfc(struct intel_engine_cs * engine,u32 * hw_mask)343*4882a593Smuzhiyun static int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask)
344*4882a593Smuzhiyun {
345*4882a593Smuzhiyun struct intel_uncore *uncore = engine->uncore;
346*4882a593Smuzhiyun u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
347*4882a593Smuzhiyun i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
348*4882a593Smuzhiyun u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
349*4882a593Smuzhiyun i915_reg_t sfc_usage;
350*4882a593Smuzhiyun u32 sfc_usage_bit;
351*4882a593Smuzhiyun u32 sfc_reset_bit;
352*4882a593Smuzhiyun int ret;
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun switch (engine->class) {
355*4882a593Smuzhiyun case VIDEO_DECODE_CLASS:
356*4882a593Smuzhiyun if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
357*4882a593Smuzhiyun return 0;
358*4882a593Smuzhiyun
359*4882a593Smuzhiyun sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
360*4882a593Smuzhiyun sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
361*4882a593Smuzhiyun
362*4882a593Smuzhiyun sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
363*4882a593Smuzhiyun sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT;
364*4882a593Smuzhiyun
365*4882a593Smuzhiyun sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
366*4882a593Smuzhiyun sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
367*4882a593Smuzhiyun sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
368*4882a593Smuzhiyun break;
369*4882a593Smuzhiyun
370*4882a593Smuzhiyun case VIDEO_ENHANCEMENT_CLASS:
371*4882a593Smuzhiyun sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
372*4882a593Smuzhiyun sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
375*4882a593Smuzhiyun sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT;
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun sfc_usage = GEN11_VECS_SFC_USAGE(engine);
378*4882a593Smuzhiyun sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
379*4882a593Smuzhiyun sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
380*4882a593Smuzhiyun break;
381*4882a593Smuzhiyun
382*4882a593Smuzhiyun default:
383*4882a593Smuzhiyun return 0;
384*4882a593Smuzhiyun }
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun /*
387*4882a593Smuzhiyun * If the engine is using a SFC, tell the engine that a software reset
388*4882a593Smuzhiyun * is going to happen. The engine will then try to force lock the SFC.
389*4882a593Smuzhiyun * If SFC ends up being locked to the engine we want to reset, we have
390*4882a593Smuzhiyun * to reset it as well (we will unlock it once the reset sequence is
391*4882a593Smuzhiyun * completed).
392*4882a593Smuzhiyun */
393*4882a593Smuzhiyun if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
394*4882a593Smuzhiyun return 0;
395*4882a593Smuzhiyun
396*4882a593Smuzhiyun rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
397*4882a593Smuzhiyun
398*4882a593Smuzhiyun ret = __intel_wait_for_register_fw(uncore,
399*4882a593Smuzhiyun sfc_forced_lock_ack,
400*4882a593Smuzhiyun sfc_forced_lock_ack_bit,
401*4882a593Smuzhiyun sfc_forced_lock_ack_bit,
402*4882a593Smuzhiyun 1000, 0, NULL);
403*4882a593Smuzhiyun
404*4882a593Smuzhiyun /* Was the SFC released while we were trying to lock it? */
405*4882a593Smuzhiyun if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
406*4882a593Smuzhiyun return 0;
407*4882a593Smuzhiyun
408*4882a593Smuzhiyun if (ret) {
409*4882a593Smuzhiyun drm_dbg(&engine->i915->drm,
410*4882a593Smuzhiyun "Wait for SFC forced lock ack failed\n");
411*4882a593Smuzhiyun return ret;
412*4882a593Smuzhiyun }
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun *hw_mask |= sfc_reset_bit;
415*4882a593Smuzhiyun return 0;
416*4882a593Smuzhiyun }
417*4882a593Smuzhiyun
gen11_unlock_sfc(struct intel_engine_cs * engine)418*4882a593Smuzhiyun static void gen11_unlock_sfc(struct intel_engine_cs *engine)
419*4882a593Smuzhiyun {
420*4882a593Smuzhiyun struct intel_uncore *uncore = engine->uncore;
421*4882a593Smuzhiyun u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
422*4882a593Smuzhiyun i915_reg_t sfc_forced_lock;
423*4882a593Smuzhiyun u32 sfc_forced_lock_bit;
424*4882a593Smuzhiyun
425*4882a593Smuzhiyun switch (engine->class) {
426*4882a593Smuzhiyun case VIDEO_DECODE_CLASS:
427*4882a593Smuzhiyun if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
428*4882a593Smuzhiyun return;
429*4882a593Smuzhiyun
430*4882a593Smuzhiyun sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
431*4882a593Smuzhiyun sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
432*4882a593Smuzhiyun break;
433*4882a593Smuzhiyun
434*4882a593Smuzhiyun case VIDEO_ENHANCEMENT_CLASS:
435*4882a593Smuzhiyun sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
436*4882a593Smuzhiyun sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
437*4882a593Smuzhiyun break;
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun default:
440*4882a593Smuzhiyun return;
441*4882a593Smuzhiyun }
442*4882a593Smuzhiyun
443*4882a593Smuzhiyun rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
444*4882a593Smuzhiyun }
445*4882a593Smuzhiyun
gen11_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)446*4882a593Smuzhiyun static int gen11_reset_engines(struct intel_gt *gt,
447*4882a593Smuzhiyun intel_engine_mask_t engine_mask,
448*4882a593Smuzhiyun unsigned int retry)
449*4882a593Smuzhiyun {
450*4882a593Smuzhiyun static const u32 hw_engine_mask[] = {
451*4882a593Smuzhiyun [RCS0] = GEN11_GRDOM_RENDER,
452*4882a593Smuzhiyun [BCS0] = GEN11_GRDOM_BLT,
453*4882a593Smuzhiyun [VCS0] = GEN11_GRDOM_MEDIA,
454*4882a593Smuzhiyun [VCS1] = GEN11_GRDOM_MEDIA2,
455*4882a593Smuzhiyun [VCS2] = GEN11_GRDOM_MEDIA3,
456*4882a593Smuzhiyun [VCS3] = GEN11_GRDOM_MEDIA4,
457*4882a593Smuzhiyun [VECS0] = GEN11_GRDOM_VECS,
458*4882a593Smuzhiyun [VECS1] = GEN11_GRDOM_VECS2,
459*4882a593Smuzhiyun };
460*4882a593Smuzhiyun struct intel_engine_cs *engine;
461*4882a593Smuzhiyun intel_engine_mask_t tmp;
462*4882a593Smuzhiyun u32 hw_mask;
463*4882a593Smuzhiyun int ret;
464*4882a593Smuzhiyun
465*4882a593Smuzhiyun if (engine_mask == ALL_ENGINES) {
466*4882a593Smuzhiyun hw_mask = GEN11_GRDOM_FULL;
467*4882a593Smuzhiyun } else {
468*4882a593Smuzhiyun hw_mask = 0;
469*4882a593Smuzhiyun for_each_engine_masked(engine, gt, engine_mask, tmp) {
470*4882a593Smuzhiyun GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
471*4882a593Smuzhiyun hw_mask |= hw_engine_mask[engine->id];
472*4882a593Smuzhiyun ret = gen11_lock_sfc(engine, &hw_mask);
473*4882a593Smuzhiyun if (ret)
474*4882a593Smuzhiyun goto sfc_unlock;
475*4882a593Smuzhiyun }
476*4882a593Smuzhiyun }
477*4882a593Smuzhiyun
478*4882a593Smuzhiyun ret = gen6_hw_domain_reset(gt, hw_mask);
479*4882a593Smuzhiyun
480*4882a593Smuzhiyun sfc_unlock:
481*4882a593Smuzhiyun /*
482*4882a593Smuzhiyun * We unlock the SFC based on the lock status and not the result of
483*4882a593Smuzhiyun * gen11_lock_sfc to make sure that we clean properly if something
484*4882a593Smuzhiyun * wrong happened during the lock (e.g. lock acquired after timeout
485*4882a593Smuzhiyun * expiration).
486*4882a593Smuzhiyun */
487*4882a593Smuzhiyun if (engine_mask != ALL_ENGINES)
488*4882a593Smuzhiyun for_each_engine_masked(engine, gt, engine_mask, tmp)
489*4882a593Smuzhiyun gen11_unlock_sfc(engine);
490*4882a593Smuzhiyun
491*4882a593Smuzhiyun return ret;
492*4882a593Smuzhiyun }
493*4882a593Smuzhiyun
gen8_engine_reset_prepare(struct intel_engine_cs * engine)494*4882a593Smuzhiyun static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
495*4882a593Smuzhiyun {
496*4882a593Smuzhiyun struct intel_uncore *uncore = engine->uncore;
497*4882a593Smuzhiyun const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
498*4882a593Smuzhiyun u32 request, mask, ack;
499*4882a593Smuzhiyun int ret;
500*4882a593Smuzhiyun
501*4882a593Smuzhiyun ack = intel_uncore_read_fw(uncore, reg);
502*4882a593Smuzhiyun if (ack & RESET_CTL_CAT_ERROR) {
503*4882a593Smuzhiyun /*
504*4882a593Smuzhiyun * For catastrophic errors, ready-for-reset sequence
505*4882a593Smuzhiyun * needs to be bypassed: HAS#396813
506*4882a593Smuzhiyun */
507*4882a593Smuzhiyun request = RESET_CTL_CAT_ERROR;
508*4882a593Smuzhiyun mask = RESET_CTL_CAT_ERROR;
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun /* Catastrophic errors need to be cleared by HW */
511*4882a593Smuzhiyun ack = 0;
512*4882a593Smuzhiyun } else if (!(ack & RESET_CTL_READY_TO_RESET)) {
513*4882a593Smuzhiyun request = RESET_CTL_REQUEST_RESET;
514*4882a593Smuzhiyun mask = RESET_CTL_READY_TO_RESET;
515*4882a593Smuzhiyun ack = RESET_CTL_READY_TO_RESET;
516*4882a593Smuzhiyun } else {
517*4882a593Smuzhiyun return 0;
518*4882a593Smuzhiyun }
519*4882a593Smuzhiyun
520*4882a593Smuzhiyun intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
521*4882a593Smuzhiyun ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
522*4882a593Smuzhiyun 700, 0, NULL);
523*4882a593Smuzhiyun if (ret)
524*4882a593Smuzhiyun drm_err(&engine->i915->drm,
525*4882a593Smuzhiyun "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
526*4882a593Smuzhiyun engine->name, request,
527*4882a593Smuzhiyun intel_uncore_read_fw(uncore, reg));
528*4882a593Smuzhiyun
529*4882a593Smuzhiyun return ret;
530*4882a593Smuzhiyun }
531*4882a593Smuzhiyun
gen8_engine_reset_cancel(struct intel_engine_cs * engine)532*4882a593Smuzhiyun static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
533*4882a593Smuzhiyun {
534*4882a593Smuzhiyun intel_uncore_write_fw(engine->uncore,
535*4882a593Smuzhiyun RING_RESET_CTL(engine->mmio_base),
536*4882a593Smuzhiyun _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
537*4882a593Smuzhiyun }
538*4882a593Smuzhiyun
gen8_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)539*4882a593Smuzhiyun static int gen8_reset_engines(struct intel_gt *gt,
540*4882a593Smuzhiyun intel_engine_mask_t engine_mask,
541*4882a593Smuzhiyun unsigned int retry)
542*4882a593Smuzhiyun {
543*4882a593Smuzhiyun struct intel_engine_cs *engine;
544*4882a593Smuzhiyun const bool reset_non_ready = retry >= 1;
545*4882a593Smuzhiyun intel_engine_mask_t tmp;
546*4882a593Smuzhiyun int ret;
547*4882a593Smuzhiyun
548*4882a593Smuzhiyun for_each_engine_masked(engine, gt, engine_mask, tmp) {
549*4882a593Smuzhiyun ret = gen8_engine_reset_prepare(engine);
550*4882a593Smuzhiyun if (ret && !reset_non_ready)
551*4882a593Smuzhiyun goto skip_reset;
552*4882a593Smuzhiyun
553*4882a593Smuzhiyun /*
554*4882a593Smuzhiyun * If this is not the first failed attempt to prepare,
555*4882a593Smuzhiyun * we decide to proceed anyway.
556*4882a593Smuzhiyun *
557*4882a593Smuzhiyun * By doing so we risk context corruption and with
558*4882a593Smuzhiyun * some gens (kbl), possible system hang if reset
559*4882a593Smuzhiyun * happens during active bb execution.
560*4882a593Smuzhiyun *
561*4882a593Smuzhiyun * We rather take context corruption instead of
562*4882a593Smuzhiyun * failed reset with a wedged driver/gpu. And
563*4882a593Smuzhiyun * active bb execution case should be covered by
564*4882a593Smuzhiyun * stop_engines() we have before the reset.
565*4882a593Smuzhiyun */
566*4882a593Smuzhiyun }
567*4882a593Smuzhiyun
568*4882a593Smuzhiyun if (INTEL_GEN(gt->i915) >= 11)
569*4882a593Smuzhiyun ret = gen11_reset_engines(gt, engine_mask, retry);
570*4882a593Smuzhiyun else
571*4882a593Smuzhiyun ret = gen6_reset_engines(gt, engine_mask, retry);
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun skip_reset:
574*4882a593Smuzhiyun for_each_engine_masked(engine, gt, engine_mask, tmp)
575*4882a593Smuzhiyun gen8_engine_reset_cancel(engine);
576*4882a593Smuzhiyun
577*4882a593Smuzhiyun return ret;
578*4882a593Smuzhiyun }
579*4882a593Smuzhiyun
mock_reset(struct intel_gt * gt,intel_engine_mask_t mask,unsigned int retry)580*4882a593Smuzhiyun static int mock_reset(struct intel_gt *gt,
581*4882a593Smuzhiyun intel_engine_mask_t mask,
582*4882a593Smuzhiyun unsigned int retry)
583*4882a593Smuzhiyun {
584*4882a593Smuzhiyun return 0;
585*4882a593Smuzhiyun }
586*4882a593Smuzhiyun
587*4882a593Smuzhiyun typedef int (*reset_func)(struct intel_gt *,
588*4882a593Smuzhiyun intel_engine_mask_t engine_mask,
589*4882a593Smuzhiyun unsigned int retry);
590*4882a593Smuzhiyun
intel_get_gpu_reset(const struct intel_gt * gt)591*4882a593Smuzhiyun static reset_func intel_get_gpu_reset(const struct intel_gt *gt)
592*4882a593Smuzhiyun {
593*4882a593Smuzhiyun struct drm_i915_private *i915 = gt->i915;
594*4882a593Smuzhiyun
595*4882a593Smuzhiyun if (is_mock_gt(gt))
596*4882a593Smuzhiyun return mock_reset;
597*4882a593Smuzhiyun else if (INTEL_GEN(i915) >= 8)
598*4882a593Smuzhiyun return gen8_reset_engines;
599*4882a593Smuzhiyun else if (INTEL_GEN(i915) >= 6)
600*4882a593Smuzhiyun return gen6_reset_engines;
601*4882a593Smuzhiyun else if (INTEL_GEN(i915) >= 5)
602*4882a593Smuzhiyun return ilk_do_reset;
603*4882a593Smuzhiyun else if (IS_G4X(i915))
604*4882a593Smuzhiyun return g4x_do_reset;
605*4882a593Smuzhiyun else if (IS_G33(i915) || IS_PINEVIEW(i915))
606*4882a593Smuzhiyun return g33_do_reset;
607*4882a593Smuzhiyun else if (INTEL_GEN(i915) >= 3)
608*4882a593Smuzhiyun return i915_do_reset;
609*4882a593Smuzhiyun else
610*4882a593Smuzhiyun return NULL;
611*4882a593Smuzhiyun }
612*4882a593Smuzhiyun
__intel_gt_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask)613*4882a593Smuzhiyun int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
614*4882a593Smuzhiyun {
615*4882a593Smuzhiyun const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
616*4882a593Smuzhiyun reset_func reset;
617*4882a593Smuzhiyun int ret = -ETIMEDOUT;
618*4882a593Smuzhiyun int retry;
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun reset = intel_get_gpu_reset(gt);
621*4882a593Smuzhiyun if (!reset)
622*4882a593Smuzhiyun return -ENODEV;
623*4882a593Smuzhiyun
624*4882a593Smuzhiyun /*
625*4882a593Smuzhiyun * If the power well sleeps during the reset, the reset
626*4882a593Smuzhiyun * request may be dropped and never completes (causing -EIO).
627*4882a593Smuzhiyun */
628*4882a593Smuzhiyun intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
629*4882a593Smuzhiyun for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
630*4882a593Smuzhiyun GT_TRACE(gt, "engine_mask=%x\n", engine_mask);
631*4882a593Smuzhiyun preempt_disable();
632*4882a593Smuzhiyun ret = reset(gt, engine_mask, retry);
633*4882a593Smuzhiyun preempt_enable();
634*4882a593Smuzhiyun }
635*4882a593Smuzhiyun intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
636*4882a593Smuzhiyun
637*4882a593Smuzhiyun return ret;
638*4882a593Smuzhiyun }
639*4882a593Smuzhiyun
intel_has_gpu_reset(const struct intel_gt * gt)640*4882a593Smuzhiyun bool intel_has_gpu_reset(const struct intel_gt *gt)
641*4882a593Smuzhiyun {
642*4882a593Smuzhiyun if (!gt->i915->params.reset)
643*4882a593Smuzhiyun return NULL;
644*4882a593Smuzhiyun
645*4882a593Smuzhiyun return intel_get_gpu_reset(gt);
646*4882a593Smuzhiyun }
647*4882a593Smuzhiyun
intel_has_reset_engine(const struct intel_gt * gt)648*4882a593Smuzhiyun bool intel_has_reset_engine(const struct intel_gt *gt)
649*4882a593Smuzhiyun {
650*4882a593Smuzhiyun if (gt->i915->params.reset < 2)
651*4882a593Smuzhiyun return false;
652*4882a593Smuzhiyun
653*4882a593Smuzhiyun return INTEL_INFO(gt->i915)->has_reset_engine;
654*4882a593Smuzhiyun }
655*4882a593Smuzhiyun
intel_reset_guc(struct intel_gt * gt)656*4882a593Smuzhiyun int intel_reset_guc(struct intel_gt *gt)
657*4882a593Smuzhiyun {
658*4882a593Smuzhiyun u32 guc_domain =
659*4882a593Smuzhiyun INTEL_GEN(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
660*4882a593Smuzhiyun int ret;
661*4882a593Smuzhiyun
662*4882a593Smuzhiyun GEM_BUG_ON(!HAS_GT_UC(gt->i915));
663*4882a593Smuzhiyun
664*4882a593Smuzhiyun intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
665*4882a593Smuzhiyun ret = gen6_hw_domain_reset(gt, guc_domain);
666*4882a593Smuzhiyun intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
667*4882a593Smuzhiyun
668*4882a593Smuzhiyun return ret;
669*4882a593Smuzhiyun }
670*4882a593Smuzhiyun
671*4882a593Smuzhiyun /*
672*4882a593Smuzhiyun * Ensure irq handler finishes, and not run again.
673*4882a593Smuzhiyun * Also return the active request so that we only search for it once.
674*4882a593Smuzhiyun */
reset_prepare_engine(struct intel_engine_cs * engine)675*4882a593Smuzhiyun static void reset_prepare_engine(struct intel_engine_cs *engine)
676*4882a593Smuzhiyun {
677*4882a593Smuzhiyun /*
678*4882a593Smuzhiyun * During the reset sequence, we must prevent the engine from
679*4882a593Smuzhiyun * entering RC6. As the context state is undefined until we restart
680*4882a593Smuzhiyun * the engine, if it does enter RC6 during the reset, the state
681*4882a593Smuzhiyun * written to the powercontext is undefined and so we may lose
682*4882a593Smuzhiyun * GPU state upon resume, i.e. fail to restart after a reset.
683*4882a593Smuzhiyun */
684*4882a593Smuzhiyun intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
685*4882a593Smuzhiyun if (engine->reset.prepare)
686*4882a593Smuzhiyun engine->reset.prepare(engine);
687*4882a593Smuzhiyun }
688*4882a593Smuzhiyun
revoke_mmaps(struct intel_gt * gt)689*4882a593Smuzhiyun static void revoke_mmaps(struct intel_gt *gt)
690*4882a593Smuzhiyun {
691*4882a593Smuzhiyun int i;
692*4882a593Smuzhiyun
693*4882a593Smuzhiyun for (i = 0; i < gt->ggtt->num_fences; i++) {
694*4882a593Smuzhiyun struct drm_vma_offset_node *node;
695*4882a593Smuzhiyun struct i915_vma *vma;
696*4882a593Smuzhiyun u64 vma_offset;
697*4882a593Smuzhiyun
698*4882a593Smuzhiyun vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
699*4882a593Smuzhiyun if (!vma)
700*4882a593Smuzhiyun continue;
701*4882a593Smuzhiyun
702*4882a593Smuzhiyun if (!i915_vma_has_userfault(vma))
703*4882a593Smuzhiyun continue;
704*4882a593Smuzhiyun
705*4882a593Smuzhiyun GEM_BUG_ON(vma->fence != >->ggtt->fence_regs[i]);
706*4882a593Smuzhiyun
707*4882a593Smuzhiyun if (!vma->mmo)
708*4882a593Smuzhiyun continue;
709*4882a593Smuzhiyun
710*4882a593Smuzhiyun node = &vma->mmo->vma_node;
711*4882a593Smuzhiyun vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
712*4882a593Smuzhiyun
713*4882a593Smuzhiyun unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping,
714*4882a593Smuzhiyun drm_vma_node_offset_addr(node) + vma_offset,
715*4882a593Smuzhiyun vma->size,
716*4882a593Smuzhiyun 1);
717*4882a593Smuzhiyun }
718*4882a593Smuzhiyun }
719*4882a593Smuzhiyun
reset_prepare(struct intel_gt * gt)720*4882a593Smuzhiyun static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
721*4882a593Smuzhiyun {
722*4882a593Smuzhiyun struct intel_engine_cs *engine;
723*4882a593Smuzhiyun intel_engine_mask_t awake = 0;
724*4882a593Smuzhiyun enum intel_engine_id id;
725*4882a593Smuzhiyun
726*4882a593Smuzhiyun for_each_engine(engine, gt, id) {
727*4882a593Smuzhiyun if (intel_engine_pm_get_if_awake(engine))
728*4882a593Smuzhiyun awake |= engine->mask;
729*4882a593Smuzhiyun reset_prepare_engine(engine);
730*4882a593Smuzhiyun }
731*4882a593Smuzhiyun
732*4882a593Smuzhiyun intel_uc_reset_prepare(>->uc);
733*4882a593Smuzhiyun
734*4882a593Smuzhiyun return awake;
735*4882a593Smuzhiyun }
736*4882a593Smuzhiyun
gt_revoke(struct intel_gt * gt)737*4882a593Smuzhiyun static void gt_revoke(struct intel_gt *gt)
738*4882a593Smuzhiyun {
739*4882a593Smuzhiyun revoke_mmaps(gt);
740*4882a593Smuzhiyun }
741*4882a593Smuzhiyun
gt_reset(struct intel_gt * gt,intel_engine_mask_t stalled_mask)742*4882a593Smuzhiyun static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
743*4882a593Smuzhiyun {
744*4882a593Smuzhiyun struct intel_engine_cs *engine;
745*4882a593Smuzhiyun enum intel_engine_id id;
746*4882a593Smuzhiyun int err;
747*4882a593Smuzhiyun
748*4882a593Smuzhiyun /*
749*4882a593Smuzhiyun * Everything depends on having the GTT running, so we need to start
750*4882a593Smuzhiyun * there.
751*4882a593Smuzhiyun */
752*4882a593Smuzhiyun err = i915_ggtt_enable_hw(gt->i915);
753*4882a593Smuzhiyun if (err)
754*4882a593Smuzhiyun return err;
755*4882a593Smuzhiyun
756*4882a593Smuzhiyun for_each_engine(engine, gt, id)
757*4882a593Smuzhiyun __intel_engine_reset(engine, stalled_mask & engine->mask);
758*4882a593Smuzhiyun
759*4882a593Smuzhiyun intel_ggtt_restore_fences(gt->ggtt);
760*4882a593Smuzhiyun
761*4882a593Smuzhiyun return err;
762*4882a593Smuzhiyun }
763*4882a593Smuzhiyun
reset_finish_engine(struct intel_engine_cs * engine)764*4882a593Smuzhiyun static void reset_finish_engine(struct intel_engine_cs *engine)
765*4882a593Smuzhiyun {
766*4882a593Smuzhiyun if (engine->reset.finish)
767*4882a593Smuzhiyun engine->reset.finish(engine);
768*4882a593Smuzhiyun intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
769*4882a593Smuzhiyun
770*4882a593Smuzhiyun intel_engine_signal_breadcrumbs(engine);
771*4882a593Smuzhiyun }
772*4882a593Smuzhiyun
reset_finish(struct intel_gt * gt,intel_engine_mask_t awake)773*4882a593Smuzhiyun static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
774*4882a593Smuzhiyun {
775*4882a593Smuzhiyun struct intel_engine_cs *engine;
776*4882a593Smuzhiyun enum intel_engine_id id;
777*4882a593Smuzhiyun
778*4882a593Smuzhiyun for_each_engine(engine, gt, id) {
779*4882a593Smuzhiyun reset_finish_engine(engine);
780*4882a593Smuzhiyun if (awake & engine->mask)
781*4882a593Smuzhiyun intel_engine_pm_put(engine);
782*4882a593Smuzhiyun }
783*4882a593Smuzhiyun }
784*4882a593Smuzhiyun
nop_submit_request(struct i915_request * request)785*4882a593Smuzhiyun static void nop_submit_request(struct i915_request *request)
786*4882a593Smuzhiyun {
787*4882a593Smuzhiyun struct intel_engine_cs *engine = request->engine;
788*4882a593Smuzhiyun unsigned long flags;
789*4882a593Smuzhiyun
790*4882a593Smuzhiyun RQ_TRACE(request, "-EIO\n");
791*4882a593Smuzhiyun i915_request_set_error_once(request, -EIO);
792*4882a593Smuzhiyun
793*4882a593Smuzhiyun spin_lock_irqsave(&engine->active.lock, flags);
794*4882a593Smuzhiyun __i915_request_submit(request);
795*4882a593Smuzhiyun i915_request_mark_complete(request);
796*4882a593Smuzhiyun spin_unlock_irqrestore(&engine->active.lock, flags);
797*4882a593Smuzhiyun
798*4882a593Smuzhiyun intel_engine_signal_breadcrumbs(engine);
799*4882a593Smuzhiyun }
800*4882a593Smuzhiyun
__intel_gt_set_wedged(struct intel_gt * gt)801*4882a593Smuzhiyun static void __intel_gt_set_wedged(struct intel_gt *gt)
802*4882a593Smuzhiyun {
803*4882a593Smuzhiyun struct intel_engine_cs *engine;
804*4882a593Smuzhiyun intel_engine_mask_t awake;
805*4882a593Smuzhiyun enum intel_engine_id id;
806*4882a593Smuzhiyun
807*4882a593Smuzhiyun if (test_bit(I915_WEDGED, >->reset.flags))
808*4882a593Smuzhiyun return;
809*4882a593Smuzhiyun
810*4882a593Smuzhiyun GT_TRACE(gt, "start\n");
811*4882a593Smuzhiyun
812*4882a593Smuzhiyun /*
813*4882a593Smuzhiyun * First, stop submission to hw, but do not yet complete requests by
814*4882a593Smuzhiyun * rolling the global seqno forward (since this would complete requests
815*4882a593Smuzhiyun * for which we haven't set the fence error to EIO yet).
816*4882a593Smuzhiyun */
817*4882a593Smuzhiyun awake = reset_prepare(gt);
818*4882a593Smuzhiyun
819*4882a593Smuzhiyun /* Even if the GPU reset fails, it should still stop the engines */
820*4882a593Smuzhiyun if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
821*4882a593Smuzhiyun __intel_gt_reset(gt, ALL_ENGINES);
822*4882a593Smuzhiyun
823*4882a593Smuzhiyun for_each_engine(engine, gt, id)
824*4882a593Smuzhiyun engine->submit_request = nop_submit_request;
825*4882a593Smuzhiyun
826*4882a593Smuzhiyun /*
827*4882a593Smuzhiyun * Make sure no request can slip through without getting completed by
828*4882a593Smuzhiyun * either this call here to intel_engine_write_global_seqno, or the one
829*4882a593Smuzhiyun * in nop_submit_request.
830*4882a593Smuzhiyun */
831*4882a593Smuzhiyun synchronize_rcu_expedited();
832*4882a593Smuzhiyun set_bit(I915_WEDGED, >->reset.flags);
833*4882a593Smuzhiyun
834*4882a593Smuzhiyun /* Mark all executing requests as skipped */
835*4882a593Smuzhiyun for_each_engine(engine, gt, id)
836*4882a593Smuzhiyun if (engine->reset.cancel)
837*4882a593Smuzhiyun engine->reset.cancel(engine);
838*4882a593Smuzhiyun
839*4882a593Smuzhiyun reset_finish(gt, awake);
840*4882a593Smuzhiyun
841*4882a593Smuzhiyun GT_TRACE(gt, "end\n");
842*4882a593Smuzhiyun }
843*4882a593Smuzhiyun
intel_gt_set_wedged(struct intel_gt * gt)844*4882a593Smuzhiyun void intel_gt_set_wedged(struct intel_gt *gt)
845*4882a593Smuzhiyun {
846*4882a593Smuzhiyun intel_wakeref_t wakeref;
847*4882a593Smuzhiyun
848*4882a593Smuzhiyun if (test_bit(I915_WEDGED, >->reset.flags))
849*4882a593Smuzhiyun return;
850*4882a593Smuzhiyun
851*4882a593Smuzhiyun wakeref = intel_runtime_pm_get(gt->uncore->rpm);
852*4882a593Smuzhiyun mutex_lock(>->reset.mutex);
853*4882a593Smuzhiyun
854*4882a593Smuzhiyun if (GEM_SHOW_DEBUG()) {
855*4882a593Smuzhiyun struct drm_printer p = drm_debug_printer(__func__);
856*4882a593Smuzhiyun struct intel_engine_cs *engine;
857*4882a593Smuzhiyun enum intel_engine_id id;
858*4882a593Smuzhiyun
859*4882a593Smuzhiyun drm_printf(&p, "called from %pS\n", (void *)_RET_IP_);
860*4882a593Smuzhiyun for_each_engine(engine, gt, id) {
861*4882a593Smuzhiyun if (intel_engine_is_idle(engine))
862*4882a593Smuzhiyun continue;
863*4882a593Smuzhiyun
864*4882a593Smuzhiyun intel_engine_dump(engine, &p, "%s\n", engine->name);
865*4882a593Smuzhiyun }
866*4882a593Smuzhiyun }
867*4882a593Smuzhiyun
868*4882a593Smuzhiyun __intel_gt_set_wedged(gt);
869*4882a593Smuzhiyun
870*4882a593Smuzhiyun mutex_unlock(>->reset.mutex);
871*4882a593Smuzhiyun intel_runtime_pm_put(gt->uncore->rpm, wakeref);
872*4882a593Smuzhiyun }
873*4882a593Smuzhiyun
__intel_gt_unset_wedged(struct intel_gt * gt)874*4882a593Smuzhiyun static bool __intel_gt_unset_wedged(struct intel_gt *gt)
875*4882a593Smuzhiyun {
876*4882a593Smuzhiyun struct intel_gt_timelines *timelines = >->timelines;
877*4882a593Smuzhiyun struct intel_timeline *tl;
878*4882a593Smuzhiyun bool ok;
879*4882a593Smuzhiyun
880*4882a593Smuzhiyun if (!test_bit(I915_WEDGED, >->reset.flags))
881*4882a593Smuzhiyun return true;
882*4882a593Smuzhiyun
883*4882a593Smuzhiyun /* Never fully initialised, recovery impossible */
884*4882a593Smuzhiyun if (intel_gt_has_unrecoverable_error(gt))
885*4882a593Smuzhiyun return false;
886*4882a593Smuzhiyun
887*4882a593Smuzhiyun GT_TRACE(gt, "start\n");
888*4882a593Smuzhiyun
889*4882a593Smuzhiyun /*
890*4882a593Smuzhiyun * Before unwedging, make sure that all pending operations
891*4882a593Smuzhiyun * are flushed and errored out - we may have requests waiting upon
892*4882a593Smuzhiyun * third party fences. We marked all inflight requests as EIO, and
893*4882a593Smuzhiyun * every execbuf since returned EIO, for consistency we want all
894*4882a593Smuzhiyun * the currently pending requests to also be marked as EIO, which
895*4882a593Smuzhiyun * is done inside our nop_submit_request - and so we must wait.
896*4882a593Smuzhiyun *
897*4882a593Smuzhiyun * No more can be submitted until we reset the wedged bit.
898*4882a593Smuzhiyun */
899*4882a593Smuzhiyun spin_lock(&timelines->lock);
900*4882a593Smuzhiyun list_for_each_entry(tl, &timelines->active_list, link) {
901*4882a593Smuzhiyun struct dma_fence *fence;
902*4882a593Smuzhiyun
903*4882a593Smuzhiyun fence = i915_active_fence_get(&tl->last_request);
904*4882a593Smuzhiyun if (!fence)
905*4882a593Smuzhiyun continue;
906*4882a593Smuzhiyun
907*4882a593Smuzhiyun spin_unlock(&timelines->lock);
908*4882a593Smuzhiyun
909*4882a593Smuzhiyun /*
910*4882a593Smuzhiyun * All internal dependencies (i915_requests) will have
911*4882a593Smuzhiyun * been flushed by the set-wedge, but we may be stuck waiting
912*4882a593Smuzhiyun * for external fences. These should all be capped to 10s
913*4882a593Smuzhiyun * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
914*4882a593Smuzhiyun * in the worst case.
915*4882a593Smuzhiyun */
916*4882a593Smuzhiyun dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT);
917*4882a593Smuzhiyun dma_fence_put(fence);
918*4882a593Smuzhiyun
919*4882a593Smuzhiyun /* Restart iteration after droping lock */
920*4882a593Smuzhiyun spin_lock(&timelines->lock);
921*4882a593Smuzhiyun tl = list_entry(&timelines->active_list, typeof(*tl), link);
922*4882a593Smuzhiyun }
923*4882a593Smuzhiyun spin_unlock(&timelines->lock);
924*4882a593Smuzhiyun
925*4882a593Smuzhiyun /* We must reset pending GPU events before restoring our submission */
926*4882a593Smuzhiyun ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
927*4882a593Smuzhiyun if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
928*4882a593Smuzhiyun ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
929*4882a593Smuzhiyun if (!ok) {
930*4882a593Smuzhiyun /*
931*4882a593Smuzhiyun * Warn CI about the unrecoverable wedged condition.
932*4882a593Smuzhiyun * Time for a reboot.
933*4882a593Smuzhiyun */
934*4882a593Smuzhiyun add_taint_for_CI(gt->i915, TAINT_WARN);
935*4882a593Smuzhiyun return false;
936*4882a593Smuzhiyun }
937*4882a593Smuzhiyun
938*4882a593Smuzhiyun /*
939*4882a593Smuzhiyun * Undo nop_submit_request. We prevent all new i915 requests from
940*4882a593Smuzhiyun * being queued (by disallowing execbuf whilst wedged) so having
941*4882a593Smuzhiyun * waited for all active requests above, we know the system is idle
942*4882a593Smuzhiyun * and do not have to worry about a thread being inside
943*4882a593Smuzhiyun * engine->submit_request() as we swap over. So unlike installing
944*4882a593Smuzhiyun * the nop_submit_request on reset, we can do this from normal
945*4882a593Smuzhiyun * context and do not require stop_machine().
946*4882a593Smuzhiyun */
947*4882a593Smuzhiyun intel_engines_reset_default_submission(gt);
948*4882a593Smuzhiyun
949*4882a593Smuzhiyun GT_TRACE(gt, "end\n");
950*4882a593Smuzhiyun
951*4882a593Smuzhiyun smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
952*4882a593Smuzhiyun clear_bit(I915_WEDGED, >->reset.flags);
953*4882a593Smuzhiyun
954*4882a593Smuzhiyun return true;
955*4882a593Smuzhiyun }
956*4882a593Smuzhiyun
intel_gt_unset_wedged(struct intel_gt * gt)957*4882a593Smuzhiyun bool intel_gt_unset_wedged(struct intel_gt *gt)
958*4882a593Smuzhiyun {
959*4882a593Smuzhiyun bool result;
960*4882a593Smuzhiyun
961*4882a593Smuzhiyun mutex_lock(>->reset.mutex);
962*4882a593Smuzhiyun result = __intel_gt_unset_wedged(gt);
963*4882a593Smuzhiyun mutex_unlock(>->reset.mutex);
964*4882a593Smuzhiyun
965*4882a593Smuzhiyun return result;
966*4882a593Smuzhiyun }
967*4882a593Smuzhiyun
do_reset(struct intel_gt * gt,intel_engine_mask_t stalled_mask)968*4882a593Smuzhiyun static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
969*4882a593Smuzhiyun {
970*4882a593Smuzhiyun int err, i;
971*4882a593Smuzhiyun
972*4882a593Smuzhiyun gt_revoke(gt);
973*4882a593Smuzhiyun
974*4882a593Smuzhiyun err = __intel_gt_reset(gt, ALL_ENGINES);
975*4882a593Smuzhiyun for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
976*4882a593Smuzhiyun msleep(10 * (i + 1));
977*4882a593Smuzhiyun err = __intel_gt_reset(gt, ALL_ENGINES);
978*4882a593Smuzhiyun }
979*4882a593Smuzhiyun if (err)
980*4882a593Smuzhiyun return err;
981*4882a593Smuzhiyun
982*4882a593Smuzhiyun return gt_reset(gt, stalled_mask);
983*4882a593Smuzhiyun }
984*4882a593Smuzhiyun
resume(struct intel_gt * gt)985*4882a593Smuzhiyun static int resume(struct intel_gt *gt)
986*4882a593Smuzhiyun {
987*4882a593Smuzhiyun struct intel_engine_cs *engine;
988*4882a593Smuzhiyun enum intel_engine_id id;
989*4882a593Smuzhiyun int ret;
990*4882a593Smuzhiyun
991*4882a593Smuzhiyun for_each_engine(engine, gt, id) {
992*4882a593Smuzhiyun ret = intel_engine_resume(engine);
993*4882a593Smuzhiyun if (ret)
994*4882a593Smuzhiyun return ret;
995*4882a593Smuzhiyun }
996*4882a593Smuzhiyun
997*4882a593Smuzhiyun return 0;
998*4882a593Smuzhiyun }
999*4882a593Smuzhiyun
1000*4882a593Smuzhiyun /**
1001*4882a593Smuzhiyun * intel_gt_reset - reset chip after a hang
1002*4882a593Smuzhiyun * @gt: #intel_gt to reset
1003*4882a593Smuzhiyun * @stalled_mask: mask of the stalled engines with the guilty requests
1004*4882a593Smuzhiyun * @reason: user error message for why we are resetting
1005*4882a593Smuzhiyun *
1006*4882a593Smuzhiyun * Reset the chip. Useful if a hang is detected. Marks the device as wedged
1007*4882a593Smuzhiyun * on failure.
1008*4882a593Smuzhiyun *
1009*4882a593Smuzhiyun * Procedure is fairly simple:
1010*4882a593Smuzhiyun * - reset the chip using the reset reg
1011*4882a593Smuzhiyun * - re-init context state
1012*4882a593Smuzhiyun * - re-init hardware status page
1013*4882a593Smuzhiyun * - re-init ring buffer
1014*4882a593Smuzhiyun * - re-init interrupt state
1015*4882a593Smuzhiyun * - re-init display
1016*4882a593Smuzhiyun */
intel_gt_reset(struct intel_gt * gt,intel_engine_mask_t stalled_mask,const char * reason)1017*4882a593Smuzhiyun void intel_gt_reset(struct intel_gt *gt,
1018*4882a593Smuzhiyun intel_engine_mask_t stalled_mask,
1019*4882a593Smuzhiyun const char *reason)
1020*4882a593Smuzhiyun {
1021*4882a593Smuzhiyun intel_engine_mask_t awake;
1022*4882a593Smuzhiyun int ret;
1023*4882a593Smuzhiyun
1024*4882a593Smuzhiyun GT_TRACE(gt, "flags=%lx\n", gt->reset.flags);
1025*4882a593Smuzhiyun
1026*4882a593Smuzhiyun might_sleep();
1027*4882a593Smuzhiyun GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, >->reset.flags));
1028*4882a593Smuzhiyun mutex_lock(>->reset.mutex);
1029*4882a593Smuzhiyun
1030*4882a593Smuzhiyun /* Clear any previous failed attempts at recovery. Time to try again. */
1031*4882a593Smuzhiyun if (!__intel_gt_unset_wedged(gt))
1032*4882a593Smuzhiyun goto unlock;
1033*4882a593Smuzhiyun
1034*4882a593Smuzhiyun if (reason)
1035*4882a593Smuzhiyun drm_notice(>->i915->drm,
1036*4882a593Smuzhiyun "Resetting chip for %s\n", reason);
1037*4882a593Smuzhiyun atomic_inc(>->i915->gpu_error.reset_count);
1038*4882a593Smuzhiyun
1039*4882a593Smuzhiyun awake = reset_prepare(gt);
1040*4882a593Smuzhiyun
1041*4882a593Smuzhiyun if (!intel_has_gpu_reset(gt)) {
1042*4882a593Smuzhiyun if (gt->i915->params.reset)
1043*4882a593Smuzhiyun drm_err(>->i915->drm, "GPU reset not supported\n");
1044*4882a593Smuzhiyun else
1045*4882a593Smuzhiyun drm_dbg(>->i915->drm, "GPU reset disabled\n");
1046*4882a593Smuzhiyun goto error;
1047*4882a593Smuzhiyun }
1048*4882a593Smuzhiyun
1049*4882a593Smuzhiyun if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1050*4882a593Smuzhiyun intel_runtime_pm_disable_interrupts(gt->i915);
1051*4882a593Smuzhiyun
1052*4882a593Smuzhiyun if (do_reset(gt, stalled_mask)) {
1053*4882a593Smuzhiyun drm_err(>->i915->drm, "Failed to reset chip\n");
1054*4882a593Smuzhiyun goto taint;
1055*4882a593Smuzhiyun }
1056*4882a593Smuzhiyun
1057*4882a593Smuzhiyun if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1058*4882a593Smuzhiyun intel_runtime_pm_enable_interrupts(gt->i915);
1059*4882a593Smuzhiyun
1060*4882a593Smuzhiyun intel_overlay_reset(gt->i915);
1061*4882a593Smuzhiyun
1062*4882a593Smuzhiyun /*
1063*4882a593Smuzhiyun * Next we need to restore the context, but we don't use those
1064*4882a593Smuzhiyun * yet either...
1065*4882a593Smuzhiyun *
1066*4882a593Smuzhiyun * Ring buffer needs to be re-initialized in the KMS case, or if X
1067*4882a593Smuzhiyun * was running at the time of the reset (i.e. we weren't VT
1068*4882a593Smuzhiyun * switched away).
1069*4882a593Smuzhiyun */
1070*4882a593Smuzhiyun ret = intel_gt_init_hw(gt);
1071*4882a593Smuzhiyun if (ret) {
1072*4882a593Smuzhiyun drm_err(>->i915->drm,
1073*4882a593Smuzhiyun "Failed to initialise HW following reset (%d)\n",
1074*4882a593Smuzhiyun ret);
1075*4882a593Smuzhiyun goto taint;
1076*4882a593Smuzhiyun }
1077*4882a593Smuzhiyun
1078*4882a593Smuzhiyun ret = resume(gt);
1079*4882a593Smuzhiyun if (ret)
1080*4882a593Smuzhiyun goto taint;
1081*4882a593Smuzhiyun
1082*4882a593Smuzhiyun finish:
1083*4882a593Smuzhiyun reset_finish(gt, awake);
1084*4882a593Smuzhiyun unlock:
1085*4882a593Smuzhiyun mutex_unlock(>->reset.mutex);
1086*4882a593Smuzhiyun return;
1087*4882a593Smuzhiyun
1088*4882a593Smuzhiyun taint:
1089*4882a593Smuzhiyun /*
1090*4882a593Smuzhiyun * History tells us that if we cannot reset the GPU now, we
1091*4882a593Smuzhiyun * never will. This then impacts everything that is run
1092*4882a593Smuzhiyun * subsequently. On failing the reset, we mark the driver
1093*4882a593Smuzhiyun * as wedged, preventing further execution on the GPU.
1094*4882a593Smuzhiyun * We also want to go one step further and add a taint to the
1095*4882a593Smuzhiyun * kernel so that any subsequent faults can be traced back to
1096*4882a593Smuzhiyun * this failure. This is important for CI, where if the
1097*4882a593Smuzhiyun * GPU/driver fails we would like to reboot and restart testing
1098*4882a593Smuzhiyun * rather than continue on into oblivion. For everyone else,
1099*4882a593Smuzhiyun * the system should still plod along, but they have been warned!
1100*4882a593Smuzhiyun */
1101*4882a593Smuzhiyun add_taint_for_CI(gt->i915, TAINT_WARN);
1102*4882a593Smuzhiyun error:
1103*4882a593Smuzhiyun __intel_gt_set_wedged(gt);
1104*4882a593Smuzhiyun goto finish;
1105*4882a593Smuzhiyun }
1106*4882a593Smuzhiyun
intel_gt_reset_engine(struct intel_engine_cs * engine)1107*4882a593Smuzhiyun static inline int intel_gt_reset_engine(struct intel_engine_cs *engine)
1108*4882a593Smuzhiyun {
1109*4882a593Smuzhiyun return __intel_gt_reset(engine->gt, engine->mask);
1110*4882a593Smuzhiyun }
1111*4882a593Smuzhiyun
1112*4882a593Smuzhiyun /**
1113*4882a593Smuzhiyun * intel_engine_reset - reset GPU engine to recover from a hang
1114*4882a593Smuzhiyun * @engine: engine to reset
1115*4882a593Smuzhiyun * @msg: reason for GPU reset; or NULL for no drm_notice()
1116*4882a593Smuzhiyun *
1117*4882a593Smuzhiyun * Reset a specific GPU engine. Useful if a hang is detected.
1118*4882a593Smuzhiyun * Returns zero on successful reset or otherwise an error code.
1119*4882a593Smuzhiyun *
1120*4882a593Smuzhiyun * Procedure is:
1121*4882a593Smuzhiyun * - identifies the request that caused the hang and it is dropped
1122*4882a593Smuzhiyun * - reset engine (which will force the engine to idle)
1123*4882a593Smuzhiyun * - re-init/configure engine
1124*4882a593Smuzhiyun */
intel_engine_reset(struct intel_engine_cs * engine,const char * msg)1125*4882a593Smuzhiyun int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
1126*4882a593Smuzhiyun {
1127*4882a593Smuzhiyun struct intel_gt *gt = engine->gt;
1128*4882a593Smuzhiyun bool uses_guc = intel_engine_in_guc_submission_mode(engine);
1129*4882a593Smuzhiyun int ret;
1130*4882a593Smuzhiyun
1131*4882a593Smuzhiyun ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags);
1132*4882a593Smuzhiyun GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, >->reset.flags));
1133*4882a593Smuzhiyun
1134*4882a593Smuzhiyun if (!intel_engine_pm_get_if_awake(engine))
1135*4882a593Smuzhiyun return 0;
1136*4882a593Smuzhiyun
1137*4882a593Smuzhiyun reset_prepare_engine(engine);
1138*4882a593Smuzhiyun
1139*4882a593Smuzhiyun if (msg)
1140*4882a593Smuzhiyun drm_notice(&engine->i915->drm,
1141*4882a593Smuzhiyun "Resetting %s for %s\n", engine->name, msg);
1142*4882a593Smuzhiyun atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
1143*4882a593Smuzhiyun
1144*4882a593Smuzhiyun if (!uses_guc)
1145*4882a593Smuzhiyun ret = intel_gt_reset_engine(engine);
1146*4882a593Smuzhiyun else
1147*4882a593Smuzhiyun ret = intel_guc_reset_engine(&engine->gt->uc.guc, engine);
1148*4882a593Smuzhiyun if (ret) {
1149*4882a593Smuzhiyun /* If we fail here, we expect to fallback to a global reset */
1150*4882a593Smuzhiyun drm_dbg(>->i915->drm, "%sFailed to reset %s, ret=%d\n",
1151*4882a593Smuzhiyun uses_guc ? "GuC " : "", engine->name, ret);
1152*4882a593Smuzhiyun goto out;
1153*4882a593Smuzhiyun }
1154*4882a593Smuzhiyun
1155*4882a593Smuzhiyun /*
1156*4882a593Smuzhiyun * The request that caused the hang is stuck on elsp, we know the
1157*4882a593Smuzhiyun * active request and can drop it, adjust head to skip the offending
1158*4882a593Smuzhiyun * request to resume executing remaining requests in the queue.
1159*4882a593Smuzhiyun */
1160*4882a593Smuzhiyun __intel_engine_reset(engine, true);
1161*4882a593Smuzhiyun
1162*4882a593Smuzhiyun /*
1163*4882a593Smuzhiyun * The engine and its registers (and workarounds in case of render)
1164*4882a593Smuzhiyun * have been reset to their default values. Follow the init_ring
1165*4882a593Smuzhiyun * process to program RING_MODE, HWSP and re-enable submission.
1166*4882a593Smuzhiyun */
1167*4882a593Smuzhiyun ret = intel_engine_resume(engine);
1168*4882a593Smuzhiyun
1169*4882a593Smuzhiyun out:
1170*4882a593Smuzhiyun intel_engine_cancel_stop_cs(engine);
1171*4882a593Smuzhiyun reset_finish_engine(engine);
1172*4882a593Smuzhiyun intel_engine_pm_put_async(engine);
1173*4882a593Smuzhiyun return ret;
1174*4882a593Smuzhiyun }
1175*4882a593Smuzhiyun
intel_gt_reset_global(struct intel_gt * gt,u32 engine_mask,const char * reason)1176*4882a593Smuzhiyun static void intel_gt_reset_global(struct intel_gt *gt,
1177*4882a593Smuzhiyun u32 engine_mask,
1178*4882a593Smuzhiyun const char *reason)
1179*4882a593Smuzhiyun {
1180*4882a593Smuzhiyun struct kobject *kobj = >->i915->drm.primary->kdev->kobj;
1181*4882a593Smuzhiyun char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1182*4882a593Smuzhiyun char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1183*4882a593Smuzhiyun char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1184*4882a593Smuzhiyun struct intel_wedge_me w;
1185*4882a593Smuzhiyun
1186*4882a593Smuzhiyun kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1187*4882a593Smuzhiyun
1188*4882a593Smuzhiyun drm_dbg(>->i915->drm, "resetting chip, engines=%x\n", engine_mask);
1189*4882a593Smuzhiyun kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1190*4882a593Smuzhiyun
1191*4882a593Smuzhiyun /* Use a watchdog to ensure that our reset completes */
1192*4882a593Smuzhiyun intel_wedge_on_timeout(&w, gt, 5 * HZ) {
1193*4882a593Smuzhiyun intel_prepare_reset(gt->i915);
1194*4882a593Smuzhiyun
1195*4882a593Smuzhiyun /* Flush everyone using a resource about to be clobbered */
1196*4882a593Smuzhiyun synchronize_srcu_expedited(>->reset.backoff_srcu);
1197*4882a593Smuzhiyun
1198*4882a593Smuzhiyun intel_gt_reset(gt, engine_mask, reason);
1199*4882a593Smuzhiyun
1200*4882a593Smuzhiyun intel_finish_reset(gt->i915);
1201*4882a593Smuzhiyun }
1202*4882a593Smuzhiyun
1203*4882a593Smuzhiyun if (!test_bit(I915_WEDGED, >->reset.flags))
1204*4882a593Smuzhiyun kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1205*4882a593Smuzhiyun }
1206*4882a593Smuzhiyun
1207*4882a593Smuzhiyun /**
1208*4882a593Smuzhiyun * intel_gt_handle_error - handle a gpu error
1209*4882a593Smuzhiyun * @gt: the intel_gt
1210*4882a593Smuzhiyun * @engine_mask: mask representing engines that are hung
1211*4882a593Smuzhiyun * @flags: control flags
1212*4882a593Smuzhiyun * @fmt: Error message format string
1213*4882a593Smuzhiyun *
1214*4882a593Smuzhiyun * Do some basic checking of register state at error time and
1215*4882a593Smuzhiyun * dump it to the syslog. Also call i915_capture_error_state() to make
1216*4882a593Smuzhiyun * sure we get a record and make it available in debugfs. Fire a uevent
1217*4882a593Smuzhiyun * so userspace knows something bad happened (should trigger collection
1218*4882a593Smuzhiyun * of a ring dump etc.).
1219*4882a593Smuzhiyun */
intel_gt_handle_error(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned long flags,const char * fmt,...)1220*4882a593Smuzhiyun void intel_gt_handle_error(struct intel_gt *gt,
1221*4882a593Smuzhiyun intel_engine_mask_t engine_mask,
1222*4882a593Smuzhiyun unsigned long flags,
1223*4882a593Smuzhiyun const char *fmt, ...)
1224*4882a593Smuzhiyun {
1225*4882a593Smuzhiyun struct intel_engine_cs *engine;
1226*4882a593Smuzhiyun intel_wakeref_t wakeref;
1227*4882a593Smuzhiyun intel_engine_mask_t tmp;
1228*4882a593Smuzhiyun char error_msg[80];
1229*4882a593Smuzhiyun char *msg = NULL;
1230*4882a593Smuzhiyun
1231*4882a593Smuzhiyun if (fmt) {
1232*4882a593Smuzhiyun va_list args;
1233*4882a593Smuzhiyun
1234*4882a593Smuzhiyun va_start(args, fmt);
1235*4882a593Smuzhiyun vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1236*4882a593Smuzhiyun va_end(args);
1237*4882a593Smuzhiyun
1238*4882a593Smuzhiyun msg = error_msg;
1239*4882a593Smuzhiyun }
1240*4882a593Smuzhiyun
1241*4882a593Smuzhiyun /*
1242*4882a593Smuzhiyun * In most cases it's guaranteed that we get here with an RPM
1243*4882a593Smuzhiyun * reference held, for example because there is a pending GPU
1244*4882a593Smuzhiyun * request that won't finish until the reset is done. This
1245*4882a593Smuzhiyun * isn't the case at least when we get here by doing a
1246*4882a593Smuzhiyun * simulated reset via debugfs, so get an RPM reference.
1247*4882a593Smuzhiyun */
1248*4882a593Smuzhiyun wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1249*4882a593Smuzhiyun
1250*4882a593Smuzhiyun engine_mask &= gt->info.engine_mask;
1251*4882a593Smuzhiyun
1252*4882a593Smuzhiyun if (flags & I915_ERROR_CAPTURE) {
1253*4882a593Smuzhiyun i915_capture_error_state(gt->i915);
1254*4882a593Smuzhiyun intel_gt_clear_error_registers(gt, engine_mask);
1255*4882a593Smuzhiyun }
1256*4882a593Smuzhiyun
1257*4882a593Smuzhiyun /*
1258*4882a593Smuzhiyun * Try engine reset when available. We fall back to full reset if
1259*4882a593Smuzhiyun * single reset fails.
1260*4882a593Smuzhiyun */
1261*4882a593Smuzhiyun if (intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) {
1262*4882a593Smuzhiyun for_each_engine_masked(engine, gt, engine_mask, tmp) {
1263*4882a593Smuzhiyun BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1264*4882a593Smuzhiyun if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1265*4882a593Smuzhiyun >->reset.flags))
1266*4882a593Smuzhiyun continue;
1267*4882a593Smuzhiyun
1268*4882a593Smuzhiyun if (intel_engine_reset(engine, msg) == 0)
1269*4882a593Smuzhiyun engine_mask &= ~engine->mask;
1270*4882a593Smuzhiyun
1271*4882a593Smuzhiyun clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
1272*4882a593Smuzhiyun >->reset.flags);
1273*4882a593Smuzhiyun }
1274*4882a593Smuzhiyun }
1275*4882a593Smuzhiyun
1276*4882a593Smuzhiyun if (!engine_mask)
1277*4882a593Smuzhiyun goto out;
1278*4882a593Smuzhiyun
1279*4882a593Smuzhiyun /* Full reset needs the mutex, stop any other user trying to do so. */
1280*4882a593Smuzhiyun if (test_and_set_bit(I915_RESET_BACKOFF, >->reset.flags)) {
1281*4882a593Smuzhiyun wait_event(gt->reset.queue,
1282*4882a593Smuzhiyun !test_bit(I915_RESET_BACKOFF, >->reset.flags));
1283*4882a593Smuzhiyun goto out; /* piggy-back on the other reset */
1284*4882a593Smuzhiyun }
1285*4882a593Smuzhiyun
1286*4882a593Smuzhiyun /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1287*4882a593Smuzhiyun synchronize_rcu_expedited();
1288*4882a593Smuzhiyun
1289*4882a593Smuzhiyun /* Prevent any other reset-engine attempt. */
1290*4882a593Smuzhiyun for_each_engine(engine, gt, tmp) {
1291*4882a593Smuzhiyun while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1292*4882a593Smuzhiyun >->reset.flags))
1293*4882a593Smuzhiyun wait_on_bit(>->reset.flags,
1294*4882a593Smuzhiyun I915_RESET_ENGINE + engine->id,
1295*4882a593Smuzhiyun TASK_UNINTERRUPTIBLE);
1296*4882a593Smuzhiyun }
1297*4882a593Smuzhiyun
1298*4882a593Smuzhiyun intel_gt_reset_global(gt, engine_mask, msg);
1299*4882a593Smuzhiyun
1300*4882a593Smuzhiyun for_each_engine(engine, gt, tmp)
1301*4882a593Smuzhiyun clear_bit_unlock(I915_RESET_ENGINE + engine->id,
1302*4882a593Smuzhiyun >->reset.flags);
1303*4882a593Smuzhiyun clear_bit_unlock(I915_RESET_BACKOFF, >->reset.flags);
1304*4882a593Smuzhiyun smp_mb__after_atomic();
1305*4882a593Smuzhiyun wake_up_all(>->reset.queue);
1306*4882a593Smuzhiyun
1307*4882a593Smuzhiyun out:
1308*4882a593Smuzhiyun intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1309*4882a593Smuzhiyun }
1310*4882a593Smuzhiyun
intel_gt_reset_trylock(struct intel_gt * gt,int * srcu)1311*4882a593Smuzhiyun int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
1312*4882a593Smuzhiyun {
1313*4882a593Smuzhiyun might_lock(>->reset.backoff_srcu);
1314*4882a593Smuzhiyun might_sleep();
1315*4882a593Smuzhiyun
1316*4882a593Smuzhiyun rcu_read_lock();
1317*4882a593Smuzhiyun while (test_bit(I915_RESET_BACKOFF, >->reset.flags)) {
1318*4882a593Smuzhiyun rcu_read_unlock();
1319*4882a593Smuzhiyun
1320*4882a593Smuzhiyun if (wait_event_interruptible(gt->reset.queue,
1321*4882a593Smuzhiyun !test_bit(I915_RESET_BACKOFF,
1322*4882a593Smuzhiyun >->reset.flags)))
1323*4882a593Smuzhiyun return -EINTR;
1324*4882a593Smuzhiyun
1325*4882a593Smuzhiyun rcu_read_lock();
1326*4882a593Smuzhiyun }
1327*4882a593Smuzhiyun *srcu = srcu_read_lock(>->reset.backoff_srcu);
1328*4882a593Smuzhiyun rcu_read_unlock();
1329*4882a593Smuzhiyun
1330*4882a593Smuzhiyun return 0;
1331*4882a593Smuzhiyun }
1332*4882a593Smuzhiyun
intel_gt_reset_unlock(struct intel_gt * gt,int tag)1333*4882a593Smuzhiyun void intel_gt_reset_unlock(struct intel_gt *gt, int tag)
1334*4882a593Smuzhiyun __releases(>->reset.backoff_srcu)
1335*4882a593Smuzhiyun {
1336*4882a593Smuzhiyun srcu_read_unlock(>->reset.backoff_srcu, tag);
1337*4882a593Smuzhiyun }
1338*4882a593Smuzhiyun
intel_gt_terminally_wedged(struct intel_gt * gt)1339*4882a593Smuzhiyun int intel_gt_terminally_wedged(struct intel_gt *gt)
1340*4882a593Smuzhiyun {
1341*4882a593Smuzhiyun might_sleep();
1342*4882a593Smuzhiyun
1343*4882a593Smuzhiyun if (!intel_gt_is_wedged(gt))
1344*4882a593Smuzhiyun return 0;
1345*4882a593Smuzhiyun
1346*4882a593Smuzhiyun if (intel_gt_has_unrecoverable_error(gt))
1347*4882a593Smuzhiyun return -EIO;
1348*4882a593Smuzhiyun
1349*4882a593Smuzhiyun /* Reset still in progress? Maybe we will recover? */
1350*4882a593Smuzhiyun if (wait_event_interruptible(gt->reset.queue,
1351*4882a593Smuzhiyun !test_bit(I915_RESET_BACKOFF,
1352*4882a593Smuzhiyun >->reset.flags)))
1353*4882a593Smuzhiyun return -EINTR;
1354*4882a593Smuzhiyun
1355*4882a593Smuzhiyun return intel_gt_is_wedged(gt) ? -EIO : 0;
1356*4882a593Smuzhiyun }
1357*4882a593Smuzhiyun
intel_gt_set_wedged_on_init(struct intel_gt * gt)1358*4882a593Smuzhiyun void intel_gt_set_wedged_on_init(struct intel_gt *gt)
1359*4882a593Smuzhiyun {
1360*4882a593Smuzhiyun BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES >
1361*4882a593Smuzhiyun I915_WEDGED_ON_INIT);
1362*4882a593Smuzhiyun intel_gt_set_wedged(gt);
1363*4882a593Smuzhiyun set_bit(I915_WEDGED_ON_INIT, >->reset.flags);
1364*4882a593Smuzhiyun
1365*4882a593Smuzhiyun /* Wedged on init is non-recoverable */
1366*4882a593Smuzhiyun add_taint_for_CI(gt->i915, TAINT_WARN);
1367*4882a593Smuzhiyun }
1368*4882a593Smuzhiyun
intel_gt_set_wedged_on_fini(struct intel_gt * gt)1369*4882a593Smuzhiyun void intel_gt_set_wedged_on_fini(struct intel_gt *gt)
1370*4882a593Smuzhiyun {
1371*4882a593Smuzhiyun intel_gt_set_wedged(gt);
1372*4882a593Smuzhiyun set_bit(I915_WEDGED_ON_FINI, >->reset.flags);
1373*4882a593Smuzhiyun }
1374*4882a593Smuzhiyun
intel_gt_init_reset(struct intel_gt * gt)1375*4882a593Smuzhiyun void intel_gt_init_reset(struct intel_gt *gt)
1376*4882a593Smuzhiyun {
1377*4882a593Smuzhiyun init_waitqueue_head(>->reset.queue);
1378*4882a593Smuzhiyun mutex_init(>->reset.mutex);
1379*4882a593Smuzhiyun init_srcu_struct(>->reset.backoff_srcu);
1380*4882a593Smuzhiyun
1381*4882a593Smuzhiyun /* no GPU until we are ready! */
1382*4882a593Smuzhiyun __set_bit(I915_WEDGED, >->reset.flags);
1383*4882a593Smuzhiyun }
1384*4882a593Smuzhiyun
intel_gt_fini_reset(struct intel_gt * gt)1385*4882a593Smuzhiyun void intel_gt_fini_reset(struct intel_gt *gt)
1386*4882a593Smuzhiyun {
1387*4882a593Smuzhiyun cleanup_srcu_struct(>->reset.backoff_srcu);
1388*4882a593Smuzhiyun }
1389*4882a593Smuzhiyun
intel_wedge_me(struct work_struct * work)1390*4882a593Smuzhiyun static void intel_wedge_me(struct work_struct *work)
1391*4882a593Smuzhiyun {
1392*4882a593Smuzhiyun struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
1393*4882a593Smuzhiyun
1394*4882a593Smuzhiyun drm_err(&w->gt->i915->drm,
1395*4882a593Smuzhiyun "%s timed out, cancelling all in-flight rendering.\n",
1396*4882a593Smuzhiyun w->name);
1397*4882a593Smuzhiyun intel_gt_set_wedged(w->gt);
1398*4882a593Smuzhiyun }
1399*4882a593Smuzhiyun
__intel_init_wedge(struct intel_wedge_me * w,struct intel_gt * gt,long timeout,const char * name)1400*4882a593Smuzhiyun void __intel_init_wedge(struct intel_wedge_me *w,
1401*4882a593Smuzhiyun struct intel_gt *gt,
1402*4882a593Smuzhiyun long timeout,
1403*4882a593Smuzhiyun const char *name)
1404*4882a593Smuzhiyun {
1405*4882a593Smuzhiyun w->gt = gt;
1406*4882a593Smuzhiyun w->name = name;
1407*4882a593Smuzhiyun
1408*4882a593Smuzhiyun INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
1409*4882a593Smuzhiyun schedule_delayed_work(&w->work, timeout);
1410*4882a593Smuzhiyun }
1411*4882a593Smuzhiyun
__intel_fini_wedge(struct intel_wedge_me * w)1412*4882a593Smuzhiyun void __intel_fini_wedge(struct intel_wedge_me *w)
1413*4882a593Smuzhiyun {
1414*4882a593Smuzhiyun cancel_delayed_work_sync(&w->work);
1415*4882a593Smuzhiyun destroy_delayed_work_on_stack(&w->work);
1416*4882a593Smuzhiyun w->gt = NULL;
1417*4882a593Smuzhiyun }
1418*4882a593Smuzhiyun
1419*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1420*4882a593Smuzhiyun #include "selftest_reset.c"
1421*4882a593Smuzhiyun #include "selftest_hangcheck.c"
1422*4882a593Smuzhiyun #endif
1423