xref: /OK3568_Linux_fs/kernel/drivers/gpu/drm/i915/gt/intel_reset.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun  * SPDX-License-Identifier: MIT
3*4882a593Smuzhiyun  *
4*4882a593Smuzhiyun  * Copyright © 2008-2018 Intel Corporation
5*4882a593Smuzhiyun  */
6*4882a593Smuzhiyun 
7*4882a593Smuzhiyun #include <linux/sched/mm.h>
8*4882a593Smuzhiyun #include <linux/stop_machine.h>
9*4882a593Smuzhiyun 
10*4882a593Smuzhiyun #include "display/intel_display_types.h"
11*4882a593Smuzhiyun #include "display/intel_overlay.h"
12*4882a593Smuzhiyun 
13*4882a593Smuzhiyun #include "gem/i915_gem_context.h"
14*4882a593Smuzhiyun 
15*4882a593Smuzhiyun #include "i915_drv.h"
16*4882a593Smuzhiyun #include "i915_gpu_error.h"
17*4882a593Smuzhiyun #include "i915_irq.h"
18*4882a593Smuzhiyun #include "intel_breadcrumbs.h"
19*4882a593Smuzhiyun #include "intel_engine_pm.h"
20*4882a593Smuzhiyun #include "intel_gt.h"
21*4882a593Smuzhiyun #include "intel_gt_pm.h"
22*4882a593Smuzhiyun #include "intel_reset.h"
23*4882a593Smuzhiyun 
24*4882a593Smuzhiyun #include "uc/intel_guc.h"
25*4882a593Smuzhiyun #include "uc/intel_guc_submission.h"
26*4882a593Smuzhiyun 
27*4882a593Smuzhiyun #define RESET_MAX_RETRIES 3
28*4882a593Smuzhiyun 
29*4882a593Smuzhiyun /* XXX How to handle concurrent GGTT updates using tiling registers? */
30*4882a593Smuzhiyun #define RESET_UNDER_STOP_MACHINE 0
31*4882a593Smuzhiyun 
rmw_set_fw(struct intel_uncore * uncore,i915_reg_t reg,u32 set)32*4882a593Smuzhiyun static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
33*4882a593Smuzhiyun {
34*4882a593Smuzhiyun 	intel_uncore_rmw_fw(uncore, reg, 0, set);
35*4882a593Smuzhiyun }
36*4882a593Smuzhiyun 
rmw_clear_fw(struct intel_uncore * uncore,i915_reg_t reg,u32 clr)37*4882a593Smuzhiyun static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
38*4882a593Smuzhiyun {
39*4882a593Smuzhiyun 	intel_uncore_rmw_fw(uncore, reg, clr, 0);
40*4882a593Smuzhiyun }
41*4882a593Smuzhiyun 
engine_skip_context(struct i915_request * rq)42*4882a593Smuzhiyun static void engine_skip_context(struct i915_request *rq)
43*4882a593Smuzhiyun {
44*4882a593Smuzhiyun 	struct intel_engine_cs *engine = rq->engine;
45*4882a593Smuzhiyun 	struct intel_context *hung_ctx = rq->context;
46*4882a593Smuzhiyun 
47*4882a593Smuzhiyun 	if (!i915_request_is_active(rq))
48*4882a593Smuzhiyun 		return;
49*4882a593Smuzhiyun 
50*4882a593Smuzhiyun 	lockdep_assert_held(&engine->active.lock);
51*4882a593Smuzhiyun 	list_for_each_entry_continue(rq, &engine->active.requests, sched.link)
52*4882a593Smuzhiyun 		if (rq->context == hung_ctx) {
53*4882a593Smuzhiyun 			i915_request_set_error_once(rq, -EIO);
54*4882a593Smuzhiyun 			__i915_request_skip(rq);
55*4882a593Smuzhiyun 		}
56*4882a593Smuzhiyun }
57*4882a593Smuzhiyun 
client_mark_guilty(struct i915_gem_context * ctx,bool banned)58*4882a593Smuzhiyun static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
59*4882a593Smuzhiyun {
60*4882a593Smuzhiyun 	struct drm_i915_file_private *file_priv = ctx->file_priv;
61*4882a593Smuzhiyun 	unsigned long prev_hang;
62*4882a593Smuzhiyun 	unsigned int score;
63*4882a593Smuzhiyun 
64*4882a593Smuzhiyun 	if (IS_ERR_OR_NULL(file_priv))
65*4882a593Smuzhiyun 		return;
66*4882a593Smuzhiyun 
67*4882a593Smuzhiyun 	score = 0;
68*4882a593Smuzhiyun 	if (banned)
69*4882a593Smuzhiyun 		score = I915_CLIENT_SCORE_CONTEXT_BAN;
70*4882a593Smuzhiyun 
71*4882a593Smuzhiyun 	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
72*4882a593Smuzhiyun 	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
73*4882a593Smuzhiyun 		score += I915_CLIENT_SCORE_HANG_FAST;
74*4882a593Smuzhiyun 
75*4882a593Smuzhiyun 	if (score) {
76*4882a593Smuzhiyun 		atomic_add(score, &file_priv->ban_score);
77*4882a593Smuzhiyun 
78*4882a593Smuzhiyun 		drm_dbg(&ctx->i915->drm,
79*4882a593Smuzhiyun 			"client %s: gained %u ban score, now %u\n",
80*4882a593Smuzhiyun 			ctx->name, score,
81*4882a593Smuzhiyun 			atomic_read(&file_priv->ban_score));
82*4882a593Smuzhiyun 	}
83*4882a593Smuzhiyun }
84*4882a593Smuzhiyun 
mark_guilty(struct i915_request * rq)85*4882a593Smuzhiyun static bool mark_guilty(struct i915_request *rq)
86*4882a593Smuzhiyun {
87*4882a593Smuzhiyun 	struct i915_gem_context *ctx;
88*4882a593Smuzhiyun 	unsigned long prev_hang;
89*4882a593Smuzhiyun 	bool banned;
90*4882a593Smuzhiyun 	int i;
91*4882a593Smuzhiyun 
92*4882a593Smuzhiyun 	if (intel_context_is_closed(rq->context)) {
93*4882a593Smuzhiyun 		intel_context_set_banned(rq->context);
94*4882a593Smuzhiyun 		return true;
95*4882a593Smuzhiyun 	}
96*4882a593Smuzhiyun 
97*4882a593Smuzhiyun 	rcu_read_lock();
98*4882a593Smuzhiyun 	ctx = rcu_dereference(rq->context->gem_context);
99*4882a593Smuzhiyun 	if (ctx && !kref_get_unless_zero(&ctx->ref))
100*4882a593Smuzhiyun 		ctx = NULL;
101*4882a593Smuzhiyun 	rcu_read_unlock();
102*4882a593Smuzhiyun 	if (!ctx)
103*4882a593Smuzhiyun 		return intel_context_is_banned(rq->context);
104*4882a593Smuzhiyun 
105*4882a593Smuzhiyun 	atomic_inc(&ctx->guilty_count);
106*4882a593Smuzhiyun 
107*4882a593Smuzhiyun 	/* Cool contexts are too cool to be banned! (Used for reset testing.) */
108*4882a593Smuzhiyun 	if (!i915_gem_context_is_bannable(ctx)) {
109*4882a593Smuzhiyun 		banned = false;
110*4882a593Smuzhiyun 		goto out;
111*4882a593Smuzhiyun 	}
112*4882a593Smuzhiyun 
113*4882a593Smuzhiyun 	drm_notice(&ctx->i915->drm,
114*4882a593Smuzhiyun 		   "%s context reset due to GPU hang\n",
115*4882a593Smuzhiyun 		   ctx->name);
116*4882a593Smuzhiyun 
117*4882a593Smuzhiyun 	/* Record the timestamp for the last N hangs */
118*4882a593Smuzhiyun 	prev_hang = ctx->hang_timestamp[0];
119*4882a593Smuzhiyun 	for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
120*4882a593Smuzhiyun 		ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
121*4882a593Smuzhiyun 	ctx->hang_timestamp[i] = jiffies;
122*4882a593Smuzhiyun 
123*4882a593Smuzhiyun 	/* If we have hung N+1 times in rapid succession, we ban the context! */
124*4882a593Smuzhiyun 	banned = !i915_gem_context_is_recoverable(ctx);
125*4882a593Smuzhiyun 	if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
126*4882a593Smuzhiyun 		banned = true;
127*4882a593Smuzhiyun 	if (banned) {
128*4882a593Smuzhiyun 		drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
129*4882a593Smuzhiyun 			ctx->name, atomic_read(&ctx->guilty_count));
130*4882a593Smuzhiyun 		intel_context_set_banned(rq->context);
131*4882a593Smuzhiyun 	}
132*4882a593Smuzhiyun 
133*4882a593Smuzhiyun 	client_mark_guilty(ctx, banned);
134*4882a593Smuzhiyun 
135*4882a593Smuzhiyun out:
136*4882a593Smuzhiyun 	i915_gem_context_put(ctx);
137*4882a593Smuzhiyun 	return banned;
138*4882a593Smuzhiyun }
139*4882a593Smuzhiyun 
mark_innocent(struct i915_request * rq)140*4882a593Smuzhiyun static void mark_innocent(struct i915_request *rq)
141*4882a593Smuzhiyun {
142*4882a593Smuzhiyun 	struct i915_gem_context *ctx;
143*4882a593Smuzhiyun 
144*4882a593Smuzhiyun 	rcu_read_lock();
145*4882a593Smuzhiyun 	ctx = rcu_dereference(rq->context->gem_context);
146*4882a593Smuzhiyun 	if (ctx)
147*4882a593Smuzhiyun 		atomic_inc(&ctx->active_count);
148*4882a593Smuzhiyun 	rcu_read_unlock();
149*4882a593Smuzhiyun }
150*4882a593Smuzhiyun 
__i915_request_reset(struct i915_request * rq,bool guilty)151*4882a593Smuzhiyun void __i915_request_reset(struct i915_request *rq, bool guilty)
152*4882a593Smuzhiyun {
153*4882a593Smuzhiyun 	RQ_TRACE(rq, "guilty? %s\n", yesno(guilty));
154*4882a593Smuzhiyun 
155*4882a593Smuzhiyun 	GEM_BUG_ON(i915_request_completed(rq));
156*4882a593Smuzhiyun 
157*4882a593Smuzhiyun 	rcu_read_lock(); /* protect the GEM context */
158*4882a593Smuzhiyun 	if (guilty) {
159*4882a593Smuzhiyun 		i915_request_set_error_once(rq, -EIO);
160*4882a593Smuzhiyun 		__i915_request_skip(rq);
161*4882a593Smuzhiyun 		if (mark_guilty(rq))
162*4882a593Smuzhiyun 			engine_skip_context(rq);
163*4882a593Smuzhiyun 	} else {
164*4882a593Smuzhiyun 		i915_request_set_error_once(rq, -EAGAIN);
165*4882a593Smuzhiyun 		mark_innocent(rq);
166*4882a593Smuzhiyun 	}
167*4882a593Smuzhiyun 	rcu_read_unlock();
168*4882a593Smuzhiyun }
169*4882a593Smuzhiyun 
i915_in_reset(struct pci_dev * pdev)170*4882a593Smuzhiyun static bool i915_in_reset(struct pci_dev *pdev)
171*4882a593Smuzhiyun {
172*4882a593Smuzhiyun 	u8 gdrst;
173*4882a593Smuzhiyun 
174*4882a593Smuzhiyun 	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
175*4882a593Smuzhiyun 	return gdrst & GRDOM_RESET_STATUS;
176*4882a593Smuzhiyun }
177*4882a593Smuzhiyun 
i915_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)178*4882a593Smuzhiyun static int i915_do_reset(struct intel_gt *gt,
179*4882a593Smuzhiyun 			 intel_engine_mask_t engine_mask,
180*4882a593Smuzhiyun 			 unsigned int retry)
181*4882a593Smuzhiyun {
182*4882a593Smuzhiyun 	struct pci_dev *pdev = gt->i915->drm.pdev;
183*4882a593Smuzhiyun 	int err;
184*4882a593Smuzhiyun 
185*4882a593Smuzhiyun 	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
186*4882a593Smuzhiyun 	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
187*4882a593Smuzhiyun 	udelay(50);
188*4882a593Smuzhiyun 	err = wait_for_atomic(i915_in_reset(pdev), 50);
189*4882a593Smuzhiyun 
190*4882a593Smuzhiyun 	/* Clear the reset request. */
191*4882a593Smuzhiyun 	pci_write_config_byte(pdev, I915_GDRST, 0);
192*4882a593Smuzhiyun 	udelay(50);
193*4882a593Smuzhiyun 	if (!err)
194*4882a593Smuzhiyun 		err = wait_for_atomic(!i915_in_reset(pdev), 50);
195*4882a593Smuzhiyun 
196*4882a593Smuzhiyun 	return err;
197*4882a593Smuzhiyun }
198*4882a593Smuzhiyun 
g4x_reset_complete(struct pci_dev * pdev)199*4882a593Smuzhiyun static bool g4x_reset_complete(struct pci_dev *pdev)
200*4882a593Smuzhiyun {
201*4882a593Smuzhiyun 	u8 gdrst;
202*4882a593Smuzhiyun 
203*4882a593Smuzhiyun 	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
204*4882a593Smuzhiyun 	return (gdrst & GRDOM_RESET_ENABLE) == 0;
205*4882a593Smuzhiyun }
206*4882a593Smuzhiyun 
g33_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)207*4882a593Smuzhiyun static int g33_do_reset(struct intel_gt *gt,
208*4882a593Smuzhiyun 			intel_engine_mask_t engine_mask,
209*4882a593Smuzhiyun 			unsigned int retry)
210*4882a593Smuzhiyun {
211*4882a593Smuzhiyun 	struct pci_dev *pdev = gt->i915->drm.pdev;
212*4882a593Smuzhiyun 
213*4882a593Smuzhiyun 	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
214*4882a593Smuzhiyun 	return wait_for_atomic(g4x_reset_complete(pdev), 50);
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun 
g4x_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)217*4882a593Smuzhiyun static int g4x_do_reset(struct intel_gt *gt,
218*4882a593Smuzhiyun 			intel_engine_mask_t engine_mask,
219*4882a593Smuzhiyun 			unsigned int retry)
220*4882a593Smuzhiyun {
221*4882a593Smuzhiyun 	struct pci_dev *pdev = gt->i915->drm.pdev;
222*4882a593Smuzhiyun 	struct intel_uncore *uncore = gt->uncore;
223*4882a593Smuzhiyun 	int ret;
224*4882a593Smuzhiyun 
225*4882a593Smuzhiyun 	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
226*4882a593Smuzhiyun 	rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
227*4882a593Smuzhiyun 	intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
228*4882a593Smuzhiyun 
229*4882a593Smuzhiyun 	pci_write_config_byte(pdev, I915_GDRST,
230*4882a593Smuzhiyun 			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
231*4882a593Smuzhiyun 	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
232*4882a593Smuzhiyun 	if (ret) {
233*4882a593Smuzhiyun 		drm_dbg(&gt->i915->drm, "Wait for media reset failed\n");
234*4882a593Smuzhiyun 		goto out;
235*4882a593Smuzhiyun 	}
236*4882a593Smuzhiyun 
237*4882a593Smuzhiyun 	pci_write_config_byte(pdev, I915_GDRST,
238*4882a593Smuzhiyun 			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
239*4882a593Smuzhiyun 	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
240*4882a593Smuzhiyun 	if (ret) {
241*4882a593Smuzhiyun 		drm_dbg(&gt->i915->drm, "Wait for render reset failed\n");
242*4882a593Smuzhiyun 		goto out;
243*4882a593Smuzhiyun 	}
244*4882a593Smuzhiyun 
245*4882a593Smuzhiyun out:
246*4882a593Smuzhiyun 	pci_write_config_byte(pdev, I915_GDRST, 0);
247*4882a593Smuzhiyun 
248*4882a593Smuzhiyun 	rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
249*4882a593Smuzhiyun 	intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
250*4882a593Smuzhiyun 
251*4882a593Smuzhiyun 	return ret;
252*4882a593Smuzhiyun }
253*4882a593Smuzhiyun 
ilk_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)254*4882a593Smuzhiyun static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask,
255*4882a593Smuzhiyun 			unsigned int retry)
256*4882a593Smuzhiyun {
257*4882a593Smuzhiyun 	struct intel_uncore *uncore = gt->uncore;
258*4882a593Smuzhiyun 	int ret;
259*4882a593Smuzhiyun 
260*4882a593Smuzhiyun 	intel_uncore_write_fw(uncore, ILK_GDSR,
261*4882a593Smuzhiyun 			      ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
262*4882a593Smuzhiyun 	ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
263*4882a593Smuzhiyun 					   ILK_GRDOM_RESET_ENABLE, 0,
264*4882a593Smuzhiyun 					   5000, 0,
265*4882a593Smuzhiyun 					   NULL);
266*4882a593Smuzhiyun 	if (ret) {
267*4882a593Smuzhiyun 		drm_dbg(&gt->i915->drm, "Wait for render reset failed\n");
268*4882a593Smuzhiyun 		goto out;
269*4882a593Smuzhiyun 	}
270*4882a593Smuzhiyun 
271*4882a593Smuzhiyun 	intel_uncore_write_fw(uncore, ILK_GDSR,
272*4882a593Smuzhiyun 			      ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
273*4882a593Smuzhiyun 	ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
274*4882a593Smuzhiyun 					   ILK_GRDOM_RESET_ENABLE, 0,
275*4882a593Smuzhiyun 					   5000, 0,
276*4882a593Smuzhiyun 					   NULL);
277*4882a593Smuzhiyun 	if (ret) {
278*4882a593Smuzhiyun 		drm_dbg(&gt->i915->drm, "Wait for media reset failed\n");
279*4882a593Smuzhiyun 		goto out;
280*4882a593Smuzhiyun 	}
281*4882a593Smuzhiyun 
282*4882a593Smuzhiyun out:
283*4882a593Smuzhiyun 	intel_uncore_write_fw(uncore, ILK_GDSR, 0);
284*4882a593Smuzhiyun 	intel_uncore_posting_read_fw(uncore, ILK_GDSR);
285*4882a593Smuzhiyun 	return ret;
286*4882a593Smuzhiyun }
287*4882a593Smuzhiyun 
288*4882a593Smuzhiyun /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
gen6_hw_domain_reset(struct intel_gt * gt,u32 hw_domain_mask)289*4882a593Smuzhiyun static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
290*4882a593Smuzhiyun {
291*4882a593Smuzhiyun 	struct intel_uncore *uncore = gt->uncore;
292*4882a593Smuzhiyun 	int err;
293*4882a593Smuzhiyun 
294*4882a593Smuzhiyun 	/*
295*4882a593Smuzhiyun 	 * GEN6_GDRST is not in the gt power well, no need to check
296*4882a593Smuzhiyun 	 * for fifo space for the write or forcewake the chip for
297*4882a593Smuzhiyun 	 * the read
298*4882a593Smuzhiyun 	 */
299*4882a593Smuzhiyun 	intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
300*4882a593Smuzhiyun 
301*4882a593Smuzhiyun 	/* Wait for the device to ack the reset requests */
302*4882a593Smuzhiyun 	err = __intel_wait_for_register_fw(uncore,
303*4882a593Smuzhiyun 					   GEN6_GDRST, hw_domain_mask, 0,
304*4882a593Smuzhiyun 					   500, 0,
305*4882a593Smuzhiyun 					   NULL);
306*4882a593Smuzhiyun 	if (err)
307*4882a593Smuzhiyun 		drm_dbg(&gt->i915->drm,
308*4882a593Smuzhiyun 			"Wait for 0x%08x engines reset failed\n",
309*4882a593Smuzhiyun 			hw_domain_mask);
310*4882a593Smuzhiyun 
311*4882a593Smuzhiyun 	return err;
312*4882a593Smuzhiyun }
313*4882a593Smuzhiyun 
gen6_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)314*4882a593Smuzhiyun static int gen6_reset_engines(struct intel_gt *gt,
315*4882a593Smuzhiyun 			      intel_engine_mask_t engine_mask,
316*4882a593Smuzhiyun 			      unsigned int retry)
317*4882a593Smuzhiyun {
318*4882a593Smuzhiyun 	static const u32 hw_engine_mask[] = {
319*4882a593Smuzhiyun 		[RCS0]  = GEN6_GRDOM_RENDER,
320*4882a593Smuzhiyun 		[BCS0]  = GEN6_GRDOM_BLT,
321*4882a593Smuzhiyun 		[VCS0]  = GEN6_GRDOM_MEDIA,
322*4882a593Smuzhiyun 		[VCS1]  = GEN8_GRDOM_MEDIA2,
323*4882a593Smuzhiyun 		[VECS0] = GEN6_GRDOM_VECS,
324*4882a593Smuzhiyun 	};
325*4882a593Smuzhiyun 	struct intel_engine_cs *engine;
326*4882a593Smuzhiyun 	u32 hw_mask;
327*4882a593Smuzhiyun 
328*4882a593Smuzhiyun 	if (engine_mask == ALL_ENGINES) {
329*4882a593Smuzhiyun 		hw_mask = GEN6_GRDOM_FULL;
330*4882a593Smuzhiyun 	} else {
331*4882a593Smuzhiyun 		intel_engine_mask_t tmp;
332*4882a593Smuzhiyun 
333*4882a593Smuzhiyun 		hw_mask = 0;
334*4882a593Smuzhiyun 		for_each_engine_masked(engine, gt, engine_mask, tmp) {
335*4882a593Smuzhiyun 			GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
336*4882a593Smuzhiyun 			hw_mask |= hw_engine_mask[engine->id];
337*4882a593Smuzhiyun 		}
338*4882a593Smuzhiyun 	}
339*4882a593Smuzhiyun 
340*4882a593Smuzhiyun 	return gen6_hw_domain_reset(gt, hw_mask);
341*4882a593Smuzhiyun }
342*4882a593Smuzhiyun 
gen11_lock_sfc(struct intel_engine_cs * engine,u32 * hw_mask)343*4882a593Smuzhiyun static int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask)
344*4882a593Smuzhiyun {
345*4882a593Smuzhiyun 	struct intel_uncore *uncore = engine->uncore;
346*4882a593Smuzhiyun 	u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
347*4882a593Smuzhiyun 	i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
348*4882a593Smuzhiyun 	u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
349*4882a593Smuzhiyun 	i915_reg_t sfc_usage;
350*4882a593Smuzhiyun 	u32 sfc_usage_bit;
351*4882a593Smuzhiyun 	u32 sfc_reset_bit;
352*4882a593Smuzhiyun 	int ret;
353*4882a593Smuzhiyun 
354*4882a593Smuzhiyun 	switch (engine->class) {
355*4882a593Smuzhiyun 	case VIDEO_DECODE_CLASS:
356*4882a593Smuzhiyun 		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
357*4882a593Smuzhiyun 			return 0;
358*4882a593Smuzhiyun 
359*4882a593Smuzhiyun 		sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
360*4882a593Smuzhiyun 		sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
361*4882a593Smuzhiyun 
362*4882a593Smuzhiyun 		sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
363*4882a593Smuzhiyun 		sfc_forced_lock_ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
364*4882a593Smuzhiyun 
365*4882a593Smuzhiyun 		sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
366*4882a593Smuzhiyun 		sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
367*4882a593Smuzhiyun 		sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
368*4882a593Smuzhiyun 		break;
369*4882a593Smuzhiyun 
370*4882a593Smuzhiyun 	case VIDEO_ENHANCEMENT_CLASS:
371*4882a593Smuzhiyun 		sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
372*4882a593Smuzhiyun 		sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
373*4882a593Smuzhiyun 
374*4882a593Smuzhiyun 		sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
375*4882a593Smuzhiyun 		sfc_forced_lock_ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
376*4882a593Smuzhiyun 
377*4882a593Smuzhiyun 		sfc_usage = GEN11_VECS_SFC_USAGE(engine);
378*4882a593Smuzhiyun 		sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
379*4882a593Smuzhiyun 		sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
380*4882a593Smuzhiyun 		break;
381*4882a593Smuzhiyun 
382*4882a593Smuzhiyun 	default:
383*4882a593Smuzhiyun 		return 0;
384*4882a593Smuzhiyun 	}
385*4882a593Smuzhiyun 
386*4882a593Smuzhiyun 	/*
387*4882a593Smuzhiyun 	 * If the engine is using a SFC, tell the engine that a software reset
388*4882a593Smuzhiyun 	 * is going to happen. The engine will then try to force lock the SFC.
389*4882a593Smuzhiyun 	 * If SFC ends up being locked to the engine we want to reset, we have
390*4882a593Smuzhiyun 	 * to reset it as well (we will unlock it once the reset sequence is
391*4882a593Smuzhiyun 	 * completed).
392*4882a593Smuzhiyun 	 */
393*4882a593Smuzhiyun 	if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
394*4882a593Smuzhiyun 		return 0;
395*4882a593Smuzhiyun 
396*4882a593Smuzhiyun 	rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
397*4882a593Smuzhiyun 
398*4882a593Smuzhiyun 	ret = __intel_wait_for_register_fw(uncore,
399*4882a593Smuzhiyun 					   sfc_forced_lock_ack,
400*4882a593Smuzhiyun 					   sfc_forced_lock_ack_bit,
401*4882a593Smuzhiyun 					   sfc_forced_lock_ack_bit,
402*4882a593Smuzhiyun 					   1000, 0, NULL);
403*4882a593Smuzhiyun 
404*4882a593Smuzhiyun 	/* Was the SFC released while we were trying to lock it? */
405*4882a593Smuzhiyun 	if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
406*4882a593Smuzhiyun 		return 0;
407*4882a593Smuzhiyun 
408*4882a593Smuzhiyun 	if (ret) {
409*4882a593Smuzhiyun 		drm_dbg(&engine->i915->drm,
410*4882a593Smuzhiyun 			"Wait for SFC forced lock ack failed\n");
411*4882a593Smuzhiyun 		return ret;
412*4882a593Smuzhiyun 	}
413*4882a593Smuzhiyun 
414*4882a593Smuzhiyun 	*hw_mask |= sfc_reset_bit;
415*4882a593Smuzhiyun 	return 0;
416*4882a593Smuzhiyun }
417*4882a593Smuzhiyun 
gen11_unlock_sfc(struct intel_engine_cs * engine)418*4882a593Smuzhiyun static void gen11_unlock_sfc(struct intel_engine_cs *engine)
419*4882a593Smuzhiyun {
420*4882a593Smuzhiyun 	struct intel_uncore *uncore = engine->uncore;
421*4882a593Smuzhiyun 	u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
422*4882a593Smuzhiyun 	i915_reg_t sfc_forced_lock;
423*4882a593Smuzhiyun 	u32 sfc_forced_lock_bit;
424*4882a593Smuzhiyun 
425*4882a593Smuzhiyun 	switch (engine->class) {
426*4882a593Smuzhiyun 	case VIDEO_DECODE_CLASS:
427*4882a593Smuzhiyun 		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
428*4882a593Smuzhiyun 			return;
429*4882a593Smuzhiyun 
430*4882a593Smuzhiyun 		sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
431*4882a593Smuzhiyun 		sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
432*4882a593Smuzhiyun 		break;
433*4882a593Smuzhiyun 
434*4882a593Smuzhiyun 	case VIDEO_ENHANCEMENT_CLASS:
435*4882a593Smuzhiyun 		sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
436*4882a593Smuzhiyun 		sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
437*4882a593Smuzhiyun 		break;
438*4882a593Smuzhiyun 
439*4882a593Smuzhiyun 	default:
440*4882a593Smuzhiyun 		return;
441*4882a593Smuzhiyun 	}
442*4882a593Smuzhiyun 
443*4882a593Smuzhiyun 	rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
444*4882a593Smuzhiyun }
445*4882a593Smuzhiyun 
gen11_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)446*4882a593Smuzhiyun static int gen11_reset_engines(struct intel_gt *gt,
447*4882a593Smuzhiyun 			       intel_engine_mask_t engine_mask,
448*4882a593Smuzhiyun 			       unsigned int retry)
449*4882a593Smuzhiyun {
450*4882a593Smuzhiyun 	static const u32 hw_engine_mask[] = {
451*4882a593Smuzhiyun 		[RCS0]  = GEN11_GRDOM_RENDER,
452*4882a593Smuzhiyun 		[BCS0]  = GEN11_GRDOM_BLT,
453*4882a593Smuzhiyun 		[VCS0]  = GEN11_GRDOM_MEDIA,
454*4882a593Smuzhiyun 		[VCS1]  = GEN11_GRDOM_MEDIA2,
455*4882a593Smuzhiyun 		[VCS2]  = GEN11_GRDOM_MEDIA3,
456*4882a593Smuzhiyun 		[VCS3]  = GEN11_GRDOM_MEDIA4,
457*4882a593Smuzhiyun 		[VECS0] = GEN11_GRDOM_VECS,
458*4882a593Smuzhiyun 		[VECS1] = GEN11_GRDOM_VECS2,
459*4882a593Smuzhiyun 	};
460*4882a593Smuzhiyun 	struct intel_engine_cs *engine;
461*4882a593Smuzhiyun 	intel_engine_mask_t tmp;
462*4882a593Smuzhiyun 	u32 hw_mask;
463*4882a593Smuzhiyun 	int ret;
464*4882a593Smuzhiyun 
465*4882a593Smuzhiyun 	if (engine_mask == ALL_ENGINES) {
466*4882a593Smuzhiyun 		hw_mask = GEN11_GRDOM_FULL;
467*4882a593Smuzhiyun 	} else {
468*4882a593Smuzhiyun 		hw_mask = 0;
469*4882a593Smuzhiyun 		for_each_engine_masked(engine, gt, engine_mask, tmp) {
470*4882a593Smuzhiyun 			GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
471*4882a593Smuzhiyun 			hw_mask |= hw_engine_mask[engine->id];
472*4882a593Smuzhiyun 			ret = gen11_lock_sfc(engine, &hw_mask);
473*4882a593Smuzhiyun 			if (ret)
474*4882a593Smuzhiyun 				goto sfc_unlock;
475*4882a593Smuzhiyun 		}
476*4882a593Smuzhiyun 	}
477*4882a593Smuzhiyun 
478*4882a593Smuzhiyun 	ret = gen6_hw_domain_reset(gt, hw_mask);
479*4882a593Smuzhiyun 
480*4882a593Smuzhiyun sfc_unlock:
481*4882a593Smuzhiyun 	/*
482*4882a593Smuzhiyun 	 * We unlock the SFC based on the lock status and not the result of
483*4882a593Smuzhiyun 	 * gen11_lock_sfc to make sure that we clean properly if something
484*4882a593Smuzhiyun 	 * wrong happened during the lock (e.g. lock acquired after timeout
485*4882a593Smuzhiyun 	 * expiration).
486*4882a593Smuzhiyun 	 */
487*4882a593Smuzhiyun 	if (engine_mask != ALL_ENGINES)
488*4882a593Smuzhiyun 		for_each_engine_masked(engine, gt, engine_mask, tmp)
489*4882a593Smuzhiyun 			gen11_unlock_sfc(engine);
490*4882a593Smuzhiyun 
491*4882a593Smuzhiyun 	return ret;
492*4882a593Smuzhiyun }
493*4882a593Smuzhiyun 
gen8_engine_reset_prepare(struct intel_engine_cs * engine)494*4882a593Smuzhiyun static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
495*4882a593Smuzhiyun {
496*4882a593Smuzhiyun 	struct intel_uncore *uncore = engine->uncore;
497*4882a593Smuzhiyun 	const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
498*4882a593Smuzhiyun 	u32 request, mask, ack;
499*4882a593Smuzhiyun 	int ret;
500*4882a593Smuzhiyun 
501*4882a593Smuzhiyun 	ack = intel_uncore_read_fw(uncore, reg);
502*4882a593Smuzhiyun 	if (ack & RESET_CTL_CAT_ERROR) {
503*4882a593Smuzhiyun 		/*
504*4882a593Smuzhiyun 		 * For catastrophic errors, ready-for-reset sequence
505*4882a593Smuzhiyun 		 * needs to be bypassed: HAS#396813
506*4882a593Smuzhiyun 		 */
507*4882a593Smuzhiyun 		request = RESET_CTL_CAT_ERROR;
508*4882a593Smuzhiyun 		mask = RESET_CTL_CAT_ERROR;
509*4882a593Smuzhiyun 
510*4882a593Smuzhiyun 		/* Catastrophic errors need to be cleared by HW */
511*4882a593Smuzhiyun 		ack = 0;
512*4882a593Smuzhiyun 	} else if (!(ack & RESET_CTL_READY_TO_RESET)) {
513*4882a593Smuzhiyun 		request = RESET_CTL_REQUEST_RESET;
514*4882a593Smuzhiyun 		mask = RESET_CTL_READY_TO_RESET;
515*4882a593Smuzhiyun 		ack = RESET_CTL_READY_TO_RESET;
516*4882a593Smuzhiyun 	} else {
517*4882a593Smuzhiyun 		return 0;
518*4882a593Smuzhiyun 	}
519*4882a593Smuzhiyun 
520*4882a593Smuzhiyun 	intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
521*4882a593Smuzhiyun 	ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
522*4882a593Smuzhiyun 					   700, 0, NULL);
523*4882a593Smuzhiyun 	if (ret)
524*4882a593Smuzhiyun 		drm_err(&engine->i915->drm,
525*4882a593Smuzhiyun 			"%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
526*4882a593Smuzhiyun 			engine->name, request,
527*4882a593Smuzhiyun 			intel_uncore_read_fw(uncore, reg));
528*4882a593Smuzhiyun 
529*4882a593Smuzhiyun 	return ret;
530*4882a593Smuzhiyun }
531*4882a593Smuzhiyun 
gen8_engine_reset_cancel(struct intel_engine_cs * engine)532*4882a593Smuzhiyun static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
533*4882a593Smuzhiyun {
534*4882a593Smuzhiyun 	intel_uncore_write_fw(engine->uncore,
535*4882a593Smuzhiyun 			      RING_RESET_CTL(engine->mmio_base),
536*4882a593Smuzhiyun 			      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
537*4882a593Smuzhiyun }
538*4882a593Smuzhiyun 
gen8_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)539*4882a593Smuzhiyun static int gen8_reset_engines(struct intel_gt *gt,
540*4882a593Smuzhiyun 			      intel_engine_mask_t engine_mask,
541*4882a593Smuzhiyun 			      unsigned int retry)
542*4882a593Smuzhiyun {
543*4882a593Smuzhiyun 	struct intel_engine_cs *engine;
544*4882a593Smuzhiyun 	const bool reset_non_ready = retry >= 1;
545*4882a593Smuzhiyun 	intel_engine_mask_t tmp;
546*4882a593Smuzhiyun 	int ret;
547*4882a593Smuzhiyun 
548*4882a593Smuzhiyun 	for_each_engine_masked(engine, gt, engine_mask, tmp) {
549*4882a593Smuzhiyun 		ret = gen8_engine_reset_prepare(engine);
550*4882a593Smuzhiyun 		if (ret && !reset_non_ready)
551*4882a593Smuzhiyun 			goto skip_reset;
552*4882a593Smuzhiyun 
553*4882a593Smuzhiyun 		/*
554*4882a593Smuzhiyun 		 * If this is not the first failed attempt to prepare,
555*4882a593Smuzhiyun 		 * we decide to proceed anyway.
556*4882a593Smuzhiyun 		 *
557*4882a593Smuzhiyun 		 * By doing so we risk context corruption and with
558*4882a593Smuzhiyun 		 * some gens (kbl), possible system hang if reset
559*4882a593Smuzhiyun 		 * happens during active bb execution.
560*4882a593Smuzhiyun 		 *
561*4882a593Smuzhiyun 		 * We rather take context corruption instead of
562*4882a593Smuzhiyun 		 * failed reset with a wedged driver/gpu. And
563*4882a593Smuzhiyun 		 * active bb execution case should be covered by
564*4882a593Smuzhiyun 		 * stop_engines() we have before the reset.
565*4882a593Smuzhiyun 		 */
566*4882a593Smuzhiyun 	}
567*4882a593Smuzhiyun 
568*4882a593Smuzhiyun 	if (INTEL_GEN(gt->i915) >= 11)
569*4882a593Smuzhiyun 		ret = gen11_reset_engines(gt, engine_mask, retry);
570*4882a593Smuzhiyun 	else
571*4882a593Smuzhiyun 		ret = gen6_reset_engines(gt, engine_mask, retry);
572*4882a593Smuzhiyun 
573*4882a593Smuzhiyun skip_reset:
574*4882a593Smuzhiyun 	for_each_engine_masked(engine, gt, engine_mask, tmp)
575*4882a593Smuzhiyun 		gen8_engine_reset_cancel(engine);
576*4882a593Smuzhiyun 
577*4882a593Smuzhiyun 	return ret;
578*4882a593Smuzhiyun }
579*4882a593Smuzhiyun 
mock_reset(struct intel_gt * gt,intel_engine_mask_t mask,unsigned int retry)580*4882a593Smuzhiyun static int mock_reset(struct intel_gt *gt,
581*4882a593Smuzhiyun 		      intel_engine_mask_t mask,
582*4882a593Smuzhiyun 		      unsigned int retry)
583*4882a593Smuzhiyun {
584*4882a593Smuzhiyun 	return 0;
585*4882a593Smuzhiyun }
586*4882a593Smuzhiyun 
587*4882a593Smuzhiyun typedef int (*reset_func)(struct intel_gt *,
588*4882a593Smuzhiyun 			  intel_engine_mask_t engine_mask,
589*4882a593Smuzhiyun 			  unsigned int retry);
590*4882a593Smuzhiyun 
intel_get_gpu_reset(const struct intel_gt * gt)591*4882a593Smuzhiyun static reset_func intel_get_gpu_reset(const struct intel_gt *gt)
592*4882a593Smuzhiyun {
593*4882a593Smuzhiyun 	struct drm_i915_private *i915 = gt->i915;
594*4882a593Smuzhiyun 
595*4882a593Smuzhiyun 	if (is_mock_gt(gt))
596*4882a593Smuzhiyun 		return mock_reset;
597*4882a593Smuzhiyun 	else if (INTEL_GEN(i915) >= 8)
598*4882a593Smuzhiyun 		return gen8_reset_engines;
599*4882a593Smuzhiyun 	else if (INTEL_GEN(i915) >= 6)
600*4882a593Smuzhiyun 		return gen6_reset_engines;
601*4882a593Smuzhiyun 	else if (INTEL_GEN(i915) >= 5)
602*4882a593Smuzhiyun 		return ilk_do_reset;
603*4882a593Smuzhiyun 	else if (IS_G4X(i915))
604*4882a593Smuzhiyun 		return g4x_do_reset;
605*4882a593Smuzhiyun 	else if (IS_G33(i915) || IS_PINEVIEW(i915))
606*4882a593Smuzhiyun 		return g33_do_reset;
607*4882a593Smuzhiyun 	else if (INTEL_GEN(i915) >= 3)
608*4882a593Smuzhiyun 		return i915_do_reset;
609*4882a593Smuzhiyun 	else
610*4882a593Smuzhiyun 		return NULL;
611*4882a593Smuzhiyun }
612*4882a593Smuzhiyun 
__intel_gt_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask)613*4882a593Smuzhiyun int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
614*4882a593Smuzhiyun {
615*4882a593Smuzhiyun 	const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
616*4882a593Smuzhiyun 	reset_func reset;
617*4882a593Smuzhiyun 	int ret = -ETIMEDOUT;
618*4882a593Smuzhiyun 	int retry;
619*4882a593Smuzhiyun 
620*4882a593Smuzhiyun 	reset = intel_get_gpu_reset(gt);
621*4882a593Smuzhiyun 	if (!reset)
622*4882a593Smuzhiyun 		return -ENODEV;
623*4882a593Smuzhiyun 
624*4882a593Smuzhiyun 	/*
625*4882a593Smuzhiyun 	 * If the power well sleeps during the reset, the reset
626*4882a593Smuzhiyun 	 * request may be dropped and never completes (causing -EIO).
627*4882a593Smuzhiyun 	 */
628*4882a593Smuzhiyun 	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
629*4882a593Smuzhiyun 	for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
630*4882a593Smuzhiyun 		GT_TRACE(gt, "engine_mask=%x\n", engine_mask);
631*4882a593Smuzhiyun 		preempt_disable();
632*4882a593Smuzhiyun 		ret = reset(gt, engine_mask, retry);
633*4882a593Smuzhiyun 		preempt_enable();
634*4882a593Smuzhiyun 	}
635*4882a593Smuzhiyun 	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
636*4882a593Smuzhiyun 
637*4882a593Smuzhiyun 	return ret;
638*4882a593Smuzhiyun }
639*4882a593Smuzhiyun 
intel_has_gpu_reset(const struct intel_gt * gt)640*4882a593Smuzhiyun bool intel_has_gpu_reset(const struct intel_gt *gt)
641*4882a593Smuzhiyun {
642*4882a593Smuzhiyun 	if (!gt->i915->params.reset)
643*4882a593Smuzhiyun 		return NULL;
644*4882a593Smuzhiyun 
645*4882a593Smuzhiyun 	return intel_get_gpu_reset(gt);
646*4882a593Smuzhiyun }
647*4882a593Smuzhiyun 
intel_has_reset_engine(const struct intel_gt * gt)648*4882a593Smuzhiyun bool intel_has_reset_engine(const struct intel_gt *gt)
649*4882a593Smuzhiyun {
650*4882a593Smuzhiyun 	if (gt->i915->params.reset < 2)
651*4882a593Smuzhiyun 		return false;
652*4882a593Smuzhiyun 
653*4882a593Smuzhiyun 	return INTEL_INFO(gt->i915)->has_reset_engine;
654*4882a593Smuzhiyun }
655*4882a593Smuzhiyun 
intel_reset_guc(struct intel_gt * gt)656*4882a593Smuzhiyun int intel_reset_guc(struct intel_gt *gt)
657*4882a593Smuzhiyun {
658*4882a593Smuzhiyun 	u32 guc_domain =
659*4882a593Smuzhiyun 		INTEL_GEN(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
660*4882a593Smuzhiyun 	int ret;
661*4882a593Smuzhiyun 
662*4882a593Smuzhiyun 	GEM_BUG_ON(!HAS_GT_UC(gt->i915));
663*4882a593Smuzhiyun 
664*4882a593Smuzhiyun 	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
665*4882a593Smuzhiyun 	ret = gen6_hw_domain_reset(gt, guc_domain);
666*4882a593Smuzhiyun 	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
667*4882a593Smuzhiyun 
668*4882a593Smuzhiyun 	return ret;
669*4882a593Smuzhiyun }
670*4882a593Smuzhiyun 
671*4882a593Smuzhiyun /*
672*4882a593Smuzhiyun  * Ensure irq handler finishes, and not run again.
673*4882a593Smuzhiyun  * Also return the active request so that we only search for it once.
674*4882a593Smuzhiyun  */
reset_prepare_engine(struct intel_engine_cs * engine)675*4882a593Smuzhiyun static void reset_prepare_engine(struct intel_engine_cs *engine)
676*4882a593Smuzhiyun {
677*4882a593Smuzhiyun 	/*
678*4882a593Smuzhiyun 	 * During the reset sequence, we must prevent the engine from
679*4882a593Smuzhiyun 	 * entering RC6. As the context state is undefined until we restart
680*4882a593Smuzhiyun 	 * the engine, if it does enter RC6 during the reset, the state
681*4882a593Smuzhiyun 	 * written to the powercontext is undefined and so we may lose
682*4882a593Smuzhiyun 	 * GPU state upon resume, i.e. fail to restart after a reset.
683*4882a593Smuzhiyun 	 */
684*4882a593Smuzhiyun 	intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
685*4882a593Smuzhiyun 	if (engine->reset.prepare)
686*4882a593Smuzhiyun 		engine->reset.prepare(engine);
687*4882a593Smuzhiyun }
688*4882a593Smuzhiyun 
revoke_mmaps(struct intel_gt * gt)689*4882a593Smuzhiyun static void revoke_mmaps(struct intel_gt *gt)
690*4882a593Smuzhiyun {
691*4882a593Smuzhiyun 	int i;
692*4882a593Smuzhiyun 
693*4882a593Smuzhiyun 	for (i = 0; i < gt->ggtt->num_fences; i++) {
694*4882a593Smuzhiyun 		struct drm_vma_offset_node *node;
695*4882a593Smuzhiyun 		struct i915_vma *vma;
696*4882a593Smuzhiyun 		u64 vma_offset;
697*4882a593Smuzhiyun 
698*4882a593Smuzhiyun 		vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
699*4882a593Smuzhiyun 		if (!vma)
700*4882a593Smuzhiyun 			continue;
701*4882a593Smuzhiyun 
702*4882a593Smuzhiyun 		if (!i915_vma_has_userfault(vma))
703*4882a593Smuzhiyun 			continue;
704*4882a593Smuzhiyun 
705*4882a593Smuzhiyun 		GEM_BUG_ON(vma->fence != &gt->ggtt->fence_regs[i]);
706*4882a593Smuzhiyun 
707*4882a593Smuzhiyun 		if (!vma->mmo)
708*4882a593Smuzhiyun 			continue;
709*4882a593Smuzhiyun 
710*4882a593Smuzhiyun 		node = &vma->mmo->vma_node;
711*4882a593Smuzhiyun 		vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
712*4882a593Smuzhiyun 
713*4882a593Smuzhiyun 		unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping,
714*4882a593Smuzhiyun 				    drm_vma_node_offset_addr(node) + vma_offset,
715*4882a593Smuzhiyun 				    vma->size,
716*4882a593Smuzhiyun 				    1);
717*4882a593Smuzhiyun 	}
718*4882a593Smuzhiyun }
719*4882a593Smuzhiyun 
reset_prepare(struct intel_gt * gt)720*4882a593Smuzhiyun static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
721*4882a593Smuzhiyun {
722*4882a593Smuzhiyun 	struct intel_engine_cs *engine;
723*4882a593Smuzhiyun 	intel_engine_mask_t awake = 0;
724*4882a593Smuzhiyun 	enum intel_engine_id id;
725*4882a593Smuzhiyun 
726*4882a593Smuzhiyun 	for_each_engine(engine, gt, id) {
727*4882a593Smuzhiyun 		if (intel_engine_pm_get_if_awake(engine))
728*4882a593Smuzhiyun 			awake |= engine->mask;
729*4882a593Smuzhiyun 		reset_prepare_engine(engine);
730*4882a593Smuzhiyun 	}
731*4882a593Smuzhiyun 
732*4882a593Smuzhiyun 	intel_uc_reset_prepare(&gt->uc);
733*4882a593Smuzhiyun 
734*4882a593Smuzhiyun 	return awake;
735*4882a593Smuzhiyun }
736*4882a593Smuzhiyun 
gt_revoke(struct intel_gt * gt)737*4882a593Smuzhiyun static void gt_revoke(struct intel_gt *gt)
738*4882a593Smuzhiyun {
739*4882a593Smuzhiyun 	revoke_mmaps(gt);
740*4882a593Smuzhiyun }
741*4882a593Smuzhiyun 
gt_reset(struct intel_gt * gt,intel_engine_mask_t stalled_mask)742*4882a593Smuzhiyun static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
743*4882a593Smuzhiyun {
744*4882a593Smuzhiyun 	struct intel_engine_cs *engine;
745*4882a593Smuzhiyun 	enum intel_engine_id id;
746*4882a593Smuzhiyun 	int err;
747*4882a593Smuzhiyun 
748*4882a593Smuzhiyun 	/*
749*4882a593Smuzhiyun 	 * Everything depends on having the GTT running, so we need to start
750*4882a593Smuzhiyun 	 * there.
751*4882a593Smuzhiyun 	 */
752*4882a593Smuzhiyun 	err = i915_ggtt_enable_hw(gt->i915);
753*4882a593Smuzhiyun 	if (err)
754*4882a593Smuzhiyun 		return err;
755*4882a593Smuzhiyun 
756*4882a593Smuzhiyun 	for_each_engine(engine, gt, id)
757*4882a593Smuzhiyun 		__intel_engine_reset(engine, stalled_mask & engine->mask);
758*4882a593Smuzhiyun 
759*4882a593Smuzhiyun 	intel_ggtt_restore_fences(gt->ggtt);
760*4882a593Smuzhiyun 
761*4882a593Smuzhiyun 	return err;
762*4882a593Smuzhiyun }
763*4882a593Smuzhiyun 
reset_finish_engine(struct intel_engine_cs * engine)764*4882a593Smuzhiyun static void reset_finish_engine(struct intel_engine_cs *engine)
765*4882a593Smuzhiyun {
766*4882a593Smuzhiyun 	if (engine->reset.finish)
767*4882a593Smuzhiyun 		engine->reset.finish(engine);
768*4882a593Smuzhiyun 	intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
769*4882a593Smuzhiyun 
770*4882a593Smuzhiyun 	intel_engine_signal_breadcrumbs(engine);
771*4882a593Smuzhiyun }
772*4882a593Smuzhiyun 
reset_finish(struct intel_gt * gt,intel_engine_mask_t awake)773*4882a593Smuzhiyun static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
774*4882a593Smuzhiyun {
775*4882a593Smuzhiyun 	struct intel_engine_cs *engine;
776*4882a593Smuzhiyun 	enum intel_engine_id id;
777*4882a593Smuzhiyun 
778*4882a593Smuzhiyun 	for_each_engine(engine, gt, id) {
779*4882a593Smuzhiyun 		reset_finish_engine(engine);
780*4882a593Smuzhiyun 		if (awake & engine->mask)
781*4882a593Smuzhiyun 			intel_engine_pm_put(engine);
782*4882a593Smuzhiyun 	}
783*4882a593Smuzhiyun }
784*4882a593Smuzhiyun 
nop_submit_request(struct i915_request * request)785*4882a593Smuzhiyun static void nop_submit_request(struct i915_request *request)
786*4882a593Smuzhiyun {
787*4882a593Smuzhiyun 	struct intel_engine_cs *engine = request->engine;
788*4882a593Smuzhiyun 	unsigned long flags;
789*4882a593Smuzhiyun 
790*4882a593Smuzhiyun 	RQ_TRACE(request, "-EIO\n");
791*4882a593Smuzhiyun 	i915_request_set_error_once(request, -EIO);
792*4882a593Smuzhiyun 
793*4882a593Smuzhiyun 	spin_lock_irqsave(&engine->active.lock, flags);
794*4882a593Smuzhiyun 	__i915_request_submit(request);
795*4882a593Smuzhiyun 	i915_request_mark_complete(request);
796*4882a593Smuzhiyun 	spin_unlock_irqrestore(&engine->active.lock, flags);
797*4882a593Smuzhiyun 
798*4882a593Smuzhiyun 	intel_engine_signal_breadcrumbs(engine);
799*4882a593Smuzhiyun }
800*4882a593Smuzhiyun 
__intel_gt_set_wedged(struct intel_gt * gt)801*4882a593Smuzhiyun static void __intel_gt_set_wedged(struct intel_gt *gt)
802*4882a593Smuzhiyun {
803*4882a593Smuzhiyun 	struct intel_engine_cs *engine;
804*4882a593Smuzhiyun 	intel_engine_mask_t awake;
805*4882a593Smuzhiyun 	enum intel_engine_id id;
806*4882a593Smuzhiyun 
807*4882a593Smuzhiyun 	if (test_bit(I915_WEDGED, &gt->reset.flags))
808*4882a593Smuzhiyun 		return;
809*4882a593Smuzhiyun 
810*4882a593Smuzhiyun 	GT_TRACE(gt, "start\n");
811*4882a593Smuzhiyun 
812*4882a593Smuzhiyun 	/*
813*4882a593Smuzhiyun 	 * First, stop submission to hw, but do not yet complete requests by
814*4882a593Smuzhiyun 	 * rolling the global seqno forward (since this would complete requests
815*4882a593Smuzhiyun 	 * for which we haven't set the fence error to EIO yet).
816*4882a593Smuzhiyun 	 */
817*4882a593Smuzhiyun 	awake = reset_prepare(gt);
818*4882a593Smuzhiyun 
819*4882a593Smuzhiyun 	/* Even if the GPU reset fails, it should still stop the engines */
820*4882a593Smuzhiyun 	if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
821*4882a593Smuzhiyun 		__intel_gt_reset(gt, ALL_ENGINES);
822*4882a593Smuzhiyun 
823*4882a593Smuzhiyun 	for_each_engine(engine, gt, id)
824*4882a593Smuzhiyun 		engine->submit_request = nop_submit_request;
825*4882a593Smuzhiyun 
826*4882a593Smuzhiyun 	/*
827*4882a593Smuzhiyun 	 * Make sure no request can slip through without getting completed by
828*4882a593Smuzhiyun 	 * either this call here to intel_engine_write_global_seqno, or the one
829*4882a593Smuzhiyun 	 * in nop_submit_request.
830*4882a593Smuzhiyun 	 */
831*4882a593Smuzhiyun 	synchronize_rcu_expedited();
832*4882a593Smuzhiyun 	set_bit(I915_WEDGED, &gt->reset.flags);
833*4882a593Smuzhiyun 
834*4882a593Smuzhiyun 	/* Mark all executing requests as skipped */
835*4882a593Smuzhiyun 	for_each_engine(engine, gt, id)
836*4882a593Smuzhiyun 		if (engine->reset.cancel)
837*4882a593Smuzhiyun 			engine->reset.cancel(engine);
838*4882a593Smuzhiyun 
839*4882a593Smuzhiyun 	reset_finish(gt, awake);
840*4882a593Smuzhiyun 
841*4882a593Smuzhiyun 	GT_TRACE(gt, "end\n");
842*4882a593Smuzhiyun }
843*4882a593Smuzhiyun 
intel_gt_set_wedged(struct intel_gt * gt)844*4882a593Smuzhiyun void intel_gt_set_wedged(struct intel_gt *gt)
845*4882a593Smuzhiyun {
846*4882a593Smuzhiyun 	intel_wakeref_t wakeref;
847*4882a593Smuzhiyun 
848*4882a593Smuzhiyun 	if (test_bit(I915_WEDGED, &gt->reset.flags))
849*4882a593Smuzhiyun 		return;
850*4882a593Smuzhiyun 
851*4882a593Smuzhiyun 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
852*4882a593Smuzhiyun 	mutex_lock(&gt->reset.mutex);
853*4882a593Smuzhiyun 
854*4882a593Smuzhiyun 	if (GEM_SHOW_DEBUG()) {
855*4882a593Smuzhiyun 		struct drm_printer p = drm_debug_printer(__func__);
856*4882a593Smuzhiyun 		struct intel_engine_cs *engine;
857*4882a593Smuzhiyun 		enum intel_engine_id id;
858*4882a593Smuzhiyun 
859*4882a593Smuzhiyun 		drm_printf(&p, "called from %pS\n", (void *)_RET_IP_);
860*4882a593Smuzhiyun 		for_each_engine(engine, gt, id) {
861*4882a593Smuzhiyun 			if (intel_engine_is_idle(engine))
862*4882a593Smuzhiyun 				continue;
863*4882a593Smuzhiyun 
864*4882a593Smuzhiyun 			intel_engine_dump(engine, &p, "%s\n", engine->name);
865*4882a593Smuzhiyun 		}
866*4882a593Smuzhiyun 	}
867*4882a593Smuzhiyun 
868*4882a593Smuzhiyun 	__intel_gt_set_wedged(gt);
869*4882a593Smuzhiyun 
870*4882a593Smuzhiyun 	mutex_unlock(&gt->reset.mutex);
871*4882a593Smuzhiyun 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
872*4882a593Smuzhiyun }
873*4882a593Smuzhiyun 
__intel_gt_unset_wedged(struct intel_gt * gt)874*4882a593Smuzhiyun static bool __intel_gt_unset_wedged(struct intel_gt *gt)
875*4882a593Smuzhiyun {
876*4882a593Smuzhiyun 	struct intel_gt_timelines *timelines = &gt->timelines;
877*4882a593Smuzhiyun 	struct intel_timeline *tl;
878*4882a593Smuzhiyun 	bool ok;
879*4882a593Smuzhiyun 
880*4882a593Smuzhiyun 	if (!test_bit(I915_WEDGED, &gt->reset.flags))
881*4882a593Smuzhiyun 		return true;
882*4882a593Smuzhiyun 
883*4882a593Smuzhiyun 	/* Never fully initialised, recovery impossible */
884*4882a593Smuzhiyun 	if (intel_gt_has_unrecoverable_error(gt))
885*4882a593Smuzhiyun 		return false;
886*4882a593Smuzhiyun 
887*4882a593Smuzhiyun 	GT_TRACE(gt, "start\n");
888*4882a593Smuzhiyun 
889*4882a593Smuzhiyun 	/*
890*4882a593Smuzhiyun 	 * Before unwedging, make sure that all pending operations
891*4882a593Smuzhiyun 	 * are flushed and errored out - we may have requests waiting upon
892*4882a593Smuzhiyun 	 * third party fences. We marked all inflight requests as EIO, and
893*4882a593Smuzhiyun 	 * every execbuf since returned EIO, for consistency we want all
894*4882a593Smuzhiyun 	 * the currently pending requests to also be marked as EIO, which
895*4882a593Smuzhiyun 	 * is done inside our nop_submit_request - and so we must wait.
896*4882a593Smuzhiyun 	 *
897*4882a593Smuzhiyun 	 * No more can be submitted until we reset the wedged bit.
898*4882a593Smuzhiyun 	 */
899*4882a593Smuzhiyun 	spin_lock(&timelines->lock);
900*4882a593Smuzhiyun 	list_for_each_entry(tl, &timelines->active_list, link) {
901*4882a593Smuzhiyun 		struct dma_fence *fence;
902*4882a593Smuzhiyun 
903*4882a593Smuzhiyun 		fence = i915_active_fence_get(&tl->last_request);
904*4882a593Smuzhiyun 		if (!fence)
905*4882a593Smuzhiyun 			continue;
906*4882a593Smuzhiyun 
907*4882a593Smuzhiyun 		spin_unlock(&timelines->lock);
908*4882a593Smuzhiyun 
909*4882a593Smuzhiyun 		/*
910*4882a593Smuzhiyun 		 * All internal dependencies (i915_requests) will have
911*4882a593Smuzhiyun 		 * been flushed by the set-wedge, but we may be stuck waiting
912*4882a593Smuzhiyun 		 * for external fences. These should all be capped to 10s
913*4882a593Smuzhiyun 		 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
914*4882a593Smuzhiyun 		 * in the worst case.
915*4882a593Smuzhiyun 		 */
916*4882a593Smuzhiyun 		dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT);
917*4882a593Smuzhiyun 		dma_fence_put(fence);
918*4882a593Smuzhiyun 
919*4882a593Smuzhiyun 		/* Restart iteration after droping lock */
920*4882a593Smuzhiyun 		spin_lock(&timelines->lock);
921*4882a593Smuzhiyun 		tl = list_entry(&timelines->active_list, typeof(*tl), link);
922*4882a593Smuzhiyun 	}
923*4882a593Smuzhiyun 	spin_unlock(&timelines->lock);
924*4882a593Smuzhiyun 
925*4882a593Smuzhiyun 	/* We must reset pending GPU events before restoring our submission */
926*4882a593Smuzhiyun 	ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
927*4882a593Smuzhiyun 	if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
928*4882a593Smuzhiyun 		ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
929*4882a593Smuzhiyun 	if (!ok) {
930*4882a593Smuzhiyun 		/*
931*4882a593Smuzhiyun 		 * Warn CI about the unrecoverable wedged condition.
932*4882a593Smuzhiyun 		 * Time for a reboot.
933*4882a593Smuzhiyun 		 */
934*4882a593Smuzhiyun 		add_taint_for_CI(gt->i915, TAINT_WARN);
935*4882a593Smuzhiyun 		return false;
936*4882a593Smuzhiyun 	}
937*4882a593Smuzhiyun 
938*4882a593Smuzhiyun 	/*
939*4882a593Smuzhiyun 	 * Undo nop_submit_request. We prevent all new i915 requests from
940*4882a593Smuzhiyun 	 * being queued (by disallowing execbuf whilst wedged) so having
941*4882a593Smuzhiyun 	 * waited for all active requests above, we know the system is idle
942*4882a593Smuzhiyun 	 * and do not have to worry about a thread being inside
943*4882a593Smuzhiyun 	 * engine->submit_request() as we swap over. So unlike installing
944*4882a593Smuzhiyun 	 * the nop_submit_request on reset, we can do this from normal
945*4882a593Smuzhiyun 	 * context and do not require stop_machine().
946*4882a593Smuzhiyun 	 */
947*4882a593Smuzhiyun 	intel_engines_reset_default_submission(gt);
948*4882a593Smuzhiyun 
949*4882a593Smuzhiyun 	GT_TRACE(gt, "end\n");
950*4882a593Smuzhiyun 
951*4882a593Smuzhiyun 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
952*4882a593Smuzhiyun 	clear_bit(I915_WEDGED, &gt->reset.flags);
953*4882a593Smuzhiyun 
954*4882a593Smuzhiyun 	return true;
955*4882a593Smuzhiyun }
956*4882a593Smuzhiyun 
intel_gt_unset_wedged(struct intel_gt * gt)957*4882a593Smuzhiyun bool intel_gt_unset_wedged(struct intel_gt *gt)
958*4882a593Smuzhiyun {
959*4882a593Smuzhiyun 	bool result;
960*4882a593Smuzhiyun 
961*4882a593Smuzhiyun 	mutex_lock(&gt->reset.mutex);
962*4882a593Smuzhiyun 	result = __intel_gt_unset_wedged(gt);
963*4882a593Smuzhiyun 	mutex_unlock(&gt->reset.mutex);
964*4882a593Smuzhiyun 
965*4882a593Smuzhiyun 	return result;
966*4882a593Smuzhiyun }
967*4882a593Smuzhiyun 
do_reset(struct intel_gt * gt,intel_engine_mask_t stalled_mask)968*4882a593Smuzhiyun static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
969*4882a593Smuzhiyun {
970*4882a593Smuzhiyun 	int err, i;
971*4882a593Smuzhiyun 
972*4882a593Smuzhiyun 	gt_revoke(gt);
973*4882a593Smuzhiyun 
974*4882a593Smuzhiyun 	err = __intel_gt_reset(gt, ALL_ENGINES);
975*4882a593Smuzhiyun 	for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
976*4882a593Smuzhiyun 		msleep(10 * (i + 1));
977*4882a593Smuzhiyun 		err = __intel_gt_reset(gt, ALL_ENGINES);
978*4882a593Smuzhiyun 	}
979*4882a593Smuzhiyun 	if (err)
980*4882a593Smuzhiyun 		return err;
981*4882a593Smuzhiyun 
982*4882a593Smuzhiyun 	return gt_reset(gt, stalled_mask);
983*4882a593Smuzhiyun }
984*4882a593Smuzhiyun 
resume(struct intel_gt * gt)985*4882a593Smuzhiyun static int resume(struct intel_gt *gt)
986*4882a593Smuzhiyun {
987*4882a593Smuzhiyun 	struct intel_engine_cs *engine;
988*4882a593Smuzhiyun 	enum intel_engine_id id;
989*4882a593Smuzhiyun 	int ret;
990*4882a593Smuzhiyun 
991*4882a593Smuzhiyun 	for_each_engine(engine, gt, id) {
992*4882a593Smuzhiyun 		ret = intel_engine_resume(engine);
993*4882a593Smuzhiyun 		if (ret)
994*4882a593Smuzhiyun 			return ret;
995*4882a593Smuzhiyun 	}
996*4882a593Smuzhiyun 
997*4882a593Smuzhiyun 	return 0;
998*4882a593Smuzhiyun }
999*4882a593Smuzhiyun 
1000*4882a593Smuzhiyun /**
1001*4882a593Smuzhiyun  * intel_gt_reset - reset chip after a hang
1002*4882a593Smuzhiyun  * @gt: #intel_gt to reset
1003*4882a593Smuzhiyun  * @stalled_mask: mask of the stalled engines with the guilty requests
1004*4882a593Smuzhiyun  * @reason: user error message for why we are resetting
1005*4882a593Smuzhiyun  *
1006*4882a593Smuzhiyun  * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
1007*4882a593Smuzhiyun  * on failure.
1008*4882a593Smuzhiyun  *
1009*4882a593Smuzhiyun  * Procedure is fairly simple:
1010*4882a593Smuzhiyun  *   - reset the chip using the reset reg
1011*4882a593Smuzhiyun  *   - re-init context state
1012*4882a593Smuzhiyun  *   - re-init hardware status page
1013*4882a593Smuzhiyun  *   - re-init ring buffer
1014*4882a593Smuzhiyun  *   - re-init interrupt state
1015*4882a593Smuzhiyun  *   - re-init display
1016*4882a593Smuzhiyun  */
intel_gt_reset(struct intel_gt * gt,intel_engine_mask_t stalled_mask,const char * reason)1017*4882a593Smuzhiyun void intel_gt_reset(struct intel_gt *gt,
1018*4882a593Smuzhiyun 		    intel_engine_mask_t stalled_mask,
1019*4882a593Smuzhiyun 		    const char *reason)
1020*4882a593Smuzhiyun {
1021*4882a593Smuzhiyun 	intel_engine_mask_t awake;
1022*4882a593Smuzhiyun 	int ret;
1023*4882a593Smuzhiyun 
1024*4882a593Smuzhiyun 	GT_TRACE(gt, "flags=%lx\n", gt->reset.flags);
1025*4882a593Smuzhiyun 
1026*4882a593Smuzhiyun 	might_sleep();
1027*4882a593Smuzhiyun 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
1028*4882a593Smuzhiyun 	mutex_lock(&gt->reset.mutex);
1029*4882a593Smuzhiyun 
1030*4882a593Smuzhiyun 	/* Clear any previous failed attempts at recovery. Time to try again. */
1031*4882a593Smuzhiyun 	if (!__intel_gt_unset_wedged(gt))
1032*4882a593Smuzhiyun 		goto unlock;
1033*4882a593Smuzhiyun 
1034*4882a593Smuzhiyun 	if (reason)
1035*4882a593Smuzhiyun 		drm_notice(&gt->i915->drm,
1036*4882a593Smuzhiyun 			   "Resetting chip for %s\n", reason);
1037*4882a593Smuzhiyun 	atomic_inc(&gt->i915->gpu_error.reset_count);
1038*4882a593Smuzhiyun 
1039*4882a593Smuzhiyun 	awake = reset_prepare(gt);
1040*4882a593Smuzhiyun 
1041*4882a593Smuzhiyun 	if (!intel_has_gpu_reset(gt)) {
1042*4882a593Smuzhiyun 		if (gt->i915->params.reset)
1043*4882a593Smuzhiyun 			drm_err(&gt->i915->drm, "GPU reset not supported\n");
1044*4882a593Smuzhiyun 		else
1045*4882a593Smuzhiyun 			drm_dbg(&gt->i915->drm, "GPU reset disabled\n");
1046*4882a593Smuzhiyun 		goto error;
1047*4882a593Smuzhiyun 	}
1048*4882a593Smuzhiyun 
1049*4882a593Smuzhiyun 	if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1050*4882a593Smuzhiyun 		intel_runtime_pm_disable_interrupts(gt->i915);
1051*4882a593Smuzhiyun 
1052*4882a593Smuzhiyun 	if (do_reset(gt, stalled_mask)) {
1053*4882a593Smuzhiyun 		drm_err(&gt->i915->drm, "Failed to reset chip\n");
1054*4882a593Smuzhiyun 		goto taint;
1055*4882a593Smuzhiyun 	}
1056*4882a593Smuzhiyun 
1057*4882a593Smuzhiyun 	if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1058*4882a593Smuzhiyun 		intel_runtime_pm_enable_interrupts(gt->i915);
1059*4882a593Smuzhiyun 
1060*4882a593Smuzhiyun 	intel_overlay_reset(gt->i915);
1061*4882a593Smuzhiyun 
1062*4882a593Smuzhiyun 	/*
1063*4882a593Smuzhiyun 	 * Next we need to restore the context, but we don't use those
1064*4882a593Smuzhiyun 	 * yet either...
1065*4882a593Smuzhiyun 	 *
1066*4882a593Smuzhiyun 	 * Ring buffer needs to be re-initialized in the KMS case, or if X
1067*4882a593Smuzhiyun 	 * was running at the time of the reset (i.e. we weren't VT
1068*4882a593Smuzhiyun 	 * switched away).
1069*4882a593Smuzhiyun 	 */
1070*4882a593Smuzhiyun 	ret = intel_gt_init_hw(gt);
1071*4882a593Smuzhiyun 	if (ret) {
1072*4882a593Smuzhiyun 		drm_err(&gt->i915->drm,
1073*4882a593Smuzhiyun 			"Failed to initialise HW following reset (%d)\n",
1074*4882a593Smuzhiyun 			ret);
1075*4882a593Smuzhiyun 		goto taint;
1076*4882a593Smuzhiyun 	}
1077*4882a593Smuzhiyun 
1078*4882a593Smuzhiyun 	ret = resume(gt);
1079*4882a593Smuzhiyun 	if (ret)
1080*4882a593Smuzhiyun 		goto taint;
1081*4882a593Smuzhiyun 
1082*4882a593Smuzhiyun finish:
1083*4882a593Smuzhiyun 	reset_finish(gt, awake);
1084*4882a593Smuzhiyun unlock:
1085*4882a593Smuzhiyun 	mutex_unlock(&gt->reset.mutex);
1086*4882a593Smuzhiyun 	return;
1087*4882a593Smuzhiyun 
1088*4882a593Smuzhiyun taint:
1089*4882a593Smuzhiyun 	/*
1090*4882a593Smuzhiyun 	 * History tells us that if we cannot reset the GPU now, we
1091*4882a593Smuzhiyun 	 * never will. This then impacts everything that is run
1092*4882a593Smuzhiyun 	 * subsequently. On failing the reset, we mark the driver
1093*4882a593Smuzhiyun 	 * as wedged, preventing further execution on the GPU.
1094*4882a593Smuzhiyun 	 * We also want to go one step further and add a taint to the
1095*4882a593Smuzhiyun 	 * kernel so that any subsequent faults can be traced back to
1096*4882a593Smuzhiyun 	 * this failure. This is important for CI, where if the
1097*4882a593Smuzhiyun 	 * GPU/driver fails we would like to reboot and restart testing
1098*4882a593Smuzhiyun 	 * rather than continue on into oblivion. For everyone else,
1099*4882a593Smuzhiyun 	 * the system should still plod along, but they have been warned!
1100*4882a593Smuzhiyun 	 */
1101*4882a593Smuzhiyun 	add_taint_for_CI(gt->i915, TAINT_WARN);
1102*4882a593Smuzhiyun error:
1103*4882a593Smuzhiyun 	__intel_gt_set_wedged(gt);
1104*4882a593Smuzhiyun 	goto finish;
1105*4882a593Smuzhiyun }
1106*4882a593Smuzhiyun 
intel_gt_reset_engine(struct intel_engine_cs * engine)1107*4882a593Smuzhiyun static inline int intel_gt_reset_engine(struct intel_engine_cs *engine)
1108*4882a593Smuzhiyun {
1109*4882a593Smuzhiyun 	return __intel_gt_reset(engine->gt, engine->mask);
1110*4882a593Smuzhiyun }
1111*4882a593Smuzhiyun 
1112*4882a593Smuzhiyun /**
1113*4882a593Smuzhiyun  * intel_engine_reset - reset GPU engine to recover from a hang
1114*4882a593Smuzhiyun  * @engine: engine to reset
1115*4882a593Smuzhiyun  * @msg: reason for GPU reset; or NULL for no drm_notice()
1116*4882a593Smuzhiyun  *
1117*4882a593Smuzhiyun  * Reset a specific GPU engine. Useful if a hang is detected.
1118*4882a593Smuzhiyun  * Returns zero on successful reset or otherwise an error code.
1119*4882a593Smuzhiyun  *
1120*4882a593Smuzhiyun  * Procedure is:
1121*4882a593Smuzhiyun  *  - identifies the request that caused the hang and it is dropped
1122*4882a593Smuzhiyun  *  - reset engine (which will force the engine to idle)
1123*4882a593Smuzhiyun  *  - re-init/configure engine
1124*4882a593Smuzhiyun  */
intel_engine_reset(struct intel_engine_cs * engine,const char * msg)1125*4882a593Smuzhiyun int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
1126*4882a593Smuzhiyun {
1127*4882a593Smuzhiyun 	struct intel_gt *gt = engine->gt;
1128*4882a593Smuzhiyun 	bool uses_guc = intel_engine_in_guc_submission_mode(engine);
1129*4882a593Smuzhiyun 	int ret;
1130*4882a593Smuzhiyun 
1131*4882a593Smuzhiyun 	ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags);
1132*4882a593Smuzhiyun 	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &gt->reset.flags));
1133*4882a593Smuzhiyun 
1134*4882a593Smuzhiyun 	if (!intel_engine_pm_get_if_awake(engine))
1135*4882a593Smuzhiyun 		return 0;
1136*4882a593Smuzhiyun 
1137*4882a593Smuzhiyun 	reset_prepare_engine(engine);
1138*4882a593Smuzhiyun 
1139*4882a593Smuzhiyun 	if (msg)
1140*4882a593Smuzhiyun 		drm_notice(&engine->i915->drm,
1141*4882a593Smuzhiyun 			   "Resetting %s for %s\n", engine->name, msg);
1142*4882a593Smuzhiyun 	atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
1143*4882a593Smuzhiyun 
1144*4882a593Smuzhiyun 	if (!uses_guc)
1145*4882a593Smuzhiyun 		ret = intel_gt_reset_engine(engine);
1146*4882a593Smuzhiyun 	else
1147*4882a593Smuzhiyun 		ret = intel_guc_reset_engine(&engine->gt->uc.guc, engine);
1148*4882a593Smuzhiyun 	if (ret) {
1149*4882a593Smuzhiyun 		/* If we fail here, we expect to fallback to a global reset */
1150*4882a593Smuzhiyun 		drm_dbg(&gt->i915->drm, "%sFailed to reset %s, ret=%d\n",
1151*4882a593Smuzhiyun 			uses_guc ? "GuC " : "", engine->name, ret);
1152*4882a593Smuzhiyun 		goto out;
1153*4882a593Smuzhiyun 	}
1154*4882a593Smuzhiyun 
1155*4882a593Smuzhiyun 	/*
1156*4882a593Smuzhiyun 	 * The request that caused the hang is stuck on elsp, we know the
1157*4882a593Smuzhiyun 	 * active request and can drop it, adjust head to skip the offending
1158*4882a593Smuzhiyun 	 * request to resume executing remaining requests in the queue.
1159*4882a593Smuzhiyun 	 */
1160*4882a593Smuzhiyun 	__intel_engine_reset(engine, true);
1161*4882a593Smuzhiyun 
1162*4882a593Smuzhiyun 	/*
1163*4882a593Smuzhiyun 	 * The engine and its registers (and workarounds in case of render)
1164*4882a593Smuzhiyun 	 * have been reset to their default values. Follow the init_ring
1165*4882a593Smuzhiyun 	 * process to program RING_MODE, HWSP and re-enable submission.
1166*4882a593Smuzhiyun 	 */
1167*4882a593Smuzhiyun 	ret = intel_engine_resume(engine);
1168*4882a593Smuzhiyun 
1169*4882a593Smuzhiyun out:
1170*4882a593Smuzhiyun 	intel_engine_cancel_stop_cs(engine);
1171*4882a593Smuzhiyun 	reset_finish_engine(engine);
1172*4882a593Smuzhiyun 	intel_engine_pm_put_async(engine);
1173*4882a593Smuzhiyun 	return ret;
1174*4882a593Smuzhiyun }
1175*4882a593Smuzhiyun 
intel_gt_reset_global(struct intel_gt * gt,u32 engine_mask,const char * reason)1176*4882a593Smuzhiyun static void intel_gt_reset_global(struct intel_gt *gt,
1177*4882a593Smuzhiyun 				  u32 engine_mask,
1178*4882a593Smuzhiyun 				  const char *reason)
1179*4882a593Smuzhiyun {
1180*4882a593Smuzhiyun 	struct kobject *kobj = &gt->i915->drm.primary->kdev->kobj;
1181*4882a593Smuzhiyun 	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1182*4882a593Smuzhiyun 	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1183*4882a593Smuzhiyun 	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1184*4882a593Smuzhiyun 	struct intel_wedge_me w;
1185*4882a593Smuzhiyun 
1186*4882a593Smuzhiyun 	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1187*4882a593Smuzhiyun 
1188*4882a593Smuzhiyun 	drm_dbg(&gt->i915->drm, "resetting chip, engines=%x\n", engine_mask);
1189*4882a593Smuzhiyun 	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1190*4882a593Smuzhiyun 
1191*4882a593Smuzhiyun 	/* Use a watchdog to ensure that our reset completes */
1192*4882a593Smuzhiyun 	intel_wedge_on_timeout(&w, gt, 5 * HZ) {
1193*4882a593Smuzhiyun 		intel_prepare_reset(gt->i915);
1194*4882a593Smuzhiyun 
1195*4882a593Smuzhiyun 		/* Flush everyone using a resource about to be clobbered */
1196*4882a593Smuzhiyun 		synchronize_srcu_expedited(&gt->reset.backoff_srcu);
1197*4882a593Smuzhiyun 
1198*4882a593Smuzhiyun 		intel_gt_reset(gt, engine_mask, reason);
1199*4882a593Smuzhiyun 
1200*4882a593Smuzhiyun 		intel_finish_reset(gt->i915);
1201*4882a593Smuzhiyun 	}
1202*4882a593Smuzhiyun 
1203*4882a593Smuzhiyun 	if (!test_bit(I915_WEDGED, &gt->reset.flags))
1204*4882a593Smuzhiyun 		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1205*4882a593Smuzhiyun }
1206*4882a593Smuzhiyun 
1207*4882a593Smuzhiyun /**
1208*4882a593Smuzhiyun  * intel_gt_handle_error - handle a gpu error
1209*4882a593Smuzhiyun  * @gt: the intel_gt
1210*4882a593Smuzhiyun  * @engine_mask: mask representing engines that are hung
1211*4882a593Smuzhiyun  * @flags: control flags
1212*4882a593Smuzhiyun  * @fmt: Error message format string
1213*4882a593Smuzhiyun  *
1214*4882a593Smuzhiyun  * Do some basic checking of register state at error time and
1215*4882a593Smuzhiyun  * dump it to the syslog.  Also call i915_capture_error_state() to make
1216*4882a593Smuzhiyun  * sure we get a record and make it available in debugfs.  Fire a uevent
1217*4882a593Smuzhiyun  * so userspace knows something bad happened (should trigger collection
1218*4882a593Smuzhiyun  * of a ring dump etc.).
1219*4882a593Smuzhiyun  */
intel_gt_handle_error(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned long flags,const char * fmt,...)1220*4882a593Smuzhiyun void intel_gt_handle_error(struct intel_gt *gt,
1221*4882a593Smuzhiyun 			   intel_engine_mask_t engine_mask,
1222*4882a593Smuzhiyun 			   unsigned long flags,
1223*4882a593Smuzhiyun 			   const char *fmt, ...)
1224*4882a593Smuzhiyun {
1225*4882a593Smuzhiyun 	struct intel_engine_cs *engine;
1226*4882a593Smuzhiyun 	intel_wakeref_t wakeref;
1227*4882a593Smuzhiyun 	intel_engine_mask_t tmp;
1228*4882a593Smuzhiyun 	char error_msg[80];
1229*4882a593Smuzhiyun 	char *msg = NULL;
1230*4882a593Smuzhiyun 
1231*4882a593Smuzhiyun 	if (fmt) {
1232*4882a593Smuzhiyun 		va_list args;
1233*4882a593Smuzhiyun 
1234*4882a593Smuzhiyun 		va_start(args, fmt);
1235*4882a593Smuzhiyun 		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1236*4882a593Smuzhiyun 		va_end(args);
1237*4882a593Smuzhiyun 
1238*4882a593Smuzhiyun 		msg = error_msg;
1239*4882a593Smuzhiyun 	}
1240*4882a593Smuzhiyun 
1241*4882a593Smuzhiyun 	/*
1242*4882a593Smuzhiyun 	 * In most cases it's guaranteed that we get here with an RPM
1243*4882a593Smuzhiyun 	 * reference held, for example because there is a pending GPU
1244*4882a593Smuzhiyun 	 * request that won't finish until the reset is done. This
1245*4882a593Smuzhiyun 	 * isn't the case at least when we get here by doing a
1246*4882a593Smuzhiyun 	 * simulated reset via debugfs, so get an RPM reference.
1247*4882a593Smuzhiyun 	 */
1248*4882a593Smuzhiyun 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1249*4882a593Smuzhiyun 
1250*4882a593Smuzhiyun 	engine_mask &= gt->info.engine_mask;
1251*4882a593Smuzhiyun 
1252*4882a593Smuzhiyun 	if (flags & I915_ERROR_CAPTURE) {
1253*4882a593Smuzhiyun 		i915_capture_error_state(gt->i915);
1254*4882a593Smuzhiyun 		intel_gt_clear_error_registers(gt, engine_mask);
1255*4882a593Smuzhiyun 	}
1256*4882a593Smuzhiyun 
1257*4882a593Smuzhiyun 	/*
1258*4882a593Smuzhiyun 	 * Try engine reset when available. We fall back to full reset if
1259*4882a593Smuzhiyun 	 * single reset fails.
1260*4882a593Smuzhiyun 	 */
1261*4882a593Smuzhiyun 	if (intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) {
1262*4882a593Smuzhiyun 		for_each_engine_masked(engine, gt, engine_mask, tmp) {
1263*4882a593Smuzhiyun 			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1264*4882a593Smuzhiyun 			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1265*4882a593Smuzhiyun 					     &gt->reset.flags))
1266*4882a593Smuzhiyun 				continue;
1267*4882a593Smuzhiyun 
1268*4882a593Smuzhiyun 			if (intel_engine_reset(engine, msg) == 0)
1269*4882a593Smuzhiyun 				engine_mask &= ~engine->mask;
1270*4882a593Smuzhiyun 
1271*4882a593Smuzhiyun 			clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
1272*4882a593Smuzhiyun 					      &gt->reset.flags);
1273*4882a593Smuzhiyun 		}
1274*4882a593Smuzhiyun 	}
1275*4882a593Smuzhiyun 
1276*4882a593Smuzhiyun 	if (!engine_mask)
1277*4882a593Smuzhiyun 		goto out;
1278*4882a593Smuzhiyun 
1279*4882a593Smuzhiyun 	/* Full reset needs the mutex, stop any other user trying to do so. */
1280*4882a593Smuzhiyun 	if (test_and_set_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1281*4882a593Smuzhiyun 		wait_event(gt->reset.queue,
1282*4882a593Smuzhiyun 			   !test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
1283*4882a593Smuzhiyun 		goto out; /* piggy-back on the other reset */
1284*4882a593Smuzhiyun 	}
1285*4882a593Smuzhiyun 
1286*4882a593Smuzhiyun 	/* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1287*4882a593Smuzhiyun 	synchronize_rcu_expedited();
1288*4882a593Smuzhiyun 
1289*4882a593Smuzhiyun 	/* Prevent any other reset-engine attempt. */
1290*4882a593Smuzhiyun 	for_each_engine(engine, gt, tmp) {
1291*4882a593Smuzhiyun 		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1292*4882a593Smuzhiyun 					&gt->reset.flags))
1293*4882a593Smuzhiyun 			wait_on_bit(&gt->reset.flags,
1294*4882a593Smuzhiyun 				    I915_RESET_ENGINE + engine->id,
1295*4882a593Smuzhiyun 				    TASK_UNINTERRUPTIBLE);
1296*4882a593Smuzhiyun 	}
1297*4882a593Smuzhiyun 
1298*4882a593Smuzhiyun 	intel_gt_reset_global(gt, engine_mask, msg);
1299*4882a593Smuzhiyun 
1300*4882a593Smuzhiyun 	for_each_engine(engine, gt, tmp)
1301*4882a593Smuzhiyun 		clear_bit_unlock(I915_RESET_ENGINE + engine->id,
1302*4882a593Smuzhiyun 				 &gt->reset.flags);
1303*4882a593Smuzhiyun 	clear_bit_unlock(I915_RESET_BACKOFF, &gt->reset.flags);
1304*4882a593Smuzhiyun 	smp_mb__after_atomic();
1305*4882a593Smuzhiyun 	wake_up_all(&gt->reset.queue);
1306*4882a593Smuzhiyun 
1307*4882a593Smuzhiyun out:
1308*4882a593Smuzhiyun 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1309*4882a593Smuzhiyun }
1310*4882a593Smuzhiyun 
intel_gt_reset_trylock(struct intel_gt * gt,int * srcu)1311*4882a593Smuzhiyun int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
1312*4882a593Smuzhiyun {
1313*4882a593Smuzhiyun 	might_lock(&gt->reset.backoff_srcu);
1314*4882a593Smuzhiyun 	might_sleep();
1315*4882a593Smuzhiyun 
1316*4882a593Smuzhiyun 	rcu_read_lock();
1317*4882a593Smuzhiyun 	while (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1318*4882a593Smuzhiyun 		rcu_read_unlock();
1319*4882a593Smuzhiyun 
1320*4882a593Smuzhiyun 		if (wait_event_interruptible(gt->reset.queue,
1321*4882a593Smuzhiyun 					     !test_bit(I915_RESET_BACKOFF,
1322*4882a593Smuzhiyun 						       &gt->reset.flags)))
1323*4882a593Smuzhiyun 			return -EINTR;
1324*4882a593Smuzhiyun 
1325*4882a593Smuzhiyun 		rcu_read_lock();
1326*4882a593Smuzhiyun 	}
1327*4882a593Smuzhiyun 	*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
1328*4882a593Smuzhiyun 	rcu_read_unlock();
1329*4882a593Smuzhiyun 
1330*4882a593Smuzhiyun 	return 0;
1331*4882a593Smuzhiyun }
1332*4882a593Smuzhiyun 
intel_gt_reset_unlock(struct intel_gt * gt,int tag)1333*4882a593Smuzhiyun void intel_gt_reset_unlock(struct intel_gt *gt, int tag)
1334*4882a593Smuzhiyun __releases(&gt->reset.backoff_srcu)
1335*4882a593Smuzhiyun {
1336*4882a593Smuzhiyun 	srcu_read_unlock(&gt->reset.backoff_srcu, tag);
1337*4882a593Smuzhiyun }
1338*4882a593Smuzhiyun 
intel_gt_terminally_wedged(struct intel_gt * gt)1339*4882a593Smuzhiyun int intel_gt_terminally_wedged(struct intel_gt *gt)
1340*4882a593Smuzhiyun {
1341*4882a593Smuzhiyun 	might_sleep();
1342*4882a593Smuzhiyun 
1343*4882a593Smuzhiyun 	if (!intel_gt_is_wedged(gt))
1344*4882a593Smuzhiyun 		return 0;
1345*4882a593Smuzhiyun 
1346*4882a593Smuzhiyun 	if (intel_gt_has_unrecoverable_error(gt))
1347*4882a593Smuzhiyun 		return -EIO;
1348*4882a593Smuzhiyun 
1349*4882a593Smuzhiyun 	/* Reset still in progress? Maybe we will recover? */
1350*4882a593Smuzhiyun 	if (wait_event_interruptible(gt->reset.queue,
1351*4882a593Smuzhiyun 				     !test_bit(I915_RESET_BACKOFF,
1352*4882a593Smuzhiyun 					       &gt->reset.flags)))
1353*4882a593Smuzhiyun 		return -EINTR;
1354*4882a593Smuzhiyun 
1355*4882a593Smuzhiyun 	return intel_gt_is_wedged(gt) ? -EIO : 0;
1356*4882a593Smuzhiyun }
1357*4882a593Smuzhiyun 
intel_gt_set_wedged_on_init(struct intel_gt * gt)1358*4882a593Smuzhiyun void intel_gt_set_wedged_on_init(struct intel_gt *gt)
1359*4882a593Smuzhiyun {
1360*4882a593Smuzhiyun 	BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES >
1361*4882a593Smuzhiyun 		     I915_WEDGED_ON_INIT);
1362*4882a593Smuzhiyun 	intel_gt_set_wedged(gt);
1363*4882a593Smuzhiyun 	set_bit(I915_WEDGED_ON_INIT, &gt->reset.flags);
1364*4882a593Smuzhiyun 
1365*4882a593Smuzhiyun 	/* Wedged on init is non-recoverable */
1366*4882a593Smuzhiyun 	add_taint_for_CI(gt->i915, TAINT_WARN);
1367*4882a593Smuzhiyun }
1368*4882a593Smuzhiyun 
intel_gt_set_wedged_on_fini(struct intel_gt * gt)1369*4882a593Smuzhiyun void intel_gt_set_wedged_on_fini(struct intel_gt *gt)
1370*4882a593Smuzhiyun {
1371*4882a593Smuzhiyun 	intel_gt_set_wedged(gt);
1372*4882a593Smuzhiyun 	set_bit(I915_WEDGED_ON_FINI, &gt->reset.flags);
1373*4882a593Smuzhiyun }
1374*4882a593Smuzhiyun 
intel_gt_init_reset(struct intel_gt * gt)1375*4882a593Smuzhiyun void intel_gt_init_reset(struct intel_gt *gt)
1376*4882a593Smuzhiyun {
1377*4882a593Smuzhiyun 	init_waitqueue_head(&gt->reset.queue);
1378*4882a593Smuzhiyun 	mutex_init(&gt->reset.mutex);
1379*4882a593Smuzhiyun 	init_srcu_struct(&gt->reset.backoff_srcu);
1380*4882a593Smuzhiyun 
1381*4882a593Smuzhiyun 	/* no GPU until we are ready! */
1382*4882a593Smuzhiyun 	__set_bit(I915_WEDGED, &gt->reset.flags);
1383*4882a593Smuzhiyun }
1384*4882a593Smuzhiyun 
intel_gt_fini_reset(struct intel_gt * gt)1385*4882a593Smuzhiyun void intel_gt_fini_reset(struct intel_gt *gt)
1386*4882a593Smuzhiyun {
1387*4882a593Smuzhiyun 	cleanup_srcu_struct(&gt->reset.backoff_srcu);
1388*4882a593Smuzhiyun }
1389*4882a593Smuzhiyun 
intel_wedge_me(struct work_struct * work)1390*4882a593Smuzhiyun static void intel_wedge_me(struct work_struct *work)
1391*4882a593Smuzhiyun {
1392*4882a593Smuzhiyun 	struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
1393*4882a593Smuzhiyun 
1394*4882a593Smuzhiyun 	drm_err(&w->gt->i915->drm,
1395*4882a593Smuzhiyun 		"%s timed out, cancelling all in-flight rendering.\n",
1396*4882a593Smuzhiyun 		w->name);
1397*4882a593Smuzhiyun 	intel_gt_set_wedged(w->gt);
1398*4882a593Smuzhiyun }
1399*4882a593Smuzhiyun 
__intel_init_wedge(struct intel_wedge_me * w,struct intel_gt * gt,long timeout,const char * name)1400*4882a593Smuzhiyun void __intel_init_wedge(struct intel_wedge_me *w,
1401*4882a593Smuzhiyun 			struct intel_gt *gt,
1402*4882a593Smuzhiyun 			long timeout,
1403*4882a593Smuzhiyun 			const char *name)
1404*4882a593Smuzhiyun {
1405*4882a593Smuzhiyun 	w->gt = gt;
1406*4882a593Smuzhiyun 	w->name = name;
1407*4882a593Smuzhiyun 
1408*4882a593Smuzhiyun 	INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
1409*4882a593Smuzhiyun 	schedule_delayed_work(&w->work, timeout);
1410*4882a593Smuzhiyun }
1411*4882a593Smuzhiyun 
__intel_fini_wedge(struct intel_wedge_me * w)1412*4882a593Smuzhiyun void __intel_fini_wedge(struct intel_wedge_me *w)
1413*4882a593Smuzhiyun {
1414*4882a593Smuzhiyun 	cancel_delayed_work_sync(&w->work);
1415*4882a593Smuzhiyun 	destroy_delayed_work_on_stack(&w->work);
1416*4882a593Smuzhiyun 	w->gt = NULL;
1417*4882a593Smuzhiyun }
1418*4882a593Smuzhiyun 
1419*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1420*4882a593Smuzhiyun #include "selftest_reset.c"
1421*4882a593Smuzhiyun #include "selftest_hangcheck.c"
1422*4882a593Smuzhiyun #endif
1423