1*4882a593Smuzhiyun // SPDX-License-Identifier: MIT
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright © 2018 Intel Corporation
4*4882a593Smuzhiyun */
5*4882a593Smuzhiyun
6*4882a593Smuzhiyun #include <linux/crc32.h>
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun #include "gem/i915_gem_stolen.h"
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun #include "i915_memcpy.h"
11*4882a593Smuzhiyun #include "i915_selftest.h"
12*4882a593Smuzhiyun #include "selftests/igt_reset.h"
13*4882a593Smuzhiyun #include "selftests/igt_atomic.h"
14*4882a593Smuzhiyun #include "selftests/igt_spinner.h"
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun static int
__igt_reset_stolen(struct intel_gt * gt,intel_engine_mask_t mask,const char * msg)17*4882a593Smuzhiyun __igt_reset_stolen(struct intel_gt *gt,
18*4882a593Smuzhiyun intel_engine_mask_t mask,
19*4882a593Smuzhiyun const char *msg)
20*4882a593Smuzhiyun {
21*4882a593Smuzhiyun struct i915_ggtt *ggtt = >->i915->ggtt;
22*4882a593Smuzhiyun const struct resource *dsm = >->i915->dsm;
23*4882a593Smuzhiyun resource_size_t num_pages, page;
24*4882a593Smuzhiyun struct intel_engine_cs *engine;
25*4882a593Smuzhiyun intel_wakeref_t wakeref;
26*4882a593Smuzhiyun enum intel_engine_id id;
27*4882a593Smuzhiyun struct igt_spinner spin;
28*4882a593Smuzhiyun long max, count;
29*4882a593Smuzhiyun void *tmp;
30*4882a593Smuzhiyun u32 *crc;
31*4882a593Smuzhiyun int err;
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun if (!drm_mm_node_allocated(&ggtt->error_capture))
34*4882a593Smuzhiyun return 0;
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun num_pages = resource_size(dsm) >> PAGE_SHIFT;
37*4882a593Smuzhiyun if (!num_pages)
38*4882a593Smuzhiyun return 0;
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun crc = kmalloc_array(num_pages, sizeof(u32), GFP_KERNEL);
41*4882a593Smuzhiyun if (!crc)
42*4882a593Smuzhiyun return -ENOMEM;
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
45*4882a593Smuzhiyun if (!tmp) {
46*4882a593Smuzhiyun err = -ENOMEM;
47*4882a593Smuzhiyun goto err_crc;
48*4882a593Smuzhiyun }
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun igt_global_reset_lock(gt);
51*4882a593Smuzhiyun wakeref = intel_runtime_pm_get(gt->uncore->rpm);
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun err = igt_spinner_init(&spin, gt);
54*4882a593Smuzhiyun if (err)
55*4882a593Smuzhiyun goto err_lock;
56*4882a593Smuzhiyun
57*4882a593Smuzhiyun for_each_engine(engine, gt, id) {
58*4882a593Smuzhiyun struct intel_context *ce;
59*4882a593Smuzhiyun struct i915_request *rq;
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun if (!(mask & engine->mask))
62*4882a593Smuzhiyun continue;
63*4882a593Smuzhiyun
64*4882a593Smuzhiyun if (!intel_engine_can_store_dword(engine))
65*4882a593Smuzhiyun continue;
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun ce = intel_context_create(engine);
68*4882a593Smuzhiyun if (IS_ERR(ce)) {
69*4882a593Smuzhiyun err = PTR_ERR(ce);
70*4882a593Smuzhiyun goto err_spin;
71*4882a593Smuzhiyun }
72*4882a593Smuzhiyun rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
73*4882a593Smuzhiyun intel_context_put(ce);
74*4882a593Smuzhiyun if (IS_ERR(rq)) {
75*4882a593Smuzhiyun err = PTR_ERR(rq);
76*4882a593Smuzhiyun goto err_spin;
77*4882a593Smuzhiyun }
78*4882a593Smuzhiyun i915_request_add(rq);
79*4882a593Smuzhiyun }
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun for (page = 0; page < num_pages; page++) {
82*4882a593Smuzhiyun dma_addr_t dma = (dma_addr_t)dsm->start + (page << PAGE_SHIFT);
83*4882a593Smuzhiyun void __iomem *s;
84*4882a593Smuzhiyun void *in;
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun ggtt->vm.insert_page(&ggtt->vm, dma,
87*4882a593Smuzhiyun ggtt->error_capture.start,
88*4882a593Smuzhiyun I915_CACHE_NONE, 0);
89*4882a593Smuzhiyun mb();
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun s = io_mapping_map_wc(&ggtt->iomap,
92*4882a593Smuzhiyun ggtt->error_capture.start,
93*4882a593Smuzhiyun PAGE_SIZE);
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun if (!__drm_mm_interval_first(>->i915->mm.stolen,
96*4882a593Smuzhiyun page << PAGE_SHIFT,
97*4882a593Smuzhiyun ((page + 1) << PAGE_SHIFT) - 1))
98*4882a593Smuzhiyun memset32(s, STACK_MAGIC, PAGE_SIZE / sizeof(u32));
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun in = s;
101*4882a593Smuzhiyun if (i915_memcpy_from_wc(tmp, s, PAGE_SIZE))
102*4882a593Smuzhiyun in = tmp;
103*4882a593Smuzhiyun crc[page] = crc32_le(0, in, PAGE_SIZE);
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun io_mapping_unmap(s);
106*4882a593Smuzhiyun }
107*4882a593Smuzhiyun mb();
108*4882a593Smuzhiyun ggtt->vm.clear_range(&ggtt->vm, ggtt->error_capture.start, PAGE_SIZE);
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun if (mask == ALL_ENGINES) {
111*4882a593Smuzhiyun intel_gt_reset(gt, mask, NULL);
112*4882a593Smuzhiyun } else {
113*4882a593Smuzhiyun for_each_engine(engine, gt, id) {
114*4882a593Smuzhiyun if (mask & engine->mask)
115*4882a593Smuzhiyun intel_engine_reset(engine, NULL);
116*4882a593Smuzhiyun }
117*4882a593Smuzhiyun }
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun max = -1;
120*4882a593Smuzhiyun count = 0;
121*4882a593Smuzhiyun for (page = 0; page < num_pages; page++) {
122*4882a593Smuzhiyun dma_addr_t dma = (dma_addr_t)dsm->start + (page << PAGE_SHIFT);
123*4882a593Smuzhiyun void __iomem *s;
124*4882a593Smuzhiyun void *in;
125*4882a593Smuzhiyun u32 x;
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun ggtt->vm.insert_page(&ggtt->vm, dma,
128*4882a593Smuzhiyun ggtt->error_capture.start,
129*4882a593Smuzhiyun I915_CACHE_NONE, 0);
130*4882a593Smuzhiyun mb();
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun s = io_mapping_map_wc(&ggtt->iomap,
133*4882a593Smuzhiyun ggtt->error_capture.start,
134*4882a593Smuzhiyun PAGE_SIZE);
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun in = s;
137*4882a593Smuzhiyun if (i915_memcpy_from_wc(tmp, s, PAGE_SIZE))
138*4882a593Smuzhiyun in = tmp;
139*4882a593Smuzhiyun x = crc32_le(0, in, PAGE_SIZE);
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun if (x != crc[page] &&
142*4882a593Smuzhiyun !__drm_mm_interval_first(>->i915->mm.stolen,
143*4882a593Smuzhiyun page << PAGE_SHIFT,
144*4882a593Smuzhiyun ((page + 1) << PAGE_SHIFT) - 1)) {
145*4882a593Smuzhiyun pr_debug("unused stolen page %pa modified by GPU reset\n",
146*4882a593Smuzhiyun &page);
147*4882a593Smuzhiyun if (count++ == 0)
148*4882a593Smuzhiyun igt_hexdump(in, PAGE_SIZE);
149*4882a593Smuzhiyun max = page;
150*4882a593Smuzhiyun }
151*4882a593Smuzhiyun
152*4882a593Smuzhiyun io_mapping_unmap(s);
153*4882a593Smuzhiyun }
154*4882a593Smuzhiyun mb();
155*4882a593Smuzhiyun ggtt->vm.clear_range(&ggtt->vm, ggtt->error_capture.start, PAGE_SIZE);
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun if (count > 0) {
158*4882a593Smuzhiyun pr_info("%s reset clobbered %ld pages of stolen, last clobber at page %ld\n",
159*4882a593Smuzhiyun msg, count, max);
160*4882a593Smuzhiyun }
161*4882a593Smuzhiyun if (max >= I915_GEM_STOLEN_BIAS >> PAGE_SHIFT) {
162*4882a593Smuzhiyun pr_err("%s reset clobbered unreserved area [above %x] of stolen; may cause severe faults\n",
163*4882a593Smuzhiyun msg, I915_GEM_STOLEN_BIAS);
164*4882a593Smuzhiyun err = -EINVAL;
165*4882a593Smuzhiyun }
166*4882a593Smuzhiyun
167*4882a593Smuzhiyun err_spin:
168*4882a593Smuzhiyun igt_spinner_fini(&spin);
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun err_lock:
171*4882a593Smuzhiyun intel_runtime_pm_put(gt->uncore->rpm, wakeref);
172*4882a593Smuzhiyun igt_global_reset_unlock(gt);
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun kfree(tmp);
175*4882a593Smuzhiyun err_crc:
176*4882a593Smuzhiyun kfree(crc);
177*4882a593Smuzhiyun return err;
178*4882a593Smuzhiyun }
179*4882a593Smuzhiyun
igt_reset_device_stolen(void * arg)180*4882a593Smuzhiyun static int igt_reset_device_stolen(void *arg)
181*4882a593Smuzhiyun {
182*4882a593Smuzhiyun return __igt_reset_stolen(arg, ALL_ENGINES, "device");
183*4882a593Smuzhiyun }
184*4882a593Smuzhiyun
igt_reset_engines_stolen(void * arg)185*4882a593Smuzhiyun static int igt_reset_engines_stolen(void *arg)
186*4882a593Smuzhiyun {
187*4882a593Smuzhiyun struct intel_gt *gt = arg;
188*4882a593Smuzhiyun struct intel_engine_cs *engine;
189*4882a593Smuzhiyun enum intel_engine_id id;
190*4882a593Smuzhiyun int err;
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun if (!intel_has_reset_engine(gt))
193*4882a593Smuzhiyun return 0;
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun for_each_engine(engine, gt, id) {
196*4882a593Smuzhiyun err = __igt_reset_stolen(gt, engine->mask, engine->name);
197*4882a593Smuzhiyun if (err)
198*4882a593Smuzhiyun return err;
199*4882a593Smuzhiyun }
200*4882a593Smuzhiyun
201*4882a593Smuzhiyun return 0;
202*4882a593Smuzhiyun }
203*4882a593Smuzhiyun
igt_global_reset(void * arg)204*4882a593Smuzhiyun static int igt_global_reset(void *arg)
205*4882a593Smuzhiyun {
206*4882a593Smuzhiyun struct intel_gt *gt = arg;
207*4882a593Smuzhiyun unsigned int reset_count;
208*4882a593Smuzhiyun intel_wakeref_t wakeref;
209*4882a593Smuzhiyun int err = 0;
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun /* Check that we can issue a global GPU reset */
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun igt_global_reset_lock(gt);
214*4882a593Smuzhiyun wakeref = intel_runtime_pm_get(gt->uncore->rpm);
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun reset_count = i915_reset_count(>->i915->gpu_error);
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun intel_gt_reset(gt, ALL_ENGINES, NULL);
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun if (i915_reset_count(>->i915->gpu_error) == reset_count) {
221*4882a593Smuzhiyun pr_err("No GPU reset recorded!\n");
222*4882a593Smuzhiyun err = -EINVAL;
223*4882a593Smuzhiyun }
224*4882a593Smuzhiyun
225*4882a593Smuzhiyun intel_runtime_pm_put(gt->uncore->rpm, wakeref);
226*4882a593Smuzhiyun igt_global_reset_unlock(gt);
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun if (intel_gt_is_wedged(gt))
229*4882a593Smuzhiyun err = -EIO;
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun return err;
232*4882a593Smuzhiyun }
233*4882a593Smuzhiyun
igt_wedged_reset(void * arg)234*4882a593Smuzhiyun static int igt_wedged_reset(void *arg)
235*4882a593Smuzhiyun {
236*4882a593Smuzhiyun struct intel_gt *gt = arg;
237*4882a593Smuzhiyun intel_wakeref_t wakeref;
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun /* Check that we can recover a wedged device with a GPU reset */
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun igt_global_reset_lock(gt);
242*4882a593Smuzhiyun wakeref = intel_runtime_pm_get(gt->uncore->rpm);
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun intel_gt_set_wedged(gt);
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun GEM_BUG_ON(!intel_gt_is_wedged(gt));
247*4882a593Smuzhiyun intel_gt_reset(gt, ALL_ENGINES, NULL);
248*4882a593Smuzhiyun
249*4882a593Smuzhiyun intel_runtime_pm_put(gt->uncore->rpm, wakeref);
250*4882a593Smuzhiyun igt_global_reset_unlock(gt);
251*4882a593Smuzhiyun
252*4882a593Smuzhiyun return intel_gt_is_wedged(gt) ? -EIO : 0;
253*4882a593Smuzhiyun }
254*4882a593Smuzhiyun
igt_atomic_reset(void * arg)255*4882a593Smuzhiyun static int igt_atomic_reset(void *arg)
256*4882a593Smuzhiyun {
257*4882a593Smuzhiyun struct intel_gt *gt = arg;
258*4882a593Smuzhiyun const typeof(*igt_atomic_phases) *p;
259*4882a593Smuzhiyun int err = 0;
260*4882a593Smuzhiyun
261*4882a593Smuzhiyun /* Check that the resets are usable from atomic context */
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun intel_gt_pm_get(gt);
264*4882a593Smuzhiyun igt_global_reset_lock(gt);
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun /* Flush any requests before we get started and check basics */
267*4882a593Smuzhiyun if (!igt_force_reset(gt))
268*4882a593Smuzhiyun goto unlock;
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun for (p = igt_atomic_phases; p->name; p++) {
271*4882a593Smuzhiyun intel_engine_mask_t awake;
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun GEM_TRACE("__intel_gt_reset under %s\n", p->name);
274*4882a593Smuzhiyun
275*4882a593Smuzhiyun awake = reset_prepare(gt);
276*4882a593Smuzhiyun p->critical_section_begin();
277*4882a593Smuzhiyun
278*4882a593Smuzhiyun err = __intel_gt_reset(gt, ALL_ENGINES);
279*4882a593Smuzhiyun
280*4882a593Smuzhiyun p->critical_section_end();
281*4882a593Smuzhiyun reset_finish(gt, awake);
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun if (err) {
284*4882a593Smuzhiyun pr_err("__intel_gt_reset failed under %s\n", p->name);
285*4882a593Smuzhiyun break;
286*4882a593Smuzhiyun }
287*4882a593Smuzhiyun }
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun /* As we poke around the guts, do a full reset before continuing. */
290*4882a593Smuzhiyun igt_force_reset(gt);
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun unlock:
293*4882a593Smuzhiyun igt_global_reset_unlock(gt);
294*4882a593Smuzhiyun intel_gt_pm_put(gt);
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun return err;
297*4882a593Smuzhiyun }
298*4882a593Smuzhiyun
igt_atomic_engine_reset(void * arg)299*4882a593Smuzhiyun static int igt_atomic_engine_reset(void *arg)
300*4882a593Smuzhiyun {
301*4882a593Smuzhiyun struct intel_gt *gt = arg;
302*4882a593Smuzhiyun const typeof(*igt_atomic_phases) *p;
303*4882a593Smuzhiyun struct intel_engine_cs *engine;
304*4882a593Smuzhiyun enum intel_engine_id id;
305*4882a593Smuzhiyun int err = 0;
306*4882a593Smuzhiyun
307*4882a593Smuzhiyun /* Check that the resets are usable from atomic context */
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun if (!intel_has_reset_engine(gt))
310*4882a593Smuzhiyun return 0;
311*4882a593Smuzhiyun
312*4882a593Smuzhiyun if (intel_uc_uses_guc_submission(>->uc))
313*4882a593Smuzhiyun return 0;
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun intel_gt_pm_get(gt);
316*4882a593Smuzhiyun igt_global_reset_lock(gt);
317*4882a593Smuzhiyun
318*4882a593Smuzhiyun /* Flush any requests before we get started and check basics */
319*4882a593Smuzhiyun if (!igt_force_reset(gt))
320*4882a593Smuzhiyun goto out_unlock;
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun for_each_engine(engine, gt, id) {
323*4882a593Smuzhiyun tasklet_disable(&engine->execlists.tasklet);
324*4882a593Smuzhiyun intel_engine_pm_get(engine);
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun for (p = igt_atomic_phases; p->name; p++) {
327*4882a593Smuzhiyun GEM_TRACE("intel_engine_reset(%s) under %s\n",
328*4882a593Smuzhiyun engine->name, p->name);
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun p->critical_section_begin();
331*4882a593Smuzhiyun err = intel_engine_reset(engine, NULL);
332*4882a593Smuzhiyun p->critical_section_end();
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun if (err) {
335*4882a593Smuzhiyun pr_err("intel_engine_reset(%s) failed under %s\n",
336*4882a593Smuzhiyun engine->name, p->name);
337*4882a593Smuzhiyun break;
338*4882a593Smuzhiyun }
339*4882a593Smuzhiyun }
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun intel_engine_pm_put(engine);
342*4882a593Smuzhiyun tasklet_enable(&engine->execlists.tasklet);
343*4882a593Smuzhiyun if (err)
344*4882a593Smuzhiyun break;
345*4882a593Smuzhiyun }
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun /* As we poke around the guts, do a full reset before continuing. */
348*4882a593Smuzhiyun igt_force_reset(gt);
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun out_unlock:
351*4882a593Smuzhiyun igt_global_reset_unlock(gt);
352*4882a593Smuzhiyun intel_gt_pm_put(gt);
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun return err;
355*4882a593Smuzhiyun }
356*4882a593Smuzhiyun
intel_reset_live_selftests(struct drm_i915_private * i915)357*4882a593Smuzhiyun int intel_reset_live_selftests(struct drm_i915_private *i915)
358*4882a593Smuzhiyun {
359*4882a593Smuzhiyun static const struct i915_subtest tests[] = {
360*4882a593Smuzhiyun SUBTEST(igt_global_reset), /* attempt to recover GPU first */
361*4882a593Smuzhiyun SUBTEST(igt_reset_device_stolen),
362*4882a593Smuzhiyun SUBTEST(igt_reset_engines_stolen),
363*4882a593Smuzhiyun SUBTEST(igt_wedged_reset),
364*4882a593Smuzhiyun SUBTEST(igt_atomic_reset),
365*4882a593Smuzhiyun SUBTEST(igt_atomic_engine_reset),
366*4882a593Smuzhiyun };
367*4882a593Smuzhiyun struct intel_gt *gt = &i915->gt;
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun if (!intel_has_gpu_reset(gt))
370*4882a593Smuzhiyun return 0;
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun if (intel_gt_is_wedged(gt))
373*4882a593Smuzhiyun return -EIO; /* we're long past hope of a successful reset */
374*4882a593Smuzhiyun
375*4882a593Smuzhiyun return intel_gt_live_subtests(tests, gt);
376*4882a593Smuzhiyun }
377