xref: /OK3568_Linux_fs/kernel/drivers/gpu/arm/midgard/mali_kbase_debug_job_fault.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 /*
2  *
3  * (C) COPYRIGHT 2012-2016 ARM Limited. All rights reserved.
4  *
5  * This program is free software and is provided to you under the terms of the
6  * GNU General Public License version 2 as published by the Free Software
7  * Foundation, and any use by you of this program is subject to the terms
8  * of such GNU licence.
9  *
10  * A copy of the licence is included with the program, and can also be obtained
11  * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
12  * Boston, MA  02110-1301, USA.
13  *
14  */
15 
16 
17 
18 #include <mali_kbase.h>
19 #include <linux/spinlock.h>
20 #include <mali_kbase_hwaccess_jm.h>
21 
22 #ifdef CONFIG_DEBUG_FS
23 
kbase_is_job_fault_event_pending(struct kbase_device * kbdev)24 static bool kbase_is_job_fault_event_pending(struct kbase_device *kbdev)
25 {
26 	struct list_head *event_list = &kbdev->job_fault_event_list;
27 	unsigned long    flags;
28 	bool             ret;
29 
30 	spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
31 	ret = !list_empty(event_list);
32 	spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
33 
34 	return ret;
35 }
36 
kbase_ctx_has_no_event_pending(struct kbase_context * kctx)37 static bool kbase_ctx_has_no_event_pending(struct kbase_context *kctx)
38 {
39 	struct kbase_device *kbdev = kctx->kbdev;
40 	struct list_head *event_list = &kctx->kbdev->job_fault_event_list;
41 	struct base_job_fault_event *event;
42 	unsigned long               flags;
43 
44 	spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
45 	if (list_empty(event_list)) {
46 		spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
47 		return true;
48 	}
49 	list_for_each_entry(event, event_list, head) {
50 		if (event->katom->kctx == kctx) {
51 			spin_unlock_irqrestore(&kbdev->job_fault_event_lock,
52 					flags);
53 			return false;
54 		}
55 	}
56 	spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
57 	return true;
58 }
59 
60 /* wait until the fault happen and copy the event */
kbase_job_fault_event_wait(struct kbase_device * kbdev,struct base_job_fault_event * event)61 static int kbase_job_fault_event_wait(struct kbase_device *kbdev,
62 		struct base_job_fault_event *event)
63 {
64 	struct list_head            *event_list = &kbdev->job_fault_event_list;
65 	struct base_job_fault_event *event_in;
66 	unsigned long               flags;
67 
68 	spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
69 	if (list_empty(event_list)) {
70 		spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
71 		if (wait_event_interruptible(kbdev->job_fault_wq,
72 				 kbase_is_job_fault_event_pending(kbdev)))
73 			return -ERESTARTSYS;
74 		spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
75 	}
76 
77 	event_in = list_entry(event_list->next,
78 			struct base_job_fault_event, head);
79 	event->event_code = event_in->event_code;
80 	event->katom = event_in->katom;
81 
82 	spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
83 
84 	return 0;
85 
86 }
87 
88 /* remove the event from the queue */
kbase_job_fault_event_dequeue(struct kbase_device * kbdev,struct list_head * event_list)89 static struct base_job_fault_event *kbase_job_fault_event_dequeue(
90 		struct kbase_device *kbdev, struct list_head *event_list)
91 {
92 	struct base_job_fault_event *event;
93 
94 	event = list_entry(event_list->next,
95 			struct base_job_fault_event, head);
96 	list_del(event_list->next);
97 
98 	return event;
99 
100 }
101 
102 /* Remove all the following atoms after the failed atom in the same context
103  * Call the postponed bottom half of job done.
104  * Then, this context could be rescheduled.
105  */
kbase_job_fault_resume_event_cleanup(struct kbase_context * kctx)106 static void kbase_job_fault_resume_event_cleanup(struct kbase_context *kctx)
107 {
108 	struct list_head *event_list = &kctx->job_fault_resume_event_list;
109 
110 	while (!list_empty(event_list)) {
111 		struct base_job_fault_event *event;
112 
113 		event = kbase_job_fault_event_dequeue(kctx->kbdev,
114 				&kctx->job_fault_resume_event_list);
115 		kbase_jd_done_worker(&event->katom->work);
116 	}
117 
118 }
119 
120 /* Remove all the failed atoms that belong to different contexts
121  * Resume all the contexts that were suspend due to failed job
122  */
kbase_job_fault_event_cleanup(struct kbase_device * kbdev)123 static void kbase_job_fault_event_cleanup(struct kbase_device *kbdev)
124 {
125 	struct list_head *event_list = &kbdev->job_fault_event_list;
126 	unsigned long    flags;
127 
128 	spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
129 	while (!list_empty(event_list)) {
130 		kbase_job_fault_event_dequeue(kbdev, event_list);
131 		spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
132 		wake_up(&kbdev->job_fault_resume_wq);
133 		spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
134 	}
135 	spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
136 }
137 
kbase_job_fault_resume_worker(struct work_struct * data)138 static void kbase_job_fault_resume_worker(struct work_struct *data)
139 {
140 	struct base_job_fault_event *event = container_of(data,
141 			struct base_job_fault_event, job_fault_work);
142 	struct kbase_context *kctx;
143 	struct kbase_jd_atom *katom;
144 
145 	katom = event->katom;
146 	kctx = katom->kctx;
147 
148 	dev_info(kctx->kbdev->dev, "Job dumping wait\n");
149 
150 	/* When it was waked up, it need to check if queue is empty or the
151 	 * failed atom belongs to different context. If yes, wake up. Both
152 	 * of them mean the failed job has been dumped. Please note, it
153 	 * should never happen that the job_fault_event_list has the two
154 	 * atoms belong to the same context.
155 	 */
156 	wait_event(kctx->kbdev->job_fault_resume_wq,
157 			 kbase_ctx_has_no_event_pending(kctx));
158 
159 	atomic_set(&kctx->job_fault_count, 0);
160 	kbase_jd_done_worker(&katom->work);
161 
162 	/* In case the following atoms were scheduled during failed job dump
163 	 * the job_done_worker was held. We need to rerun it after the dump
164 	 * was finished
165 	 */
166 	kbase_job_fault_resume_event_cleanup(kctx);
167 
168 	dev_info(kctx->kbdev->dev, "Job dumping finish, resume scheduler\n");
169 }
170 
kbase_job_fault_event_queue(struct list_head * event_list,struct kbase_jd_atom * atom,u32 completion_code)171 static struct base_job_fault_event *kbase_job_fault_event_queue(
172 		struct list_head *event_list,
173 		struct kbase_jd_atom *atom,
174 		u32 completion_code)
175 {
176 	struct base_job_fault_event *event;
177 
178 	event = &atom->fault_event;
179 
180 	event->katom = atom;
181 	event->event_code = completion_code;
182 
183 	list_add_tail(&event->head, event_list);
184 
185 	return event;
186 
187 }
188 
kbase_job_fault_event_post(struct kbase_device * kbdev,struct kbase_jd_atom * katom,u32 completion_code)189 static void kbase_job_fault_event_post(struct kbase_device *kbdev,
190 		struct kbase_jd_atom *katom, u32 completion_code)
191 {
192 	struct base_job_fault_event *event;
193 	unsigned long flags;
194 
195 	spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
196 	event = kbase_job_fault_event_queue(&kbdev->job_fault_event_list,
197 				katom, completion_code);
198 	spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
199 
200 	wake_up_interruptible(&kbdev->job_fault_wq);
201 
202 	INIT_WORK(&event->job_fault_work, kbase_job_fault_resume_worker);
203 	queue_work(kbdev->job_fault_resume_workq, &event->job_fault_work);
204 
205 	dev_info(katom->kctx->kbdev->dev, "Job fault happen, start dump: %d_%d",
206 			katom->kctx->tgid, katom->kctx->id);
207 
208 }
209 
210 /*
211  * This function will process the job fault
212  * Get the register copy
213  * Send the failed job dump event
214  * Create a Wait queue to wait until the job dump finish
215  */
216 
kbase_debug_job_fault_process(struct kbase_jd_atom * katom,u32 completion_code)217 bool kbase_debug_job_fault_process(struct kbase_jd_atom *katom,
218 		u32 completion_code)
219 {
220 	struct kbase_context *kctx = katom->kctx;
221 
222 	/* Check if dumping is in the process
223 	 * only one atom of each context can be dumped at the same time
224 	 * If the atom belongs to different context, it can be dumped
225 	 */
226 	if (atomic_read(&kctx->job_fault_count) > 0) {
227 		kbase_job_fault_event_queue(
228 				&kctx->job_fault_resume_event_list,
229 				katom, completion_code);
230 		dev_info(kctx->kbdev->dev, "queue:%d\n",
231 				kbase_jd_atom_id(kctx, katom));
232 		return true;
233 	}
234 
235 	if (kctx->kbdev->job_fault_debug == true) {
236 
237 		if (completion_code != BASE_JD_EVENT_DONE) {
238 
239 			if (kbase_job_fault_get_reg_snapshot(kctx) == false) {
240 				dev_warn(kctx->kbdev->dev, "get reg dump failed\n");
241 				return false;
242 			}
243 
244 			kbase_job_fault_event_post(kctx->kbdev, katom,
245 					completion_code);
246 			atomic_inc(&kctx->job_fault_count);
247 			dev_info(kctx->kbdev->dev, "post:%d\n",
248 					kbase_jd_atom_id(kctx, katom));
249 			return true;
250 
251 		}
252 	}
253 	return false;
254 
255 }
256 
debug_job_fault_show(struct seq_file * m,void * v)257 static int debug_job_fault_show(struct seq_file *m, void *v)
258 {
259 	struct kbase_device *kbdev = m->private;
260 	struct base_job_fault_event *event = (struct base_job_fault_event *)v;
261 	struct kbase_context *kctx = event->katom->kctx;
262 	int i;
263 
264 	dev_info(kbdev->dev, "debug job fault seq show:%d_%d, %d",
265 			kctx->tgid, kctx->id, event->reg_offset);
266 
267 	if (kctx->reg_dump == NULL) {
268 		dev_warn(kbdev->dev, "reg dump is NULL");
269 		return -1;
270 	}
271 
272 	if (kctx->reg_dump[event->reg_offset] ==
273 			REGISTER_DUMP_TERMINATION_FLAG) {
274 		/* Return the error here to stop the read. And the
275 		 * following next() will not be called. The stop can
276 		 * get the real event resource and release it
277 		 */
278 		return -1;
279 	}
280 
281 	if (event->reg_offset == 0)
282 		seq_printf(m, "%d_%d\n", kctx->tgid, kctx->id);
283 
284 	for (i = 0; i < 50; i++) {
285 		if (kctx->reg_dump[event->reg_offset] ==
286 				REGISTER_DUMP_TERMINATION_FLAG) {
287 			break;
288 		}
289 		seq_printf(m, "%08x: %08x\n",
290 				kctx->reg_dump[event->reg_offset],
291 				kctx->reg_dump[1+event->reg_offset]);
292 		event->reg_offset += 2;
293 
294 	}
295 
296 
297 	return 0;
298 }
debug_job_fault_next(struct seq_file * m,void * v,loff_t * pos)299 static void *debug_job_fault_next(struct seq_file *m, void *v, loff_t *pos)
300 {
301 	struct kbase_device *kbdev = m->private;
302 	struct base_job_fault_event *event = (struct base_job_fault_event *)v;
303 
304 	dev_info(kbdev->dev, "debug job fault seq next:%d, %d",
305 			event->reg_offset, (int)*pos);
306 
307 	return event;
308 }
309 
debug_job_fault_start(struct seq_file * m,loff_t * pos)310 static void *debug_job_fault_start(struct seq_file *m, loff_t *pos)
311 {
312 	struct kbase_device *kbdev = m->private;
313 	struct base_job_fault_event *event;
314 
315 	dev_info(kbdev->dev, "fault job seq start:%d", (int)*pos);
316 
317 	/* The condition is trick here. It needs make sure the
318 	 * fault hasn't happened and the dumping hasn't been started,
319 	 * or the dumping has finished
320 	 */
321 	if (*pos == 0) {
322 		event = kmalloc(sizeof(*event), GFP_KERNEL);
323 		if (!event)
324 			return NULL;
325 		event->reg_offset = 0;
326 		if (kbase_job_fault_event_wait(kbdev, event)) {
327 			kfree(event);
328 			return NULL;
329 		}
330 
331 		/* The cache flush workaround is called in bottom half of
332 		 * job done but we delayed it. Now we should clean cache
333 		 * earlier. Then the GPU memory dump should be correct.
334 		 */
335 		kbase_backend_cacheclean(kbdev, event->katom);
336 	} else
337 		return NULL;
338 
339 	return event;
340 }
341 
debug_job_fault_stop(struct seq_file * m,void * v)342 static void debug_job_fault_stop(struct seq_file *m, void *v)
343 {
344 	struct kbase_device *kbdev = m->private;
345 
346 	/* here we wake up the kbase_jd_done_worker after stop, it needs
347 	 * get the memory dump before the register dump in debug daemon,
348 	 * otherwise, the memory dump may be incorrect.
349 	 */
350 
351 	if (v != NULL) {
352 		kfree(v);
353 		dev_info(kbdev->dev, "debug job fault seq stop stage 1");
354 
355 	} else {
356 		unsigned long flags;
357 
358 		spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
359 		if (!list_empty(&kbdev->job_fault_event_list)) {
360 			kbase_job_fault_event_dequeue(kbdev,
361 				&kbdev->job_fault_event_list);
362 			wake_up(&kbdev->job_fault_resume_wq);
363 		}
364 		spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
365 		dev_info(kbdev->dev, "debug job fault seq stop stage 2");
366 	}
367 
368 }
369 
370 static const struct seq_operations ops = {
371 	.start = debug_job_fault_start,
372 	.next = debug_job_fault_next,
373 	.stop = debug_job_fault_stop,
374 	.show = debug_job_fault_show,
375 };
376 
debug_job_fault_open(struct inode * in,struct file * file)377 static int debug_job_fault_open(struct inode *in, struct file *file)
378 {
379 	struct kbase_device *kbdev = in->i_private;
380 
381 	seq_open(file, &ops);
382 
383 	((struct seq_file *)file->private_data)->private = kbdev;
384 	dev_info(kbdev->dev, "debug job fault seq open");
385 
386 	kbdev->job_fault_debug = true;
387 
388 	return 0;
389 
390 }
391 
debug_job_fault_release(struct inode * in,struct file * file)392 static int debug_job_fault_release(struct inode *in, struct file *file)
393 {
394 	struct kbase_device *kbdev = in->i_private;
395 
396 	seq_release(in, file);
397 
398 	kbdev->job_fault_debug = false;
399 
400 	/* Clean the unprocessed job fault. After that, all the suspended
401 	 * contexts could be rescheduled.
402 	 */
403 	kbase_job_fault_event_cleanup(kbdev);
404 
405 	dev_info(kbdev->dev, "debug job fault seq close");
406 
407 	return 0;
408 }
409 
410 static const struct file_operations kbasep_debug_job_fault_fops = {
411 	.open = debug_job_fault_open,
412 	.read = seq_read,
413 	.llseek = seq_lseek,
414 	.release = debug_job_fault_release,
415 };
416 
417 /*
418  *  Initialize debugfs entry for job fault dump
419  */
kbase_debug_job_fault_debugfs_init(struct kbase_device * kbdev)420 void kbase_debug_job_fault_debugfs_init(struct kbase_device *kbdev)
421 {
422 	debugfs_create_file("job_fault", S_IRUGO,
423 			kbdev->mali_debugfs_directory, kbdev,
424 			&kbasep_debug_job_fault_fops);
425 }
426 
427 
kbase_debug_job_fault_dev_init(struct kbase_device * kbdev)428 int kbase_debug_job_fault_dev_init(struct kbase_device *kbdev)
429 {
430 
431 	INIT_LIST_HEAD(&kbdev->job_fault_event_list);
432 
433 	init_waitqueue_head(&(kbdev->job_fault_wq));
434 	init_waitqueue_head(&(kbdev->job_fault_resume_wq));
435 	spin_lock_init(&kbdev->job_fault_event_lock);
436 
437 	kbdev->job_fault_resume_workq = alloc_workqueue(
438 			"kbase_job_fault_resume_work_queue", WQ_MEM_RECLAIM, 1);
439 	if (!kbdev->job_fault_resume_workq)
440 		return -ENOMEM;
441 
442 	kbdev->job_fault_debug = false;
443 
444 	return 0;
445 }
446 
447 /*
448  * Release the relevant resource per device
449  */
kbase_debug_job_fault_dev_term(struct kbase_device * kbdev)450 void kbase_debug_job_fault_dev_term(struct kbase_device *kbdev)
451 {
452 	destroy_workqueue(kbdev->job_fault_resume_workq);
453 }
454 
455 
456 /*
457  *  Initialize the relevant data structure per context
458  */
kbase_debug_job_fault_context_init(struct kbase_context * kctx)459 void kbase_debug_job_fault_context_init(struct kbase_context *kctx)
460 {
461 
462 	/* We need allocate double size register range
463 	 * Because this memory will keep the register address and value
464 	 */
465 	kctx->reg_dump = vmalloc(0x4000 * 2);
466 	if (kctx->reg_dump == NULL)
467 		return;
468 
469 	if (kbase_debug_job_fault_reg_snapshot_init(kctx, 0x4000) == false) {
470 		vfree(kctx->reg_dump);
471 		kctx->reg_dump = NULL;
472 	}
473 	INIT_LIST_HEAD(&kctx->job_fault_resume_event_list);
474 	atomic_set(&kctx->job_fault_count, 0);
475 
476 }
477 
478 /*
479  *  release the relevant resource per context
480  */
kbase_debug_job_fault_context_term(struct kbase_context * kctx)481 void kbase_debug_job_fault_context_term(struct kbase_context *kctx)
482 {
483 	vfree(kctx->reg_dump);
484 }
485 
486 #else /* CONFIG_DEBUG_FS */
487 
kbase_debug_job_fault_dev_init(struct kbase_device * kbdev)488 int kbase_debug_job_fault_dev_init(struct kbase_device *kbdev)
489 {
490 	kbdev->job_fault_debug = false;
491 
492 	return 0;
493 }
494 
kbase_debug_job_fault_dev_term(struct kbase_device * kbdev)495 void kbase_debug_job_fault_dev_term(struct kbase_device *kbdev)
496 {
497 }
498 
499 #endif /* CONFIG_DEBUG_FS */
500