xref: /OK3568_Linux_fs/kernel/drivers/gpu/arm/bifrost/mali_kbase_debug_job_fault.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
2 /*
3  *
4  * (C) COPYRIGHT 2012-2016, 2018-2022 ARM Limited. All rights reserved.
5  *
6  * This program is free software and is provided to you under the terms of the
7  * GNU General Public License version 2 as published by the Free Software
8  * Foundation, and any use by you of this program is subject to the terms
9  * of such GNU license.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, you can access it online at
18  * http://www.gnu.org/licenses/gpl-2.0.html.
19  *
20  */
21 
22 #include <mali_kbase.h>
23 #include <linux/spinlock.h>
24 #include <mali_kbase_hwaccess_jm.h>
25 
26 #if IS_ENABLED(CONFIG_DEBUG_FS)
27 
kbase_is_job_fault_event_pending(struct kbase_device * kbdev)28 static bool kbase_is_job_fault_event_pending(struct kbase_device *kbdev)
29 {
30 	struct list_head *event_list = &kbdev->job_fault_event_list;
31 	unsigned long    flags;
32 	bool             ret;
33 
34 	spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
35 	ret = !list_empty(event_list);
36 	spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
37 
38 	return ret;
39 }
40 
kbase_ctx_remove_pending_event(struct kbase_context * kctx)41 static void kbase_ctx_remove_pending_event(struct kbase_context *kctx)
42 {
43 	struct list_head *event_list = &kctx->kbdev->job_fault_event_list;
44 	struct base_job_fault_event *event;
45 	unsigned long flags;
46 
47 	spin_lock_irqsave(&kctx->kbdev->job_fault_event_lock, flags);
48 	list_for_each_entry(event, event_list, head) {
49 		if (event->katom->kctx == kctx) {
50 			list_del(&event->head);
51 			spin_unlock_irqrestore(&kctx->kbdev->job_fault_event_lock, flags);
52 
53 			wake_up(&kctx->kbdev->job_fault_resume_wq);
54 			flush_work(&event->job_fault_work);
55 
56 			/* job_fault_event_list can only have a single atom for
57 			 * each context.
58 			 */
59 			return;
60 		}
61 	}
62 	spin_unlock_irqrestore(&kctx->kbdev->job_fault_event_lock, flags);
63 }
64 
kbase_ctx_has_no_event_pending(struct kbase_context * kctx)65 static bool kbase_ctx_has_no_event_pending(struct kbase_context *kctx)
66 {
67 	struct kbase_device *kbdev = kctx->kbdev;
68 	struct list_head *event_list = &kctx->kbdev->job_fault_event_list;
69 	struct base_job_fault_event *event;
70 	unsigned long               flags;
71 
72 	spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
73 	if (list_empty(event_list)) {
74 		spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
75 		return true;
76 	}
77 	list_for_each_entry(event, event_list, head) {
78 		if (event->katom->kctx == kctx) {
79 			spin_unlock_irqrestore(&kbdev->job_fault_event_lock,
80 					flags);
81 			return false;
82 		}
83 	}
84 	spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
85 	return true;
86 }
87 
wait_for_job_fault(struct kbase_device * kbdev)88 static int wait_for_job_fault(struct kbase_device *kbdev)
89 {
90 #if KERNEL_VERSION(4, 15, 0) > LINUX_VERSION_CODE
91 	int ret = wait_event_interruptible_timeout(kbdev->job_fault_wq,
92 			kbase_is_job_fault_event_pending(kbdev),
93 			msecs_to_jiffies(2000));
94 	if (ret == 0)
95 		return -EAGAIN;
96 	else if (ret > 0)
97 		return 0;
98 	else
99 		return ret;
100 #else
101 	return wait_event_interruptible(kbdev->job_fault_wq,
102 			kbase_is_job_fault_event_pending(kbdev));
103 #endif
104 }
105 
106 /* wait until the fault happen and copy the event */
kbase_job_fault_event_wait(struct kbase_device * kbdev,struct base_job_fault_event * event)107 static int kbase_job_fault_event_wait(struct kbase_device *kbdev,
108 		struct base_job_fault_event *event)
109 {
110 	struct list_head            *event_list = &kbdev->job_fault_event_list;
111 	struct base_job_fault_event *event_in;
112 	unsigned long               flags;
113 
114 	spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
115 	while (list_empty(event_list)) {
116 		int err;
117 
118 		spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
119 
120 		err = wait_for_job_fault(kbdev);
121 		if (err)
122 			return err;
123 
124 		spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
125 	}
126 
127 	event_in = list_entry(event_list->next,
128 			struct base_job_fault_event, head);
129 	event->event_code = event_in->event_code;
130 	event->katom = event_in->katom;
131 
132 	spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
133 
134 	return 0;
135 
136 }
137 
138 /* remove the event from the queue */
kbase_job_fault_event_dequeue(struct kbase_device * kbdev,struct list_head * event_list)139 static struct base_job_fault_event *kbase_job_fault_event_dequeue(
140 		struct kbase_device *kbdev, struct list_head *event_list)
141 {
142 	struct base_job_fault_event *event;
143 
144 	event = list_entry(event_list->next,
145 			struct base_job_fault_event, head);
146 	list_del(event_list->next);
147 
148 	return event;
149 
150 }
151 
152 /* Remove all the following atoms after the failed atom in the same context
153  * Call the postponed bottom half of job done.
154  * Then, this context could be rescheduled.
155  */
kbase_job_fault_resume_event_cleanup(struct kbase_context * kctx)156 static void kbase_job_fault_resume_event_cleanup(struct kbase_context *kctx)
157 {
158 	struct list_head *event_list = &kctx->job_fault_resume_event_list;
159 
160 	while (!list_empty(event_list)) {
161 		struct base_job_fault_event *event;
162 
163 		event = kbase_job_fault_event_dequeue(kctx->kbdev,
164 				&kctx->job_fault_resume_event_list);
165 		kbase_jd_done_worker(&event->katom->work);
166 	}
167 
168 }
169 
kbase_job_fault_resume_worker(struct work_struct * data)170 static void kbase_job_fault_resume_worker(struct work_struct *data)
171 {
172 	struct base_job_fault_event *event = container_of(data,
173 			struct base_job_fault_event, job_fault_work);
174 	struct kbase_context *kctx;
175 	struct kbase_jd_atom *katom;
176 
177 	katom = event->katom;
178 	kctx = katom->kctx;
179 
180 	dev_info(kctx->kbdev->dev, "Job dumping wait\n");
181 
182 	/* When it was waked up, it need to check if queue is empty or the
183 	 * failed atom belongs to different context. If yes, wake up. Both
184 	 * of them mean the failed job has been dumped. Please note, it
185 	 * should never happen that the job_fault_event_list has the two
186 	 * atoms belong to the same context.
187 	 */
188 	wait_event(kctx->kbdev->job_fault_resume_wq,
189 			 kbase_ctx_has_no_event_pending(kctx));
190 
191 	atomic_set(&kctx->job_fault_count, 0);
192 	kbase_jd_done_worker(&katom->work);
193 
194 	/* In case the following atoms were scheduled during failed job dump
195 	 * the job_done_worker was held. We need to rerun it after the dump
196 	 * was finished
197 	 */
198 	kbase_job_fault_resume_event_cleanup(kctx);
199 
200 	dev_info(kctx->kbdev->dev, "Job dumping finish, resume scheduler\n");
201 }
202 
kbase_job_fault_event_queue(struct list_head * event_list,struct kbase_jd_atom * atom,u32 completion_code)203 static struct base_job_fault_event *kbase_job_fault_event_queue(
204 		struct list_head *event_list,
205 		struct kbase_jd_atom *atom,
206 		u32 completion_code)
207 {
208 	struct base_job_fault_event *event;
209 
210 	event = &atom->fault_event;
211 
212 	event->katom = atom;
213 	event->event_code = completion_code;
214 
215 	list_add_tail(&event->head, event_list);
216 
217 	return event;
218 
219 }
220 
kbase_job_fault_event_post(struct kbase_device * kbdev,struct kbase_jd_atom * katom,u32 completion_code)221 static void kbase_job_fault_event_post(struct kbase_device *kbdev,
222 		struct kbase_jd_atom *katom, u32 completion_code)
223 {
224 	struct base_job_fault_event *event;
225 	unsigned long flags;
226 
227 	spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
228 	event = kbase_job_fault_event_queue(&kbdev->job_fault_event_list,
229 				katom, completion_code);
230 	spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
231 
232 	wake_up_interruptible(&kbdev->job_fault_wq);
233 
234 	INIT_WORK(&event->job_fault_work, kbase_job_fault_resume_worker);
235 	queue_work(kbdev->job_fault_resume_workq, &event->job_fault_work);
236 
237 	dev_info(katom->kctx->kbdev->dev, "Job fault happen, start dump: %d_%d",
238 			katom->kctx->tgid, katom->kctx->id);
239 
240 }
241 
242 /*
243  * This function will process the job fault
244  * Get the register copy
245  * Send the failed job dump event
246  * Create a Wait queue to wait until the job dump finish
247  */
248 
kbase_debug_job_fault_process(struct kbase_jd_atom * katom,u32 completion_code)249 bool kbase_debug_job_fault_process(struct kbase_jd_atom *katom,
250 		u32 completion_code)
251 {
252 	struct kbase_context *kctx = katom->kctx;
253 
254 	/* Check if dumping is in the process
255 	 * only one atom of each context can be dumped at the same time
256 	 * If the atom belongs to different context, it can be dumped
257 	 */
258 	if (atomic_read(&kctx->job_fault_count) > 0) {
259 		kbase_job_fault_event_queue(
260 				&kctx->job_fault_resume_event_list,
261 				katom, completion_code);
262 		dev_info(kctx->kbdev->dev, "queue:%d\n",
263 				kbase_jd_atom_id(kctx, katom));
264 		return true;
265 	}
266 
267 	if (kbase_ctx_flag(kctx, KCTX_DYING))
268 		return false;
269 
270 	if (atomic_read(&kctx->kbdev->job_fault_debug) > 0) {
271 
272 		if (completion_code != BASE_JD_EVENT_DONE) {
273 
274 			if (kbase_job_fault_get_reg_snapshot(kctx) == false) {
275 				dev_warn(kctx->kbdev->dev, "get reg dump failed\n");
276 				return false;
277 			}
278 
279 			kbase_job_fault_event_post(kctx->kbdev, katom,
280 					completion_code);
281 			atomic_inc(&kctx->job_fault_count);
282 			dev_info(kctx->kbdev->dev, "post:%d\n",
283 					kbase_jd_atom_id(kctx, katom));
284 			return true;
285 
286 		}
287 	}
288 	return false;
289 
290 }
291 
debug_job_fault_show(struct seq_file * m,void * v)292 static int debug_job_fault_show(struct seq_file *m, void *v)
293 {
294 	struct kbase_device *kbdev = m->private;
295 	struct base_job_fault_event *event = (struct base_job_fault_event *)v;
296 	struct kbase_context *kctx = event->katom->kctx;
297 	int i;
298 
299 	dev_info(kbdev->dev, "debug job fault seq show:%d_%d, %d",
300 			kctx->tgid, kctx->id, event->reg_offset);
301 
302 	if (kctx->reg_dump == NULL) {
303 		dev_warn(kbdev->dev, "reg dump is NULL");
304 		return -1;
305 	}
306 
307 	if (kctx->reg_dump[event->reg_offset] ==
308 			REGISTER_DUMP_TERMINATION_FLAG) {
309 		/* Return the error here to stop the read. And the
310 		 * following next() will not be called. The stop can
311 		 * get the real event resource and release it
312 		 */
313 		return -1;
314 	}
315 
316 	if (event->reg_offset == 0)
317 		seq_printf(m, "%d_%d\n", kctx->tgid, kctx->id);
318 
319 	for (i = 0; i < 50; i++) {
320 		if (kctx->reg_dump[event->reg_offset] ==
321 				REGISTER_DUMP_TERMINATION_FLAG) {
322 			break;
323 		}
324 		seq_printf(m, "%08x: %08x\n",
325 				kctx->reg_dump[event->reg_offset],
326 				kctx->reg_dump[1+event->reg_offset]);
327 		event->reg_offset += 2;
328 
329 	}
330 
331 
332 	return 0;
333 }
debug_job_fault_next(struct seq_file * m,void * v,loff_t * pos)334 static void *debug_job_fault_next(struct seq_file *m, void *v, loff_t *pos)
335 {
336 	struct kbase_device *kbdev = m->private;
337 	struct base_job_fault_event *event = (struct base_job_fault_event *)v;
338 
339 	dev_info(kbdev->dev, "debug job fault seq next:%d, %d",
340 			event->reg_offset, (int)*pos);
341 
342 	return event;
343 }
344 
debug_job_fault_start(struct seq_file * m,loff_t * pos)345 static void *debug_job_fault_start(struct seq_file *m, loff_t *pos)
346 {
347 	struct kbase_device *kbdev = m->private;
348 	struct base_job_fault_event *event;
349 
350 	dev_info(kbdev->dev, "fault job seq start:%d", (int)*pos);
351 
352 	/* The condition is trick here. It needs make sure the
353 	 * fault hasn't happened and the dumping hasn't been started,
354 	 * or the dumping has finished
355 	 */
356 	if (*pos == 0) {
357 		event = kmalloc(sizeof(*event), GFP_KERNEL);
358 		if (!event)
359 			return NULL;
360 		event->reg_offset = 0;
361 		if (kbase_job_fault_event_wait(kbdev, event)) {
362 			kfree(event);
363 			return NULL;
364 		}
365 
366 		/* The cache flush workaround is called in bottom half of
367 		 * job done but we delayed it. Now we should clean cache
368 		 * earlier. Then the GPU memory dump should be correct.
369 		 */
370 		kbase_backend_cache_clean(kbdev, event->katom);
371 	} else
372 		return NULL;
373 
374 	return event;
375 }
376 
debug_job_fault_stop(struct seq_file * m,void * v)377 static void debug_job_fault_stop(struct seq_file *m, void *v)
378 {
379 	struct kbase_device *kbdev = m->private;
380 
381 	/* here we wake up the kbase_jd_done_worker after stop, it needs
382 	 * get the memory dump before the register dump in debug daemon,
383 	 * otherwise, the memory dump may be incorrect.
384 	 */
385 
386 	if (v != NULL) {
387 		kfree(v);
388 		dev_info(kbdev->dev, "debug job fault seq stop stage 1");
389 
390 	} else {
391 		unsigned long flags;
392 
393 		spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
394 		if (!list_empty(&kbdev->job_fault_event_list)) {
395 			kbase_job_fault_event_dequeue(kbdev,
396 				&kbdev->job_fault_event_list);
397 			wake_up(&kbdev->job_fault_resume_wq);
398 		}
399 		spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
400 		dev_info(kbdev->dev, "debug job fault seq stop stage 2");
401 	}
402 
403 }
404 
405 static const struct seq_operations ops = {
406 	.start = debug_job_fault_start,
407 	.next = debug_job_fault_next,
408 	.stop = debug_job_fault_stop,
409 	.show = debug_job_fault_show,
410 };
411 
debug_job_fault_open(struct inode * in,struct file * file)412 static int debug_job_fault_open(struct inode *in, struct file *file)
413 {
414 	struct kbase_device *kbdev = in->i_private;
415 
416 	if (atomic_cmpxchg(&kbdev->job_fault_debug, 0, 1) == 1) {
417 		dev_warn(kbdev->dev, "debug job fault is busy, only a single client is allowed");
418 		return -EBUSY;
419 	}
420 
421 	seq_open(file, &ops);
422 
423 	((struct seq_file *)file->private_data)->private = kbdev;
424 	dev_info(kbdev->dev, "debug job fault seq open");
425 
426 
427 	return 0;
428 
429 }
430 
debug_job_fault_release(struct inode * in,struct file * file)431 static int debug_job_fault_release(struct inode *in, struct file *file)
432 {
433 	struct kbase_device *kbdev = in->i_private;
434 	struct list_head *event_list = &kbdev->job_fault_event_list;
435 	unsigned long    flags;
436 
437 	seq_release(in, file);
438 
439 	spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
440 
441 	/* Disable job fault dumping. This will let kbase run jobs as normal,
442 	 * without blocking waiting for a job_fault client to read failed jobs.
443 	 *
444 	 * After this a new client may open the file, and may re-enable job
445 	 * fault dumping, but the job_fault_event_lock we hold here will block
446 	 * that from interfering until after we've completed the cleanup.
447 	 */
448 	atomic_dec(&kbdev->job_fault_debug);
449 
450 	/* Clean the unprocessed job fault. After that, all the suspended
451 	 * contexts could be rescheduled. Remove all the failed atoms that
452 	 * belong to different contexts Resume all the contexts that were
453 	 * suspend due to failed job.
454 	 */
455 	while (!list_empty(event_list)) {
456 		kbase_job_fault_event_dequeue(kbdev, event_list);
457 		spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
458 		wake_up(&kbdev->job_fault_resume_wq);
459 		spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
460 	}
461 
462 	spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
463 
464 	dev_info(kbdev->dev, "debug job fault seq close");
465 
466 	return 0;
467 }
468 
469 static const struct file_operations kbasep_debug_job_fault_fops = {
470 	.owner = THIS_MODULE,
471 	.open = debug_job_fault_open,
472 	.read = seq_read,
473 	.llseek = seq_lseek,
474 	.release = debug_job_fault_release,
475 };
476 
477 /*
478  *  Initialize debugfs entry for job fault dump
479  */
kbase_debug_job_fault_debugfs_init(struct kbase_device * kbdev)480 void kbase_debug_job_fault_debugfs_init(struct kbase_device *kbdev)
481 {
482 	debugfs_create_file("job_fault", 0400,
483 			kbdev->mali_debugfs_directory, kbdev,
484 			&kbasep_debug_job_fault_fops);
485 }
486 
487 
kbase_debug_job_fault_dev_init(struct kbase_device * kbdev)488 int kbase_debug_job_fault_dev_init(struct kbase_device *kbdev)
489 {
490 
491 	INIT_LIST_HEAD(&kbdev->job_fault_event_list);
492 
493 	init_waitqueue_head(&(kbdev->job_fault_wq));
494 	init_waitqueue_head(&(kbdev->job_fault_resume_wq));
495 	spin_lock_init(&kbdev->job_fault_event_lock);
496 
497 	kbdev->job_fault_resume_workq = alloc_workqueue(
498 			"kbase_job_fault_resume_work_queue", WQ_MEM_RECLAIM, 1);
499 	if (!kbdev->job_fault_resume_workq)
500 		return -ENOMEM;
501 
502 	atomic_set(&kbdev->job_fault_debug, 0);
503 
504 	return 0;
505 }
506 
507 /*
508  * Release the relevant resource per device
509  */
kbase_debug_job_fault_dev_term(struct kbase_device * kbdev)510 void kbase_debug_job_fault_dev_term(struct kbase_device *kbdev)
511 {
512 	destroy_workqueue(kbdev->job_fault_resume_workq);
513 }
514 
515 
516 /*
517  *  Initialize the relevant data structure per context
518  */
kbase_debug_job_fault_context_init(struct kbase_context * kctx)519 int kbase_debug_job_fault_context_init(struct kbase_context *kctx)
520 {
521 
522 	/* We need allocate double size register range
523 	 * Because this memory will keep the register address and value
524 	 */
525 	kctx->reg_dump = vmalloc(0x4000 * 2);
526 	if (kctx->reg_dump != NULL) {
527 		if (kbase_debug_job_fault_reg_snapshot_init(kctx, 0x4000) ==
528 		    false) {
529 			vfree(kctx->reg_dump);
530 			kctx->reg_dump = NULL;
531 		}
532 		INIT_LIST_HEAD(&kctx->job_fault_resume_event_list);
533 		atomic_set(&kctx->job_fault_count, 0);
534 	}
535 
536 	return 0;
537 }
538 
539 /*
540  *  release the relevant resource per context
541  */
kbase_debug_job_fault_context_term(struct kbase_context * kctx)542 void kbase_debug_job_fault_context_term(struct kbase_context *kctx)
543 {
544 	vfree(kctx->reg_dump);
545 }
546 
kbase_debug_job_fault_kctx_unblock(struct kbase_context * kctx)547 void kbase_debug_job_fault_kctx_unblock(struct kbase_context *kctx)
548 {
549 	WARN_ON(!kbase_ctx_flag(kctx, KCTX_DYING));
550 
551 	/* Return early if the job fault part of the kbase_device is not
552 	 * initialized yet. An error can happen during the device probe after
553 	 * the privileged Kbase context was created for the HW counter dumping
554 	 * but before the job fault part is initialized.
555 	 */
556 	if (!kctx->kbdev->job_fault_resume_workq)
557 		return;
558 
559 	kbase_ctx_remove_pending_event(kctx);
560 }
561 
562 #else /* CONFIG_DEBUG_FS */
563 
kbase_debug_job_fault_dev_init(struct kbase_device * kbdev)564 int kbase_debug_job_fault_dev_init(struct kbase_device *kbdev)
565 {
566 	return 0;
567 }
568 
kbase_debug_job_fault_dev_term(struct kbase_device * kbdev)569 void kbase_debug_job_fault_dev_term(struct kbase_device *kbdev)
570 {
571 }
572 
573 #endif /* CONFIG_DEBUG_FS */
574