1 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
2 /*
3 *
4 * (C) COPYRIGHT 2012-2016, 2018-2022 ARM Limited. All rights reserved.
5 *
6 * This program is free software and is provided to you under the terms of the
7 * GNU General Public License version 2 as published by the Free Software
8 * Foundation, and any use by you of this program is subject to the terms
9 * of such GNU license.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, you can access it online at
18 * http://www.gnu.org/licenses/gpl-2.0.html.
19 *
20 */
21
22 #include <mali_kbase.h>
23 #include <linux/spinlock.h>
24 #include <mali_kbase_hwaccess_jm.h>
25
26 #if IS_ENABLED(CONFIG_DEBUG_FS)
27
kbase_is_job_fault_event_pending(struct kbase_device * kbdev)28 static bool kbase_is_job_fault_event_pending(struct kbase_device *kbdev)
29 {
30 struct list_head *event_list = &kbdev->job_fault_event_list;
31 unsigned long flags;
32 bool ret;
33
34 spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
35 ret = !list_empty(event_list);
36 spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
37
38 return ret;
39 }
40
kbase_ctx_remove_pending_event(struct kbase_context * kctx)41 static void kbase_ctx_remove_pending_event(struct kbase_context *kctx)
42 {
43 struct list_head *event_list = &kctx->kbdev->job_fault_event_list;
44 struct base_job_fault_event *event;
45 unsigned long flags;
46
47 spin_lock_irqsave(&kctx->kbdev->job_fault_event_lock, flags);
48 list_for_each_entry(event, event_list, head) {
49 if (event->katom->kctx == kctx) {
50 list_del(&event->head);
51 spin_unlock_irqrestore(&kctx->kbdev->job_fault_event_lock, flags);
52
53 wake_up(&kctx->kbdev->job_fault_resume_wq);
54 flush_work(&event->job_fault_work);
55
56 /* job_fault_event_list can only have a single atom for
57 * each context.
58 */
59 return;
60 }
61 }
62 spin_unlock_irqrestore(&kctx->kbdev->job_fault_event_lock, flags);
63 }
64
kbase_ctx_has_no_event_pending(struct kbase_context * kctx)65 static bool kbase_ctx_has_no_event_pending(struct kbase_context *kctx)
66 {
67 struct kbase_device *kbdev = kctx->kbdev;
68 struct list_head *event_list = &kctx->kbdev->job_fault_event_list;
69 struct base_job_fault_event *event;
70 unsigned long flags;
71
72 spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
73 if (list_empty(event_list)) {
74 spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
75 return true;
76 }
77 list_for_each_entry(event, event_list, head) {
78 if (event->katom->kctx == kctx) {
79 spin_unlock_irqrestore(&kbdev->job_fault_event_lock,
80 flags);
81 return false;
82 }
83 }
84 spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
85 return true;
86 }
87
wait_for_job_fault(struct kbase_device * kbdev)88 static int wait_for_job_fault(struct kbase_device *kbdev)
89 {
90 #if KERNEL_VERSION(4, 15, 0) > LINUX_VERSION_CODE
91 int ret = wait_event_interruptible_timeout(kbdev->job_fault_wq,
92 kbase_is_job_fault_event_pending(kbdev),
93 msecs_to_jiffies(2000));
94 if (ret == 0)
95 return -EAGAIN;
96 else if (ret > 0)
97 return 0;
98 else
99 return ret;
100 #else
101 return wait_event_interruptible(kbdev->job_fault_wq,
102 kbase_is_job_fault_event_pending(kbdev));
103 #endif
104 }
105
106 /* wait until the fault happen and copy the event */
kbase_job_fault_event_wait(struct kbase_device * kbdev,struct base_job_fault_event * event)107 static int kbase_job_fault_event_wait(struct kbase_device *kbdev,
108 struct base_job_fault_event *event)
109 {
110 struct list_head *event_list = &kbdev->job_fault_event_list;
111 struct base_job_fault_event *event_in;
112 unsigned long flags;
113
114 spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
115 while (list_empty(event_list)) {
116 int err;
117
118 spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
119
120 err = wait_for_job_fault(kbdev);
121 if (err)
122 return err;
123
124 spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
125 }
126
127 event_in = list_entry(event_list->next,
128 struct base_job_fault_event, head);
129 event->event_code = event_in->event_code;
130 event->katom = event_in->katom;
131
132 spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
133
134 return 0;
135
136 }
137
138 /* remove the event from the queue */
kbase_job_fault_event_dequeue(struct kbase_device * kbdev,struct list_head * event_list)139 static struct base_job_fault_event *kbase_job_fault_event_dequeue(
140 struct kbase_device *kbdev, struct list_head *event_list)
141 {
142 struct base_job_fault_event *event;
143
144 event = list_entry(event_list->next,
145 struct base_job_fault_event, head);
146 list_del(event_list->next);
147
148 return event;
149
150 }
151
152 /* Remove all the following atoms after the failed atom in the same context
153 * Call the postponed bottom half of job done.
154 * Then, this context could be rescheduled.
155 */
kbase_job_fault_resume_event_cleanup(struct kbase_context * kctx)156 static void kbase_job_fault_resume_event_cleanup(struct kbase_context *kctx)
157 {
158 struct list_head *event_list = &kctx->job_fault_resume_event_list;
159
160 while (!list_empty(event_list)) {
161 struct base_job_fault_event *event;
162
163 event = kbase_job_fault_event_dequeue(kctx->kbdev,
164 &kctx->job_fault_resume_event_list);
165 kbase_jd_done_worker(&event->katom->work);
166 }
167
168 }
169
kbase_job_fault_resume_worker(struct work_struct * data)170 static void kbase_job_fault_resume_worker(struct work_struct *data)
171 {
172 struct base_job_fault_event *event = container_of(data,
173 struct base_job_fault_event, job_fault_work);
174 struct kbase_context *kctx;
175 struct kbase_jd_atom *katom;
176
177 katom = event->katom;
178 kctx = katom->kctx;
179
180 dev_info(kctx->kbdev->dev, "Job dumping wait\n");
181
182 /* When it was waked up, it need to check if queue is empty or the
183 * failed atom belongs to different context. If yes, wake up. Both
184 * of them mean the failed job has been dumped. Please note, it
185 * should never happen that the job_fault_event_list has the two
186 * atoms belong to the same context.
187 */
188 wait_event(kctx->kbdev->job_fault_resume_wq,
189 kbase_ctx_has_no_event_pending(kctx));
190
191 atomic_set(&kctx->job_fault_count, 0);
192 kbase_jd_done_worker(&katom->work);
193
194 /* In case the following atoms were scheduled during failed job dump
195 * the job_done_worker was held. We need to rerun it after the dump
196 * was finished
197 */
198 kbase_job_fault_resume_event_cleanup(kctx);
199
200 dev_info(kctx->kbdev->dev, "Job dumping finish, resume scheduler\n");
201 }
202
kbase_job_fault_event_queue(struct list_head * event_list,struct kbase_jd_atom * atom,u32 completion_code)203 static struct base_job_fault_event *kbase_job_fault_event_queue(
204 struct list_head *event_list,
205 struct kbase_jd_atom *atom,
206 u32 completion_code)
207 {
208 struct base_job_fault_event *event;
209
210 event = &atom->fault_event;
211
212 event->katom = atom;
213 event->event_code = completion_code;
214
215 list_add_tail(&event->head, event_list);
216
217 return event;
218
219 }
220
kbase_job_fault_event_post(struct kbase_device * kbdev,struct kbase_jd_atom * katom,u32 completion_code)221 static void kbase_job_fault_event_post(struct kbase_device *kbdev,
222 struct kbase_jd_atom *katom, u32 completion_code)
223 {
224 struct base_job_fault_event *event;
225 unsigned long flags;
226
227 spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
228 event = kbase_job_fault_event_queue(&kbdev->job_fault_event_list,
229 katom, completion_code);
230 spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
231
232 wake_up_interruptible(&kbdev->job_fault_wq);
233
234 INIT_WORK(&event->job_fault_work, kbase_job_fault_resume_worker);
235 queue_work(kbdev->job_fault_resume_workq, &event->job_fault_work);
236
237 dev_info(katom->kctx->kbdev->dev, "Job fault happen, start dump: %d_%d",
238 katom->kctx->tgid, katom->kctx->id);
239
240 }
241
242 /*
243 * This function will process the job fault
244 * Get the register copy
245 * Send the failed job dump event
246 * Create a Wait queue to wait until the job dump finish
247 */
248
kbase_debug_job_fault_process(struct kbase_jd_atom * katom,u32 completion_code)249 bool kbase_debug_job_fault_process(struct kbase_jd_atom *katom,
250 u32 completion_code)
251 {
252 struct kbase_context *kctx = katom->kctx;
253
254 /* Check if dumping is in the process
255 * only one atom of each context can be dumped at the same time
256 * If the atom belongs to different context, it can be dumped
257 */
258 if (atomic_read(&kctx->job_fault_count) > 0) {
259 kbase_job_fault_event_queue(
260 &kctx->job_fault_resume_event_list,
261 katom, completion_code);
262 dev_info(kctx->kbdev->dev, "queue:%d\n",
263 kbase_jd_atom_id(kctx, katom));
264 return true;
265 }
266
267 if (kbase_ctx_flag(kctx, KCTX_DYING))
268 return false;
269
270 if (atomic_read(&kctx->kbdev->job_fault_debug) > 0) {
271
272 if (completion_code != BASE_JD_EVENT_DONE) {
273
274 if (kbase_job_fault_get_reg_snapshot(kctx) == false) {
275 dev_warn(kctx->kbdev->dev, "get reg dump failed\n");
276 return false;
277 }
278
279 kbase_job_fault_event_post(kctx->kbdev, katom,
280 completion_code);
281 atomic_inc(&kctx->job_fault_count);
282 dev_info(kctx->kbdev->dev, "post:%d\n",
283 kbase_jd_atom_id(kctx, katom));
284 return true;
285
286 }
287 }
288 return false;
289
290 }
291
debug_job_fault_show(struct seq_file * m,void * v)292 static int debug_job_fault_show(struct seq_file *m, void *v)
293 {
294 struct kbase_device *kbdev = m->private;
295 struct base_job_fault_event *event = (struct base_job_fault_event *)v;
296 struct kbase_context *kctx = event->katom->kctx;
297 int i;
298
299 dev_info(kbdev->dev, "debug job fault seq show:%d_%d, %d",
300 kctx->tgid, kctx->id, event->reg_offset);
301
302 if (kctx->reg_dump == NULL) {
303 dev_warn(kbdev->dev, "reg dump is NULL");
304 return -1;
305 }
306
307 if (kctx->reg_dump[event->reg_offset] ==
308 REGISTER_DUMP_TERMINATION_FLAG) {
309 /* Return the error here to stop the read. And the
310 * following next() will not be called. The stop can
311 * get the real event resource and release it
312 */
313 return -1;
314 }
315
316 if (event->reg_offset == 0)
317 seq_printf(m, "%d_%d\n", kctx->tgid, kctx->id);
318
319 for (i = 0; i < 50; i++) {
320 if (kctx->reg_dump[event->reg_offset] ==
321 REGISTER_DUMP_TERMINATION_FLAG) {
322 break;
323 }
324 seq_printf(m, "%08x: %08x\n",
325 kctx->reg_dump[event->reg_offset],
326 kctx->reg_dump[1+event->reg_offset]);
327 event->reg_offset += 2;
328
329 }
330
331
332 return 0;
333 }
debug_job_fault_next(struct seq_file * m,void * v,loff_t * pos)334 static void *debug_job_fault_next(struct seq_file *m, void *v, loff_t *pos)
335 {
336 struct kbase_device *kbdev = m->private;
337 struct base_job_fault_event *event = (struct base_job_fault_event *)v;
338
339 dev_info(kbdev->dev, "debug job fault seq next:%d, %d",
340 event->reg_offset, (int)*pos);
341
342 return event;
343 }
344
debug_job_fault_start(struct seq_file * m,loff_t * pos)345 static void *debug_job_fault_start(struct seq_file *m, loff_t *pos)
346 {
347 struct kbase_device *kbdev = m->private;
348 struct base_job_fault_event *event;
349
350 dev_info(kbdev->dev, "fault job seq start:%d", (int)*pos);
351
352 /* The condition is trick here. It needs make sure the
353 * fault hasn't happened and the dumping hasn't been started,
354 * or the dumping has finished
355 */
356 if (*pos == 0) {
357 event = kmalloc(sizeof(*event), GFP_KERNEL);
358 if (!event)
359 return NULL;
360 event->reg_offset = 0;
361 if (kbase_job_fault_event_wait(kbdev, event)) {
362 kfree(event);
363 return NULL;
364 }
365
366 /* The cache flush workaround is called in bottom half of
367 * job done but we delayed it. Now we should clean cache
368 * earlier. Then the GPU memory dump should be correct.
369 */
370 kbase_backend_cache_clean(kbdev, event->katom);
371 } else
372 return NULL;
373
374 return event;
375 }
376
debug_job_fault_stop(struct seq_file * m,void * v)377 static void debug_job_fault_stop(struct seq_file *m, void *v)
378 {
379 struct kbase_device *kbdev = m->private;
380
381 /* here we wake up the kbase_jd_done_worker after stop, it needs
382 * get the memory dump before the register dump in debug daemon,
383 * otherwise, the memory dump may be incorrect.
384 */
385
386 if (v != NULL) {
387 kfree(v);
388 dev_info(kbdev->dev, "debug job fault seq stop stage 1");
389
390 } else {
391 unsigned long flags;
392
393 spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
394 if (!list_empty(&kbdev->job_fault_event_list)) {
395 kbase_job_fault_event_dequeue(kbdev,
396 &kbdev->job_fault_event_list);
397 wake_up(&kbdev->job_fault_resume_wq);
398 }
399 spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
400 dev_info(kbdev->dev, "debug job fault seq stop stage 2");
401 }
402
403 }
404
405 static const struct seq_operations ops = {
406 .start = debug_job_fault_start,
407 .next = debug_job_fault_next,
408 .stop = debug_job_fault_stop,
409 .show = debug_job_fault_show,
410 };
411
debug_job_fault_open(struct inode * in,struct file * file)412 static int debug_job_fault_open(struct inode *in, struct file *file)
413 {
414 struct kbase_device *kbdev = in->i_private;
415
416 if (atomic_cmpxchg(&kbdev->job_fault_debug, 0, 1) == 1) {
417 dev_warn(kbdev->dev, "debug job fault is busy, only a single client is allowed");
418 return -EBUSY;
419 }
420
421 seq_open(file, &ops);
422
423 ((struct seq_file *)file->private_data)->private = kbdev;
424 dev_info(kbdev->dev, "debug job fault seq open");
425
426
427 return 0;
428
429 }
430
debug_job_fault_release(struct inode * in,struct file * file)431 static int debug_job_fault_release(struct inode *in, struct file *file)
432 {
433 struct kbase_device *kbdev = in->i_private;
434 struct list_head *event_list = &kbdev->job_fault_event_list;
435 unsigned long flags;
436
437 seq_release(in, file);
438
439 spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
440
441 /* Disable job fault dumping. This will let kbase run jobs as normal,
442 * without blocking waiting for a job_fault client to read failed jobs.
443 *
444 * After this a new client may open the file, and may re-enable job
445 * fault dumping, but the job_fault_event_lock we hold here will block
446 * that from interfering until after we've completed the cleanup.
447 */
448 atomic_dec(&kbdev->job_fault_debug);
449
450 /* Clean the unprocessed job fault. After that, all the suspended
451 * contexts could be rescheduled. Remove all the failed atoms that
452 * belong to different contexts Resume all the contexts that were
453 * suspend due to failed job.
454 */
455 while (!list_empty(event_list)) {
456 kbase_job_fault_event_dequeue(kbdev, event_list);
457 spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
458 wake_up(&kbdev->job_fault_resume_wq);
459 spin_lock_irqsave(&kbdev->job_fault_event_lock, flags);
460 }
461
462 spin_unlock_irqrestore(&kbdev->job_fault_event_lock, flags);
463
464 dev_info(kbdev->dev, "debug job fault seq close");
465
466 return 0;
467 }
468
469 static const struct file_operations kbasep_debug_job_fault_fops = {
470 .owner = THIS_MODULE,
471 .open = debug_job_fault_open,
472 .read = seq_read,
473 .llseek = seq_lseek,
474 .release = debug_job_fault_release,
475 };
476
477 /*
478 * Initialize debugfs entry for job fault dump
479 */
kbase_debug_job_fault_debugfs_init(struct kbase_device * kbdev)480 void kbase_debug_job_fault_debugfs_init(struct kbase_device *kbdev)
481 {
482 debugfs_create_file("job_fault", 0400,
483 kbdev->mali_debugfs_directory, kbdev,
484 &kbasep_debug_job_fault_fops);
485 }
486
487
kbase_debug_job_fault_dev_init(struct kbase_device * kbdev)488 int kbase_debug_job_fault_dev_init(struct kbase_device *kbdev)
489 {
490
491 INIT_LIST_HEAD(&kbdev->job_fault_event_list);
492
493 init_waitqueue_head(&(kbdev->job_fault_wq));
494 init_waitqueue_head(&(kbdev->job_fault_resume_wq));
495 spin_lock_init(&kbdev->job_fault_event_lock);
496
497 kbdev->job_fault_resume_workq = alloc_workqueue(
498 "kbase_job_fault_resume_work_queue", WQ_MEM_RECLAIM, 1);
499 if (!kbdev->job_fault_resume_workq)
500 return -ENOMEM;
501
502 atomic_set(&kbdev->job_fault_debug, 0);
503
504 return 0;
505 }
506
507 /*
508 * Release the relevant resource per device
509 */
kbase_debug_job_fault_dev_term(struct kbase_device * kbdev)510 void kbase_debug_job_fault_dev_term(struct kbase_device *kbdev)
511 {
512 destroy_workqueue(kbdev->job_fault_resume_workq);
513 }
514
515
516 /*
517 * Initialize the relevant data structure per context
518 */
kbase_debug_job_fault_context_init(struct kbase_context * kctx)519 int kbase_debug_job_fault_context_init(struct kbase_context *kctx)
520 {
521
522 /* We need allocate double size register range
523 * Because this memory will keep the register address and value
524 */
525 kctx->reg_dump = vmalloc(0x4000 * 2);
526 if (kctx->reg_dump != NULL) {
527 if (kbase_debug_job_fault_reg_snapshot_init(kctx, 0x4000) ==
528 false) {
529 vfree(kctx->reg_dump);
530 kctx->reg_dump = NULL;
531 }
532 INIT_LIST_HEAD(&kctx->job_fault_resume_event_list);
533 atomic_set(&kctx->job_fault_count, 0);
534 }
535
536 return 0;
537 }
538
539 /*
540 * release the relevant resource per context
541 */
kbase_debug_job_fault_context_term(struct kbase_context * kctx)542 void kbase_debug_job_fault_context_term(struct kbase_context *kctx)
543 {
544 vfree(kctx->reg_dump);
545 }
546
kbase_debug_job_fault_kctx_unblock(struct kbase_context * kctx)547 void kbase_debug_job_fault_kctx_unblock(struct kbase_context *kctx)
548 {
549 WARN_ON(!kbase_ctx_flag(kctx, KCTX_DYING));
550
551 /* Return early if the job fault part of the kbase_device is not
552 * initialized yet. An error can happen during the device probe after
553 * the privileged Kbase context was created for the HW counter dumping
554 * but before the job fault part is initialized.
555 */
556 if (!kctx->kbdev->job_fault_resume_workq)
557 return;
558
559 kbase_ctx_remove_pending_event(kctx);
560 }
561
562 #else /* CONFIG_DEBUG_FS */
563
kbase_debug_job_fault_dev_init(struct kbase_device * kbdev)564 int kbase_debug_job_fault_dev_init(struct kbase_device *kbdev)
565 {
566 return 0;
567 }
568
kbase_debug_job_fault_dev_term(struct kbase_device * kbdev)569 void kbase_debug_job_fault_dev_term(struct kbase_device *kbdev)
570 {
571 }
572
573 #endif /* CONFIG_DEBUG_FS */
574