xref: /OK3568_Linux_fs/kernel/drivers/vfio/vfio_iommu_type1.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO: IOMMU DMA mapping support for Type1 IOMMU
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  *
12  * We arbitrarily define a Type1 IOMMU as one matching the below code.
13  * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
14  * VT-d, but that makes it harder to re-use as theoretically anyone
15  * implementing a similar IOMMU could make use of this.  We expect the
16  * IOMMU to support the IOMMU API and have few to no restrictions around
17  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
18  * optimized for relatively static mappings of a userspace process with
19  * userpsace pages pinned into memory.  We also assume devices and IOMMU
20  * domains are PCI based as the IOMMU API is still centered around a
21  * device/bus interface rather than a group interface.
22  */
23 
24 #include <linux/compat.h>
25 #include <linux/device.h>
26 #include <linux/fs.h>
27 #include <linux/highmem.h>
28 #include <linux/iommu.h>
29 #include <linux/module.h>
30 #include <linux/mm.h>
31 #include <linux/kthread.h>
32 #include <linux/rbtree.h>
33 #include <linux/sched/signal.h>
34 #include <linux/sched/mm.h>
35 #include <linux/slab.h>
36 #include <linux/uaccess.h>
37 #include <linux/vfio.h>
38 #include <linux/workqueue.h>
39 #include <linux/mdev.h>
40 #include <linux/notifier.h>
41 #include <linux/dma-iommu.h>
42 #include <linux/irqdomain.h>
43 
44 #define DRIVER_VERSION  "0.2"
45 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
46 #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
47 
48 static bool allow_unsafe_interrupts;
49 module_param_named(allow_unsafe_interrupts,
50 		   allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
51 MODULE_PARM_DESC(allow_unsafe_interrupts,
52 		 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
53 
54 static bool disable_hugepages;
55 module_param_named(disable_hugepages,
56 		   disable_hugepages, bool, S_IRUGO | S_IWUSR);
57 MODULE_PARM_DESC(disable_hugepages,
58 		 "Disable VFIO IOMMU support for IOMMU hugepages.");
59 
60 static unsigned int dma_entry_limit __read_mostly = U16_MAX;
61 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
62 MODULE_PARM_DESC(dma_entry_limit,
63 		 "Maximum number of user DMA mappings per container (65535).");
64 
65 struct vfio_iommu {
66 	struct list_head	domain_list;
67 	struct list_head	iova_list;
68 	struct vfio_domain	*external_domain; /* domain for external user */
69 	struct mutex		lock;
70 	struct rb_root		dma_list;
71 	struct blocking_notifier_head notifier;
72 	unsigned int		dma_avail;
73 	uint64_t		pgsize_bitmap;
74 	bool			v2;
75 	bool			nesting;
76 	bool			dirty_page_tracking;
77 	bool			pinned_page_dirty_scope;
78 };
79 
80 struct vfio_domain {
81 	struct iommu_domain	*domain;
82 	struct list_head	next;
83 	struct list_head	group_list;
84 	int			prot;		/* IOMMU_CACHE */
85 	bool			fgsp;		/* Fine-grained super pages */
86 };
87 
88 struct vfio_dma {
89 	struct rb_node		node;
90 	dma_addr_t		iova;		/* Device address */
91 	unsigned long		vaddr;		/* Process virtual addr */
92 	size_t			size;		/* Map size (bytes) */
93 	int			prot;		/* IOMMU_READ/WRITE */
94 	bool			iommu_mapped;
95 	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
96 	struct task_struct	*task;
97 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
98 	unsigned long		*bitmap;
99 };
100 
101 struct vfio_batch {
102 	struct page		**pages;	/* for pin_user_pages_remote */
103 	struct page		*fallback_page; /* if pages alloc fails */
104 	int			capacity;	/* length of pages array */
105 };
106 
107 struct vfio_group {
108 	struct iommu_group	*iommu_group;
109 	struct list_head	next;
110 	bool			mdev_group;	/* An mdev group */
111 	bool			pinned_page_dirty_scope;
112 };
113 
114 struct vfio_iova {
115 	struct list_head	list;
116 	dma_addr_t		start;
117 	dma_addr_t		end;
118 };
119 
120 /*
121  * Guest RAM pinning working set or DMA target
122  */
123 struct vfio_pfn {
124 	struct rb_node		node;
125 	dma_addr_t		iova;		/* Device address */
126 	unsigned long		pfn;		/* Host pfn */
127 	unsigned int		ref_count;
128 };
129 
130 struct vfio_regions {
131 	struct list_head list;
132 	dma_addr_t iova;
133 	phys_addr_t phys;
134 	size_t len;
135 };
136 
137 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
138 					(!list_empty(&iommu->domain_list))
139 
140 #define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
141 
142 /*
143  * Input argument of number of bits to bitmap_set() is unsigned integer, which
144  * further casts to signed integer for unaligned multi-bit operation,
145  * __bitmap_set().
146  * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
147  * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
148  * system.
149  */
150 #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
151 #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
152 
153 static int put_pfn(unsigned long pfn, int prot);
154 
155 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
156 					       struct iommu_group *iommu_group);
157 
158 static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
159 /*
160  * This code handles mapping and unmapping of user data buffers
161  * into DMA'ble space using the IOMMU
162  */
163 
vfio_find_dma(struct vfio_iommu * iommu,dma_addr_t start,size_t size)164 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
165 				      dma_addr_t start, size_t size)
166 {
167 	struct rb_node *node = iommu->dma_list.rb_node;
168 
169 	while (node) {
170 		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
171 
172 		if (start + size <= dma->iova)
173 			node = node->rb_left;
174 		else if (start >= dma->iova + dma->size)
175 			node = node->rb_right;
176 		else
177 			return dma;
178 	}
179 
180 	return NULL;
181 }
182 
vfio_link_dma(struct vfio_iommu * iommu,struct vfio_dma * new)183 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
184 {
185 	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
186 	struct vfio_dma *dma;
187 
188 	while (*link) {
189 		parent = *link;
190 		dma = rb_entry(parent, struct vfio_dma, node);
191 
192 		if (new->iova + new->size <= dma->iova)
193 			link = &(*link)->rb_left;
194 		else
195 			link = &(*link)->rb_right;
196 	}
197 
198 	rb_link_node(&new->node, parent, link);
199 	rb_insert_color(&new->node, &iommu->dma_list);
200 }
201 
vfio_unlink_dma(struct vfio_iommu * iommu,struct vfio_dma * old)202 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
203 {
204 	rb_erase(&old->node, &iommu->dma_list);
205 }
206 
207 
vfio_dma_bitmap_alloc(struct vfio_dma * dma,size_t pgsize)208 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
209 {
210 	uint64_t npages = dma->size / pgsize;
211 
212 	if (npages > DIRTY_BITMAP_PAGES_MAX)
213 		return -EINVAL;
214 
215 	/*
216 	 * Allocate extra 64 bits that are used to calculate shift required for
217 	 * bitmap_shift_left() to manipulate and club unaligned number of pages
218 	 * in adjacent vfio_dma ranges.
219 	 */
220 	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
221 			       GFP_KERNEL);
222 	if (!dma->bitmap)
223 		return -ENOMEM;
224 
225 	return 0;
226 }
227 
vfio_dma_bitmap_free(struct vfio_dma * dma)228 static void vfio_dma_bitmap_free(struct vfio_dma *dma)
229 {
230 	kfree(dma->bitmap);
231 	dma->bitmap = NULL;
232 }
233 
vfio_dma_populate_bitmap(struct vfio_dma * dma,size_t pgsize)234 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
235 {
236 	struct rb_node *p;
237 	unsigned long pgshift = __ffs(pgsize);
238 
239 	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
240 		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
241 
242 		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
243 	}
244 }
245 
vfio_iommu_populate_bitmap_full(struct vfio_iommu * iommu)246 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
247 {
248 	struct rb_node *n;
249 	unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
250 
251 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
252 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
253 
254 		bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
255 	}
256 }
257 
vfio_dma_bitmap_alloc_all(struct vfio_iommu * iommu,size_t pgsize)258 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
259 {
260 	struct rb_node *n;
261 
262 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
263 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
264 		int ret;
265 
266 		ret = vfio_dma_bitmap_alloc(dma, pgsize);
267 		if (ret) {
268 			struct rb_node *p;
269 
270 			for (p = rb_prev(n); p; p = rb_prev(p)) {
271 				struct vfio_dma *dma = rb_entry(n,
272 							struct vfio_dma, node);
273 
274 				vfio_dma_bitmap_free(dma);
275 			}
276 			return ret;
277 		}
278 		vfio_dma_populate_bitmap(dma, pgsize);
279 	}
280 	return 0;
281 }
282 
vfio_dma_bitmap_free_all(struct vfio_iommu * iommu)283 static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
284 {
285 	struct rb_node *n;
286 
287 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
288 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
289 
290 		vfio_dma_bitmap_free(dma);
291 	}
292 }
293 
294 /*
295  * Helper Functions for host iova-pfn list
296  */
vfio_find_vpfn(struct vfio_dma * dma,dma_addr_t iova)297 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
298 {
299 	struct vfio_pfn *vpfn;
300 	struct rb_node *node = dma->pfn_list.rb_node;
301 
302 	while (node) {
303 		vpfn = rb_entry(node, struct vfio_pfn, node);
304 
305 		if (iova < vpfn->iova)
306 			node = node->rb_left;
307 		else if (iova > vpfn->iova)
308 			node = node->rb_right;
309 		else
310 			return vpfn;
311 	}
312 	return NULL;
313 }
314 
vfio_link_pfn(struct vfio_dma * dma,struct vfio_pfn * new)315 static void vfio_link_pfn(struct vfio_dma *dma,
316 			  struct vfio_pfn *new)
317 {
318 	struct rb_node **link, *parent = NULL;
319 	struct vfio_pfn *vpfn;
320 
321 	link = &dma->pfn_list.rb_node;
322 	while (*link) {
323 		parent = *link;
324 		vpfn = rb_entry(parent, struct vfio_pfn, node);
325 
326 		if (new->iova < vpfn->iova)
327 			link = &(*link)->rb_left;
328 		else
329 			link = &(*link)->rb_right;
330 	}
331 
332 	rb_link_node(&new->node, parent, link);
333 	rb_insert_color(&new->node, &dma->pfn_list);
334 }
335 
vfio_unlink_pfn(struct vfio_dma * dma,struct vfio_pfn * old)336 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
337 {
338 	rb_erase(&old->node, &dma->pfn_list);
339 }
340 
vfio_add_to_pfn_list(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn)341 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
342 				unsigned long pfn)
343 {
344 	struct vfio_pfn *vpfn;
345 
346 	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
347 	if (!vpfn)
348 		return -ENOMEM;
349 
350 	vpfn->iova = iova;
351 	vpfn->pfn = pfn;
352 	vpfn->ref_count = 1;
353 	vfio_link_pfn(dma, vpfn);
354 	return 0;
355 }
356 
vfio_remove_from_pfn_list(struct vfio_dma * dma,struct vfio_pfn * vpfn)357 static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
358 				      struct vfio_pfn *vpfn)
359 {
360 	vfio_unlink_pfn(dma, vpfn);
361 	kfree(vpfn);
362 }
363 
vfio_iova_get_vfio_pfn(struct vfio_dma * dma,unsigned long iova)364 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
365 					       unsigned long iova)
366 {
367 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
368 
369 	if (vpfn)
370 		vpfn->ref_count++;
371 	return vpfn;
372 }
373 
vfio_iova_put_vfio_pfn(struct vfio_dma * dma,struct vfio_pfn * vpfn)374 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
375 {
376 	int ret = 0;
377 
378 	vpfn->ref_count--;
379 	if (!vpfn->ref_count) {
380 		ret = put_pfn(vpfn->pfn, dma->prot);
381 		vfio_remove_from_pfn_list(dma, vpfn);
382 	}
383 	return ret;
384 }
385 
vfio_lock_acct(struct vfio_dma * dma,long npage,bool async)386 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
387 {
388 	struct mm_struct *mm;
389 	int ret;
390 
391 	if (!npage)
392 		return 0;
393 
394 	mm = async ? get_task_mm(dma->task) : dma->task->mm;
395 	if (!mm)
396 		return -ESRCH; /* process exited */
397 
398 	ret = mmap_write_lock_killable(mm);
399 	if (!ret) {
400 		ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
401 					  dma->lock_cap);
402 		mmap_write_unlock(mm);
403 	}
404 
405 	if (async)
406 		mmput(mm);
407 
408 	return ret;
409 }
410 
411 /*
412  * Some mappings aren't backed by a struct page, for example an mmap'd
413  * MMIO range for our own or another device.  These use a different
414  * pfn conversion and shouldn't be tracked as locked pages.
415  * For compound pages, any driver that sets the reserved bit in head
416  * page needs to set the reserved bit in all subpages to be safe.
417  */
is_invalid_reserved_pfn(unsigned long pfn)418 static bool is_invalid_reserved_pfn(unsigned long pfn)
419 {
420 	if (pfn_valid(pfn))
421 		return PageReserved(pfn_to_page(pfn));
422 
423 	return true;
424 }
425 
put_pfn(unsigned long pfn,int prot)426 static int put_pfn(unsigned long pfn, int prot)
427 {
428 	if (!is_invalid_reserved_pfn(pfn)) {
429 		struct page *page = pfn_to_page(pfn);
430 
431 		unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
432 		return 1;
433 	}
434 	return 0;
435 }
436 
437 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
438 
vfio_batch_init(struct vfio_batch * batch)439 static void vfio_batch_init(struct vfio_batch *batch)
440 {
441 	if (unlikely(disable_hugepages))
442 		goto fallback;
443 
444 	batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
445 	if (!batch->pages)
446 		goto fallback;
447 
448 	batch->capacity = VFIO_BATCH_MAX_CAPACITY;
449 	return;
450 
451 fallback:
452 	batch->pages = &batch->fallback_page;
453 	batch->capacity = 1;
454 }
455 
vfio_batch_fini(struct vfio_batch * batch)456 static void vfio_batch_fini(struct vfio_batch *batch)
457 {
458 	if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
459 		free_page((unsigned long)batch->pages);
460 }
461 
follow_fault_pfn(struct vm_area_struct * vma,struct mm_struct * mm,unsigned long vaddr,unsigned long * pfn,bool write_fault)462 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
463 			    unsigned long vaddr, unsigned long *pfn,
464 			    bool write_fault)
465 {
466 	pte_t *ptep;
467 	spinlock_t *ptl;
468 	int ret;
469 
470 	ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
471 	if (ret) {
472 		bool unlocked = false;
473 
474 		ret = fixup_user_fault(mm, vaddr,
475 				       FAULT_FLAG_REMOTE |
476 				       (write_fault ?  FAULT_FLAG_WRITE : 0),
477 				       &unlocked);
478 		if (unlocked)
479 			return -EAGAIN;
480 
481 		if (ret)
482 			return ret;
483 
484 		ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
485 		if (ret)
486 			return ret;
487 	}
488 
489 	if (write_fault && !pte_write(*ptep))
490 		ret = -EFAULT;
491 	else
492 		*pfn = pte_pfn(*ptep);
493 
494 	pte_unmap_unlock(ptep, ptl);
495 	return ret;
496 }
497 
498 /*
499  * Returns the positive number of pfns successfully obtained or a negative
500  * error code.
501  */
vaddr_get_pfns(struct mm_struct * mm,unsigned long vaddr,long npages,int prot,unsigned long * pfn,struct page ** pages)502 static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
503 			  long npages, int prot, unsigned long *pfn,
504 			  struct page **pages)
505 {
506 	struct vm_area_struct *vma;
507 	unsigned int flags = 0;
508 	int ret;
509 
510 	if (prot & IOMMU_WRITE)
511 		flags |= FOLL_WRITE;
512 
513 	mmap_read_lock(mm);
514 	ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
515 				    pages, NULL, NULL);
516 	if (ret > 0) {
517 		int i;
518 
519 		/*
520 		 * The zero page is always resident, we don't need to pin it
521 		 * and it falls into our invalid/reserved test so we don't
522 		 * unpin in put_pfn().  Unpin all zero pages in the batch here.
523 		 */
524 		for (i = 0 ; i < ret; i++) {
525 			if (unlikely(is_zero_pfn(page_to_pfn(pages[i]))))
526 				unpin_user_page(pages[i]);
527 		}
528 
529 		*pfn = page_to_pfn(pages[0]);
530 		goto done;
531 	}
532 
533 	vaddr = untagged_addr(vaddr);
534 
535 retry:
536 	vma = find_vma_intersection(mm, vaddr, vaddr + 1);
537 
538 	if (vma && vma->vm_flags & VM_PFNMAP) {
539 		ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
540 		if (ret == -EAGAIN)
541 			goto retry;
542 
543 		if (!ret) {
544 			if (is_invalid_reserved_pfn(*pfn))
545 				ret = 1;
546 			else
547 				ret = -EFAULT;
548 		}
549 	}
550 done:
551 	mmap_read_unlock(mm);
552 	return ret;
553 }
554 
555 /*
556  * Attempt to pin pages.  We really don't want to track all the pfns and
557  * the iommu can only map chunks of consecutive pfns anyway, so get the
558  * first page and all consecutive pages with the same locking.
559  */
vfio_pin_pages_remote(struct vfio_dma * dma,unsigned long vaddr,long npage,unsigned long * pfn_base,unsigned long limit,struct vfio_batch * batch)560 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
561 				  long npage, unsigned long *pfn_base,
562 				  unsigned long limit, struct vfio_batch *batch)
563 {
564 	unsigned long pfn = 0;
565 	long ret, pinned = 0, lock_acct = 0;
566 	bool rsvd;
567 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
568 
569 	/* This code path is only user initiated */
570 	if (!current->mm)
571 		return -ENODEV;
572 
573 	ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, pfn_base,
574 			     batch->pages);
575 	if (ret < 0)
576 		return ret;
577 
578 	pinned++;
579 	rsvd = is_invalid_reserved_pfn(*pfn_base);
580 
581 	/*
582 	 * Reserved pages aren't counted against the user, externally pinned
583 	 * pages are already counted against the user.
584 	 */
585 	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
586 		if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
587 			put_pfn(*pfn_base, dma->prot);
588 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
589 					limit << PAGE_SHIFT);
590 			return -ENOMEM;
591 		}
592 		lock_acct++;
593 	}
594 
595 	if (unlikely(disable_hugepages))
596 		goto out;
597 
598 	/* Lock all the consecutive pages from pfn_base */
599 	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
600 	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
601 		ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, &pfn,
602 				     batch->pages);
603 		if (ret < 0)
604 			break;
605 
606 		if (pfn != *pfn_base + pinned ||
607 		    rsvd != is_invalid_reserved_pfn(pfn)) {
608 			put_pfn(pfn, dma->prot);
609 			break;
610 		}
611 
612 		if (!rsvd && !vfio_find_vpfn(dma, iova)) {
613 			if (!dma->lock_cap &&
614 			    current->mm->locked_vm + lock_acct + 1 > limit) {
615 				put_pfn(pfn, dma->prot);
616 				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
617 					__func__, limit << PAGE_SHIFT);
618 				ret = -ENOMEM;
619 				goto unpin_out;
620 			}
621 			lock_acct++;
622 		}
623 	}
624 
625 out:
626 	ret = vfio_lock_acct(dma, lock_acct, false);
627 
628 unpin_out:
629 	if (ret < 0) {
630 		if (!rsvd) {
631 			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
632 				put_pfn(pfn, dma->prot);
633 		}
634 
635 		return ret;
636 	}
637 
638 	return pinned;
639 }
640 
vfio_unpin_pages_remote(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn,long npage,bool do_accounting)641 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
642 				    unsigned long pfn, long npage,
643 				    bool do_accounting)
644 {
645 	long unlocked = 0, locked = 0;
646 	long i;
647 
648 	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
649 		if (put_pfn(pfn++, dma->prot)) {
650 			unlocked++;
651 			if (vfio_find_vpfn(dma, iova))
652 				locked++;
653 		}
654 	}
655 
656 	if (do_accounting)
657 		vfio_lock_acct(dma, locked - unlocked, true);
658 
659 	return unlocked;
660 }
661 
vfio_pin_page_external(struct vfio_dma * dma,unsigned long vaddr,unsigned long * pfn_base,bool do_accounting)662 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
663 				  unsigned long *pfn_base, bool do_accounting)
664 {
665 	struct page *pages[1];
666 	struct mm_struct *mm;
667 	int ret;
668 
669 	mm = get_task_mm(dma->task);
670 	if (!mm)
671 		return -ENODEV;
672 
673 	ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
674 	if (ret != 1)
675 		goto out;
676 
677 	ret = 0;
678 
679 	if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
680 		ret = vfio_lock_acct(dma, 1, true);
681 		if (ret) {
682 			put_pfn(*pfn_base, dma->prot);
683 			if (ret == -ENOMEM)
684 				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
685 					"(%ld) exceeded\n", __func__,
686 					dma->task->comm, task_pid_nr(dma->task),
687 					task_rlimit(dma->task, RLIMIT_MEMLOCK));
688 		}
689 	}
690 
691 out:
692 	mmput(mm);
693 	return ret;
694 }
695 
vfio_unpin_page_external(struct vfio_dma * dma,dma_addr_t iova,bool do_accounting)696 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
697 				    bool do_accounting)
698 {
699 	int unlocked;
700 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
701 
702 	if (!vpfn)
703 		return 0;
704 
705 	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
706 
707 	if (do_accounting)
708 		vfio_lock_acct(dma, -unlocked, true);
709 
710 	return unlocked;
711 }
712 
vfio_iommu_type1_pin_pages(void * iommu_data,struct iommu_group * iommu_group,unsigned long * user_pfn,int npage,int prot,unsigned long * phys_pfn)713 static int vfio_iommu_type1_pin_pages(void *iommu_data,
714 				      struct iommu_group *iommu_group,
715 				      unsigned long *user_pfn,
716 				      int npage, int prot,
717 				      unsigned long *phys_pfn)
718 {
719 	struct vfio_iommu *iommu = iommu_data;
720 	struct vfio_group *group;
721 	int i, j, ret;
722 	unsigned long remote_vaddr;
723 	struct vfio_dma *dma;
724 	bool do_accounting;
725 
726 	if (!iommu || !user_pfn || !phys_pfn)
727 		return -EINVAL;
728 
729 	/* Supported for v2 version only */
730 	if (!iommu->v2)
731 		return -EACCES;
732 
733 	mutex_lock(&iommu->lock);
734 
735 	/* Fail if notifier list is empty */
736 	if (!iommu->notifier.head) {
737 		ret = -EINVAL;
738 		goto pin_done;
739 	}
740 
741 	/*
742 	 * If iommu capable domain exist in the container then all pages are
743 	 * already pinned and accounted. Accouting should be done if there is no
744 	 * iommu capable domain in the container.
745 	 */
746 	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
747 
748 	for (i = 0; i < npage; i++) {
749 		dma_addr_t iova;
750 		struct vfio_pfn *vpfn;
751 
752 		iova = user_pfn[i] << PAGE_SHIFT;
753 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
754 		if (!dma) {
755 			ret = -EINVAL;
756 			goto pin_unwind;
757 		}
758 
759 		if ((dma->prot & prot) != prot) {
760 			ret = -EPERM;
761 			goto pin_unwind;
762 		}
763 
764 		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
765 		if (vpfn) {
766 			phys_pfn[i] = vpfn->pfn;
767 			continue;
768 		}
769 
770 		remote_vaddr = dma->vaddr + (iova - dma->iova);
771 		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
772 					     do_accounting);
773 		if (ret)
774 			goto pin_unwind;
775 
776 		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
777 		if (ret) {
778 			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
779 				vfio_lock_acct(dma, -1, true);
780 			goto pin_unwind;
781 		}
782 
783 		if (iommu->dirty_page_tracking) {
784 			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
785 
786 			/*
787 			 * Bitmap populated with the smallest supported page
788 			 * size
789 			 */
790 			bitmap_set(dma->bitmap,
791 				   (iova - dma->iova) >> pgshift, 1);
792 		}
793 	}
794 	ret = i;
795 
796 	group = vfio_iommu_find_iommu_group(iommu, iommu_group);
797 	if (!group->pinned_page_dirty_scope) {
798 		group->pinned_page_dirty_scope = true;
799 		update_pinned_page_dirty_scope(iommu);
800 	}
801 
802 	goto pin_done;
803 
804 pin_unwind:
805 	phys_pfn[i] = 0;
806 	for (j = 0; j < i; j++) {
807 		dma_addr_t iova;
808 
809 		iova = user_pfn[j] << PAGE_SHIFT;
810 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
811 		vfio_unpin_page_external(dma, iova, do_accounting);
812 		phys_pfn[j] = 0;
813 	}
814 pin_done:
815 	mutex_unlock(&iommu->lock);
816 	return ret;
817 }
818 
vfio_iommu_type1_unpin_pages(void * iommu_data,unsigned long * user_pfn,int npage)819 static int vfio_iommu_type1_unpin_pages(void *iommu_data,
820 					unsigned long *user_pfn,
821 					int npage)
822 {
823 	struct vfio_iommu *iommu = iommu_data;
824 	bool do_accounting;
825 	int i;
826 
827 	if (!iommu || !user_pfn)
828 		return -EINVAL;
829 
830 	/* Supported for v2 version only */
831 	if (!iommu->v2)
832 		return -EACCES;
833 
834 	mutex_lock(&iommu->lock);
835 
836 	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
837 	for (i = 0; i < npage; i++) {
838 		struct vfio_dma *dma;
839 		dma_addr_t iova;
840 
841 		iova = user_pfn[i] << PAGE_SHIFT;
842 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
843 		if (!dma)
844 			goto unpin_exit;
845 		vfio_unpin_page_external(dma, iova, do_accounting);
846 	}
847 
848 unpin_exit:
849 	mutex_unlock(&iommu->lock);
850 	return i > npage ? npage : (i > 0 ? i : -EINVAL);
851 }
852 
vfio_sync_unpin(struct vfio_dma * dma,struct vfio_domain * domain,struct list_head * regions,struct iommu_iotlb_gather * iotlb_gather)853 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
854 			    struct list_head *regions,
855 			    struct iommu_iotlb_gather *iotlb_gather)
856 {
857 	long unlocked = 0;
858 	struct vfio_regions *entry, *next;
859 
860 	iommu_iotlb_sync(domain->domain, iotlb_gather);
861 
862 	list_for_each_entry_safe(entry, next, regions, list) {
863 		unlocked += vfio_unpin_pages_remote(dma,
864 						    entry->iova,
865 						    entry->phys >> PAGE_SHIFT,
866 						    entry->len >> PAGE_SHIFT,
867 						    false);
868 		list_del(&entry->list);
869 		kfree(entry);
870 	}
871 
872 	cond_resched();
873 
874 	return unlocked;
875 }
876 
877 /*
878  * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
879  * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
880  * of these regions (currently using a list).
881  *
882  * This value specifies maximum number of regions for each IOTLB flush sync.
883  */
884 #define VFIO_IOMMU_TLB_SYNC_MAX		512
885 
unmap_unpin_fast(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked,struct list_head * unmapped_list,int * unmapped_cnt,struct iommu_iotlb_gather * iotlb_gather)886 static size_t unmap_unpin_fast(struct vfio_domain *domain,
887 			       struct vfio_dma *dma, dma_addr_t *iova,
888 			       size_t len, phys_addr_t phys, long *unlocked,
889 			       struct list_head *unmapped_list,
890 			       int *unmapped_cnt,
891 			       struct iommu_iotlb_gather *iotlb_gather)
892 {
893 	size_t unmapped = 0;
894 	struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
895 
896 	if (entry) {
897 		unmapped = iommu_unmap_fast(domain->domain, *iova, len,
898 					    iotlb_gather);
899 
900 		if (!unmapped) {
901 			kfree(entry);
902 		} else {
903 			entry->iova = *iova;
904 			entry->phys = phys;
905 			entry->len  = unmapped;
906 			list_add_tail(&entry->list, unmapped_list);
907 
908 			*iova += unmapped;
909 			(*unmapped_cnt)++;
910 		}
911 	}
912 
913 	/*
914 	 * Sync if the number of fast-unmap regions hits the limit
915 	 * or in case of errors.
916 	 */
917 	if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
918 		*unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
919 					     iotlb_gather);
920 		*unmapped_cnt = 0;
921 	}
922 
923 	return unmapped;
924 }
925 
unmap_unpin_slow(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked)926 static size_t unmap_unpin_slow(struct vfio_domain *domain,
927 			       struct vfio_dma *dma, dma_addr_t *iova,
928 			       size_t len, phys_addr_t phys,
929 			       long *unlocked)
930 {
931 	size_t unmapped = iommu_unmap(domain->domain, *iova, len);
932 
933 	if (unmapped) {
934 		*unlocked += vfio_unpin_pages_remote(dma, *iova,
935 						     phys >> PAGE_SHIFT,
936 						     unmapped >> PAGE_SHIFT,
937 						     false);
938 		*iova += unmapped;
939 		cond_resched();
940 	}
941 	return unmapped;
942 }
943 
vfio_unmap_unpin(struct vfio_iommu * iommu,struct vfio_dma * dma,bool do_accounting)944 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
945 			     bool do_accounting)
946 {
947 	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
948 	struct vfio_domain *domain, *d;
949 	LIST_HEAD(unmapped_region_list);
950 	struct iommu_iotlb_gather iotlb_gather;
951 	int unmapped_region_cnt = 0;
952 	long unlocked = 0;
953 
954 	if (!dma->size)
955 		return 0;
956 
957 	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
958 		return 0;
959 
960 	/*
961 	 * We use the IOMMU to track the physical addresses, otherwise we'd
962 	 * need a much more complicated tracking system.  Unfortunately that
963 	 * means we need to use one of the iommu domains to figure out the
964 	 * pfns to unpin.  The rest need to be unmapped in advance so we have
965 	 * no iommu translations remaining when the pages are unpinned.
966 	 */
967 	domain = d = list_first_entry(&iommu->domain_list,
968 				      struct vfio_domain, next);
969 
970 	list_for_each_entry_continue(d, &iommu->domain_list, next) {
971 		iommu_unmap(d->domain, dma->iova, dma->size);
972 		cond_resched();
973 	}
974 
975 	iommu_iotlb_gather_init(&iotlb_gather);
976 	while (iova < end) {
977 		size_t unmapped, len;
978 		phys_addr_t phys, next;
979 
980 		phys = iommu_iova_to_phys(domain->domain, iova);
981 		if (WARN_ON(!phys)) {
982 			iova += PAGE_SIZE;
983 			continue;
984 		}
985 
986 		/*
987 		 * To optimize for fewer iommu_unmap() calls, each of which
988 		 * may require hardware cache flushing, try to find the
989 		 * largest contiguous physical memory chunk to unmap.
990 		 */
991 		for (len = PAGE_SIZE;
992 		     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
993 			next = iommu_iova_to_phys(domain->domain, iova + len);
994 			if (next != phys + len)
995 				break;
996 		}
997 
998 		/*
999 		 * First, try to use fast unmap/unpin. In case of failure,
1000 		 * switch to slow unmap/unpin path.
1001 		 */
1002 		unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
1003 					    &unlocked, &unmapped_region_list,
1004 					    &unmapped_region_cnt,
1005 					    &iotlb_gather);
1006 		if (!unmapped) {
1007 			unmapped = unmap_unpin_slow(domain, dma, &iova, len,
1008 						    phys, &unlocked);
1009 			if (WARN_ON(!unmapped))
1010 				break;
1011 		}
1012 	}
1013 
1014 	dma->iommu_mapped = false;
1015 
1016 	if (unmapped_region_cnt) {
1017 		unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
1018 					    &iotlb_gather);
1019 	}
1020 
1021 	if (do_accounting) {
1022 		vfio_lock_acct(dma, -unlocked, true);
1023 		return 0;
1024 	}
1025 	return unlocked;
1026 }
1027 
vfio_remove_dma(struct vfio_iommu * iommu,struct vfio_dma * dma)1028 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
1029 {
1030 	WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
1031 	vfio_unmap_unpin(iommu, dma, true);
1032 	vfio_unlink_dma(iommu, dma);
1033 	put_task_struct(dma->task);
1034 	vfio_dma_bitmap_free(dma);
1035 	kfree(dma);
1036 	iommu->dma_avail++;
1037 }
1038 
vfio_update_pgsize_bitmap(struct vfio_iommu * iommu)1039 static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
1040 {
1041 	struct vfio_domain *domain;
1042 
1043 	iommu->pgsize_bitmap = ULONG_MAX;
1044 
1045 	list_for_each_entry(domain, &iommu->domain_list, next)
1046 		iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
1047 
1048 	/*
1049 	 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
1050 	 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
1051 	 * That way the user will be able to map/unmap buffers whose size/
1052 	 * start address is aligned with PAGE_SIZE. Pinning code uses that
1053 	 * granularity while iommu driver can use the sub-PAGE_SIZE size
1054 	 * to map the buffer.
1055 	 */
1056 	if (iommu->pgsize_bitmap & ~PAGE_MASK) {
1057 		iommu->pgsize_bitmap &= PAGE_MASK;
1058 		iommu->pgsize_bitmap |= PAGE_SIZE;
1059 	}
1060 }
1061 
update_user_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,struct vfio_dma * dma,dma_addr_t base_iova,size_t pgsize)1062 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1063 			      struct vfio_dma *dma, dma_addr_t base_iova,
1064 			      size_t pgsize)
1065 {
1066 	unsigned long pgshift = __ffs(pgsize);
1067 	unsigned long nbits = dma->size >> pgshift;
1068 	unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
1069 	unsigned long copy_offset = bit_offset / BITS_PER_LONG;
1070 	unsigned long shift = bit_offset % BITS_PER_LONG;
1071 	unsigned long leftover;
1072 
1073 	/*
1074 	 * mark all pages dirty if any IOMMU capable device is not able
1075 	 * to report dirty pages and all pages are pinned and mapped.
1076 	 */
1077 	if (!iommu->pinned_page_dirty_scope && dma->iommu_mapped)
1078 		bitmap_set(dma->bitmap, 0, nbits);
1079 
1080 	if (shift) {
1081 		bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
1082 				  nbits + shift);
1083 
1084 		if (copy_from_user(&leftover,
1085 				   (void __user *)(bitmap + copy_offset),
1086 				   sizeof(leftover)))
1087 			return -EFAULT;
1088 
1089 		bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
1090 	}
1091 
1092 	if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
1093 			 DIRTY_BITMAP_BYTES(nbits + shift)))
1094 		return -EFAULT;
1095 
1096 	return 0;
1097 }
1098 
vfio_iova_dirty_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,dma_addr_t iova,size_t size,size_t pgsize)1099 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1100 				  dma_addr_t iova, size_t size, size_t pgsize)
1101 {
1102 	struct vfio_dma *dma;
1103 	struct rb_node *n;
1104 	unsigned long pgshift = __ffs(pgsize);
1105 	int ret;
1106 
1107 	/*
1108 	 * GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
1109 	 * vfio_dma mappings may be clubbed by specifying large ranges, but
1110 	 * there must not be any previous mappings bisected by the range.
1111 	 * An error will be returned if these conditions are not met.
1112 	 */
1113 	dma = vfio_find_dma(iommu, iova, 1);
1114 	if (dma && dma->iova != iova)
1115 		return -EINVAL;
1116 
1117 	dma = vfio_find_dma(iommu, iova + size - 1, 0);
1118 	if (dma && dma->iova + dma->size != iova + size)
1119 		return -EINVAL;
1120 
1121 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1122 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1123 
1124 		if (dma->iova < iova)
1125 			continue;
1126 
1127 		if (dma->iova > iova + size - 1)
1128 			break;
1129 
1130 		ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
1131 		if (ret)
1132 			return ret;
1133 
1134 		/*
1135 		 * Re-populate bitmap to include all pinned pages which are
1136 		 * considered as dirty but exclude pages which are unpinned and
1137 		 * pages which are marked dirty by vfio_dma_rw()
1138 		 */
1139 		bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
1140 		vfio_dma_populate_bitmap(dma, pgsize);
1141 	}
1142 	return 0;
1143 }
1144 
verify_bitmap_size(uint64_t npages,uint64_t bitmap_size)1145 static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1146 {
1147 	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
1148 	    (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1149 		return -EINVAL;
1150 
1151 	return 0;
1152 }
1153 
vfio_dma_do_unmap(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_unmap * unmap,struct vfio_bitmap * bitmap)1154 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1155 			     struct vfio_iommu_type1_dma_unmap *unmap,
1156 			     struct vfio_bitmap *bitmap)
1157 {
1158 	struct vfio_dma *dma, *dma_last = NULL;
1159 	size_t unmapped = 0, pgsize;
1160 	int ret = 0, retries = 0;
1161 	unsigned long pgshift;
1162 
1163 	mutex_lock(&iommu->lock);
1164 
1165 	pgshift = __ffs(iommu->pgsize_bitmap);
1166 	pgsize = (size_t)1 << pgshift;
1167 
1168 	if (unmap->iova & (pgsize - 1)) {
1169 		ret = -EINVAL;
1170 		goto unlock;
1171 	}
1172 
1173 	if (!unmap->size || unmap->size & (pgsize - 1)) {
1174 		ret = -EINVAL;
1175 		goto unlock;
1176 	}
1177 
1178 	if (unmap->iova + unmap->size - 1 < unmap->iova ||
1179 	    unmap->size > SIZE_MAX) {
1180 		ret = -EINVAL;
1181 		goto unlock;
1182 	}
1183 
1184 	/* When dirty tracking is enabled, allow only min supported pgsize */
1185 	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1186 	    (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
1187 		ret = -EINVAL;
1188 		goto unlock;
1189 	}
1190 
1191 	WARN_ON((pgsize - 1) & PAGE_MASK);
1192 again:
1193 	/*
1194 	 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
1195 	 * avoid tracking individual mappings.  This means that the granularity
1196 	 * of the original mapping was lost and the user was allowed to attempt
1197 	 * to unmap any range.  Depending on the contiguousness of physical
1198 	 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
1199 	 * or may not have worked.  We only guaranteed unmap granularity
1200 	 * matching the original mapping; even though it was untracked here,
1201 	 * the original mappings are reflected in IOMMU mappings.  This
1202 	 * resulted in a couple unusual behaviors.  First, if a range is not
1203 	 * able to be unmapped, ex. a set of 4k pages that was mapped as a
1204 	 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1205 	 * a zero sized unmap.  Also, if an unmap request overlaps the first
1206 	 * address of a hugepage, the IOMMU will unmap the entire hugepage.
1207 	 * This also returns success and the returned unmap size reflects the
1208 	 * actual size unmapped.
1209 	 *
1210 	 * We attempt to maintain compatibility with this "v1" interface, but
1211 	 * we take control out of the hands of the IOMMU.  Therefore, an unmap
1212 	 * request offset from the beginning of the original mapping will
1213 	 * return success with zero sized unmap.  And an unmap request covering
1214 	 * the first iova of mapping will unmap the entire range.
1215 	 *
1216 	 * The v2 version of this interface intends to be more deterministic.
1217 	 * Unmap requests must fully cover previous mappings.  Multiple
1218 	 * mappings may still be unmaped by specifying large ranges, but there
1219 	 * must not be any previous mappings bisected by the range.  An error
1220 	 * will be returned if these conditions are not met.  The v2 interface
1221 	 * will only return success and a size of zero if there were no
1222 	 * mappings within the range.
1223 	 */
1224 	if (iommu->v2) {
1225 		dma = vfio_find_dma(iommu, unmap->iova, 1);
1226 		if (dma && dma->iova != unmap->iova) {
1227 			ret = -EINVAL;
1228 			goto unlock;
1229 		}
1230 		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
1231 		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
1232 			ret = -EINVAL;
1233 			goto unlock;
1234 		}
1235 	}
1236 
1237 	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
1238 		if (!iommu->v2 && unmap->iova > dma->iova)
1239 			break;
1240 		/*
1241 		 * Task with same address space who mapped this iova range is
1242 		 * allowed to unmap the iova range.
1243 		 */
1244 		if (dma->task->mm != current->mm)
1245 			break;
1246 
1247 		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1248 			struct vfio_iommu_type1_dma_unmap nb_unmap;
1249 
1250 			if (dma_last == dma) {
1251 				BUG_ON(++retries > 10);
1252 			} else {
1253 				dma_last = dma;
1254 				retries = 0;
1255 			}
1256 
1257 			nb_unmap.iova = dma->iova;
1258 			nb_unmap.size = dma->size;
1259 
1260 			/*
1261 			 * Notify anyone (mdev vendor drivers) to invalidate and
1262 			 * unmap iovas within the range we're about to unmap.
1263 			 * Vendor drivers MUST unpin pages in response to an
1264 			 * invalidation.
1265 			 */
1266 			mutex_unlock(&iommu->lock);
1267 			blocking_notifier_call_chain(&iommu->notifier,
1268 						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
1269 						    &nb_unmap);
1270 			mutex_lock(&iommu->lock);
1271 			goto again;
1272 		}
1273 
1274 		if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1275 			ret = update_user_bitmap(bitmap->data, iommu, dma,
1276 						 unmap->iova, pgsize);
1277 			if (ret)
1278 				break;
1279 		}
1280 
1281 		unmapped += dma->size;
1282 		vfio_remove_dma(iommu, dma);
1283 	}
1284 
1285 unlock:
1286 	mutex_unlock(&iommu->lock);
1287 
1288 	/* Report how much was unmapped */
1289 	unmap->size = unmapped;
1290 
1291 	return ret;
1292 }
1293 
vfio_iommu_map(struct vfio_iommu * iommu,dma_addr_t iova,unsigned long pfn,long npage,int prot)1294 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1295 			  unsigned long pfn, long npage, int prot)
1296 {
1297 	struct vfio_domain *d;
1298 	int ret;
1299 
1300 	list_for_each_entry(d, &iommu->domain_list, next) {
1301 		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
1302 				npage << PAGE_SHIFT, prot | d->prot);
1303 		if (ret)
1304 			goto unwind;
1305 
1306 		cond_resched();
1307 	}
1308 
1309 	return 0;
1310 
1311 unwind:
1312 	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1313 		iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1314 		cond_resched();
1315 	}
1316 
1317 	return ret;
1318 }
1319 
vfio_pin_map_dma(struct vfio_iommu * iommu,struct vfio_dma * dma,size_t map_size)1320 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1321 			    size_t map_size)
1322 {
1323 	dma_addr_t iova = dma->iova;
1324 	unsigned long vaddr = dma->vaddr;
1325 	struct vfio_batch batch;
1326 	size_t size = map_size;
1327 	long npage;
1328 	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1329 	int ret = 0;
1330 
1331 	vfio_batch_init(&batch);
1332 
1333 	while (size) {
1334 		/* Pin a contiguous chunk of memory */
1335 		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1336 					      size >> PAGE_SHIFT, &pfn, limit,
1337 					      &batch);
1338 		if (npage <= 0) {
1339 			WARN_ON(!npage);
1340 			ret = (int)npage;
1341 			break;
1342 		}
1343 
1344 		/* Map it! */
1345 		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1346 				     dma->prot);
1347 		if (ret) {
1348 			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1349 						npage, true);
1350 			break;
1351 		}
1352 
1353 		size -= npage << PAGE_SHIFT;
1354 		dma->size += npage << PAGE_SHIFT;
1355 	}
1356 
1357 	vfio_batch_fini(&batch);
1358 	dma->iommu_mapped = true;
1359 
1360 	if (ret)
1361 		vfio_remove_dma(iommu, dma);
1362 
1363 	return ret;
1364 }
1365 
1366 /*
1367  * Check dma map request is within a valid iova range
1368  */
vfio_iommu_iova_dma_valid(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1369 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1370 				      dma_addr_t start, dma_addr_t end)
1371 {
1372 	struct list_head *iova = &iommu->iova_list;
1373 	struct vfio_iova *node;
1374 
1375 	list_for_each_entry(node, iova, list) {
1376 		if (start >= node->start && end <= node->end)
1377 			return true;
1378 	}
1379 
1380 	/*
1381 	 * Check for list_empty() as well since a container with
1382 	 * a single mdev device will have an empty list.
1383 	 */
1384 	return list_empty(iova);
1385 }
1386 
vfio_dma_do_map(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_map * map)1387 static int vfio_dma_do_map(struct vfio_iommu *iommu,
1388 			   struct vfio_iommu_type1_dma_map *map)
1389 {
1390 	dma_addr_t iova = map->iova;
1391 	unsigned long vaddr = map->vaddr;
1392 	size_t size = map->size;
1393 	int ret = 0, prot = 0;
1394 	size_t pgsize;
1395 	struct vfio_dma *dma;
1396 
1397 	/* Verify that none of our __u64 fields overflow */
1398 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1399 		return -EINVAL;
1400 
1401 	/* READ/WRITE from device perspective */
1402 	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1403 		prot |= IOMMU_WRITE;
1404 	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1405 		prot |= IOMMU_READ;
1406 
1407 	mutex_lock(&iommu->lock);
1408 
1409 	pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
1410 
1411 	WARN_ON((pgsize - 1) & PAGE_MASK);
1412 
1413 	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1)) {
1414 		ret = -EINVAL;
1415 		goto out_unlock;
1416 	}
1417 
1418 	/* Don't allow IOVA or virtual address wrap */
1419 	if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
1420 		ret = -EINVAL;
1421 		goto out_unlock;
1422 	}
1423 
1424 	if (vfio_find_dma(iommu, iova, size)) {
1425 		ret = -EEXIST;
1426 		goto out_unlock;
1427 	}
1428 
1429 	if (!iommu->dma_avail) {
1430 		ret = -ENOSPC;
1431 		goto out_unlock;
1432 	}
1433 
1434 	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1435 		ret = -EINVAL;
1436 		goto out_unlock;
1437 	}
1438 
1439 	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1440 	if (!dma) {
1441 		ret = -ENOMEM;
1442 		goto out_unlock;
1443 	}
1444 
1445 	iommu->dma_avail--;
1446 	dma->iova = iova;
1447 	dma->vaddr = vaddr;
1448 	dma->prot = prot;
1449 
1450 	/*
1451 	 * We need to be able to both add to a task's locked memory and test
1452 	 * against the locked memory limit and we need to be able to do both
1453 	 * outside of this call path as pinning can be asynchronous via the
1454 	 * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
1455 	 * task_struct and VM locked pages requires an mm_struct, however
1456 	 * holding an indefinite mm reference is not recommended, therefore we
1457 	 * only hold a reference to a task.  We could hold a reference to
1458 	 * current, however QEMU uses this call path through vCPU threads,
1459 	 * which can be killed resulting in a NULL mm and failure in the unmap
1460 	 * path when called via a different thread.  Avoid this problem by
1461 	 * using the group_leader as threads within the same group require
1462 	 * both CLONE_THREAD and CLONE_VM and will therefore use the same
1463 	 * mm_struct.
1464 	 *
1465 	 * Previously we also used the task for testing CAP_IPC_LOCK at the
1466 	 * time of pinning and accounting, however has_capability() makes use
1467 	 * of real_cred, a copy-on-write field, so we can't guarantee that it
1468 	 * matches group_leader, or in fact that it might not change by the
1469 	 * time it's evaluated.  If a process were to call MAP_DMA with
1470 	 * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
1471 	 * possibly see different results for an iommu_mapped vfio_dma vs
1472 	 * externally mapped.  Therefore track CAP_IPC_LOCK in vfio_dma at the
1473 	 * time of calling MAP_DMA.
1474 	 */
1475 	get_task_struct(current->group_leader);
1476 	dma->task = current->group_leader;
1477 	dma->lock_cap = capable(CAP_IPC_LOCK);
1478 
1479 	dma->pfn_list = RB_ROOT;
1480 
1481 	/* Insert zero-sized and grow as we map chunks of it */
1482 	vfio_link_dma(iommu, dma);
1483 
1484 	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
1485 	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
1486 		dma->size = size;
1487 	else
1488 		ret = vfio_pin_map_dma(iommu, dma, size);
1489 
1490 	if (!ret && iommu->dirty_page_tracking) {
1491 		ret = vfio_dma_bitmap_alloc(dma, pgsize);
1492 		if (ret)
1493 			vfio_remove_dma(iommu, dma);
1494 	}
1495 
1496 out_unlock:
1497 	mutex_unlock(&iommu->lock);
1498 	return ret;
1499 }
1500 
vfio_bus_type(struct device * dev,void * data)1501 static int vfio_bus_type(struct device *dev, void *data)
1502 {
1503 	struct bus_type **bus = data;
1504 
1505 	if (*bus && *bus != dev->bus)
1506 		return -EINVAL;
1507 
1508 	*bus = dev->bus;
1509 
1510 	return 0;
1511 }
1512 
vfio_iommu_replay(struct vfio_iommu * iommu,struct vfio_domain * domain)1513 static int vfio_iommu_replay(struct vfio_iommu *iommu,
1514 			     struct vfio_domain *domain)
1515 {
1516 	struct vfio_batch batch;
1517 	struct vfio_domain *d = NULL;
1518 	struct rb_node *n;
1519 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1520 	int ret;
1521 
1522 	/* Arbitrarily pick the first domain in the list for lookups */
1523 	if (!list_empty(&iommu->domain_list))
1524 		d = list_first_entry(&iommu->domain_list,
1525 				     struct vfio_domain, next);
1526 
1527 	vfio_batch_init(&batch);
1528 
1529 	n = rb_first(&iommu->dma_list);
1530 
1531 	for (; n; n = rb_next(n)) {
1532 		struct vfio_dma *dma;
1533 		dma_addr_t iova;
1534 
1535 		dma = rb_entry(n, struct vfio_dma, node);
1536 		iova = dma->iova;
1537 
1538 		while (iova < dma->iova + dma->size) {
1539 			phys_addr_t phys;
1540 			size_t size;
1541 
1542 			if (dma->iommu_mapped) {
1543 				phys_addr_t p;
1544 				dma_addr_t i;
1545 
1546 				if (WARN_ON(!d)) { /* mapped w/o a domain?! */
1547 					ret = -EINVAL;
1548 					goto unwind;
1549 				}
1550 
1551 				phys = iommu_iova_to_phys(d->domain, iova);
1552 
1553 				if (WARN_ON(!phys)) {
1554 					iova += PAGE_SIZE;
1555 					continue;
1556 				}
1557 
1558 				size = PAGE_SIZE;
1559 				p = phys + size;
1560 				i = iova + size;
1561 				while (i < dma->iova + dma->size &&
1562 				       p == iommu_iova_to_phys(d->domain, i)) {
1563 					size += PAGE_SIZE;
1564 					p += PAGE_SIZE;
1565 					i += PAGE_SIZE;
1566 				}
1567 			} else {
1568 				unsigned long pfn;
1569 				unsigned long vaddr = dma->vaddr +
1570 						     (iova - dma->iova);
1571 				size_t n = dma->iova + dma->size - iova;
1572 				long npage;
1573 
1574 				npage = vfio_pin_pages_remote(dma, vaddr,
1575 							      n >> PAGE_SHIFT,
1576 							      &pfn, limit,
1577 							      &batch);
1578 				if (npage <= 0) {
1579 					WARN_ON(!npage);
1580 					ret = (int)npage;
1581 					goto unwind;
1582 				}
1583 
1584 				phys = pfn << PAGE_SHIFT;
1585 				size = npage << PAGE_SHIFT;
1586 			}
1587 
1588 			ret = iommu_map(domain->domain, iova, phys,
1589 					size, dma->prot | domain->prot);
1590 			if (ret) {
1591 				if (!dma->iommu_mapped)
1592 					vfio_unpin_pages_remote(dma, iova,
1593 							phys >> PAGE_SHIFT,
1594 							size >> PAGE_SHIFT,
1595 							true);
1596 				goto unwind;
1597 			}
1598 
1599 			iova += size;
1600 		}
1601 	}
1602 
1603 	/* All dmas are now mapped, defer to second tree walk for unwind */
1604 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1605 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1606 
1607 		dma->iommu_mapped = true;
1608 	}
1609 
1610 	vfio_batch_fini(&batch);
1611 	return 0;
1612 
1613 unwind:
1614 	for (; n; n = rb_prev(n)) {
1615 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1616 		dma_addr_t iova;
1617 
1618 		if (dma->iommu_mapped) {
1619 			iommu_unmap(domain->domain, dma->iova, dma->size);
1620 			continue;
1621 		}
1622 
1623 		iova = dma->iova;
1624 		while (iova < dma->iova + dma->size) {
1625 			phys_addr_t phys, p;
1626 			size_t size;
1627 			dma_addr_t i;
1628 
1629 			phys = iommu_iova_to_phys(domain->domain, iova);
1630 			if (!phys) {
1631 				iova += PAGE_SIZE;
1632 				continue;
1633 			}
1634 
1635 			size = PAGE_SIZE;
1636 			p = phys + size;
1637 			i = iova + size;
1638 			while (i < dma->iova + dma->size &&
1639 			       p == iommu_iova_to_phys(domain->domain, i)) {
1640 				size += PAGE_SIZE;
1641 				p += PAGE_SIZE;
1642 				i += PAGE_SIZE;
1643 			}
1644 
1645 			iommu_unmap(domain->domain, iova, size);
1646 			vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
1647 						size >> PAGE_SHIFT, true);
1648 		}
1649 	}
1650 
1651 	vfio_batch_fini(&batch);
1652 	return ret;
1653 }
1654 
1655 /*
1656  * We change our unmap behavior slightly depending on whether the IOMMU
1657  * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
1658  * for practically any contiguous power-of-two mapping we give it.  This means
1659  * we don't need to look for contiguous chunks ourselves to make unmapping
1660  * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
1661  * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1662  * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1663  * hugetlbfs is in use.
1664  */
vfio_test_domain_fgsp(struct vfio_domain * domain)1665 static void vfio_test_domain_fgsp(struct vfio_domain *domain)
1666 {
1667 	struct page *pages;
1668 	int ret, order = get_order(PAGE_SIZE * 2);
1669 
1670 	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1671 	if (!pages)
1672 		return;
1673 
1674 	ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
1675 			IOMMU_READ | IOMMU_WRITE | domain->prot);
1676 	if (!ret) {
1677 		size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
1678 
1679 		if (unmapped == PAGE_SIZE)
1680 			iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
1681 		else
1682 			domain->fgsp = true;
1683 	}
1684 
1685 	__free_pages(pages, order);
1686 }
1687 
find_iommu_group(struct vfio_domain * domain,struct iommu_group * iommu_group)1688 static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
1689 					   struct iommu_group *iommu_group)
1690 {
1691 	struct vfio_group *g;
1692 
1693 	list_for_each_entry(g, &domain->group_list, next) {
1694 		if (g->iommu_group == iommu_group)
1695 			return g;
1696 	}
1697 
1698 	return NULL;
1699 }
1700 
vfio_iommu_find_iommu_group(struct vfio_iommu * iommu,struct iommu_group * iommu_group)1701 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1702 					       struct iommu_group *iommu_group)
1703 {
1704 	struct vfio_domain *domain;
1705 	struct vfio_group *group = NULL;
1706 
1707 	list_for_each_entry(domain, &iommu->domain_list, next) {
1708 		group = find_iommu_group(domain, iommu_group);
1709 		if (group)
1710 			return group;
1711 	}
1712 
1713 	if (iommu->external_domain)
1714 		group = find_iommu_group(iommu->external_domain, iommu_group);
1715 
1716 	return group;
1717 }
1718 
update_pinned_page_dirty_scope(struct vfio_iommu * iommu)1719 static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu)
1720 {
1721 	struct vfio_domain *domain;
1722 	struct vfio_group *group;
1723 
1724 	list_for_each_entry(domain, &iommu->domain_list, next) {
1725 		list_for_each_entry(group, &domain->group_list, next) {
1726 			if (!group->pinned_page_dirty_scope) {
1727 				iommu->pinned_page_dirty_scope = false;
1728 				return;
1729 			}
1730 		}
1731 	}
1732 
1733 	if (iommu->external_domain) {
1734 		domain = iommu->external_domain;
1735 		list_for_each_entry(group, &domain->group_list, next) {
1736 			if (!group->pinned_page_dirty_scope) {
1737 				iommu->pinned_page_dirty_scope = false;
1738 				return;
1739 			}
1740 		}
1741 	}
1742 
1743 	iommu->pinned_page_dirty_scope = true;
1744 }
1745 
vfio_iommu_has_sw_msi(struct list_head * group_resv_regions,phys_addr_t * base)1746 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1747 				  phys_addr_t *base)
1748 {
1749 	struct iommu_resv_region *region;
1750 	bool ret = false;
1751 
1752 	list_for_each_entry(region, group_resv_regions, list) {
1753 		/*
1754 		 * The presence of any 'real' MSI regions should take
1755 		 * precedence over the software-managed one if the
1756 		 * IOMMU driver happens to advertise both types.
1757 		 */
1758 		if (region->type == IOMMU_RESV_MSI) {
1759 			ret = false;
1760 			break;
1761 		}
1762 
1763 		if (region->type == IOMMU_RESV_SW_MSI) {
1764 			*base = region->start;
1765 			ret = true;
1766 		}
1767 	}
1768 
1769 	return ret;
1770 }
1771 
vfio_mdev_get_iommu_device(struct device * dev)1772 static struct device *vfio_mdev_get_iommu_device(struct device *dev)
1773 {
1774 	struct device *(*fn)(struct device *dev);
1775 	struct device *iommu_device;
1776 
1777 	fn = symbol_get(mdev_get_iommu_device);
1778 	if (fn) {
1779 		iommu_device = fn(dev);
1780 		symbol_put(mdev_get_iommu_device);
1781 
1782 		return iommu_device;
1783 	}
1784 
1785 	return NULL;
1786 }
1787 
vfio_mdev_attach_domain(struct device * dev,void * data)1788 static int vfio_mdev_attach_domain(struct device *dev, void *data)
1789 {
1790 	struct iommu_domain *domain = data;
1791 	struct device *iommu_device;
1792 
1793 	iommu_device = vfio_mdev_get_iommu_device(dev);
1794 	if (iommu_device) {
1795 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1796 			return iommu_aux_attach_device(domain, iommu_device);
1797 		else
1798 			return iommu_attach_device(domain, iommu_device);
1799 	}
1800 
1801 	return -EINVAL;
1802 }
1803 
vfio_mdev_detach_domain(struct device * dev,void * data)1804 static int vfio_mdev_detach_domain(struct device *dev, void *data)
1805 {
1806 	struct iommu_domain *domain = data;
1807 	struct device *iommu_device;
1808 
1809 	iommu_device = vfio_mdev_get_iommu_device(dev);
1810 	if (iommu_device) {
1811 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1812 			iommu_aux_detach_device(domain, iommu_device);
1813 		else
1814 			iommu_detach_device(domain, iommu_device);
1815 	}
1816 
1817 	return 0;
1818 }
1819 
vfio_iommu_attach_group(struct vfio_domain * domain,struct vfio_group * group)1820 static int vfio_iommu_attach_group(struct vfio_domain *domain,
1821 				   struct vfio_group *group)
1822 {
1823 	if (group->mdev_group)
1824 		return iommu_group_for_each_dev(group->iommu_group,
1825 						domain->domain,
1826 						vfio_mdev_attach_domain);
1827 	else
1828 		return iommu_attach_group(domain->domain, group->iommu_group);
1829 }
1830 
vfio_iommu_detach_group(struct vfio_domain * domain,struct vfio_group * group)1831 static void vfio_iommu_detach_group(struct vfio_domain *domain,
1832 				    struct vfio_group *group)
1833 {
1834 	if (group->mdev_group)
1835 		iommu_group_for_each_dev(group->iommu_group, domain->domain,
1836 					 vfio_mdev_detach_domain);
1837 	else
1838 		iommu_detach_group(domain->domain, group->iommu_group);
1839 }
1840 
vfio_bus_is_mdev(struct bus_type * bus)1841 static bool vfio_bus_is_mdev(struct bus_type *bus)
1842 {
1843 	struct bus_type *mdev_bus;
1844 	bool ret = false;
1845 
1846 	mdev_bus = symbol_get(mdev_bus_type);
1847 	if (mdev_bus) {
1848 		ret = (bus == mdev_bus);
1849 		symbol_put(mdev_bus_type);
1850 	}
1851 
1852 	return ret;
1853 }
1854 
vfio_mdev_iommu_device(struct device * dev,void * data)1855 static int vfio_mdev_iommu_device(struct device *dev, void *data)
1856 {
1857 	struct device **old = data, *new;
1858 
1859 	new = vfio_mdev_get_iommu_device(dev);
1860 	if (!new || (*old && *old != new))
1861 		return -EINVAL;
1862 
1863 	*old = new;
1864 
1865 	return 0;
1866 }
1867 
1868 /*
1869  * This is a helper function to insert an address range to iova list.
1870  * The list is initially created with a single entry corresponding to
1871  * the IOMMU domain geometry to which the device group is attached.
1872  * The list aperture gets modified when a new domain is added to the
1873  * container if the new aperture doesn't conflict with the current one
1874  * or with any existing dma mappings. The list is also modified to
1875  * exclude any reserved regions associated with the device group.
1876  */
vfio_iommu_iova_insert(struct list_head * head,dma_addr_t start,dma_addr_t end)1877 static int vfio_iommu_iova_insert(struct list_head *head,
1878 				  dma_addr_t start, dma_addr_t end)
1879 {
1880 	struct vfio_iova *region;
1881 
1882 	region = kmalloc(sizeof(*region), GFP_KERNEL);
1883 	if (!region)
1884 		return -ENOMEM;
1885 
1886 	INIT_LIST_HEAD(&region->list);
1887 	region->start = start;
1888 	region->end = end;
1889 
1890 	list_add_tail(&region->list, head);
1891 	return 0;
1892 }
1893 
1894 /*
1895  * Check the new iommu aperture conflicts with existing aper or with any
1896  * existing dma mappings.
1897  */
vfio_iommu_aper_conflict(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1898 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
1899 				     dma_addr_t start, dma_addr_t end)
1900 {
1901 	struct vfio_iova *first, *last;
1902 	struct list_head *iova = &iommu->iova_list;
1903 
1904 	if (list_empty(iova))
1905 		return false;
1906 
1907 	/* Disjoint sets, return conflict */
1908 	first = list_first_entry(iova, struct vfio_iova, list);
1909 	last = list_last_entry(iova, struct vfio_iova, list);
1910 	if (start > last->end || end < first->start)
1911 		return true;
1912 
1913 	/* Check for any existing dma mappings below the new start */
1914 	if (start > first->start) {
1915 		if (vfio_find_dma(iommu, first->start, start - first->start))
1916 			return true;
1917 	}
1918 
1919 	/* Check for any existing dma mappings beyond the new end */
1920 	if (end < last->end) {
1921 		if (vfio_find_dma(iommu, end + 1, last->end - end))
1922 			return true;
1923 	}
1924 
1925 	return false;
1926 }
1927 
1928 /*
1929  * Resize iommu iova aperture window. This is called only if the new
1930  * aperture has no conflict with existing aperture and dma mappings.
1931  */
vfio_iommu_aper_resize(struct list_head * iova,dma_addr_t start,dma_addr_t end)1932 static int vfio_iommu_aper_resize(struct list_head *iova,
1933 				  dma_addr_t start, dma_addr_t end)
1934 {
1935 	struct vfio_iova *node, *next;
1936 
1937 	if (list_empty(iova))
1938 		return vfio_iommu_iova_insert(iova, start, end);
1939 
1940 	/* Adjust iova list start */
1941 	list_for_each_entry_safe(node, next, iova, list) {
1942 		if (start < node->start)
1943 			break;
1944 		if (start >= node->start && start < node->end) {
1945 			node->start = start;
1946 			break;
1947 		}
1948 		/* Delete nodes before new start */
1949 		list_del(&node->list);
1950 		kfree(node);
1951 	}
1952 
1953 	/* Adjust iova list end */
1954 	list_for_each_entry_safe(node, next, iova, list) {
1955 		if (end > node->end)
1956 			continue;
1957 		if (end > node->start && end <= node->end) {
1958 			node->end = end;
1959 			continue;
1960 		}
1961 		/* Delete nodes after new end */
1962 		list_del(&node->list);
1963 		kfree(node);
1964 	}
1965 
1966 	return 0;
1967 }
1968 
1969 /*
1970  * Check reserved region conflicts with existing dma mappings
1971  */
vfio_iommu_resv_conflict(struct vfio_iommu * iommu,struct list_head * resv_regions)1972 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
1973 				     struct list_head *resv_regions)
1974 {
1975 	struct iommu_resv_region *region;
1976 
1977 	/* Check for conflict with existing dma mappings */
1978 	list_for_each_entry(region, resv_regions, list) {
1979 		if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
1980 			continue;
1981 
1982 		if (vfio_find_dma(iommu, region->start, region->length))
1983 			return true;
1984 	}
1985 
1986 	return false;
1987 }
1988 
1989 /*
1990  * Check iova region overlap with  reserved regions and
1991  * exclude them from the iommu iova range
1992  */
vfio_iommu_resv_exclude(struct list_head * iova,struct list_head * resv_regions)1993 static int vfio_iommu_resv_exclude(struct list_head *iova,
1994 				   struct list_head *resv_regions)
1995 {
1996 	struct iommu_resv_region *resv;
1997 	struct vfio_iova *n, *next;
1998 
1999 	list_for_each_entry(resv, resv_regions, list) {
2000 		phys_addr_t start, end;
2001 
2002 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
2003 			continue;
2004 
2005 		start = resv->start;
2006 		end = resv->start + resv->length - 1;
2007 
2008 		list_for_each_entry_safe(n, next, iova, list) {
2009 			int ret = 0;
2010 
2011 			/* No overlap */
2012 			if (start > n->end || end < n->start)
2013 				continue;
2014 			/*
2015 			 * Insert a new node if current node overlaps with the
2016 			 * reserve region to exlude that from valid iova range.
2017 			 * Note that, new node is inserted before the current
2018 			 * node and finally the current node is deleted keeping
2019 			 * the list updated and sorted.
2020 			 */
2021 			if (start > n->start)
2022 				ret = vfio_iommu_iova_insert(&n->list, n->start,
2023 							     start - 1);
2024 			if (!ret && end < n->end)
2025 				ret = vfio_iommu_iova_insert(&n->list, end + 1,
2026 							     n->end);
2027 			if (ret)
2028 				return ret;
2029 
2030 			list_del(&n->list);
2031 			kfree(n);
2032 		}
2033 	}
2034 
2035 	if (list_empty(iova))
2036 		return -EINVAL;
2037 
2038 	return 0;
2039 }
2040 
vfio_iommu_resv_free(struct list_head * resv_regions)2041 static void vfio_iommu_resv_free(struct list_head *resv_regions)
2042 {
2043 	struct iommu_resv_region *n, *next;
2044 
2045 	list_for_each_entry_safe(n, next, resv_regions, list) {
2046 		list_del(&n->list);
2047 		kfree(n);
2048 	}
2049 }
2050 
vfio_iommu_iova_free(struct list_head * iova)2051 static void vfio_iommu_iova_free(struct list_head *iova)
2052 {
2053 	struct vfio_iova *n, *next;
2054 
2055 	list_for_each_entry_safe(n, next, iova, list) {
2056 		list_del(&n->list);
2057 		kfree(n);
2058 	}
2059 }
2060 
vfio_iommu_iova_get_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2061 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
2062 				    struct list_head *iova_copy)
2063 {
2064 	struct list_head *iova = &iommu->iova_list;
2065 	struct vfio_iova *n;
2066 	int ret;
2067 
2068 	list_for_each_entry(n, iova, list) {
2069 		ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
2070 		if (ret)
2071 			goto out_free;
2072 	}
2073 
2074 	return 0;
2075 
2076 out_free:
2077 	vfio_iommu_iova_free(iova_copy);
2078 	return ret;
2079 }
2080 
vfio_iommu_iova_insert_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2081 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
2082 					struct list_head *iova_copy)
2083 {
2084 	struct list_head *iova = &iommu->iova_list;
2085 
2086 	vfio_iommu_iova_free(iova);
2087 
2088 	list_splice_tail(iova_copy, iova);
2089 }
2090 
vfio_iommu_type1_attach_group(void * iommu_data,struct iommu_group * iommu_group)2091 static int vfio_iommu_type1_attach_group(void *iommu_data,
2092 					 struct iommu_group *iommu_group)
2093 {
2094 	struct vfio_iommu *iommu = iommu_data;
2095 	struct vfio_group *group;
2096 	struct vfio_domain *domain, *d;
2097 	struct bus_type *bus = NULL;
2098 	int ret;
2099 	bool resv_msi, msi_remap;
2100 	phys_addr_t resv_msi_base = 0;
2101 	struct iommu_domain_geometry geo;
2102 	LIST_HEAD(iova_copy);
2103 	LIST_HEAD(group_resv_regions);
2104 
2105 	mutex_lock(&iommu->lock);
2106 
2107 	/* Check for duplicates */
2108 	if (vfio_iommu_find_iommu_group(iommu, iommu_group)) {
2109 		mutex_unlock(&iommu->lock);
2110 		return -EINVAL;
2111 	}
2112 
2113 	group = kzalloc(sizeof(*group), GFP_KERNEL);
2114 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2115 	if (!group || !domain) {
2116 		ret = -ENOMEM;
2117 		goto out_free;
2118 	}
2119 
2120 	group->iommu_group = iommu_group;
2121 
2122 	/* Determine bus_type in order to allocate a domain */
2123 	ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
2124 	if (ret)
2125 		goto out_free;
2126 
2127 	if (vfio_bus_is_mdev(bus)) {
2128 		struct device *iommu_device = NULL;
2129 
2130 		group->mdev_group = true;
2131 
2132 		/* Determine the isolation type */
2133 		ret = iommu_group_for_each_dev(iommu_group, &iommu_device,
2134 					       vfio_mdev_iommu_device);
2135 		if (ret || !iommu_device) {
2136 			if (!iommu->external_domain) {
2137 				INIT_LIST_HEAD(&domain->group_list);
2138 				iommu->external_domain = domain;
2139 				vfio_update_pgsize_bitmap(iommu);
2140 			} else {
2141 				kfree(domain);
2142 			}
2143 
2144 			list_add(&group->next,
2145 				 &iommu->external_domain->group_list);
2146 			/*
2147 			 * Non-iommu backed group cannot dirty memory directly,
2148 			 * it can only use interfaces that provide dirty
2149 			 * tracking.
2150 			 * The iommu scope can only be promoted with the
2151 			 * addition of a dirty tracking group.
2152 			 */
2153 			group->pinned_page_dirty_scope = true;
2154 			if (!iommu->pinned_page_dirty_scope)
2155 				update_pinned_page_dirty_scope(iommu);
2156 			mutex_unlock(&iommu->lock);
2157 
2158 			return 0;
2159 		}
2160 
2161 		bus = iommu_device->bus;
2162 	}
2163 
2164 	domain->domain = iommu_domain_alloc(bus);
2165 	if (!domain->domain) {
2166 		ret = -EIO;
2167 		goto out_free;
2168 	}
2169 
2170 	if (iommu->nesting) {
2171 		int attr = 1;
2172 
2173 		ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
2174 					    &attr);
2175 		if (ret)
2176 			goto out_domain;
2177 	}
2178 
2179 	ret = vfio_iommu_attach_group(domain, group);
2180 	if (ret)
2181 		goto out_domain;
2182 
2183 	/* Get aperture info */
2184 	iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
2185 
2186 	if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
2187 				     geo.aperture_end)) {
2188 		ret = -EINVAL;
2189 		goto out_detach;
2190 	}
2191 
2192 	ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
2193 	if (ret)
2194 		goto out_detach;
2195 
2196 	if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
2197 		ret = -EINVAL;
2198 		goto out_detach;
2199 	}
2200 
2201 	/*
2202 	 * We don't want to work on the original iova list as the list
2203 	 * gets modified and in case of failure we have to retain the
2204 	 * original list. Get a copy here.
2205 	 */
2206 	ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
2207 	if (ret)
2208 		goto out_detach;
2209 
2210 	ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
2211 				     geo.aperture_end);
2212 	if (ret)
2213 		goto out_detach;
2214 
2215 	ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
2216 	if (ret)
2217 		goto out_detach;
2218 
2219 	resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
2220 
2221 	INIT_LIST_HEAD(&domain->group_list);
2222 	list_add(&group->next, &domain->group_list);
2223 
2224 	msi_remap = irq_domain_check_msi_remap() ||
2225 		    iommu_capable(bus, IOMMU_CAP_INTR_REMAP);
2226 
2227 	if (!allow_unsafe_interrupts && !msi_remap) {
2228 		pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2229 		       __func__);
2230 		ret = -EPERM;
2231 		goto out_detach;
2232 	}
2233 
2234 	if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
2235 		domain->prot |= IOMMU_CACHE;
2236 
2237 	/*
2238 	 * Try to match an existing compatible domain.  We don't want to
2239 	 * preclude an IOMMU driver supporting multiple bus_types and being
2240 	 * able to include different bus_types in the same IOMMU domain, so
2241 	 * we test whether the domains use the same iommu_ops rather than
2242 	 * testing if they're on the same bus_type.
2243 	 */
2244 	list_for_each_entry(d, &iommu->domain_list, next) {
2245 		if (d->domain->ops == domain->domain->ops &&
2246 		    d->prot == domain->prot) {
2247 			vfio_iommu_detach_group(domain, group);
2248 			if (!vfio_iommu_attach_group(d, group)) {
2249 				list_add(&group->next, &d->group_list);
2250 				iommu_domain_free(domain->domain);
2251 				kfree(domain);
2252 				goto done;
2253 			}
2254 
2255 			ret = vfio_iommu_attach_group(domain, group);
2256 			if (ret)
2257 				goto out_domain;
2258 		}
2259 	}
2260 
2261 	vfio_test_domain_fgsp(domain);
2262 
2263 	/* replay mappings on new domains */
2264 	ret = vfio_iommu_replay(iommu, domain);
2265 	if (ret)
2266 		goto out_detach;
2267 
2268 	if (resv_msi) {
2269 		ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
2270 		if (ret && ret != -ENODEV)
2271 			goto out_detach;
2272 	}
2273 
2274 	list_add(&domain->next, &iommu->domain_list);
2275 	vfio_update_pgsize_bitmap(iommu);
2276 done:
2277 	/* Delete the old one and insert new iova list */
2278 	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2279 
2280 	/*
2281 	 * An iommu backed group can dirty memory directly and therefore
2282 	 * demotes the iommu scope until it declares itself dirty tracking
2283 	 * capable via the page pinning interface.
2284 	 */
2285 	iommu->pinned_page_dirty_scope = false;
2286 	mutex_unlock(&iommu->lock);
2287 	vfio_iommu_resv_free(&group_resv_regions);
2288 
2289 	return 0;
2290 
2291 out_detach:
2292 	vfio_iommu_detach_group(domain, group);
2293 out_domain:
2294 	iommu_domain_free(domain->domain);
2295 	vfio_iommu_iova_free(&iova_copy);
2296 	vfio_iommu_resv_free(&group_resv_regions);
2297 out_free:
2298 	kfree(domain);
2299 	kfree(group);
2300 	mutex_unlock(&iommu->lock);
2301 	return ret;
2302 }
2303 
vfio_iommu_unmap_unpin_all(struct vfio_iommu * iommu)2304 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2305 {
2306 	struct rb_node *node;
2307 
2308 	while ((node = rb_first(&iommu->dma_list)))
2309 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2310 }
2311 
vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu * iommu)2312 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2313 {
2314 	struct rb_node *n, *p;
2315 
2316 	n = rb_first(&iommu->dma_list);
2317 	for (; n; n = rb_next(n)) {
2318 		struct vfio_dma *dma;
2319 		long locked = 0, unlocked = 0;
2320 
2321 		dma = rb_entry(n, struct vfio_dma, node);
2322 		unlocked += vfio_unmap_unpin(iommu, dma, false);
2323 		p = rb_first(&dma->pfn_list);
2324 		for (; p; p = rb_next(p)) {
2325 			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
2326 							 node);
2327 
2328 			if (!is_invalid_reserved_pfn(vpfn->pfn))
2329 				locked++;
2330 		}
2331 		vfio_lock_acct(dma, locked - unlocked, true);
2332 	}
2333 }
2334 
2335 /*
2336  * Called when a domain is removed in detach. It is possible that
2337  * the removed domain decided the iova aperture window. Modify the
2338  * iova aperture with the smallest window among existing domains.
2339  */
vfio_iommu_aper_expand(struct vfio_iommu * iommu,struct list_head * iova_copy)2340 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2341 				   struct list_head *iova_copy)
2342 {
2343 	struct vfio_domain *domain;
2344 	struct iommu_domain_geometry geo;
2345 	struct vfio_iova *node;
2346 	dma_addr_t start = 0;
2347 	dma_addr_t end = (dma_addr_t)~0;
2348 
2349 	if (list_empty(iova_copy))
2350 		return;
2351 
2352 	list_for_each_entry(domain, &iommu->domain_list, next) {
2353 		iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
2354 				      &geo);
2355 		if (geo.aperture_start > start)
2356 			start = geo.aperture_start;
2357 		if (geo.aperture_end < end)
2358 			end = geo.aperture_end;
2359 	}
2360 
2361 	/* Modify aperture limits. The new aper is either same or bigger */
2362 	node = list_first_entry(iova_copy, struct vfio_iova, list);
2363 	node->start = start;
2364 	node = list_last_entry(iova_copy, struct vfio_iova, list);
2365 	node->end = end;
2366 }
2367 
2368 /*
2369  * Called when a group is detached. The reserved regions for that
2370  * group can be part of valid iova now. But since reserved regions
2371  * may be duplicated among groups, populate the iova valid regions
2372  * list again.
2373  */
vfio_iommu_resv_refresh(struct vfio_iommu * iommu,struct list_head * iova_copy)2374 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2375 				   struct list_head *iova_copy)
2376 {
2377 	struct vfio_domain *d;
2378 	struct vfio_group *g;
2379 	struct vfio_iova *node;
2380 	dma_addr_t start, end;
2381 	LIST_HEAD(resv_regions);
2382 	int ret;
2383 
2384 	if (list_empty(iova_copy))
2385 		return -EINVAL;
2386 
2387 	list_for_each_entry(d, &iommu->domain_list, next) {
2388 		list_for_each_entry(g, &d->group_list, next) {
2389 			ret = iommu_get_group_resv_regions(g->iommu_group,
2390 							   &resv_regions);
2391 			if (ret)
2392 				goto done;
2393 		}
2394 	}
2395 
2396 	node = list_first_entry(iova_copy, struct vfio_iova, list);
2397 	start = node->start;
2398 	node = list_last_entry(iova_copy, struct vfio_iova, list);
2399 	end = node->end;
2400 
2401 	/* purge the iova list and create new one */
2402 	vfio_iommu_iova_free(iova_copy);
2403 
2404 	ret = vfio_iommu_aper_resize(iova_copy, start, end);
2405 	if (ret)
2406 		goto done;
2407 
2408 	/* Exclude current reserved regions from iova ranges */
2409 	ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
2410 done:
2411 	vfio_iommu_resv_free(&resv_regions);
2412 	return ret;
2413 }
2414 
vfio_iommu_type1_detach_group(void * iommu_data,struct iommu_group * iommu_group)2415 static void vfio_iommu_type1_detach_group(void *iommu_data,
2416 					  struct iommu_group *iommu_group)
2417 {
2418 	struct vfio_iommu *iommu = iommu_data;
2419 	struct vfio_domain *domain;
2420 	struct vfio_group *group;
2421 	bool update_dirty_scope = false;
2422 	LIST_HEAD(iova_copy);
2423 
2424 	mutex_lock(&iommu->lock);
2425 
2426 	if (iommu->external_domain) {
2427 		group = find_iommu_group(iommu->external_domain, iommu_group);
2428 		if (group) {
2429 			update_dirty_scope = !group->pinned_page_dirty_scope;
2430 			list_del(&group->next);
2431 			kfree(group);
2432 
2433 			if (list_empty(&iommu->external_domain->group_list)) {
2434 				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
2435 					WARN_ON(iommu->notifier.head);
2436 					vfio_iommu_unmap_unpin_all(iommu);
2437 				}
2438 
2439 				kfree(iommu->external_domain);
2440 				iommu->external_domain = NULL;
2441 			}
2442 			goto detach_group_done;
2443 		}
2444 	}
2445 
2446 	/*
2447 	 * Get a copy of iova list. This will be used to update
2448 	 * and to replace the current one later. Please note that
2449 	 * we will leave the original list as it is if update fails.
2450 	 */
2451 	vfio_iommu_iova_get_copy(iommu, &iova_copy);
2452 
2453 	list_for_each_entry(domain, &iommu->domain_list, next) {
2454 		group = find_iommu_group(domain, iommu_group);
2455 		if (!group)
2456 			continue;
2457 
2458 		vfio_iommu_detach_group(domain, group);
2459 		update_dirty_scope = !group->pinned_page_dirty_scope;
2460 		list_del(&group->next);
2461 		kfree(group);
2462 		/*
2463 		 * Group ownership provides privilege, if the group list is
2464 		 * empty, the domain goes away. If it's the last domain with
2465 		 * iommu and external domain doesn't exist, then all the
2466 		 * mappings go away too. If it's the last domain with iommu and
2467 		 * external domain exist, update accounting
2468 		 */
2469 		if (list_empty(&domain->group_list)) {
2470 			if (list_is_singular(&iommu->domain_list)) {
2471 				if (!iommu->external_domain) {
2472 					WARN_ON(iommu->notifier.head);
2473 					vfio_iommu_unmap_unpin_all(iommu);
2474 				} else {
2475 					vfio_iommu_unmap_unpin_reaccount(iommu);
2476 				}
2477 			}
2478 			iommu_domain_free(domain->domain);
2479 			list_del(&domain->next);
2480 			kfree(domain);
2481 			vfio_iommu_aper_expand(iommu, &iova_copy);
2482 			vfio_update_pgsize_bitmap(iommu);
2483 		}
2484 		break;
2485 	}
2486 
2487 	if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2488 		vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2489 	else
2490 		vfio_iommu_iova_free(&iova_copy);
2491 
2492 detach_group_done:
2493 	/*
2494 	 * Removal of a group without dirty tracking may allow the iommu scope
2495 	 * to be promoted.
2496 	 */
2497 	if (update_dirty_scope) {
2498 		update_pinned_page_dirty_scope(iommu);
2499 		if (iommu->dirty_page_tracking)
2500 			vfio_iommu_populate_bitmap_full(iommu);
2501 	}
2502 	mutex_unlock(&iommu->lock);
2503 }
2504 
vfio_iommu_type1_open(unsigned long arg)2505 static void *vfio_iommu_type1_open(unsigned long arg)
2506 {
2507 	struct vfio_iommu *iommu;
2508 
2509 	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2510 	if (!iommu)
2511 		return ERR_PTR(-ENOMEM);
2512 
2513 	switch (arg) {
2514 	case VFIO_TYPE1_IOMMU:
2515 		break;
2516 	case VFIO_TYPE1_NESTING_IOMMU:
2517 		iommu->nesting = true;
2518 		fallthrough;
2519 	case VFIO_TYPE1v2_IOMMU:
2520 		iommu->v2 = true;
2521 		break;
2522 	default:
2523 		kfree(iommu);
2524 		return ERR_PTR(-EINVAL);
2525 	}
2526 
2527 	INIT_LIST_HEAD(&iommu->domain_list);
2528 	INIT_LIST_HEAD(&iommu->iova_list);
2529 	iommu->dma_list = RB_ROOT;
2530 	iommu->dma_avail = dma_entry_limit;
2531 	mutex_init(&iommu->lock);
2532 	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
2533 
2534 	return iommu;
2535 }
2536 
vfio_release_domain(struct vfio_domain * domain,bool external)2537 static void vfio_release_domain(struct vfio_domain *domain, bool external)
2538 {
2539 	struct vfio_group *group, *group_tmp;
2540 
2541 	list_for_each_entry_safe(group, group_tmp,
2542 				 &domain->group_list, next) {
2543 		if (!external)
2544 			vfio_iommu_detach_group(domain, group);
2545 		list_del(&group->next);
2546 		kfree(group);
2547 	}
2548 
2549 	if (!external)
2550 		iommu_domain_free(domain->domain);
2551 }
2552 
vfio_iommu_type1_release(void * iommu_data)2553 static void vfio_iommu_type1_release(void *iommu_data)
2554 {
2555 	struct vfio_iommu *iommu = iommu_data;
2556 	struct vfio_domain *domain, *domain_tmp;
2557 
2558 	if (iommu->external_domain) {
2559 		vfio_release_domain(iommu->external_domain, true);
2560 		kfree(iommu->external_domain);
2561 	}
2562 
2563 	vfio_iommu_unmap_unpin_all(iommu);
2564 
2565 	list_for_each_entry_safe(domain, domain_tmp,
2566 				 &iommu->domain_list, next) {
2567 		vfio_release_domain(domain, false);
2568 		list_del(&domain->next);
2569 		kfree(domain);
2570 	}
2571 
2572 	vfio_iommu_iova_free(&iommu->iova_list);
2573 
2574 	kfree(iommu);
2575 }
2576 
vfio_domains_have_iommu_cache(struct vfio_iommu * iommu)2577 static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
2578 {
2579 	struct vfio_domain *domain;
2580 	int ret = 1;
2581 
2582 	mutex_lock(&iommu->lock);
2583 	list_for_each_entry(domain, &iommu->domain_list, next) {
2584 		if (!(domain->prot & IOMMU_CACHE)) {
2585 			ret = 0;
2586 			break;
2587 		}
2588 	}
2589 	mutex_unlock(&iommu->lock);
2590 
2591 	return ret;
2592 }
2593 
vfio_iommu_type1_check_extension(struct vfio_iommu * iommu,unsigned long arg)2594 static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2595 					    unsigned long arg)
2596 {
2597 	switch (arg) {
2598 	case VFIO_TYPE1_IOMMU:
2599 	case VFIO_TYPE1v2_IOMMU:
2600 	case VFIO_TYPE1_NESTING_IOMMU:
2601 		return 1;
2602 	case VFIO_DMA_CC_IOMMU:
2603 		if (!iommu)
2604 			return 0;
2605 		return vfio_domains_have_iommu_cache(iommu);
2606 	default:
2607 		return 0;
2608 	}
2609 }
2610 
vfio_iommu_iova_add_cap(struct vfio_info_cap * caps,struct vfio_iommu_type1_info_cap_iova_range * cap_iovas,size_t size)2611 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2612 		 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2613 		 size_t size)
2614 {
2615 	struct vfio_info_cap_header *header;
2616 	struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2617 
2618 	header = vfio_info_cap_add(caps, size,
2619 				   VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2620 	if (IS_ERR(header))
2621 		return PTR_ERR(header);
2622 
2623 	iova_cap = container_of(header,
2624 				struct vfio_iommu_type1_info_cap_iova_range,
2625 				header);
2626 	iova_cap->nr_iovas = cap_iovas->nr_iovas;
2627 	memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2628 	       cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2629 	return 0;
2630 }
2631 
vfio_iommu_iova_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2632 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2633 				      struct vfio_info_cap *caps)
2634 {
2635 	struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2636 	struct vfio_iova *iova;
2637 	size_t size;
2638 	int iovas = 0, i = 0, ret;
2639 
2640 	list_for_each_entry(iova, &iommu->iova_list, list)
2641 		iovas++;
2642 
2643 	if (!iovas) {
2644 		/*
2645 		 * Return 0 as a container with a single mdev device
2646 		 * will have an empty list
2647 		 */
2648 		return 0;
2649 	}
2650 
2651 	size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
2652 
2653 	cap_iovas = kzalloc(size, GFP_KERNEL);
2654 	if (!cap_iovas)
2655 		return -ENOMEM;
2656 
2657 	cap_iovas->nr_iovas = iovas;
2658 
2659 	list_for_each_entry(iova, &iommu->iova_list, list) {
2660 		cap_iovas->iova_ranges[i].start = iova->start;
2661 		cap_iovas->iova_ranges[i].end = iova->end;
2662 		i++;
2663 	}
2664 
2665 	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2666 
2667 	kfree(cap_iovas);
2668 	return ret;
2669 }
2670 
vfio_iommu_migration_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2671 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2672 					   struct vfio_info_cap *caps)
2673 {
2674 	struct vfio_iommu_type1_info_cap_migration cap_mig;
2675 
2676 	cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2677 	cap_mig.header.version = 1;
2678 
2679 	cap_mig.flags = 0;
2680 	/* support minimum pgsize */
2681 	cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2682 	cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2683 
2684 	return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
2685 }
2686 
vfio_iommu_dma_avail_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2687 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
2688 					   struct vfio_info_cap *caps)
2689 {
2690 	struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
2691 
2692 	cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
2693 	cap_dma_avail.header.version = 1;
2694 
2695 	cap_dma_avail.avail = iommu->dma_avail;
2696 
2697 	return vfio_info_add_capability(caps, &cap_dma_avail.header,
2698 					sizeof(cap_dma_avail));
2699 }
2700 
vfio_iommu_type1_get_info(struct vfio_iommu * iommu,unsigned long arg)2701 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2702 				     unsigned long arg)
2703 {
2704 	struct vfio_iommu_type1_info info;
2705 	unsigned long minsz;
2706 	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2707 	unsigned long capsz;
2708 	int ret;
2709 
2710 	minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2711 
2712 	/* For backward compatibility, cannot require this */
2713 	capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
2714 
2715 	if (copy_from_user(&info, (void __user *)arg, minsz))
2716 		return -EFAULT;
2717 
2718 	if (info.argsz < minsz)
2719 		return -EINVAL;
2720 
2721 	if (info.argsz >= capsz) {
2722 		minsz = capsz;
2723 		info.cap_offset = 0; /* output, no-recopy necessary */
2724 	}
2725 
2726 	mutex_lock(&iommu->lock);
2727 	info.flags = VFIO_IOMMU_INFO_PGSIZES;
2728 
2729 	info.iova_pgsizes = iommu->pgsize_bitmap;
2730 
2731 	ret = vfio_iommu_migration_build_caps(iommu, &caps);
2732 
2733 	if (!ret)
2734 		ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
2735 
2736 	if (!ret)
2737 		ret = vfio_iommu_iova_build_caps(iommu, &caps);
2738 
2739 	mutex_unlock(&iommu->lock);
2740 
2741 	if (ret)
2742 		return ret;
2743 
2744 	if (caps.size) {
2745 		info.flags |= VFIO_IOMMU_INFO_CAPS;
2746 
2747 		if (info.argsz < sizeof(info) + caps.size) {
2748 			info.argsz = sizeof(info) + caps.size;
2749 		} else {
2750 			vfio_info_cap_shift(&caps, sizeof(info));
2751 			if (copy_to_user((void __user *)arg +
2752 					sizeof(info), caps.buf,
2753 					caps.size)) {
2754 				kfree(caps.buf);
2755 				return -EFAULT;
2756 			}
2757 			info.cap_offset = sizeof(info);
2758 		}
2759 
2760 		kfree(caps.buf);
2761 	}
2762 
2763 	return copy_to_user((void __user *)arg, &info, minsz) ?
2764 			-EFAULT : 0;
2765 }
2766 
vfio_iommu_type1_map_dma(struct vfio_iommu * iommu,unsigned long arg)2767 static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2768 				    unsigned long arg)
2769 {
2770 	struct vfio_iommu_type1_dma_map map;
2771 	unsigned long minsz;
2772 	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
2773 
2774 	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2775 
2776 	if (copy_from_user(&map, (void __user *)arg, minsz))
2777 		return -EFAULT;
2778 
2779 	if (map.argsz < minsz || map.flags & ~mask)
2780 		return -EINVAL;
2781 
2782 	return vfio_dma_do_map(iommu, &map);
2783 }
2784 
vfio_iommu_type1_unmap_dma(struct vfio_iommu * iommu,unsigned long arg)2785 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2786 				      unsigned long arg)
2787 {
2788 	struct vfio_iommu_type1_dma_unmap unmap;
2789 	struct vfio_bitmap bitmap = { 0 };
2790 	unsigned long minsz;
2791 	int ret;
2792 
2793 	minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2794 
2795 	if (copy_from_user(&unmap, (void __user *)arg, minsz))
2796 		return -EFAULT;
2797 
2798 	if (unmap.argsz < minsz ||
2799 	    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
2800 		return -EINVAL;
2801 
2802 	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
2803 		unsigned long pgshift;
2804 
2805 		if (unmap.argsz < (minsz + sizeof(bitmap)))
2806 			return -EINVAL;
2807 
2808 		if (copy_from_user(&bitmap,
2809 				   (void __user *)(arg + minsz),
2810 				   sizeof(bitmap)))
2811 			return -EFAULT;
2812 
2813 		if (!access_ok((void __user *)bitmap.data, bitmap.size))
2814 			return -EINVAL;
2815 
2816 		pgshift = __ffs(bitmap.pgsize);
2817 		ret = verify_bitmap_size(unmap.size >> pgshift,
2818 					 bitmap.size);
2819 		if (ret)
2820 			return ret;
2821 	}
2822 
2823 	ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
2824 	if (ret)
2825 		return ret;
2826 
2827 	return copy_to_user((void __user *)arg, &unmap, minsz) ?
2828 			-EFAULT : 0;
2829 }
2830 
vfio_iommu_type1_dirty_pages(struct vfio_iommu * iommu,unsigned long arg)2831 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
2832 					unsigned long arg)
2833 {
2834 	struct vfio_iommu_type1_dirty_bitmap dirty;
2835 	uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
2836 			VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
2837 			VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
2838 	unsigned long minsz;
2839 	int ret = 0;
2840 
2841 	if (!iommu->v2)
2842 		return -EACCES;
2843 
2844 	minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
2845 
2846 	if (copy_from_user(&dirty, (void __user *)arg, minsz))
2847 		return -EFAULT;
2848 
2849 	if (dirty.argsz < minsz || dirty.flags & ~mask)
2850 		return -EINVAL;
2851 
2852 	/* only one flag should be set at a time */
2853 	if (__ffs(dirty.flags) != __fls(dirty.flags))
2854 		return -EINVAL;
2855 
2856 	if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
2857 		size_t pgsize;
2858 
2859 		mutex_lock(&iommu->lock);
2860 		pgsize = 1 << __ffs(iommu->pgsize_bitmap);
2861 		if (!iommu->dirty_page_tracking) {
2862 			ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
2863 			if (!ret)
2864 				iommu->dirty_page_tracking = true;
2865 		}
2866 		mutex_unlock(&iommu->lock);
2867 		return ret;
2868 	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
2869 		mutex_lock(&iommu->lock);
2870 		if (iommu->dirty_page_tracking) {
2871 			iommu->dirty_page_tracking = false;
2872 			vfio_dma_bitmap_free_all(iommu);
2873 		}
2874 		mutex_unlock(&iommu->lock);
2875 		return 0;
2876 	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
2877 		struct vfio_iommu_type1_dirty_bitmap_get range;
2878 		unsigned long pgshift;
2879 		size_t data_size = dirty.argsz - minsz;
2880 		size_t iommu_pgsize;
2881 
2882 		if (!data_size || data_size < sizeof(range))
2883 			return -EINVAL;
2884 
2885 		if (copy_from_user(&range, (void __user *)(arg + minsz),
2886 				   sizeof(range)))
2887 			return -EFAULT;
2888 
2889 		if (range.iova + range.size < range.iova)
2890 			return -EINVAL;
2891 		if (!access_ok((void __user *)range.bitmap.data,
2892 			       range.bitmap.size))
2893 			return -EINVAL;
2894 
2895 		pgshift = __ffs(range.bitmap.pgsize);
2896 		ret = verify_bitmap_size(range.size >> pgshift,
2897 					 range.bitmap.size);
2898 		if (ret)
2899 			return ret;
2900 
2901 		mutex_lock(&iommu->lock);
2902 
2903 		iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2904 
2905 		/* allow only smallest supported pgsize */
2906 		if (range.bitmap.pgsize != iommu_pgsize) {
2907 			ret = -EINVAL;
2908 			goto out_unlock;
2909 		}
2910 		if (range.iova & (iommu_pgsize - 1)) {
2911 			ret = -EINVAL;
2912 			goto out_unlock;
2913 		}
2914 		if (!range.size || range.size & (iommu_pgsize - 1)) {
2915 			ret = -EINVAL;
2916 			goto out_unlock;
2917 		}
2918 
2919 		if (iommu->dirty_page_tracking)
2920 			ret = vfio_iova_dirty_bitmap(range.bitmap.data,
2921 						     iommu, range.iova,
2922 						     range.size,
2923 						     range.bitmap.pgsize);
2924 		else
2925 			ret = -EINVAL;
2926 out_unlock:
2927 		mutex_unlock(&iommu->lock);
2928 
2929 		return ret;
2930 	}
2931 
2932 	return -EINVAL;
2933 }
2934 
vfio_iommu_type1_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)2935 static long vfio_iommu_type1_ioctl(void *iommu_data,
2936 				   unsigned int cmd, unsigned long arg)
2937 {
2938 	struct vfio_iommu *iommu = iommu_data;
2939 
2940 	switch (cmd) {
2941 	case VFIO_CHECK_EXTENSION:
2942 		return vfio_iommu_type1_check_extension(iommu, arg);
2943 	case VFIO_IOMMU_GET_INFO:
2944 		return vfio_iommu_type1_get_info(iommu, arg);
2945 	case VFIO_IOMMU_MAP_DMA:
2946 		return vfio_iommu_type1_map_dma(iommu, arg);
2947 	case VFIO_IOMMU_UNMAP_DMA:
2948 		return vfio_iommu_type1_unmap_dma(iommu, arg);
2949 	case VFIO_IOMMU_DIRTY_PAGES:
2950 		return vfio_iommu_type1_dirty_pages(iommu, arg);
2951 	default:
2952 		return -ENOTTY;
2953 	}
2954 }
2955 
vfio_iommu_type1_register_notifier(void * iommu_data,unsigned long * events,struct notifier_block * nb)2956 static int vfio_iommu_type1_register_notifier(void *iommu_data,
2957 					      unsigned long *events,
2958 					      struct notifier_block *nb)
2959 {
2960 	struct vfio_iommu *iommu = iommu_data;
2961 
2962 	/* clear known events */
2963 	*events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
2964 
2965 	/* refuse to register if still events remaining */
2966 	if (*events)
2967 		return -EINVAL;
2968 
2969 	return blocking_notifier_chain_register(&iommu->notifier, nb);
2970 }
2971 
vfio_iommu_type1_unregister_notifier(void * iommu_data,struct notifier_block * nb)2972 static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
2973 						struct notifier_block *nb)
2974 {
2975 	struct vfio_iommu *iommu = iommu_data;
2976 
2977 	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
2978 }
2979 
vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu * iommu,dma_addr_t user_iova,void * data,size_t count,bool write,size_t * copied)2980 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
2981 					 dma_addr_t user_iova, void *data,
2982 					 size_t count, bool write,
2983 					 size_t *copied)
2984 {
2985 	struct mm_struct *mm;
2986 	unsigned long vaddr;
2987 	struct vfio_dma *dma;
2988 	bool kthread = current->mm == NULL;
2989 	size_t offset;
2990 
2991 	*copied = 0;
2992 
2993 	dma = vfio_find_dma(iommu, user_iova, 1);
2994 	if (!dma)
2995 		return -EINVAL;
2996 
2997 	if ((write && !(dma->prot & IOMMU_WRITE)) ||
2998 			!(dma->prot & IOMMU_READ))
2999 		return -EPERM;
3000 
3001 	mm = get_task_mm(dma->task);
3002 
3003 	if (!mm)
3004 		return -EPERM;
3005 
3006 	if (kthread)
3007 		kthread_use_mm(mm);
3008 	else if (current->mm != mm)
3009 		goto out;
3010 
3011 	offset = user_iova - dma->iova;
3012 
3013 	if (count > dma->size - offset)
3014 		count = dma->size - offset;
3015 
3016 	vaddr = dma->vaddr + offset;
3017 
3018 	if (write) {
3019 		*copied = copy_to_user((void __user *)vaddr, data,
3020 					 count) ? 0 : count;
3021 		if (*copied && iommu->dirty_page_tracking) {
3022 			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
3023 			/*
3024 			 * Bitmap populated with the smallest supported page
3025 			 * size
3026 			 */
3027 			bitmap_set(dma->bitmap, offset >> pgshift,
3028 				   ((offset + *copied - 1) >> pgshift) -
3029 				   (offset >> pgshift) + 1);
3030 		}
3031 	} else
3032 		*copied = copy_from_user(data, (void __user *)vaddr,
3033 					   count) ? 0 : count;
3034 	if (kthread)
3035 		kthread_unuse_mm(mm);
3036 out:
3037 	mmput(mm);
3038 	return *copied ? 0 : -EFAULT;
3039 }
3040 
vfio_iommu_type1_dma_rw(void * iommu_data,dma_addr_t user_iova,void * data,size_t count,bool write)3041 static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
3042 				   void *data, size_t count, bool write)
3043 {
3044 	struct vfio_iommu *iommu = iommu_data;
3045 	int ret = 0;
3046 	size_t done;
3047 
3048 	mutex_lock(&iommu->lock);
3049 	while (count > 0) {
3050 		ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
3051 						    count, write, &done);
3052 		if (ret)
3053 			break;
3054 
3055 		count -= done;
3056 		data += done;
3057 		user_iova += done;
3058 	}
3059 
3060 	mutex_unlock(&iommu->lock);
3061 	return ret;
3062 }
3063 
3064 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
3065 	.name			= "vfio-iommu-type1",
3066 	.owner			= THIS_MODULE,
3067 	.open			= vfio_iommu_type1_open,
3068 	.release		= vfio_iommu_type1_release,
3069 	.ioctl			= vfio_iommu_type1_ioctl,
3070 	.attach_group		= vfio_iommu_type1_attach_group,
3071 	.detach_group		= vfio_iommu_type1_detach_group,
3072 	.pin_pages		= vfio_iommu_type1_pin_pages,
3073 	.unpin_pages		= vfio_iommu_type1_unpin_pages,
3074 	.register_notifier	= vfio_iommu_type1_register_notifier,
3075 	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
3076 	.dma_rw			= vfio_iommu_type1_dma_rw,
3077 };
3078 
vfio_iommu_type1_init(void)3079 static int __init vfio_iommu_type1_init(void)
3080 {
3081 	return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
3082 }
3083 
vfio_iommu_type1_cleanup(void)3084 static void __exit vfio_iommu_type1_cleanup(void)
3085 {
3086 	vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
3087 }
3088 
3089 module_init(vfio_iommu_type1_init);
3090 module_exit(vfio_iommu_type1_cleanup);
3091 
3092 MODULE_VERSION(DRIVER_VERSION);
3093 MODULE_LICENSE("GPL v2");
3094 MODULE_AUTHOR(DRIVER_AUTHOR);
3095 MODULE_DESCRIPTION(DRIVER_DESC);
3096