drivers/vfio/vfio_iommu_type1.c

   1 /*
   2  * VFIO: IOMMU DMA mapping support for Type1 IOMMU
   3  *
   4  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5  *     Author: Alex Williamson <alex.williamson@redhat.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License version 2 as
   9  * published by the Free Software Foundation.
  10  *
  11  * Derived from original vfio:
  12  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13  * Author: Tom Lyon, pugs@cisco.com
  14  *
  15  * We arbitrarily define a Type1 IOMMU as one matching the below code.
  16  * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
  17  * VT-d, but that makes it harder to re-use as theoretically anyone
  18  * implementing a similar IOMMU could make use of this.  We expect the
  19  * IOMMU to support the IOMMU API and have few to no restrictions around
  20  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
  21  * optimized for relatively static mappings of a userspace process with
  22  * userpsace pages pinned into memory.  We also assume devices and IOMMU
  23  * domains are PCI based as the IOMMU API is still centered around a
  24  * device/bus interface rather than a group interface.
  25  */
  26
  27 #include <linux/compat.h>
  28 #include <linux/device.h>
  29 #include <linux/fs.h>
  30 #include <linux/iommu.h>
  31 #include <linux/module.h>
  32 #include <linux/mm.h>
  33 #include <linux/pci.h>          /* pci_bus_type */
  34 #include <linux/rbtree.h>
  35 #include <linux/sched.h>
  36 #include <linux/slab.h>
  37 #include <linux/uaccess.h>
  38 #include <linux/vfio.h>
  39 #include <linux/workqueue.h>
  40
  41 #define DRIVER_VERSION  "0.2"
  42 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  43 #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
  44
  45 static bool allow_unsafe_interrupts;
  46 module_param_named(allow_unsafe_interrupts,
  47                    allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
  48 MODULE_PARM_DESC(allow_unsafe_interrupts,
  49                  "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
  50
  51 static bool disable_hugepages;
  52 module_param_named(disable_hugepages,
  53                    disable_hugepages, bool, S_IRUGO | S_IWUSR);
  54 MODULE_PARM_DESC(disable_hugepages,
  55                  "Disable VFIO IOMMU support for IOMMU hugepages.");
  56
  57 struct vfio_iommu {
  58         struct iommu_domain     *domain;
  59         struct mutex            lock;
  60         struct rb_root          dma_list;
  61         struct list_head        group_list;
  62         bool                    cache;
  63 };
  64
  65 struct vfio_dma {
  66         struct rb_node          node;
  67         dma_addr_t              iova;           /* Device address */
  68         unsigned long           vaddr;          /* Process virtual addr */
  69         size_t                  size;           /* Map size (bytes) */
  70         int                     prot;           /* IOMMU_READ/WRITE */
  71 };
  72
  73 struct vfio_group {
  74         struct iommu_group      *iommu_group;
  75         struct list_head        next;
  76 };
  77
  78 /*
  79  * This code handles mapping and unmapping of user data buffers
  80  * into DMA'ble space using the IOMMU
  81  */
  82
  83 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
  84                                       dma_addr_t start, size_t size)
  85 {
  86         struct rb_node *node = iommu->dma_list.rb_node;
  87
  88         while (node) {
  89                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
  90
  91                 if (start + size <= dma->iova)
  92                         node = node->rb_left;
  93                 else if (start >= dma->iova + dma->size)
  94                         node = node->rb_right;
  95                 else
  96                         return dma;
  97         }
  98
  99         return NULL;
 100 }
 101
 102 static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 103 {
 104         struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
 105         struct vfio_dma *dma;
 106
 107         while (*link) {
 108                 parent = *link;
 109                 dma = rb_entry(parent, struct vfio_dma, node);
 110
 111                 if (new->iova + new->size <= dma->iova)
 112                         link = &(*link)->rb_left;
 113                 else
 114                         link = &(*link)->rb_right;
 115         }
 116
 117         rb_link_node(&new->node, parent, link);
 118         rb_insert_color(&new->node, &iommu->dma_list);
 119 }
 120
 121 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 122 {
 123         rb_erase(&old->node, &iommu->dma_list);
 124 }
 125
 126 struct vwork {
 127         struct mm_struct        *mm;
 128         long                    npage;
 129         struct work_struct      work;
 130 };
 131
 132 /* delayed decrement/increment for locked_vm */
 133 static void vfio_lock_acct_bg(struct work_struct *work)
 134 {
 135         struct vwork *vwork = container_of(work, struct vwork, work);
 136         struct mm_struct *mm;
 137
 138         mm = vwork->mm;
 139         down_write(&mm->mmap_sem);
 140         mm->locked_vm += vwork->npage;
 141         up_write(&mm->mmap_sem);
 142         mmput(mm);
 143         kfree(vwork);
 144 }
 145
 146 static void vfio_lock_acct(long npage)
 147 {
 148         struct vwork *vwork;
 149         struct mm_struct *mm;
 150
 151         if (!current->mm || !npage)
 152                 return; /* process exited or nothing to do */
 153
 154         if (down_write_trylock(&current->mm->mmap_sem)) {
 155                 current->mm->locked_vm += npage;
 156                 up_write(&current->mm->mmap_sem);
 157                 return;
 158         }
 159
 160         /*
 161          * Couldn't get mmap_sem lock, so must setup to update
 162          * mm->locked_vm later. If locked_vm were atomic, we
 163          * wouldn't need this silliness
 164          */
 165         vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
 166         if (!vwork)
 167                 return;
 168         mm = get_task_mm(current);
 169         if (!mm) {
 170                 kfree(vwork);
 171                 return;
 172         }
 173         INIT_WORK(&vwork->work, vfio_lock_acct_bg);
 174         vwork->mm = mm;
 175         vwork->npage = npage;
 176         schedule_work(&vwork->work);
 177 }
 178
 179 /*
 180  * Some mappings aren't backed by a struct page, for example an mmap'd
 181  * MMIO range for our own or another device.  These use a different
 182  * pfn conversion and shouldn't be tracked as locked pages.
 183  */
 184 static bool is_invalid_reserved_pfn(unsigned long pfn)
 185 {
 186         if (pfn_valid(pfn)) {
 187                 bool reserved;
 188                 struct page *tail = pfn_to_page(pfn);
 189                 struct page *head = compound_trans_head(tail);
 190                 reserved = !!(PageReserved(head));
 191                 if (head != tail) {
 192                         /*
 193                          * "head" is not a dangling pointer
 194                          * (compound_trans_head takes care of that)
 195                          * but the hugepage may have been split
 196                          * from under us (and we may not hold a
 197                          * reference count on the head page so it can
 198                          * be reused before we run PageReferenced), so
 199                          * we've to check PageTail before returning
 200                          * what we just read.
 201                          */
 202                         smp_rmb();
 203                         if (PageTail(tail))
 204                                 return reserved;
 205                 }
 206                 return PageReserved(tail);
 207         }
 208
 209         return true;
 210 }
 211
 212 static int put_pfn(unsigned long pfn, int prot)
 213 {
 214         if (!is_invalid_reserved_pfn(pfn)) {
 215                 struct page *page = pfn_to_page(pfn);
 216                 if (prot & IOMMU_WRITE)
 217                         SetPageDirty(page);
 218                 put_page(page);
 219                 return 1;
 220         }
 221         return 0;
 222 }
 223
 224 static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 225 {
 226         struct page *page[1];
 227         struct vm_area_struct *vma;
 228         int ret = -EFAULT;
 229
 230         if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
 231                 *pfn = page_to_pfn(page[0]);
 232                 return 0;
 233         }
 234
 235         down_read(&current->mm->mmap_sem);
 236
 237         vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
 238
 239         if (vma && vma->vm_flags & VM_PFNMAP) {
 240                 *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 241                 if (is_invalid_reserved_pfn(*pfn))
 242                         ret = 0;
 243         }
 244
 245         up_read(&current->mm->mmap_sem);
 246
 247         return ret;
 248 }
 249
 250 /*
 251  * Attempt to pin pages.  We really don't want to track all the pfns and
 252  * the iommu can only map chunks of consecutive pfns anyway, so get the
 253  * first page and all consecutive pages with the same locking.
 254  */
 255 static long vfio_pin_pages(unsigned long vaddr, long npage,
 256                            int prot, unsigned long *pfn_base)
 257 {
 258         unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 259         bool lock_cap = capable(CAP_IPC_LOCK);
 260         long ret, i;
 261
 262         if (!current->mm)
 263                 return -ENODEV;
 264
 265         ret = vaddr_get_pfn(vaddr, prot, pfn_base);
 266         if (ret)
 267                 return ret;
 268
 269         if (is_invalid_reserved_pfn(*pfn_base))
 270                 return 1;
 271
 272         if (!lock_cap && current->mm->locked_vm + 1 > limit) {
 273                 put_pfn(*pfn_base, prot);
 274                 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 275                         limit << PAGE_SHIFT);
 276                 return -ENOMEM;
 277         }
 278
 279         if (unlikely(disable_hugepages)) {
 280                 vfio_lock_acct(1);
 281                 return 1;
 282         }
 283
 284         /* Lock all the consecutive pages from pfn_base */
 285         for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 286                 unsigned long pfn = 0;
 287
 288                 ret = vaddr_get_pfn(vaddr, prot, &pfn);
 289                 if (ret)
 290                         break;
 291
 292                 if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
 293                         put_pfn(pfn, prot);
 294                         break;
 295                 }
 296
 297                 if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
 298                         put_pfn(pfn, prot);
 299                         pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 300                                 __func__, limit << PAGE_SHIFT);
 301                         break;
 302                 }
 303         }
 304
 305         vfio_lock_acct(i);
 306
 307         return i;
 308 }
 309
 310 static long vfio_unpin_pages(unsigned long pfn, long npage,
 311                              int prot, bool do_accounting)
 312 {
 313         unsigned long unlocked = 0;
 314         long i;
 315
 316         for (i = 0; i < npage; i++)
 317                 unlocked += put_pfn(pfn++, prot);
 318
 319         if (do_accounting)
 320                 vfio_lock_acct(-unlocked);
 321
 322         return unlocked;
 323 }
 324
 325 static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 326                             dma_addr_t iova, size_t *size)
 327 {
 328         dma_addr_t start = iova, end = iova + *size;
 329         long unlocked = 0;
 330
 331         while (iova < end) {
 332                 size_t unmapped;
 333                 phys_addr_t phys;
 334
 335                 /*
 336                  * We use the IOMMU to track the physical address.  This
 337                  * saves us from having a lot more entries in our mapping
 338                  * tree.  The downside is that we don't track the size
 339                  * used to do the mapping.  We request unmap of a single
 340                  * page, but expect IOMMUs that support large pages to
 341                  * unmap a larger chunk.
 342                  */
 343                 phys = iommu_iova_to_phys(iommu->domain, iova);
 344                 if (WARN_ON(!phys)) {
 345                         iova += PAGE_SIZE;
 346                         continue;
 347                 }
 348
 349                 unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
 350                 if (!unmapped)
 351                         break;
 352
 353                 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
 354                                              unmapped >> PAGE_SHIFT,
 355                                              dma->prot, false);
 356                 iova += unmapped;
 357         }
 358
 359         vfio_lock_acct(-unlocked);
 360
 361         *size = iova - start;
 362
 363         return 0;
 364 }
 365
 366 static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 367                                    size_t *size, struct vfio_dma *dma)
 368 {
 369         size_t offset, overlap, tmp;
 370         struct vfio_dma *split;
 371         int ret;
 372
 373         if (!*size)
 374                 return 0;
 375
 376         /*
 377          * Existing dma region is completely covered, unmap all.  This is
 378          * the likely case since userspace tends to map and unmap buffers
 379          * in one shot rather than multiple mappings within a buffer.
 380          */
 381         if (likely(start <= dma->iova &&
 382                    start + *size >= dma->iova + dma->size)) {
 383                 *size = dma->size;
 384                 ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
 385                 if (ret)
 386                         return ret;
 387
 388                 /*
 389                  * Did we remove more than we have?  Should never happen
 390                  * since a vfio_dma is contiguous in iova and vaddr.
 391                  */
 392                 WARN_ON(*size != dma->size);
 393
 394                 vfio_remove_dma(iommu, dma);
 395                 kfree(dma);
 396                 return 0;
 397         }
 398
 399         /* Overlap low address of existing range */
 400         if (start <= dma->iova) {
 401                 overlap = start + *size - dma->iova;
 402                 ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
 403                 if (ret)
 404                         return ret;
 405
 406                 vfio_remove_dma(iommu, dma);
 407
 408                 /*
 409                  * Check, we may have removed to whole vfio_dma.  If not
 410                  * fixup and re-insert.
 411                  */
 412                 if (overlap < dma->size) {
 413                         dma->iova += overlap;
 414                         dma->vaddr += overlap;
 415                         dma->size -= overlap;
 416                         vfio_insert_dma(iommu, dma);
 417                 } else
 418                         kfree(dma);
 419
 420                 *size = overlap;
 421                 return 0;
 422         }
 423
 424         /* Overlap high address of existing range */
 425         if (start + *size >= dma->iova + dma->size) {
 426                 offset = start - dma->iova;
 427                 overlap = dma->size - offset;
 428
 429                 ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
 430                 if (ret)
 431                         return ret;
 432
 433                 dma->size -= overlap;
 434                 *size = overlap;
 435                 return 0;
 436         }
 437
 438         /* Split existing */
 439
 440         /*
 441          * Allocate our tracking structure early even though it may not
 442          * be used.  An Allocation failure later loses track of pages and
 443          * is more difficult to unwind.
 444          */
 445         split = kzalloc(sizeof(*split), GFP_KERNEL);
 446         if (!split)
 447                 return -ENOMEM;
 448
 449         offset = start - dma->iova;
 450
 451         ret = vfio_unmap_unpin(iommu, dma, start, size);
 452         if (ret || !*size) {
 453                 kfree(split);
 454                 return ret;
 455         }
 456
 457         tmp = dma->size;
 458
 459         /* Resize the lower vfio_dma in place, before the below insert */
 460         dma->size = offset;
 461
 462         /* Insert new for remainder, assuming it didn't all get unmapped */
 463         if (likely(offset + *size < tmp)) {
 464                 split->size = tmp - offset - *size;
 465                 split->iova = dma->iova + offset + *size;
 466                 split->vaddr = dma->vaddr + offset + *size;
 467                 split->prot = dma->prot;
 468                 vfio_insert_dma(iommu, split);
 469         } else
 470                 kfree(split);
 471
 472         return 0;
 473 }
 474
 475 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 476                              struct vfio_iommu_type1_dma_unmap *unmap)
 477 {
 478         uint64_t mask;
 479         struct vfio_dma *dma;
 480         size_t unmapped = 0, size;
 481         int ret = 0;
 482
 483         mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 484
 485         if (unmap->iova & mask)
 486                 return -EINVAL;
 487         if (!unmap->size || unmap->size & mask)
 488                 return -EINVAL;
 489
 490         WARN_ON(mask & PAGE_MASK);
 491
 492         mutex_lock(&iommu->lock);
 493
 494         while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
 495                 size = unmap->size;
 496                 ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
 497                 if (ret || !size)
 498                         break;
 499                 unmapped += size;
 500         }
 501
 502         mutex_unlock(&iommu->lock);
 503
 504         /*
 505          * We may unmap more than requested, update the unmap struct so
 506          * userspace can know.
 507          */
 508         unmap->size = unmapped;
 509
 510         return ret;
 511 }
 512
 513 /*
 514  * Turns out AMD IOMMU has a page table bug where it won't map large pages
 515  * to a region that previously mapped smaller pages.  This should be fixed
 516  * soon, so this is just a temporary workaround to break mappings down into
 517  * PAGE_SIZE.  Better to map smaller pages than nothing.
 518  */
 519 static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
 520                           unsigned long pfn, long npage, int prot)
 521 {
 522         long i;
 523         int ret;
 524
 525         for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
 526                 ret = iommu_map(iommu->domain, iova,
 527                                 (phys_addr_t)pfn << PAGE_SHIFT,
 528                                 PAGE_SIZE, prot);
 529                 if (ret)
 530                         break;
 531         }
 532
 533         for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
 534                 iommu_unmap(iommu->domain, iova, PAGE_SIZE);
 535
 536         return ret;
 537 }
 538
 539 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 540                            struct vfio_iommu_type1_dma_map *map)
 541 {
 542         dma_addr_t end, iova;
 543         unsigned long vaddr = map->vaddr;
 544         size_t size = map->size;
 545         long npage;
 546         int ret = 0, prot = 0;
 547         uint64_t mask;
 548
 549         end = map->iova + map->size;
 550
 551         mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 552
 553         /* READ/WRITE from device perspective */
 554         if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
 555                 prot |= IOMMU_WRITE;
 556         if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 557                 prot |= IOMMU_READ;
 558
 559         if (!prot)
 560                 return -EINVAL; /* No READ/WRITE? */
 561
 562         if (iommu->cache)
 563                 prot |= IOMMU_CACHE;
 564
 565         if (vaddr & mask)
 566                 return -EINVAL;
 567         if (map->iova & mask)
 568                 return -EINVAL;
 569         if (!map->size || map->size & mask)
 570                 return -EINVAL;
 571
 572         WARN_ON(mask & PAGE_MASK);
 573
 574         /* Don't allow IOVA wrap */
 575         if (end && end < map->iova)
 576                 return -EINVAL;
 577
 578         /* Don't allow virtual address wrap */
 579         if (vaddr + map->size && vaddr + map->size < vaddr)
 580                 return -EINVAL;
 581
 582         mutex_lock(&iommu->lock);
 583
 584         if (vfio_find_dma(iommu, map->iova, map->size)) {
 585                 mutex_unlock(&iommu->lock);
 586                 return -EEXIST;
 587         }
 588
 589         for (iova = map->iova; iova < end; iova += size, vaddr += size) {
 590                 struct vfio_dma *dma = NULL;
 591                 unsigned long pfn;
 592                 long i;
 593
 594                 /* Pin a contiguous chunk of memory */
 595                 npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
 596                                        prot, &pfn);
 597                 if (npage <= 0) {
 598                         WARN_ON(!npage);
 599                         ret = (int)npage;
 600                         break;
 601                 }
 602
 603                 /* Verify pages are not already mapped */
 604                 for (i = 0; i < npage; i++) {
 605                         if (iommu_iova_to_phys(iommu->domain,
 606                                                iova + (i << PAGE_SHIFT))) {
 607                                 vfio_unpin_pages(pfn, npage, prot, true);
 608                                 ret = -EBUSY;
 609                                 break;
 610                         }
 611                 }
 612
 613                 ret = iommu_map(iommu->domain, iova,
 614                                 (phys_addr_t)pfn << PAGE_SHIFT,
 615                                 npage << PAGE_SHIFT, prot);
 616                 if (ret) {
 617                         if (ret != -EBUSY ||
 618                             map_try_harder(iommu, iova, pfn, npage, prot)) {
 619                                 vfio_unpin_pages(pfn, npage, prot, true);
 620                                 break;
 621                         }
 622                 }
 623
 624                 size = npage << PAGE_SHIFT;
 625
 626                 /*
 627                  * Check if we abut a region below - nothing below 0.
 628                  * This is the most likely case when mapping chunks of
 629                  * physically contiguous regions within a virtual address
 630                  * range.  Update the abutting entry in place since iova
 631                  * doesn't change.
 632                  */
 633                 if (likely(iova)) {
 634                         struct vfio_dma *tmp;
 635                         tmp = vfio_find_dma(iommu, iova - 1, 1);
 636                         if (tmp && tmp->prot == prot &&
 637                             tmp->vaddr + tmp->size == vaddr) {
 638                                 tmp->size += size;
 639                                 iova = tmp->iova;
 640                                 size = tmp->size;
 641                                 vaddr = tmp->vaddr;
 642                                 dma = tmp;
 643                         }
 644                 }
 645
 646                 /*
 647                  * Check if we abut a region above - nothing above ~0 + 1.
 648                  * If we abut above and below, remove and free.  If only
 649                  * abut above, remove, modify, reinsert.
 650                  */
 651                 if (likely(iova + size)) {
 652                         struct vfio_dma *tmp;
 653                         tmp = vfio_find_dma(iommu, iova + size, 1);
 654                         if (tmp && tmp->prot == prot &&
 655                             tmp->vaddr == vaddr + size) {
 656                                 vfio_remove_dma(iommu, tmp);
 657                                 if (dma) {
 658                                         dma->size += tmp->size;
 659                                         kfree(tmp);
 660                                 } else {
 661                                         size += tmp->size;
 662                                         tmp->size = size;
 663                                         tmp->iova = iova;
 664                                         tmp->vaddr = vaddr;
 665                                         vfio_insert_dma(iommu, tmp);
 666                                         dma = tmp;
 667                                 }
 668                         }
 669                 }
 670
 671                 if (!dma) {
 672                         dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 673                         if (!dma) {
 674                                 iommu_unmap(iommu->domain, iova, size);
 675                                 vfio_unpin_pages(pfn, npage, prot, true);
 676                                 ret = -ENOMEM;
 677                                 break;
 678                         }
 679
 680                         dma->size = size;
 681                         dma->iova = iova;
 682                         dma->vaddr = vaddr;
 683                         dma->prot = prot;
 684                         vfio_insert_dma(iommu, dma);
 685                 }
 686         }
 687
 688         if (ret) {
 689                 struct vfio_dma *tmp;
 690                 iova = map->iova;
 691                 size = map->size;
 692                 while ((tmp = vfio_find_dma(iommu, iova, size))) {
 693                         int r = vfio_remove_dma_overlap(iommu, iova,
 694                                                         &size, tmp);
 695                         if (WARN_ON(r || !size))
 696                                 break;
 697                 }
 698         }
 699
 700         mutex_unlock(&iommu->lock);
 701         return ret;
 702 }
 703
 704 static int vfio_iommu_type1_attach_group(void *iommu_data,
 705                                          struct iommu_group *iommu_group)
 706 {
 707         struct vfio_iommu *iommu = iommu_data;
 708         struct vfio_group *group, *tmp;
 709         int ret;
 710
 711         group = kzalloc(sizeof(*group), GFP_KERNEL);
 712         if (!group)
 713                 return -ENOMEM;
 714
 715         mutex_lock(&iommu->lock);
 716
 717         list_for_each_entry(tmp, &iommu->group_list, next) {
 718                 if (tmp->iommu_group == iommu_group) {
 719                         mutex_unlock(&iommu->lock);
 720                         kfree(group);
 721                         return -EINVAL;
 722                 }
 723         }
 724
 725         /*
 726          * TODO: Domain have capabilities that might change as we add
 727          * groups (see iommu->cache, currently never set).  Check for
 728          * them and potentially disallow groups to be attached when it
 729          * would change capabilities (ugh).
 730          */
 731         ret = iommu_attach_group(iommu->domain, iommu_group);
 732         if (ret) {
 733                 mutex_unlock(&iommu->lock);
 734                 kfree(group);
 735                 return ret;
 736         }
 737
 738         group->iommu_group = iommu_group;
 739         list_add(&group->next, &iommu->group_list);
 740
 741         mutex_unlock(&iommu->lock);
 742
 743         return 0;
 744 }
 745
 746 static void vfio_iommu_type1_detach_group(void *iommu_data,
 747                                           struct iommu_group *iommu_group)
 748 {
 749         struct vfio_iommu *iommu = iommu_data;
 750         struct vfio_group *group;
 751
 752         mutex_lock(&iommu->lock);
 753
 754         list_for_each_entry(group, &iommu->group_list, next) {
 755                 if (group->iommu_group == iommu_group) {
 756                         iommu_detach_group(iommu->domain, iommu_group);
 757                         list_del(&group->next);
 758                         kfree(group);
 759                         break;
 760                 }
 761         }
 762
 763         mutex_unlock(&iommu->lock);
 764 }
 765
 766 static void *vfio_iommu_type1_open(unsigned long arg)
 767 {
 768         struct vfio_iommu *iommu;
 769
 770         if (arg != VFIO_TYPE1_IOMMU)
 771                 return ERR_PTR(-EINVAL);
 772
 773         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
 774         if (!iommu)
 775                 return ERR_PTR(-ENOMEM);
 776
 777         INIT_LIST_HEAD(&iommu->group_list);
 778         iommu->dma_list = RB_ROOT;
 779         mutex_init(&iommu->lock);
 780
 781         /*
 782          * Wish we didn't have to know about bus_type here.
 783          */
 784         iommu->domain = iommu_domain_alloc(&pci_bus_type);
 785         if (!iommu->domain) {
 786                 kfree(iommu);
 787                 return ERR_PTR(-EIO);
 788         }
 789
 790         /*
 791          * Wish we could specify required capabilities rather than create
 792          * a domain, see what comes out and hope it doesn't change along
 793          * the way.  Fortunately we know interrupt remapping is global for
 794          * our iommus.
 795          */
 796         if (!allow_unsafe_interrupts &&
 797             !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
 798                 pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
 799                        __func__);
 800                 iommu_domain_free(iommu->domain);
 801                 kfree(iommu);
 802                 return ERR_PTR(-EPERM);
 803         }
 804
 805         return iommu;
 806 }
 807
 808 static void vfio_iommu_type1_release(void *iommu_data)
 809 {
 810         struct vfio_iommu *iommu = iommu_data;
 811         struct vfio_group *group, *group_tmp;
 812         struct rb_node *node;
 813
 814         list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
 815                 iommu_detach_group(iommu->domain, group->iommu_group);
 816                 list_del(&group->next);
 817                 kfree(group);
 818         }
 819
 820         while ((node = rb_first(&iommu->dma_list))) {
 821                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 822                 size_t size = dma->size;
 823                 vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
 824                 if (WARN_ON(!size))
 825                         break;
 826         }
 827
 828         iommu_domain_free(iommu->domain);
 829         iommu->domain = NULL;
 830         kfree(iommu);
 831 }
 832
 833 static long vfio_iommu_type1_ioctl(void *iommu_data,
 834                                    unsigned int cmd, unsigned long arg)
 835 {
 836         struct vfio_iommu *iommu = iommu_data;
 837         unsigned long minsz;
 838
 839         if (cmd == VFIO_CHECK_EXTENSION) {
 840                 switch (arg) {
 841                 case VFIO_TYPE1_IOMMU:
 842                         return 1;
 843                 default:
 844                         return 0;
 845                 }
 846         } else if (cmd == VFIO_IOMMU_GET_INFO) {
 847                 struct vfio_iommu_type1_info info;
 848
 849                 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 850
 851                 if (copy_from_user(&info, (void __user *)arg, minsz))
 852                         return -EFAULT;
 853
 854                 if (info.argsz < minsz)
 855                         return -EINVAL;
 856
 857                 info.flags = 0;
 858
 859                 info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap;
 860
 861                 return copy_to_user((void __user *)arg, &info, minsz);
 862
 863         } else if (cmd == VFIO_IOMMU_MAP_DMA) {
 864                 struct vfio_iommu_type1_dma_map map;
 865                 uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
 866                                 VFIO_DMA_MAP_FLAG_WRITE;
 867
 868                 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 869
 870                 if (copy_from_user(&map, (void __user *)arg, minsz))
 871                         return -EFAULT;
 872
 873                 if (map.argsz < minsz || map.flags & ~mask)
 874                         return -EINVAL;
 875
 876                 return vfio_dma_do_map(iommu, &map);
 877
 878         } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
 879                 struct vfio_iommu_type1_dma_unmap unmap;
 880                 long ret;
 881
 882                 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 883
 884                 if (copy_from_user(&unmap, (void __user *)arg, minsz))
 885                         return -EFAULT;
 886
 887                 if (unmap.argsz < minsz || unmap.flags)
 888                         return -EINVAL;
 889
 890                 ret = vfio_dma_do_unmap(iommu, &unmap);
 891                 if (ret)
 892                         return ret;
 893
 894                 return copy_to_user((void __user *)arg, &unmap, minsz);
 895         }
 896
 897         return -ENOTTY;
 898 }
 899
 900 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 901         .name           = "vfio-iommu-type1",
 902         .owner          = THIS_MODULE,
 903         .open           = vfio_iommu_type1_open,
 904         .release        = vfio_iommu_type1_release,
 905         .ioctl          = vfio_iommu_type1_ioctl,
 906         .attach_group   = vfio_iommu_type1_attach_group,
 907         .detach_group   = vfio_iommu_type1_detach_group,
 908 };
 909
 910 static int __init vfio_iommu_type1_init(void)
 911 {
 912         if (!iommu_present(&pci_bus_type))
 913                 return -ENODEV;
 914
 915         return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
 916 }
 917
 918 static void __exit vfio_iommu_type1_cleanup(void)
 919 {
 920         vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
 921 }
 922
 923 module_init(vfio_iommu_type1_init);
 924 module_exit(vfio_iommu_type1_cleanup);
 925
 926 MODULE_VERSION(DRIVER_VERSION);
 927 MODULE_LICENSE("GPL v2");
 928 MODULE_AUTHOR(DRIVER_AUTHOR);
 929 MODULE_DESCRIPTION(DRIVER_DESC);