i40e: Add define for interrupt name string len
[linux-2.6/btrfs-unstable.git] / drivers / iommu / intel-iommu.c
blob40dfbc0444c0eaccdeca8cebfcd5f567e3d84ae4
1 /*
2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-contiguous.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
47 #include "irq_remapping.h"
49 #define ROOT_SIZE VTD_PAGE_SIZE
50 #define CONTEXT_SIZE VTD_PAGE_SIZE
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56 #define IOAPIC_RANGE_START (0xfee00000)
57 #define IOAPIC_RANGE_END (0xfeefffff)
58 #define IOVA_START_ADDR (0x1000)
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
65 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
71 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
78 /* page table handling */
79 #define LEVEL_STRIDE (9)
80 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
83 * This bitmap is used to advertise the page sizes our hardware support
84 * to the IOMMU core, which will then use this information to split
85 * physically contiguous memory regions it is mapping into page sizes
86 * that we support.
88 * Traditionally the IOMMU core just handed us the mappings directly,
89 * after making sure the size is an order of a 4KiB page and that the
90 * mapping has natural alignment.
92 * To retain this behavior, we currently advertise that we support
93 * all page sizes that are an order of 4KiB.
95 * If at some point we'd like to utilize the IOMMU core's new behavior,
96 * we could change this to advertise the real page sizes we support.
98 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
100 static inline int agaw_to_level(int agaw)
102 return agaw + 2;
105 static inline int agaw_to_width(int agaw)
107 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
110 static inline int width_to_agaw(int width)
112 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
115 static inline unsigned int level_to_offset_bits(int level)
117 return (level - 1) * LEVEL_STRIDE;
120 static inline int pfn_level_offset(unsigned long pfn, int level)
122 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
125 static inline unsigned long level_mask(int level)
127 return -1UL << level_to_offset_bits(level);
130 static inline unsigned long level_size(int level)
132 return 1UL << level_to_offset_bits(level);
135 static inline unsigned long align_to_level(unsigned long pfn, int level)
137 return (pfn + level_size(level) - 1) & level_mask(level);
140 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
142 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
145 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
146 are never going to work. */
147 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
149 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
152 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
154 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long page_to_dma_pfn(struct page *pg)
158 return mm_to_dma_pfn(page_to_pfn(pg));
160 static inline unsigned long virt_to_dma_pfn(void *p)
162 return page_to_dma_pfn(virt_to_page(p));
165 /* global iommu list, set NULL for ignored DMAR units */
166 static struct intel_iommu **g_iommus;
168 static void __init check_tylersburg_isoch(void);
169 static int rwbf_quirk;
172 * set to 1 to panic kernel if can't successfully enable VT-d
173 * (used when kernel is launched w/ TXT)
175 static int force_on = 0;
178 * 0: Present
179 * 1-11: Reserved
180 * 12-63: Context Ptr (12 - (haw-1))
181 * 64-127: Reserved
183 struct root_entry {
184 u64 val;
185 u64 rsvd1;
187 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 static inline bool root_present(struct root_entry *root)
190 return (root->val & 1);
192 static inline void set_root_present(struct root_entry *root)
194 root->val |= 1;
196 static inline void set_root_value(struct root_entry *root, unsigned long value)
198 root->val &= ~VTD_PAGE_MASK;
199 root->val |= value & VTD_PAGE_MASK;
202 static inline struct context_entry *
203 get_context_addr_from_root(struct root_entry *root)
205 return (struct context_entry *)
206 (root_present(root)?phys_to_virt(
207 root->val & VTD_PAGE_MASK) :
208 NULL);
212 * low 64 bits:
213 * 0: present
214 * 1: fault processing disable
215 * 2-3: translation type
216 * 12-63: address space root
217 * high 64 bits:
218 * 0-2: address width
219 * 3-6: aval
220 * 8-23: domain id
222 struct context_entry {
223 u64 lo;
224 u64 hi;
227 static inline bool context_present(struct context_entry *context)
229 return (context->lo & 1);
231 static inline void context_set_present(struct context_entry *context)
233 context->lo |= 1;
236 static inline void context_set_fault_enable(struct context_entry *context)
238 context->lo &= (((u64)-1) << 2) | 1;
241 static inline void context_set_translation_type(struct context_entry *context,
242 unsigned long value)
244 context->lo &= (((u64)-1) << 4) | 3;
245 context->lo |= (value & 3) << 2;
248 static inline void context_set_address_root(struct context_entry *context,
249 unsigned long value)
251 context->lo &= ~VTD_PAGE_MASK;
252 context->lo |= value & VTD_PAGE_MASK;
255 static inline void context_set_address_width(struct context_entry *context,
256 unsigned long value)
258 context->hi |= value & 7;
261 static inline void context_set_domain_id(struct context_entry *context,
262 unsigned long value)
264 context->hi |= (value & ((1 << 16) - 1)) << 8;
267 static inline void context_clear_entry(struct context_entry *context)
269 context->lo = 0;
270 context->hi = 0;
274 * 0: readable
275 * 1: writable
276 * 2-6: reserved
277 * 7: super page
278 * 8-10: available
279 * 11: snoop behavior
280 * 12-63: Host physcial address
282 struct dma_pte {
283 u64 val;
286 static inline void dma_clear_pte(struct dma_pte *pte)
288 pte->val = 0;
291 static inline u64 dma_pte_addr(struct dma_pte *pte)
293 #ifdef CONFIG_64BIT
294 return pte->val & VTD_PAGE_MASK;
295 #else
296 /* Must have a full atomic 64-bit read */
297 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
298 #endif
301 static inline bool dma_pte_present(struct dma_pte *pte)
303 return (pte->val & 3) != 0;
306 static inline bool dma_pte_superpage(struct dma_pte *pte)
308 return (pte->val & DMA_PTE_LARGE_PAGE);
311 static inline int first_pte_in_page(struct dma_pte *pte)
313 return !((unsigned long)pte & ~VTD_PAGE_MASK);
317 * This domain is a statically identity mapping domain.
318 * 1. This domain creats a static 1:1 mapping to all usable memory.
319 * 2. It maps to each iommu if successful.
320 * 3. Each iommu mapps to this domain if successful.
322 static struct dmar_domain *si_domain;
323 static int hw_pass_through = 1;
325 /* domain represents a virtual machine, more than one devices
326 * across iommus may be owned in one domain, e.g. kvm guest.
328 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
330 /* si_domain contains mulitple devices */
331 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
333 struct dmar_domain {
334 int id; /* domain id */
335 int nid; /* node id */
336 DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
337 /* bitmap of iommus this domain uses*/
339 struct list_head devices; /* all devices' list */
340 struct iova_domain iovad; /* iova's that belong to this domain */
342 struct dma_pte *pgd; /* virtual address */
343 int gaw; /* max guest address width */
345 /* adjusted guest address width, 0 is level 2 30-bit */
346 int agaw;
348 int flags; /* flags to find out type of domain */
350 int iommu_coherency;/* indicate coherency of iommu access */
351 int iommu_snooping; /* indicate snooping control feature*/
352 int iommu_count; /* reference count of iommu */
353 int iommu_superpage;/* Level of superpages supported:
354 0 == 4KiB (no superpages), 1 == 2MiB,
355 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
356 spinlock_t iommu_lock; /* protect iommu set in domain */
357 u64 max_addr; /* maximum mapped address */
360 /* PCI domain-device relationship */
361 struct device_domain_info {
362 struct list_head link; /* link to domain siblings */
363 struct list_head global; /* link to global list */
364 u8 bus; /* PCI bus number */
365 u8 devfn; /* PCI devfn number */
366 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
367 struct intel_iommu *iommu; /* IOMMU used by this device */
368 struct dmar_domain *domain; /* pointer to domain */
371 struct dmar_rmrr_unit {
372 struct list_head list; /* list of rmrr units */
373 struct acpi_dmar_header *hdr; /* ACPI header */
374 u64 base_address; /* reserved base address*/
375 u64 end_address; /* reserved end address */
376 struct dmar_dev_scope *devices; /* target devices */
377 int devices_cnt; /* target device count */
380 struct dmar_atsr_unit {
381 struct list_head list; /* list of ATSR units */
382 struct acpi_dmar_header *hdr; /* ACPI header */
383 struct dmar_dev_scope *devices; /* target devices */
384 int devices_cnt; /* target device count */
385 u8 include_all:1; /* include all ports */
388 static LIST_HEAD(dmar_atsr_units);
389 static LIST_HEAD(dmar_rmrr_units);
391 #define for_each_rmrr_units(rmrr) \
392 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
394 static void flush_unmaps_timeout(unsigned long data);
396 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
398 #define HIGH_WATER_MARK 250
399 struct deferred_flush_tables {
400 int next;
401 struct iova *iova[HIGH_WATER_MARK];
402 struct dmar_domain *domain[HIGH_WATER_MARK];
403 struct page *freelist[HIGH_WATER_MARK];
406 static struct deferred_flush_tables *deferred_flush;
408 /* bitmap for indexing intel_iommus */
409 static int g_num_of_iommus;
411 static DEFINE_SPINLOCK(async_umap_flush_lock);
412 static LIST_HEAD(unmaps_to_do);
414 static int timer_on;
415 static long list_size;
417 static void domain_exit(struct dmar_domain *domain);
418 static void domain_remove_dev_info(struct dmar_domain *domain);
419 static void domain_remove_one_dev_info(struct dmar_domain *domain,
420 struct device *dev);
421 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
422 struct device *dev);
423 static int domain_detach_iommu(struct dmar_domain *domain,
424 struct intel_iommu *iommu);
426 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
427 int dmar_disabled = 0;
428 #else
429 int dmar_disabled = 1;
430 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
432 int intel_iommu_enabled = 0;
433 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
435 static int dmar_map_gfx = 1;
436 static int dmar_forcedac;
437 static int intel_iommu_strict;
438 static int intel_iommu_superpage = 1;
440 int intel_iommu_gfx_mapped;
441 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
443 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
444 static DEFINE_SPINLOCK(device_domain_lock);
445 static LIST_HEAD(device_domain_list);
447 static const struct iommu_ops intel_iommu_ops;
449 static int __init intel_iommu_setup(char *str)
451 if (!str)
452 return -EINVAL;
453 while (*str) {
454 if (!strncmp(str, "on", 2)) {
455 dmar_disabled = 0;
456 printk(KERN_INFO "Intel-IOMMU: enabled\n");
457 } else if (!strncmp(str, "off", 3)) {
458 dmar_disabled = 1;
459 printk(KERN_INFO "Intel-IOMMU: disabled\n");
460 } else if (!strncmp(str, "igfx_off", 8)) {
461 dmar_map_gfx = 0;
462 printk(KERN_INFO
463 "Intel-IOMMU: disable GFX device mapping\n");
464 } else if (!strncmp(str, "forcedac", 8)) {
465 printk(KERN_INFO
466 "Intel-IOMMU: Forcing DAC for PCI devices\n");
467 dmar_forcedac = 1;
468 } else if (!strncmp(str, "strict", 6)) {
469 printk(KERN_INFO
470 "Intel-IOMMU: disable batched IOTLB flush\n");
471 intel_iommu_strict = 1;
472 } else if (!strncmp(str, "sp_off", 6)) {
473 printk(KERN_INFO
474 "Intel-IOMMU: disable supported super page\n");
475 intel_iommu_superpage = 0;
478 str += strcspn(str, ",");
479 while (*str == ',')
480 str++;
482 return 0;
484 __setup("intel_iommu=", intel_iommu_setup);
486 static struct kmem_cache *iommu_domain_cache;
487 static struct kmem_cache *iommu_devinfo_cache;
488 static struct kmem_cache *iommu_iova_cache;
490 static inline void *alloc_pgtable_page(int node)
492 struct page *page;
493 void *vaddr = NULL;
495 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
496 if (page)
497 vaddr = page_address(page);
498 return vaddr;
501 static inline void free_pgtable_page(void *vaddr)
503 free_page((unsigned long)vaddr);
506 static inline void *alloc_domain_mem(void)
508 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
511 static void free_domain_mem(void *vaddr)
513 kmem_cache_free(iommu_domain_cache, vaddr);
516 static inline void * alloc_devinfo_mem(void)
518 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
521 static inline void free_devinfo_mem(void *vaddr)
523 kmem_cache_free(iommu_devinfo_cache, vaddr);
526 struct iova *alloc_iova_mem(void)
528 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
531 void free_iova_mem(struct iova *iova)
533 kmem_cache_free(iommu_iova_cache, iova);
536 static inline int domain_type_is_vm(struct dmar_domain *domain)
538 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
541 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
543 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
544 DOMAIN_FLAG_STATIC_IDENTITY);
547 static inline int domain_pfn_supported(struct dmar_domain *domain,
548 unsigned long pfn)
550 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
552 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
555 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
557 unsigned long sagaw;
558 int agaw = -1;
560 sagaw = cap_sagaw(iommu->cap);
561 for (agaw = width_to_agaw(max_gaw);
562 agaw >= 0; agaw--) {
563 if (test_bit(agaw, &sagaw))
564 break;
567 return agaw;
571 * Calculate max SAGAW for each iommu.
573 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
575 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
579 * calculate agaw for each iommu.
580 * "SAGAW" may be different across iommus, use a default agaw, and
581 * get a supported less agaw for iommus that don't support the default agaw.
583 int iommu_calculate_agaw(struct intel_iommu *iommu)
585 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
588 /* This functionin only returns single iommu in a domain */
589 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
591 int iommu_id;
593 /* si_domain and vm domain should not get here. */
594 BUG_ON(domain_type_is_vm_or_si(domain));
595 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
596 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
597 return NULL;
599 return g_iommus[iommu_id];
602 static void domain_update_iommu_coherency(struct dmar_domain *domain)
604 struct dmar_drhd_unit *drhd;
605 struct intel_iommu *iommu;
606 int i, found = 0;
608 domain->iommu_coherency = 1;
610 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
611 found = 1;
612 if (!ecap_coherent(g_iommus[i]->ecap)) {
613 domain->iommu_coherency = 0;
614 break;
617 if (found)
618 return;
620 /* No hardware attached; use lowest common denominator */
621 rcu_read_lock();
622 for_each_active_iommu(iommu, drhd) {
623 if (!ecap_coherent(iommu->ecap)) {
624 domain->iommu_coherency = 0;
625 break;
628 rcu_read_unlock();
631 static int domain_update_iommu_snooping(struct intel_iommu *skip)
633 struct dmar_drhd_unit *drhd;
634 struct intel_iommu *iommu;
635 int ret = 1;
637 rcu_read_lock();
638 for_each_active_iommu(iommu, drhd) {
639 if (iommu != skip) {
640 if (!ecap_sc_support(iommu->ecap)) {
641 ret = 0;
642 break;
646 rcu_read_unlock();
648 return ret;
651 static int domain_update_iommu_superpage(struct intel_iommu *skip)
653 struct dmar_drhd_unit *drhd;
654 struct intel_iommu *iommu;
655 int mask = 0xf;
657 if (!intel_iommu_superpage) {
658 return 0;
661 /* set iommu_superpage to the smallest common denominator */
662 rcu_read_lock();
663 for_each_active_iommu(iommu, drhd) {
664 if (iommu != skip) {
665 mask &= cap_super_page_val(iommu->cap);
666 if (!mask)
667 break;
670 rcu_read_unlock();
672 return fls(mask);
675 /* Some capabilities may be different across iommus */
676 static void domain_update_iommu_cap(struct dmar_domain *domain)
678 domain_update_iommu_coherency(domain);
679 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
680 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
683 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
685 struct dmar_drhd_unit *drhd = NULL;
686 struct intel_iommu *iommu;
687 struct device *tmp;
688 struct pci_dev *ptmp, *pdev = NULL;
689 u16 segment = 0;
690 int i;
692 if (dev_is_pci(dev)) {
693 pdev = to_pci_dev(dev);
694 segment = pci_domain_nr(pdev->bus);
695 } else if (ACPI_COMPANION(dev))
696 dev = &ACPI_COMPANION(dev)->dev;
698 rcu_read_lock();
699 for_each_active_iommu(iommu, drhd) {
700 if (pdev && segment != drhd->segment)
701 continue;
703 for_each_active_dev_scope(drhd->devices,
704 drhd->devices_cnt, i, tmp) {
705 if (tmp == dev) {
706 *bus = drhd->devices[i].bus;
707 *devfn = drhd->devices[i].devfn;
708 goto out;
711 if (!pdev || !dev_is_pci(tmp))
712 continue;
714 ptmp = to_pci_dev(tmp);
715 if (ptmp->subordinate &&
716 ptmp->subordinate->number <= pdev->bus->number &&
717 ptmp->subordinate->busn_res.end >= pdev->bus->number)
718 goto got_pdev;
721 if (pdev && drhd->include_all) {
722 got_pdev:
723 *bus = pdev->bus->number;
724 *devfn = pdev->devfn;
725 goto out;
728 iommu = NULL;
729 out:
730 rcu_read_unlock();
732 return iommu;
735 static void domain_flush_cache(struct dmar_domain *domain,
736 void *addr, int size)
738 if (!domain->iommu_coherency)
739 clflush_cache_range(addr, size);
742 /* Gets context entry for a given bus and devfn */
743 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
744 u8 bus, u8 devfn)
746 struct root_entry *root;
747 struct context_entry *context;
748 unsigned long phy_addr;
749 unsigned long flags;
751 spin_lock_irqsave(&iommu->lock, flags);
752 root = &iommu->root_entry[bus];
753 context = get_context_addr_from_root(root);
754 if (!context) {
755 context = (struct context_entry *)
756 alloc_pgtable_page(iommu->node);
757 if (!context) {
758 spin_unlock_irqrestore(&iommu->lock, flags);
759 return NULL;
761 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
762 phy_addr = virt_to_phys((void *)context);
763 set_root_value(root, phy_addr);
764 set_root_present(root);
765 __iommu_flush_cache(iommu, root, sizeof(*root));
767 spin_unlock_irqrestore(&iommu->lock, flags);
768 return &context[devfn];
771 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
773 struct root_entry *root;
774 struct context_entry *context;
775 int ret;
776 unsigned long flags;
778 spin_lock_irqsave(&iommu->lock, flags);
779 root = &iommu->root_entry[bus];
780 context = get_context_addr_from_root(root);
781 if (!context) {
782 ret = 0;
783 goto out;
785 ret = context_present(&context[devfn]);
786 out:
787 spin_unlock_irqrestore(&iommu->lock, flags);
788 return ret;
791 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
793 struct root_entry *root;
794 struct context_entry *context;
795 unsigned long flags;
797 spin_lock_irqsave(&iommu->lock, flags);
798 root = &iommu->root_entry[bus];
799 context = get_context_addr_from_root(root);
800 if (context) {
801 context_clear_entry(&context[devfn]);
802 __iommu_flush_cache(iommu, &context[devfn], \
803 sizeof(*context));
805 spin_unlock_irqrestore(&iommu->lock, flags);
808 static void free_context_table(struct intel_iommu *iommu)
810 struct root_entry *root;
811 int i;
812 unsigned long flags;
813 struct context_entry *context;
815 spin_lock_irqsave(&iommu->lock, flags);
816 if (!iommu->root_entry) {
817 goto out;
819 for (i = 0; i < ROOT_ENTRY_NR; i++) {
820 root = &iommu->root_entry[i];
821 context = get_context_addr_from_root(root);
822 if (context)
823 free_pgtable_page(context);
825 free_pgtable_page(iommu->root_entry);
826 iommu->root_entry = NULL;
827 out:
828 spin_unlock_irqrestore(&iommu->lock, flags);
831 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
832 unsigned long pfn, int *target_level)
834 struct dma_pte *parent, *pte = NULL;
835 int level = agaw_to_level(domain->agaw);
836 int offset;
838 BUG_ON(!domain->pgd);
840 if (!domain_pfn_supported(domain, pfn))
841 /* Address beyond IOMMU's addressing capabilities. */
842 return NULL;
844 parent = domain->pgd;
846 while (1) {
847 void *tmp_page;
849 offset = pfn_level_offset(pfn, level);
850 pte = &parent[offset];
851 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
852 break;
853 if (level == *target_level)
854 break;
856 if (!dma_pte_present(pte)) {
857 uint64_t pteval;
859 tmp_page = alloc_pgtable_page(domain->nid);
861 if (!tmp_page)
862 return NULL;
864 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
865 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
866 if (cmpxchg64(&pte->val, 0ULL, pteval))
867 /* Someone else set it while we were thinking; use theirs. */
868 free_pgtable_page(tmp_page);
869 else
870 domain_flush_cache(domain, pte, sizeof(*pte));
872 if (level == 1)
873 break;
875 parent = phys_to_virt(dma_pte_addr(pte));
876 level--;
879 if (!*target_level)
880 *target_level = level;
882 return pte;
886 /* return address's pte at specific level */
887 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
888 unsigned long pfn,
889 int level, int *large_page)
891 struct dma_pte *parent, *pte = NULL;
892 int total = agaw_to_level(domain->agaw);
893 int offset;
895 parent = domain->pgd;
896 while (level <= total) {
897 offset = pfn_level_offset(pfn, total);
898 pte = &parent[offset];
899 if (level == total)
900 return pte;
902 if (!dma_pte_present(pte)) {
903 *large_page = total;
904 break;
907 if (dma_pte_superpage(pte)) {
908 *large_page = total;
909 return pte;
912 parent = phys_to_virt(dma_pte_addr(pte));
913 total--;
915 return NULL;
918 /* clear last level pte, a tlb flush should be followed */
919 static void dma_pte_clear_range(struct dmar_domain *domain,
920 unsigned long start_pfn,
921 unsigned long last_pfn)
923 unsigned int large_page = 1;
924 struct dma_pte *first_pte, *pte;
926 BUG_ON(!domain_pfn_supported(domain, start_pfn));
927 BUG_ON(!domain_pfn_supported(domain, last_pfn));
928 BUG_ON(start_pfn > last_pfn);
930 /* we don't need lock here; nobody else touches the iova range */
931 do {
932 large_page = 1;
933 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
934 if (!pte) {
935 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
936 continue;
938 do {
939 dma_clear_pte(pte);
940 start_pfn += lvl_to_nr_pages(large_page);
941 pte++;
942 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
944 domain_flush_cache(domain, first_pte,
945 (void *)pte - (void *)first_pte);
947 } while (start_pfn && start_pfn <= last_pfn);
950 static void dma_pte_free_level(struct dmar_domain *domain, int level,
951 struct dma_pte *pte, unsigned long pfn,
952 unsigned long start_pfn, unsigned long last_pfn)
954 pfn = max(start_pfn, pfn);
955 pte = &pte[pfn_level_offset(pfn, level)];
957 do {
958 unsigned long level_pfn;
959 struct dma_pte *level_pte;
961 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
962 goto next;
964 level_pfn = pfn & level_mask(level - 1);
965 level_pte = phys_to_virt(dma_pte_addr(pte));
967 if (level > 2)
968 dma_pte_free_level(domain, level - 1, level_pte,
969 level_pfn, start_pfn, last_pfn);
971 /* If range covers entire pagetable, free it */
972 if (!(start_pfn > level_pfn ||
973 last_pfn < level_pfn + level_size(level) - 1)) {
974 dma_clear_pte(pte);
975 domain_flush_cache(domain, pte, sizeof(*pte));
976 free_pgtable_page(level_pte);
978 next:
979 pfn += level_size(level);
980 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
983 /* free page table pages. last level pte should already be cleared */
984 static void dma_pte_free_pagetable(struct dmar_domain *domain,
985 unsigned long start_pfn,
986 unsigned long last_pfn)
988 BUG_ON(!domain_pfn_supported(domain, start_pfn));
989 BUG_ON(!domain_pfn_supported(domain, last_pfn));
990 BUG_ON(start_pfn > last_pfn);
992 dma_pte_clear_range(domain, start_pfn, last_pfn);
994 /* We don't need lock here; nobody else touches the iova range */
995 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
996 domain->pgd, 0, start_pfn, last_pfn);
998 /* free pgd */
999 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1000 free_pgtable_page(domain->pgd);
1001 domain->pgd = NULL;
1005 /* When a page at a given level is being unlinked from its parent, we don't
1006 need to *modify* it at all. All we need to do is make a list of all the
1007 pages which can be freed just as soon as we've flushed the IOTLB and we
1008 know the hardware page-walk will no longer touch them.
1009 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1010 be freed. */
1011 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1012 int level, struct dma_pte *pte,
1013 struct page *freelist)
1015 struct page *pg;
1017 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1018 pg->freelist = freelist;
1019 freelist = pg;
1021 if (level == 1)
1022 return freelist;
1024 pte = page_address(pg);
1025 do {
1026 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1027 freelist = dma_pte_list_pagetables(domain, level - 1,
1028 pte, freelist);
1029 pte++;
1030 } while (!first_pte_in_page(pte));
1032 return freelist;
1035 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1036 struct dma_pte *pte, unsigned long pfn,
1037 unsigned long start_pfn,
1038 unsigned long last_pfn,
1039 struct page *freelist)
1041 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1043 pfn = max(start_pfn, pfn);
1044 pte = &pte[pfn_level_offset(pfn, level)];
1046 do {
1047 unsigned long level_pfn;
1049 if (!dma_pte_present(pte))
1050 goto next;
1052 level_pfn = pfn & level_mask(level);
1054 /* If range covers entire pagetable, free it */
1055 if (start_pfn <= level_pfn &&
1056 last_pfn >= level_pfn + level_size(level) - 1) {
1057 /* These suborbinate page tables are going away entirely. Don't
1058 bother to clear them; we're just going to *free* them. */
1059 if (level > 1 && !dma_pte_superpage(pte))
1060 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1062 dma_clear_pte(pte);
1063 if (!first_pte)
1064 first_pte = pte;
1065 last_pte = pte;
1066 } else if (level > 1) {
1067 /* Recurse down into a level that isn't *entirely* obsolete */
1068 freelist = dma_pte_clear_level(domain, level - 1,
1069 phys_to_virt(dma_pte_addr(pte)),
1070 level_pfn, start_pfn, last_pfn,
1071 freelist);
1073 next:
1074 pfn += level_size(level);
1075 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1077 if (first_pte)
1078 domain_flush_cache(domain, first_pte,
1079 (void *)++last_pte - (void *)first_pte);
1081 return freelist;
1084 /* We can't just free the pages because the IOMMU may still be walking
1085 the page tables, and may have cached the intermediate levels. The
1086 pages can only be freed after the IOTLB flush has been done. */
1087 struct page *domain_unmap(struct dmar_domain *domain,
1088 unsigned long start_pfn,
1089 unsigned long last_pfn)
1091 struct page *freelist = NULL;
1093 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1094 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1095 BUG_ON(start_pfn > last_pfn);
1097 /* we don't need lock here; nobody else touches the iova range */
1098 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1099 domain->pgd, 0, start_pfn, last_pfn, NULL);
1101 /* free pgd */
1102 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1103 struct page *pgd_page = virt_to_page(domain->pgd);
1104 pgd_page->freelist = freelist;
1105 freelist = pgd_page;
1107 domain->pgd = NULL;
1110 return freelist;
1113 void dma_free_pagelist(struct page *freelist)
1115 struct page *pg;
1117 while ((pg = freelist)) {
1118 freelist = pg->freelist;
1119 free_pgtable_page(page_address(pg));
1123 /* iommu handling */
1124 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1126 struct root_entry *root;
1127 unsigned long flags;
1129 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1130 if (!root) {
1131 pr_err("IOMMU: allocating root entry for %s failed\n",
1132 iommu->name);
1133 return -ENOMEM;
1136 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1138 spin_lock_irqsave(&iommu->lock, flags);
1139 iommu->root_entry = root;
1140 spin_unlock_irqrestore(&iommu->lock, flags);
1142 return 0;
1145 static void iommu_set_root_entry(struct intel_iommu *iommu)
1147 void *addr;
1148 u32 sts;
1149 unsigned long flag;
1151 addr = iommu->root_entry;
1153 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1154 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1156 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1158 /* Make sure hardware complete it */
1159 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1160 readl, (sts & DMA_GSTS_RTPS), sts);
1162 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1165 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1167 u32 val;
1168 unsigned long flag;
1170 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1171 return;
1173 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1174 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1176 /* Make sure hardware complete it */
1177 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1178 readl, (!(val & DMA_GSTS_WBFS)), val);
1180 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1183 /* return value determine if we need a write buffer flush */
1184 static void __iommu_flush_context(struct intel_iommu *iommu,
1185 u16 did, u16 source_id, u8 function_mask,
1186 u64 type)
1188 u64 val = 0;
1189 unsigned long flag;
1191 switch (type) {
1192 case DMA_CCMD_GLOBAL_INVL:
1193 val = DMA_CCMD_GLOBAL_INVL;
1194 break;
1195 case DMA_CCMD_DOMAIN_INVL:
1196 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1197 break;
1198 case DMA_CCMD_DEVICE_INVL:
1199 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1200 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1201 break;
1202 default:
1203 BUG();
1205 val |= DMA_CCMD_ICC;
1207 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1208 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1210 /* Make sure hardware complete it */
1211 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1212 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1214 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1217 /* return value determine if we need a write buffer flush */
1218 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1219 u64 addr, unsigned int size_order, u64 type)
1221 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1222 u64 val = 0, val_iva = 0;
1223 unsigned long flag;
1225 switch (type) {
1226 case DMA_TLB_GLOBAL_FLUSH:
1227 /* global flush doesn't need set IVA_REG */
1228 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1229 break;
1230 case DMA_TLB_DSI_FLUSH:
1231 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1232 break;
1233 case DMA_TLB_PSI_FLUSH:
1234 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1235 /* IH bit is passed in as part of address */
1236 val_iva = size_order | addr;
1237 break;
1238 default:
1239 BUG();
1241 /* Note: set drain read/write */
1242 #if 0
1244 * This is probably to be super secure.. Looks like we can
1245 * ignore it without any impact.
1247 if (cap_read_drain(iommu->cap))
1248 val |= DMA_TLB_READ_DRAIN;
1249 #endif
1250 if (cap_write_drain(iommu->cap))
1251 val |= DMA_TLB_WRITE_DRAIN;
1253 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1254 /* Note: Only uses first TLB reg currently */
1255 if (val_iva)
1256 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1257 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1259 /* Make sure hardware complete it */
1260 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1261 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1263 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1265 /* check IOTLB invalidation granularity */
1266 if (DMA_TLB_IAIG(val) == 0)
1267 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1268 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1269 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1270 (unsigned long long)DMA_TLB_IIRG(type),
1271 (unsigned long long)DMA_TLB_IAIG(val));
1274 static struct device_domain_info *
1275 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1276 u8 bus, u8 devfn)
1278 int found = 0;
1279 unsigned long flags;
1280 struct device_domain_info *info;
1281 struct pci_dev *pdev;
1283 if (!ecap_dev_iotlb_support(iommu->ecap))
1284 return NULL;
1286 if (!iommu->qi)
1287 return NULL;
1289 spin_lock_irqsave(&device_domain_lock, flags);
1290 list_for_each_entry(info, &domain->devices, link)
1291 if (info->iommu == iommu && info->bus == bus &&
1292 info->devfn == devfn) {
1293 found = 1;
1294 break;
1296 spin_unlock_irqrestore(&device_domain_lock, flags);
1298 if (!found || !info->dev || !dev_is_pci(info->dev))
1299 return NULL;
1301 pdev = to_pci_dev(info->dev);
1303 if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1304 return NULL;
1306 if (!dmar_find_matched_atsr_unit(pdev))
1307 return NULL;
1309 return info;
1312 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1314 if (!info || !dev_is_pci(info->dev))
1315 return;
1317 pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1320 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1322 if (!info->dev || !dev_is_pci(info->dev) ||
1323 !pci_ats_enabled(to_pci_dev(info->dev)))
1324 return;
1326 pci_disable_ats(to_pci_dev(info->dev));
1329 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1330 u64 addr, unsigned mask)
1332 u16 sid, qdep;
1333 unsigned long flags;
1334 struct device_domain_info *info;
1336 spin_lock_irqsave(&device_domain_lock, flags);
1337 list_for_each_entry(info, &domain->devices, link) {
1338 struct pci_dev *pdev;
1339 if (!info->dev || !dev_is_pci(info->dev))
1340 continue;
1342 pdev = to_pci_dev(info->dev);
1343 if (!pci_ats_enabled(pdev))
1344 continue;
1346 sid = info->bus << 8 | info->devfn;
1347 qdep = pci_ats_queue_depth(pdev);
1348 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1350 spin_unlock_irqrestore(&device_domain_lock, flags);
1353 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1354 unsigned long pfn, unsigned int pages, int ih, int map)
1356 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1357 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1359 BUG_ON(pages == 0);
1361 if (ih)
1362 ih = 1 << 6;
1364 * Fallback to domain selective flush if no PSI support or the size is
1365 * too big.
1366 * PSI requires page size to be 2 ^ x, and the base address is naturally
1367 * aligned to the size
1369 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1370 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1371 DMA_TLB_DSI_FLUSH);
1372 else
1373 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1374 DMA_TLB_PSI_FLUSH);
1377 * In caching mode, changes of pages from non-present to present require
1378 * flush. However, device IOTLB doesn't need to be flushed in this case.
1380 if (!cap_caching_mode(iommu->cap) || !map)
1381 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1384 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1386 u32 pmen;
1387 unsigned long flags;
1389 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1390 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1391 pmen &= ~DMA_PMEN_EPM;
1392 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1394 /* wait for the protected region status bit to clear */
1395 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1396 readl, !(pmen & DMA_PMEN_PRS), pmen);
1398 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1401 static void iommu_enable_translation(struct intel_iommu *iommu)
1403 u32 sts;
1404 unsigned long flags;
1406 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1407 iommu->gcmd |= DMA_GCMD_TE;
1408 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1410 /* Make sure hardware complete it */
1411 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1412 readl, (sts & DMA_GSTS_TES), sts);
1414 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1417 static void iommu_disable_translation(struct intel_iommu *iommu)
1419 u32 sts;
1420 unsigned long flag;
1422 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1423 iommu->gcmd &= ~DMA_GCMD_TE;
1424 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1426 /* Make sure hardware complete it */
1427 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1428 readl, (!(sts & DMA_GSTS_TES)), sts);
1430 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1434 static int iommu_init_domains(struct intel_iommu *iommu)
1436 unsigned long ndomains;
1437 unsigned long nlongs;
1439 ndomains = cap_ndoms(iommu->cap);
1440 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1441 iommu->seq_id, ndomains);
1442 nlongs = BITS_TO_LONGS(ndomains);
1444 spin_lock_init(&iommu->lock);
1446 /* TBD: there might be 64K domains,
1447 * consider other allocation for future chip
1449 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1450 if (!iommu->domain_ids) {
1451 pr_err("IOMMU%d: allocating domain id array failed\n",
1452 iommu->seq_id);
1453 return -ENOMEM;
1455 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1456 GFP_KERNEL);
1457 if (!iommu->domains) {
1458 pr_err("IOMMU%d: allocating domain array failed\n",
1459 iommu->seq_id);
1460 kfree(iommu->domain_ids);
1461 iommu->domain_ids = NULL;
1462 return -ENOMEM;
1466 * if Caching mode is set, then invalid translations are tagged
1467 * with domainid 0. Hence we need to pre-allocate it.
1469 if (cap_caching_mode(iommu->cap))
1470 set_bit(0, iommu->domain_ids);
1471 return 0;
1474 static void disable_dmar_iommu(struct intel_iommu *iommu)
1476 struct dmar_domain *domain;
1477 int i;
1479 if ((iommu->domains) && (iommu->domain_ids)) {
1480 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1482 * Domain id 0 is reserved for invalid translation
1483 * if hardware supports caching mode.
1485 if (cap_caching_mode(iommu->cap) && i == 0)
1486 continue;
1488 domain = iommu->domains[i];
1489 clear_bit(i, iommu->domain_ids);
1490 if (domain_detach_iommu(domain, iommu) == 0 &&
1491 !domain_type_is_vm(domain))
1492 domain_exit(domain);
1496 if (iommu->gcmd & DMA_GCMD_TE)
1497 iommu_disable_translation(iommu);
1500 static void free_dmar_iommu(struct intel_iommu *iommu)
1502 if ((iommu->domains) && (iommu->domain_ids)) {
1503 kfree(iommu->domains);
1504 kfree(iommu->domain_ids);
1505 iommu->domains = NULL;
1506 iommu->domain_ids = NULL;
1509 g_iommus[iommu->seq_id] = NULL;
1511 /* free context mapping */
1512 free_context_table(iommu);
1515 static struct dmar_domain *alloc_domain(int flags)
1517 /* domain id for virtual machine, it won't be set in context */
1518 static atomic_t vm_domid = ATOMIC_INIT(0);
1519 struct dmar_domain *domain;
1521 domain = alloc_domain_mem();
1522 if (!domain)
1523 return NULL;
1525 memset(domain, 0, sizeof(*domain));
1526 domain->nid = -1;
1527 domain->flags = flags;
1528 spin_lock_init(&domain->iommu_lock);
1529 INIT_LIST_HEAD(&domain->devices);
1530 if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1531 domain->id = atomic_inc_return(&vm_domid);
1533 return domain;
1536 static int __iommu_attach_domain(struct dmar_domain *domain,
1537 struct intel_iommu *iommu)
1539 int num;
1540 unsigned long ndomains;
1542 ndomains = cap_ndoms(iommu->cap);
1543 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1544 if (num < ndomains) {
1545 set_bit(num, iommu->domain_ids);
1546 iommu->domains[num] = domain;
1547 } else {
1548 num = -ENOSPC;
1551 return num;
1554 static int iommu_attach_domain(struct dmar_domain *domain,
1555 struct intel_iommu *iommu)
1557 int num;
1558 unsigned long flags;
1560 spin_lock_irqsave(&iommu->lock, flags);
1561 num = __iommu_attach_domain(domain, iommu);
1562 spin_unlock_irqrestore(&iommu->lock, flags);
1563 if (num < 0)
1564 pr_err("IOMMU: no free domain ids\n");
1566 return num;
1569 static int iommu_attach_vm_domain(struct dmar_domain *domain,
1570 struct intel_iommu *iommu)
1572 int num;
1573 unsigned long ndomains;
1575 ndomains = cap_ndoms(iommu->cap);
1576 for_each_set_bit(num, iommu->domain_ids, ndomains)
1577 if (iommu->domains[num] == domain)
1578 return num;
1580 return __iommu_attach_domain(domain, iommu);
1583 static void iommu_detach_domain(struct dmar_domain *domain,
1584 struct intel_iommu *iommu)
1586 unsigned long flags;
1587 int num, ndomains;
1589 spin_lock_irqsave(&iommu->lock, flags);
1590 if (domain_type_is_vm_or_si(domain)) {
1591 ndomains = cap_ndoms(iommu->cap);
1592 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1593 if (iommu->domains[num] == domain) {
1594 clear_bit(num, iommu->domain_ids);
1595 iommu->domains[num] = NULL;
1596 break;
1599 } else {
1600 clear_bit(domain->id, iommu->domain_ids);
1601 iommu->domains[domain->id] = NULL;
1603 spin_unlock_irqrestore(&iommu->lock, flags);
1606 static void domain_attach_iommu(struct dmar_domain *domain,
1607 struct intel_iommu *iommu)
1609 unsigned long flags;
1611 spin_lock_irqsave(&domain->iommu_lock, flags);
1612 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1613 domain->iommu_count++;
1614 if (domain->iommu_count == 1)
1615 domain->nid = iommu->node;
1616 domain_update_iommu_cap(domain);
1618 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1621 static int domain_detach_iommu(struct dmar_domain *domain,
1622 struct intel_iommu *iommu)
1624 unsigned long flags;
1625 int count = INT_MAX;
1627 spin_lock_irqsave(&domain->iommu_lock, flags);
1628 if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1629 count = --domain->iommu_count;
1630 domain_update_iommu_cap(domain);
1632 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1634 return count;
1637 static struct iova_domain reserved_iova_list;
1638 static struct lock_class_key reserved_rbtree_key;
1640 static int dmar_init_reserved_ranges(void)
1642 struct pci_dev *pdev = NULL;
1643 struct iova *iova;
1644 int i;
1646 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1648 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1649 &reserved_rbtree_key);
1651 /* IOAPIC ranges shouldn't be accessed by DMA */
1652 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1653 IOVA_PFN(IOAPIC_RANGE_END));
1654 if (!iova) {
1655 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1656 return -ENODEV;
1659 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1660 for_each_pci_dev(pdev) {
1661 struct resource *r;
1663 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1664 r = &pdev->resource[i];
1665 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1666 continue;
1667 iova = reserve_iova(&reserved_iova_list,
1668 IOVA_PFN(r->start),
1669 IOVA_PFN(r->end));
1670 if (!iova) {
1671 printk(KERN_ERR "Reserve iova failed\n");
1672 return -ENODEV;
1676 return 0;
1679 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1681 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1684 static inline int guestwidth_to_adjustwidth(int gaw)
1686 int agaw;
1687 int r = (gaw - 12) % 9;
1689 if (r == 0)
1690 agaw = gaw;
1691 else
1692 agaw = gaw + 9 - r;
1693 if (agaw > 64)
1694 agaw = 64;
1695 return agaw;
1698 static int domain_init(struct dmar_domain *domain, int guest_width)
1700 struct intel_iommu *iommu;
1701 int adjust_width, agaw;
1702 unsigned long sagaw;
1704 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1705 domain_reserve_special_ranges(domain);
1707 /* calculate AGAW */
1708 iommu = domain_get_iommu(domain);
1709 if (guest_width > cap_mgaw(iommu->cap))
1710 guest_width = cap_mgaw(iommu->cap);
1711 domain->gaw = guest_width;
1712 adjust_width = guestwidth_to_adjustwidth(guest_width);
1713 agaw = width_to_agaw(adjust_width);
1714 sagaw = cap_sagaw(iommu->cap);
1715 if (!test_bit(agaw, &sagaw)) {
1716 /* hardware doesn't support it, choose a bigger one */
1717 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1718 agaw = find_next_bit(&sagaw, 5, agaw);
1719 if (agaw >= 5)
1720 return -ENODEV;
1722 domain->agaw = agaw;
1724 if (ecap_coherent(iommu->ecap))
1725 domain->iommu_coherency = 1;
1726 else
1727 domain->iommu_coherency = 0;
1729 if (ecap_sc_support(iommu->ecap))
1730 domain->iommu_snooping = 1;
1731 else
1732 domain->iommu_snooping = 0;
1734 if (intel_iommu_superpage)
1735 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1736 else
1737 domain->iommu_superpage = 0;
1739 domain->nid = iommu->node;
1741 /* always allocate the top pgd */
1742 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1743 if (!domain->pgd)
1744 return -ENOMEM;
1745 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1746 return 0;
1749 static void domain_exit(struct dmar_domain *domain)
1751 struct dmar_drhd_unit *drhd;
1752 struct intel_iommu *iommu;
1753 struct page *freelist = NULL;
1755 /* Domain 0 is reserved, so dont process it */
1756 if (!domain)
1757 return;
1759 /* Flush any lazy unmaps that may reference this domain */
1760 if (!intel_iommu_strict)
1761 flush_unmaps_timeout(0);
1763 /* remove associated devices */
1764 domain_remove_dev_info(domain);
1766 /* destroy iovas */
1767 put_iova_domain(&domain->iovad);
1769 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1771 /* clear attached or cached domains */
1772 rcu_read_lock();
1773 for_each_active_iommu(iommu, drhd)
1774 iommu_detach_domain(domain, iommu);
1775 rcu_read_unlock();
1777 dma_free_pagelist(freelist);
1779 free_domain_mem(domain);
1782 static int domain_context_mapping_one(struct dmar_domain *domain,
1783 struct intel_iommu *iommu,
1784 u8 bus, u8 devfn, int translation)
1786 struct context_entry *context;
1787 unsigned long flags;
1788 struct dma_pte *pgd;
1789 int id;
1790 int agaw;
1791 struct device_domain_info *info = NULL;
1793 pr_debug("Set context mapping for %02x:%02x.%d\n",
1794 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1796 BUG_ON(!domain->pgd);
1797 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1798 translation != CONTEXT_TT_MULTI_LEVEL);
1800 context = device_to_context_entry(iommu, bus, devfn);
1801 if (!context)
1802 return -ENOMEM;
1803 spin_lock_irqsave(&iommu->lock, flags);
1804 if (context_present(context)) {
1805 spin_unlock_irqrestore(&iommu->lock, flags);
1806 return 0;
1809 id = domain->id;
1810 pgd = domain->pgd;
1812 if (domain_type_is_vm_or_si(domain)) {
1813 if (domain_type_is_vm(domain)) {
1814 id = iommu_attach_vm_domain(domain, iommu);
1815 if (id < 0) {
1816 spin_unlock_irqrestore(&iommu->lock, flags);
1817 pr_err("IOMMU: no free domain ids\n");
1818 return -EFAULT;
1822 /* Skip top levels of page tables for
1823 * iommu which has less agaw than default.
1824 * Unnecessary for PT mode.
1826 if (translation != CONTEXT_TT_PASS_THROUGH) {
1827 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1828 pgd = phys_to_virt(dma_pte_addr(pgd));
1829 if (!dma_pte_present(pgd)) {
1830 spin_unlock_irqrestore(&iommu->lock, flags);
1831 return -ENOMEM;
1837 context_set_domain_id(context, id);
1839 if (translation != CONTEXT_TT_PASS_THROUGH) {
1840 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1841 translation = info ? CONTEXT_TT_DEV_IOTLB :
1842 CONTEXT_TT_MULTI_LEVEL;
1845 * In pass through mode, AW must be programmed to indicate the largest
1846 * AGAW value supported by hardware. And ASR is ignored by hardware.
1848 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1849 context_set_address_width(context, iommu->msagaw);
1850 else {
1851 context_set_address_root(context, virt_to_phys(pgd));
1852 context_set_address_width(context, iommu->agaw);
1855 context_set_translation_type(context, translation);
1856 context_set_fault_enable(context);
1857 context_set_present(context);
1858 domain_flush_cache(domain, context, sizeof(*context));
1861 * It's a non-present to present mapping. If hardware doesn't cache
1862 * non-present entry we only need to flush the write-buffer. If the
1863 * _does_ cache non-present entries, then it does so in the special
1864 * domain #0, which we have to flush:
1866 if (cap_caching_mode(iommu->cap)) {
1867 iommu->flush.flush_context(iommu, 0,
1868 (((u16)bus) << 8) | devfn,
1869 DMA_CCMD_MASK_NOBIT,
1870 DMA_CCMD_DEVICE_INVL);
1871 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
1872 } else {
1873 iommu_flush_write_buffer(iommu);
1875 iommu_enable_dev_iotlb(info);
1876 spin_unlock_irqrestore(&iommu->lock, flags);
1878 domain_attach_iommu(domain, iommu);
1880 return 0;
1883 struct domain_context_mapping_data {
1884 struct dmar_domain *domain;
1885 struct intel_iommu *iommu;
1886 int translation;
1889 static int domain_context_mapping_cb(struct pci_dev *pdev,
1890 u16 alias, void *opaque)
1892 struct domain_context_mapping_data *data = opaque;
1894 return domain_context_mapping_one(data->domain, data->iommu,
1895 PCI_BUS_NUM(alias), alias & 0xff,
1896 data->translation);
1899 static int
1900 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1901 int translation)
1903 struct intel_iommu *iommu;
1904 u8 bus, devfn;
1905 struct domain_context_mapping_data data;
1907 iommu = device_to_iommu(dev, &bus, &devfn);
1908 if (!iommu)
1909 return -ENODEV;
1911 if (!dev_is_pci(dev))
1912 return domain_context_mapping_one(domain, iommu, bus, devfn,
1913 translation);
1915 data.domain = domain;
1916 data.iommu = iommu;
1917 data.translation = translation;
1919 return pci_for_each_dma_alias(to_pci_dev(dev),
1920 &domain_context_mapping_cb, &data);
1923 static int domain_context_mapped_cb(struct pci_dev *pdev,
1924 u16 alias, void *opaque)
1926 struct intel_iommu *iommu = opaque;
1928 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
1931 static int domain_context_mapped(struct device *dev)
1933 struct intel_iommu *iommu;
1934 u8 bus, devfn;
1936 iommu = device_to_iommu(dev, &bus, &devfn);
1937 if (!iommu)
1938 return -ENODEV;
1940 if (!dev_is_pci(dev))
1941 return device_context_mapped(iommu, bus, devfn);
1943 return !pci_for_each_dma_alias(to_pci_dev(dev),
1944 domain_context_mapped_cb, iommu);
1947 /* Returns a number of VTD pages, but aligned to MM page size */
1948 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1949 size_t size)
1951 host_addr &= ~PAGE_MASK;
1952 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1955 /* Return largest possible superpage level for a given mapping */
1956 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1957 unsigned long iov_pfn,
1958 unsigned long phy_pfn,
1959 unsigned long pages)
1961 int support, level = 1;
1962 unsigned long pfnmerge;
1964 support = domain->iommu_superpage;
1966 /* To use a large page, the virtual *and* physical addresses
1967 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1968 of them will mean we have to use smaller pages. So just
1969 merge them and check both at once. */
1970 pfnmerge = iov_pfn | phy_pfn;
1972 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1973 pages >>= VTD_STRIDE_SHIFT;
1974 if (!pages)
1975 break;
1976 pfnmerge >>= VTD_STRIDE_SHIFT;
1977 level++;
1978 support--;
1980 return level;
1983 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1984 struct scatterlist *sg, unsigned long phys_pfn,
1985 unsigned long nr_pages, int prot)
1987 struct dma_pte *first_pte = NULL, *pte = NULL;
1988 phys_addr_t uninitialized_var(pteval);
1989 unsigned long sg_res = 0;
1990 unsigned int largepage_lvl = 0;
1991 unsigned long lvl_pages = 0;
1993 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
1995 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1996 return -EINVAL;
1998 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2000 if (!sg) {
2001 sg_res = nr_pages;
2002 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2005 while (nr_pages > 0) {
2006 uint64_t tmp;
2008 if (!sg_res) {
2009 sg_res = aligned_nrpages(sg->offset, sg->length);
2010 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2011 sg->dma_length = sg->length;
2012 pteval = page_to_phys(sg_page(sg)) | prot;
2013 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2016 if (!pte) {
2017 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2019 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2020 if (!pte)
2021 return -ENOMEM;
2022 /* It is large page*/
2023 if (largepage_lvl > 1) {
2024 pteval |= DMA_PTE_LARGE_PAGE;
2025 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2027 * Ensure that old small page tables are
2028 * removed to make room for superpage,
2029 * if they exist.
2031 dma_pte_free_pagetable(domain, iov_pfn,
2032 iov_pfn + lvl_pages - 1);
2033 } else {
2034 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2038 /* We don't need lock here, nobody else
2039 * touches the iova range
2041 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2042 if (tmp) {
2043 static int dumps = 5;
2044 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2045 iov_pfn, tmp, (unsigned long long)pteval);
2046 if (dumps) {
2047 dumps--;
2048 debug_dma_dump_mappings(NULL);
2050 WARN_ON(1);
2053 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2055 BUG_ON(nr_pages < lvl_pages);
2056 BUG_ON(sg_res < lvl_pages);
2058 nr_pages -= lvl_pages;
2059 iov_pfn += lvl_pages;
2060 phys_pfn += lvl_pages;
2061 pteval += lvl_pages * VTD_PAGE_SIZE;
2062 sg_res -= lvl_pages;
2064 /* If the next PTE would be the first in a new page, then we
2065 need to flush the cache on the entries we've just written.
2066 And then we'll need to recalculate 'pte', so clear it and
2067 let it get set again in the if (!pte) block above.
2069 If we're done (!nr_pages) we need to flush the cache too.
2071 Also if we've been setting superpages, we may need to
2072 recalculate 'pte' and switch back to smaller pages for the
2073 end of the mapping, if the trailing size is not enough to
2074 use another superpage (i.e. sg_res < lvl_pages). */
2075 pte++;
2076 if (!nr_pages || first_pte_in_page(pte) ||
2077 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2078 domain_flush_cache(domain, first_pte,
2079 (void *)pte - (void *)first_pte);
2080 pte = NULL;
2083 if (!sg_res && nr_pages)
2084 sg = sg_next(sg);
2086 return 0;
2089 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2090 struct scatterlist *sg, unsigned long nr_pages,
2091 int prot)
2093 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2096 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2097 unsigned long phys_pfn, unsigned long nr_pages,
2098 int prot)
2100 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2103 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2105 if (!iommu)
2106 return;
2108 clear_context_table(iommu, bus, devfn);
2109 iommu->flush.flush_context(iommu, 0, 0, 0,
2110 DMA_CCMD_GLOBAL_INVL);
2111 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2114 static inline void unlink_domain_info(struct device_domain_info *info)
2116 assert_spin_locked(&device_domain_lock);
2117 list_del(&info->link);
2118 list_del(&info->global);
2119 if (info->dev)
2120 info->dev->archdata.iommu = NULL;
2123 static void domain_remove_dev_info(struct dmar_domain *domain)
2125 struct device_domain_info *info, *tmp;
2126 unsigned long flags;
2128 spin_lock_irqsave(&device_domain_lock, flags);
2129 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2130 unlink_domain_info(info);
2131 spin_unlock_irqrestore(&device_domain_lock, flags);
2133 iommu_disable_dev_iotlb(info);
2134 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2136 if (domain_type_is_vm(domain)) {
2137 iommu_detach_dependent_devices(info->iommu, info->dev);
2138 domain_detach_iommu(domain, info->iommu);
2141 free_devinfo_mem(info);
2142 spin_lock_irqsave(&device_domain_lock, flags);
2144 spin_unlock_irqrestore(&device_domain_lock, flags);
2148 * find_domain
2149 * Note: we use struct device->archdata.iommu stores the info
2151 static struct dmar_domain *find_domain(struct device *dev)
2153 struct device_domain_info *info;
2155 /* No lock here, assumes no domain exit in normal case */
2156 info = dev->archdata.iommu;
2157 if (info)
2158 return info->domain;
2159 return NULL;
2162 static inline struct device_domain_info *
2163 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2165 struct device_domain_info *info;
2167 list_for_each_entry(info, &device_domain_list, global)
2168 if (info->iommu->segment == segment && info->bus == bus &&
2169 info->devfn == devfn)
2170 return info;
2172 return NULL;
2175 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2176 int bus, int devfn,
2177 struct device *dev,
2178 struct dmar_domain *domain)
2180 struct dmar_domain *found = NULL;
2181 struct device_domain_info *info;
2182 unsigned long flags;
2184 info = alloc_devinfo_mem();
2185 if (!info)
2186 return NULL;
2188 info->bus = bus;
2189 info->devfn = devfn;
2190 info->dev = dev;
2191 info->domain = domain;
2192 info->iommu = iommu;
2194 spin_lock_irqsave(&device_domain_lock, flags);
2195 if (dev)
2196 found = find_domain(dev);
2197 else {
2198 struct device_domain_info *info2;
2199 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2200 if (info2)
2201 found = info2->domain;
2203 if (found) {
2204 spin_unlock_irqrestore(&device_domain_lock, flags);
2205 free_devinfo_mem(info);
2206 /* Caller must free the original domain */
2207 return found;
2210 list_add(&info->link, &domain->devices);
2211 list_add(&info->global, &device_domain_list);
2212 if (dev)
2213 dev->archdata.iommu = info;
2214 spin_unlock_irqrestore(&device_domain_lock, flags);
2216 return domain;
2219 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2221 *(u16 *)opaque = alias;
2222 return 0;
2225 /* domain is initialized */
2226 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2228 struct dmar_domain *domain, *tmp;
2229 struct intel_iommu *iommu;
2230 struct device_domain_info *info;
2231 u16 dma_alias;
2232 unsigned long flags;
2233 u8 bus, devfn;
2235 domain = find_domain(dev);
2236 if (domain)
2237 return domain;
2239 iommu = device_to_iommu(dev, &bus, &devfn);
2240 if (!iommu)
2241 return NULL;
2243 if (dev_is_pci(dev)) {
2244 struct pci_dev *pdev = to_pci_dev(dev);
2246 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2248 spin_lock_irqsave(&device_domain_lock, flags);
2249 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2250 PCI_BUS_NUM(dma_alias),
2251 dma_alias & 0xff);
2252 if (info) {
2253 iommu = info->iommu;
2254 domain = info->domain;
2256 spin_unlock_irqrestore(&device_domain_lock, flags);
2258 /* DMA alias already has a domain, uses it */
2259 if (info)
2260 goto found_domain;
2263 /* Allocate and initialize new domain for the device */
2264 domain = alloc_domain(0);
2265 if (!domain)
2266 return NULL;
2267 domain->id = iommu_attach_domain(domain, iommu);
2268 if (domain->id < 0) {
2269 free_domain_mem(domain);
2270 return NULL;
2272 domain_attach_iommu(domain, iommu);
2273 if (domain_init(domain, gaw)) {
2274 domain_exit(domain);
2275 return NULL;
2278 /* register PCI DMA alias device */
2279 if (dev_is_pci(dev)) {
2280 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2281 dma_alias & 0xff, NULL, domain);
2283 if (!tmp || tmp != domain) {
2284 domain_exit(domain);
2285 domain = tmp;
2288 if (!domain)
2289 return NULL;
2292 found_domain:
2293 tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2295 if (!tmp || tmp != domain) {
2296 domain_exit(domain);
2297 domain = tmp;
2300 return domain;
2303 static int iommu_identity_mapping;
2304 #define IDENTMAP_ALL 1
2305 #define IDENTMAP_GFX 2
2306 #define IDENTMAP_AZALIA 4
2308 static int iommu_domain_identity_map(struct dmar_domain *domain,
2309 unsigned long long start,
2310 unsigned long long end)
2312 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2313 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2315 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2316 dma_to_mm_pfn(last_vpfn))) {
2317 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2318 return -ENOMEM;
2321 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2322 start, end, domain->id);
2324 * RMRR range might have overlap with physical memory range,
2325 * clear it first
2327 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2329 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2330 last_vpfn - first_vpfn + 1,
2331 DMA_PTE_READ|DMA_PTE_WRITE);
2334 static int iommu_prepare_identity_map(struct device *dev,
2335 unsigned long long start,
2336 unsigned long long end)
2338 struct dmar_domain *domain;
2339 int ret;
2341 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2342 if (!domain)
2343 return -ENOMEM;
2345 /* For _hardware_ passthrough, don't bother. But for software
2346 passthrough, we do it anyway -- it may indicate a memory
2347 range which is reserved in E820, so which didn't get set
2348 up to start with in si_domain */
2349 if (domain == si_domain && hw_pass_through) {
2350 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2351 dev_name(dev), start, end);
2352 return 0;
2355 printk(KERN_INFO
2356 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2357 dev_name(dev), start, end);
2359 if (end < start) {
2360 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2361 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2362 dmi_get_system_info(DMI_BIOS_VENDOR),
2363 dmi_get_system_info(DMI_BIOS_VERSION),
2364 dmi_get_system_info(DMI_PRODUCT_VERSION));
2365 ret = -EIO;
2366 goto error;
2369 if (end >> agaw_to_width(domain->agaw)) {
2370 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2371 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2372 agaw_to_width(domain->agaw),
2373 dmi_get_system_info(DMI_BIOS_VENDOR),
2374 dmi_get_system_info(DMI_BIOS_VERSION),
2375 dmi_get_system_info(DMI_PRODUCT_VERSION));
2376 ret = -EIO;
2377 goto error;
2380 ret = iommu_domain_identity_map(domain, start, end);
2381 if (ret)
2382 goto error;
2384 /* context entry init */
2385 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2386 if (ret)
2387 goto error;
2389 return 0;
2391 error:
2392 domain_exit(domain);
2393 return ret;
2396 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2397 struct device *dev)
2399 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2400 return 0;
2401 return iommu_prepare_identity_map(dev, rmrr->base_address,
2402 rmrr->end_address);
2405 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2406 static inline void iommu_prepare_isa(void)
2408 struct pci_dev *pdev;
2409 int ret;
2411 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2412 if (!pdev)
2413 return;
2415 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2416 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2418 if (ret)
2419 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2420 "floppy might not work\n");
2422 pci_dev_put(pdev);
2424 #else
2425 static inline void iommu_prepare_isa(void)
2427 return;
2429 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2431 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2433 static int __init si_domain_init(int hw)
2435 struct dmar_drhd_unit *drhd;
2436 struct intel_iommu *iommu;
2437 int nid, ret = 0;
2438 bool first = true;
2440 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2441 if (!si_domain)
2442 return -EFAULT;
2444 for_each_active_iommu(iommu, drhd) {
2445 ret = iommu_attach_domain(si_domain, iommu);
2446 if (ret < 0) {
2447 domain_exit(si_domain);
2448 return -EFAULT;
2449 } else if (first) {
2450 si_domain->id = ret;
2451 first = false;
2452 } else if (si_domain->id != ret) {
2453 domain_exit(si_domain);
2454 return -EFAULT;
2456 domain_attach_iommu(si_domain, iommu);
2459 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2460 domain_exit(si_domain);
2461 return -EFAULT;
2464 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2465 si_domain->id);
2467 if (hw)
2468 return 0;
2470 for_each_online_node(nid) {
2471 unsigned long start_pfn, end_pfn;
2472 int i;
2474 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2475 ret = iommu_domain_identity_map(si_domain,
2476 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2477 if (ret)
2478 return ret;
2482 return 0;
2485 static int identity_mapping(struct device *dev)
2487 struct device_domain_info *info;
2489 if (likely(!iommu_identity_mapping))
2490 return 0;
2492 info = dev->archdata.iommu;
2493 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2494 return (info->domain == si_domain);
2496 return 0;
2499 static int domain_add_dev_info(struct dmar_domain *domain,
2500 struct device *dev, int translation)
2502 struct dmar_domain *ndomain;
2503 struct intel_iommu *iommu;
2504 u8 bus, devfn;
2505 int ret;
2507 iommu = device_to_iommu(dev, &bus, &devfn);
2508 if (!iommu)
2509 return -ENODEV;
2511 ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2512 if (ndomain != domain)
2513 return -EBUSY;
2515 ret = domain_context_mapping(domain, dev, translation);
2516 if (ret) {
2517 domain_remove_one_dev_info(domain, dev);
2518 return ret;
2521 return 0;
2524 static bool device_has_rmrr(struct device *dev)
2526 struct dmar_rmrr_unit *rmrr;
2527 struct device *tmp;
2528 int i;
2530 rcu_read_lock();
2531 for_each_rmrr_units(rmrr) {
2533 * Return TRUE if this RMRR contains the device that
2534 * is passed in.
2536 for_each_active_dev_scope(rmrr->devices,
2537 rmrr->devices_cnt, i, tmp)
2538 if (tmp == dev) {
2539 rcu_read_unlock();
2540 return true;
2543 rcu_read_unlock();
2544 return false;
2548 * There are a couple cases where we need to restrict the functionality of
2549 * devices associated with RMRRs. The first is when evaluating a device for
2550 * identity mapping because problems exist when devices are moved in and out
2551 * of domains and their respective RMRR information is lost. This means that
2552 * a device with associated RMRRs will never be in a "passthrough" domain.
2553 * The second is use of the device through the IOMMU API. This interface
2554 * expects to have full control of the IOVA space for the device. We cannot
2555 * satisfy both the requirement that RMRR access is maintained and have an
2556 * unencumbered IOVA space. We also have no ability to quiesce the device's
2557 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2558 * We therefore prevent devices associated with an RMRR from participating in
2559 * the IOMMU API, which eliminates them from device assignment.
2561 * In both cases we assume that PCI USB devices with RMRRs have them largely
2562 * for historical reasons and that the RMRR space is not actively used post
2563 * boot. This exclusion may change if vendors begin to abuse it.
2565 static bool device_is_rmrr_locked(struct device *dev)
2567 if (!device_has_rmrr(dev))
2568 return false;
2570 if (dev_is_pci(dev)) {
2571 struct pci_dev *pdev = to_pci_dev(dev);
2573 if ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
2574 return false;
2577 return true;
2580 static int iommu_should_identity_map(struct device *dev, int startup)
2583 if (dev_is_pci(dev)) {
2584 struct pci_dev *pdev = to_pci_dev(dev);
2586 if (device_is_rmrr_locked(dev))
2587 return 0;
2589 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2590 return 1;
2592 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2593 return 1;
2595 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2596 return 0;
2599 * We want to start off with all devices in the 1:1 domain, and
2600 * take them out later if we find they can't access all of memory.
2602 * However, we can't do this for PCI devices behind bridges,
2603 * because all PCI devices behind the same bridge will end up
2604 * with the same source-id on their transactions.
2606 * Practically speaking, we can't change things around for these
2607 * devices at run-time, because we can't be sure there'll be no
2608 * DMA transactions in flight for any of their siblings.
2610 * So PCI devices (unless they're on the root bus) as well as
2611 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2612 * the 1:1 domain, just in _case_ one of their siblings turns out
2613 * not to be able to map all of memory.
2615 if (!pci_is_pcie(pdev)) {
2616 if (!pci_is_root_bus(pdev->bus))
2617 return 0;
2618 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2619 return 0;
2620 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2621 return 0;
2622 } else {
2623 if (device_has_rmrr(dev))
2624 return 0;
2628 * At boot time, we don't yet know if devices will be 64-bit capable.
2629 * Assume that they will — if they turn out not to be, then we can
2630 * take them out of the 1:1 domain later.
2632 if (!startup) {
2634 * If the device's dma_mask is less than the system's memory
2635 * size then this is not a candidate for identity mapping.
2637 u64 dma_mask = *dev->dma_mask;
2639 if (dev->coherent_dma_mask &&
2640 dev->coherent_dma_mask < dma_mask)
2641 dma_mask = dev->coherent_dma_mask;
2643 return dma_mask >= dma_get_required_mask(dev);
2646 return 1;
2649 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2651 int ret;
2653 if (!iommu_should_identity_map(dev, 1))
2654 return 0;
2656 ret = domain_add_dev_info(si_domain, dev,
2657 hw ? CONTEXT_TT_PASS_THROUGH :
2658 CONTEXT_TT_MULTI_LEVEL);
2659 if (!ret)
2660 pr_info("IOMMU: %s identity mapping for device %s\n",
2661 hw ? "hardware" : "software", dev_name(dev));
2662 else if (ret == -ENODEV)
2663 /* device not associated with an iommu */
2664 ret = 0;
2666 return ret;
2670 static int __init iommu_prepare_static_identity_mapping(int hw)
2672 struct pci_dev *pdev = NULL;
2673 struct dmar_drhd_unit *drhd;
2674 struct intel_iommu *iommu;
2675 struct device *dev;
2676 int i;
2677 int ret = 0;
2679 ret = si_domain_init(hw);
2680 if (ret)
2681 return -EFAULT;
2683 for_each_pci_dev(pdev) {
2684 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2685 if (ret)
2686 return ret;
2689 for_each_active_iommu(iommu, drhd)
2690 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2691 struct acpi_device_physical_node *pn;
2692 struct acpi_device *adev;
2694 if (dev->bus != &acpi_bus_type)
2695 continue;
2697 adev= to_acpi_device(dev);
2698 mutex_lock(&adev->physical_node_lock);
2699 list_for_each_entry(pn, &adev->physical_node_list, node) {
2700 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2701 if (ret)
2702 break;
2704 mutex_unlock(&adev->physical_node_lock);
2705 if (ret)
2706 return ret;
2709 return 0;
2712 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2715 * Start from the sane iommu hardware state.
2716 * If the queued invalidation is already initialized by us
2717 * (for example, while enabling interrupt-remapping) then
2718 * we got the things already rolling from a sane state.
2720 if (!iommu->qi) {
2722 * Clear any previous faults.
2724 dmar_fault(-1, iommu);
2726 * Disable queued invalidation if supported and already enabled
2727 * before OS handover.
2729 dmar_disable_qi(iommu);
2732 if (dmar_enable_qi(iommu)) {
2734 * Queued Invalidate not enabled, use Register Based Invalidate
2736 iommu->flush.flush_context = __iommu_flush_context;
2737 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2738 pr_info("IOMMU: %s using Register based invalidation\n",
2739 iommu->name);
2740 } else {
2741 iommu->flush.flush_context = qi_flush_context;
2742 iommu->flush.flush_iotlb = qi_flush_iotlb;
2743 pr_info("IOMMU: %s using Queued invalidation\n", iommu->name);
2747 static int __init init_dmars(void)
2749 struct dmar_drhd_unit *drhd;
2750 struct dmar_rmrr_unit *rmrr;
2751 struct device *dev;
2752 struct intel_iommu *iommu;
2753 int i, ret;
2756 * for each drhd
2757 * allocate root
2758 * initialize and program root entry to not present
2759 * endfor
2761 for_each_drhd_unit(drhd) {
2763 * lock not needed as this is only incremented in the single
2764 * threaded kernel __init code path all other access are read
2765 * only
2767 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2768 g_num_of_iommus++;
2769 continue;
2771 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2772 DMAR_UNITS_SUPPORTED);
2775 /* Preallocate enough resources for IOMMU hot-addition */
2776 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
2777 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
2779 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2780 GFP_KERNEL);
2781 if (!g_iommus) {
2782 printk(KERN_ERR "Allocating global iommu array failed\n");
2783 ret = -ENOMEM;
2784 goto error;
2787 deferred_flush = kzalloc(g_num_of_iommus *
2788 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2789 if (!deferred_flush) {
2790 ret = -ENOMEM;
2791 goto free_g_iommus;
2794 for_each_active_iommu(iommu, drhd) {
2795 g_iommus[iommu->seq_id] = iommu;
2797 ret = iommu_init_domains(iommu);
2798 if (ret)
2799 goto free_iommu;
2802 * TBD:
2803 * we could share the same root & context tables
2804 * among all IOMMU's. Need to Split it later.
2806 ret = iommu_alloc_root_entry(iommu);
2807 if (ret)
2808 goto free_iommu;
2809 if (!ecap_pass_through(iommu->ecap))
2810 hw_pass_through = 0;
2813 for_each_active_iommu(iommu, drhd)
2814 intel_iommu_init_qi(iommu);
2816 if (iommu_pass_through)
2817 iommu_identity_mapping |= IDENTMAP_ALL;
2819 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2820 iommu_identity_mapping |= IDENTMAP_GFX;
2821 #endif
2823 check_tylersburg_isoch();
2826 * If pass through is not set or not enabled, setup context entries for
2827 * identity mappings for rmrr, gfx, and isa and may fall back to static
2828 * identity mapping if iommu_identity_mapping is set.
2830 if (iommu_identity_mapping) {
2831 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2832 if (ret) {
2833 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2834 goto free_iommu;
2838 * For each rmrr
2839 * for each dev attached to rmrr
2840 * do
2841 * locate drhd for dev, alloc domain for dev
2842 * allocate free domain
2843 * allocate page table entries for rmrr
2844 * if context not allocated for bus
2845 * allocate and init context
2846 * set present in root table for this bus
2847 * init context with domain, translation etc
2848 * endfor
2849 * endfor
2851 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2852 for_each_rmrr_units(rmrr) {
2853 /* some BIOS lists non-exist devices in DMAR table. */
2854 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2855 i, dev) {
2856 ret = iommu_prepare_rmrr_dev(rmrr, dev);
2857 if (ret)
2858 printk(KERN_ERR
2859 "IOMMU: mapping reserved region failed\n");
2863 iommu_prepare_isa();
2866 * for each drhd
2867 * enable fault log
2868 * global invalidate context cache
2869 * global invalidate iotlb
2870 * enable translation
2872 for_each_iommu(iommu, drhd) {
2873 if (drhd->ignored) {
2875 * we always have to disable PMRs or DMA may fail on
2876 * this device
2878 if (force_on)
2879 iommu_disable_protect_mem_regions(iommu);
2880 continue;
2883 iommu_flush_write_buffer(iommu);
2885 ret = dmar_set_interrupt(iommu);
2886 if (ret)
2887 goto free_iommu;
2889 iommu_set_root_entry(iommu);
2891 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2892 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2893 iommu_enable_translation(iommu);
2894 iommu_disable_protect_mem_regions(iommu);
2897 return 0;
2899 free_iommu:
2900 for_each_active_iommu(iommu, drhd) {
2901 disable_dmar_iommu(iommu);
2902 free_dmar_iommu(iommu);
2904 kfree(deferred_flush);
2905 free_g_iommus:
2906 kfree(g_iommus);
2907 error:
2908 return ret;
2911 /* This takes a number of _MM_ pages, not VTD pages */
2912 static struct iova *intel_alloc_iova(struct device *dev,
2913 struct dmar_domain *domain,
2914 unsigned long nrpages, uint64_t dma_mask)
2916 struct iova *iova = NULL;
2918 /* Restrict dma_mask to the width that the iommu can handle */
2919 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2921 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2923 * First try to allocate an io virtual address in
2924 * DMA_BIT_MASK(32) and if that fails then try allocating
2925 * from higher range
2927 iova = alloc_iova(&domain->iovad, nrpages,
2928 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2929 if (iova)
2930 return iova;
2932 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2933 if (unlikely(!iova)) {
2934 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2935 nrpages, dev_name(dev));
2936 return NULL;
2939 return iova;
2942 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2944 struct dmar_domain *domain;
2945 int ret;
2947 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2948 if (!domain) {
2949 printk(KERN_ERR "Allocating domain for %s failed",
2950 dev_name(dev));
2951 return NULL;
2954 /* make sure context mapping is ok */
2955 if (unlikely(!domain_context_mapped(dev))) {
2956 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2957 if (ret) {
2958 printk(KERN_ERR "Domain context map for %s failed",
2959 dev_name(dev));
2960 return NULL;
2964 return domain;
2967 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2969 struct device_domain_info *info;
2971 /* No lock here, assumes no domain exit in normal case */
2972 info = dev->archdata.iommu;
2973 if (likely(info))
2974 return info->domain;
2976 return __get_valid_domain_for_dev(dev);
2979 static int iommu_dummy(struct device *dev)
2981 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2984 /* Check if the dev needs to go through non-identity map and unmap process.*/
2985 static int iommu_no_mapping(struct device *dev)
2987 int found;
2989 if (iommu_dummy(dev))
2990 return 1;
2992 if (!iommu_identity_mapping)
2993 return 0;
2995 found = identity_mapping(dev);
2996 if (found) {
2997 if (iommu_should_identity_map(dev, 0))
2998 return 1;
2999 else {
3001 * 32 bit DMA is removed from si_domain and fall back
3002 * to non-identity mapping.
3004 domain_remove_one_dev_info(si_domain, dev);
3005 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
3006 dev_name(dev));
3007 return 0;
3009 } else {
3011 * In case of a detached 64 bit DMA device from vm, the device
3012 * is put into si_domain for identity mapping.
3014 if (iommu_should_identity_map(dev, 0)) {
3015 int ret;
3016 ret = domain_add_dev_info(si_domain, dev,
3017 hw_pass_through ?
3018 CONTEXT_TT_PASS_THROUGH :
3019 CONTEXT_TT_MULTI_LEVEL);
3020 if (!ret) {
3021 printk(KERN_INFO "64bit %s uses identity mapping\n",
3022 dev_name(dev));
3023 return 1;
3028 return 0;
3031 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3032 size_t size, int dir, u64 dma_mask)
3034 struct dmar_domain *domain;
3035 phys_addr_t start_paddr;
3036 struct iova *iova;
3037 int prot = 0;
3038 int ret;
3039 struct intel_iommu *iommu;
3040 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3042 BUG_ON(dir == DMA_NONE);
3044 if (iommu_no_mapping(dev))
3045 return paddr;
3047 domain = get_valid_domain_for_dev(dev);
3048 if (!domain)
3049 return 0;
3051 iommu = domain_get_iommu(domain);
3052 size = aligned_nrpages(paddr, size);
3054 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3055 if (!iova)
3056 goto error;
3059 * Check if DMAR supports zero-length reads on write only
3060 * mappings..
3062 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3063 !cap_zlr(iommu->cap))
3064 prot |= DMA_PTE_READ;
3065 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3066 prot |= DMA_PTE_WRITE;
3068 * paddr - (paddr + size) might be partial page, we should map the whole
3069 * page. Note: if two part of one page are separately mapped, we
3070 * might have two guest_addr mapping to the same host paddr, but this
3071 * is not a big problem
3073 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3074 mm_to_dma_pfn(paddr_pfn), size, prot);
3075 if (ret)
3076 goto error;
3078 /* it's a non-present to present mapping. Only flush if caching mode */
3079 if (cap_caching_mode(iommu->cap))
3080 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3081 else
3082 iommu_flush_write_buffer(iommu);
3084 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3085 start_paddr += paddr & ~PAGE_MASK;
3086 return start_paddr;
3088 error:
3089 if (iova)
3090 __free_iova(&domain->iovad, iova);
3091 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3092 dev_name(dev), size, (unsigned long long)paddr, dir);
3093 return 0;
3096 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3097 unsigned long offset, size_t size,
3098 enum dma_data_direction dir,
3099 struct dma_attrs *attrs)
3101 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3102 dir, *dev->dma_mask);
3105 static void flush_unmaps(void)
3107 int i, j;
3109 timer_on = 0;
3111 /* just flush them all */
3112 for (i = 0; i < g_num_of_iommus; i++) {
3113 struct intel_iommu *iommu = g_iommus[i];
3114 if (!iommu)
3115 continue;
3117 if (!deferred_flush[i].next)
3118 continue;
3120 /* In caching mode, global flushes turn emulation expensive */
3121 if (!cap_caching_mode(iommu->cap))
3122 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3123 DMA_TLB_GLOBAL_FLUSH);
3124 for (j = 0; j < deferred_flush[i].next; j++) {
3125 unsigned long mask;
3126 struct iova *iova = deferred_flush[i].iova[j];
3127 struct dmar_domain *domain = deferred_flush[i].domain[j];
3129 /* On real hardware multiple invalidations are expensive */
3130 if (cap_caching_mode(iommu->cap))
3131 iommu_flush_iotlb_psi(iommu, domain->id,
3132 iova->pfn_lo, iova_size(iova),
3133 !deferred_flush[i].freelist[j], 0);
3134 else {
3135 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3136 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3137 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3139 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3140 if (deferred_flush[i].freelist[j])
3141 dma_free_pagelist(deferred_flush[i].freelist[j]);
3143 deferred_flush[i].next = 0;
3146 list_size = 0;
3149 static void flush_unmaps_timeout(unsigned long data)
3151 unsigned long flags;
3153 spin_lock_irqsave(&async_umap_flush_lock, flags);
3154 flush_unmaps();
3155 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3158 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3160 unsigned long flags;
3161 int next, iommu_id;
3162 struct intel_iommu *iommu;
3164 spin_lock_irqsave(&async_umap_flush_lock, flags);
3165 if (list_size == HIGH_WATER_MARK)
3166 flush_unmaps();
3168 iommu = domain_get_iommu(dom);
3169 iommu_id = iommu->seq_id;
3171 next = deferred_flush[iommu_id].next;
3172 deferred_flush[iommu_id].domain[next] = dom;
3173 deferred_flush[iommu_id].iova[next] = iova;
3174 deferred_flush[iommu_id].freelist[next] = freelist;
3175 deferred_flush[iommu_id].next++;
3177 if (!timer_on) {
3178 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3179 timer_on = 1;
3181 list_size++;
3182 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3185 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3187 struct dmar_domain *domain;
3188 unsigned long start_pfn, last_pfn;
3189 struct iova *iova;
3190 struct intel_iommu *iommu;
3191 struct page *freelist;
3193 if (iommu_no_mapping(dev))
3194 return;
3196 domain = find_domain(dev);
3197 BUG_ON(!domain);
3199 iommu = domain_get_iommu(domain);
3201 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3202 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3203 (unsigned long long)dev_addr))
3204 return;
3206 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3207 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3209 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3210 dev_name(dev), start_pfn, last_pfn);
3212 freelist = domain_unmap(domain, start_pfn, last_pfn);
3214 if (intel_iommu_strict) {
3215 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3216 last_pfn - start_pfn + 1, !freelist, 0);
3217 /* free iova */
3218 __free_iova(&domain->iovad, iova);
3219 dma_free_pagelist(freelist);
3220 } else {
3221 add_unmap(domain, iova, freelist);
3223 * queue up the release of the unmap to save the 1/6th of the
3224 * cpu used up by the iotlb flush operation...
3229 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3230 size_t size, enum dma_data_direction dir,
3231 struct dma_attrs *attrs)
3233 intel_unmap(dev, dev_addr);
3236 static void *intel_alloc_coherent(struct device *dev, size_t size,
3237 dma_addr_t *dma_handle, gfp_t flags,
3238 struct dma_attrs *attrs)
3240 struct page *page = NULL;
3241 int order;
3243 size = PAGE_ALIGN(size);
3244 order = get_order(size);
3246 if (!iommu_no_mapping(dev))
3247 flags &= ~(GFP_DMA | GFP_DMA32);
3248 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3249 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3250 flags |= GFP_DMA;
3251 else
3252 flags |= GFP_DMA32;
3255 if (flags & __GFP_WAIT) {
3256 unsigned int count = size >> PAGE_SHIFT;
3258 page = dma_alloc_from_contiguous(dev, count, order);
3259 if (page && iommu_no_mapping(dev) &&
3260 page_to_phys(page) + size > dev->coherent_dma_mask) {
3261 dma_release_from_contiguous(dev, page, count);
3262 page = NULL;
3266 if (!page)
3267 page = alloc_pages(flags, order);
3268 if (!page)
3269 return NULL;
3270 memset(page_address(page), 0, size);
3272 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3273 DMA_BIDIRECTIONAL,
3274 dev->coherent_dma_mask);
3275 if (*dma_handle)
3276 return page_address(page);
3277 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3278 __free_pages(page, order);
3280 return NULL;
3283 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3284 dma_addr_t dma_handle, struct dma_attrs *attrs)
3286 int order;
3287 struct page *page = virt_to_page(vaddr);
3289 size = PAGE_ALIGN(size);
3290 order = get_order(size);
3292 intel_unmap(dev, dma_handle);
3293 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3294 __free_pages(page, order);
3297 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3298 int nelems, enum dma_data_direction dir,
3299 struct dma_attrs *attrs)
3301 intel_unmap(dev, sglist[0].dma_address);
3304 static int intel_nontranslate_map_sg(struct device *hddev,
3305 struct scatterlist *sglist, int nelems, int dir)
3307 int i;
3308 struct scatterlist *sg;
3310 for_each_sg(sglist, sg, nelems, i) {
3311 BUG_ON(!sg_page(sg));
3312 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3313 sg->dma_length = sg->length;
3315 return nelems;
3318 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3319 enum dma_data_direction dir, struct dma_attrs *attrs)
3321 int i;
3322 struct dmar_domain *domain;
3323 size_t size = 0;
3324 int prot = 0;
3325 struct iova *iova = NULL;
3326 int ret;
3327 struct scatterlist *sg;
3328 unsigned long start_vpfn;
3329 struct intel_iommu *iommu;
3331 BUG_ON(dir == DMA_NONE);
3332 if (iommu_no_mapping(dev))
3333 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3335 domain = get_valid_domain_for_dev(dev);
3336 if (!domain)
3337 return 0;
3339 iommu = domain_get_iommu(domain);
3341 for_each_sg(sglist, sg, nelems, i)
3342 size += aligned_nrpages(sg->offset, sg->length);
3344 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3345 *dev->dma_mask);
3346 if (!iova) {
3347 sglist->dma_length = 0;
3348 return 0;
3352 * Check if DMAR supports zero-length reads on write only
3353 * mappings..
3355 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3356 !cap_zlr(iommu->cap))
3357 prot |= DMA_PTE_READ;
3358 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3359 prot |= DMA_PTE_WRITE;
3361 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3363 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3364 if (unlikely(ret)) {
3365 dma_pte_free_pagetable(domain, start_vpfn,
3366 start_vpfn + size - 1);
3367 __free_iova(&domain->iovad, iova);
3368 return 0;
3371 /* it's a non-present to present mapping. Only flush if caching mode */
3372 if (cap_caching_mode(iommu->cap))
3373 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3374 else
3375 iommu_flush_write_buffer(iommu);
3377 return nelems;
3380 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3382 return !dma_addr;
3385 struct dma_map_ops intel_dma_ops = {
3386 .alloc = intel_alloc_coherent,
3387 .free = intel_free_coherent,
3388 .map_sg = intel_map_sg,
3389 .unmap_sg = intel_unmap_sg,
3390 .map_page = intel_map_page,
3391 .unmap_page = intel_unmap_page,
3392 .mapping_error = intel_mapping_error,
3395 static inline int iommu_domain_cache_init(void)
3397 int ret = 0;
3399 iommu_domain_cache = kmem_cache_create("iommu_domain",
3400 sizeof(struct dmar_domain),
3402 SLAB_HWCACHE_ALIGN,
3404 NULL);
3405 if (!iommu_domain_cache) {
3406 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3407 ret = -ENOMEM;
3410 return ret;
3413 static inline int iommu_devinfo_cache_init(void)
3415 int ret = 0;
3417 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3418 sizeof(struct device_domain_info),
3420 SLAB_HWCACHE_ALIGN,
3421 NULL);
3422 if (!iommu_devinfo_cache) {
3423 printk(KERN_ERR "Couldn't create devinfo cache\n");
3424 ret = -ENOMEM;
3427 return ret;
3430 static inline int iommu_iova_cache_init(void)
3432 int ret = 0;
3434 iommu_iova_cache = kmem_cache_create("iommu_iova",
3435 sizeof(struct iova),
3437 SLAB_HWCACHE_ALIGN,
3438 NULL);
3439 if (!iommu_iova_cache) {
3440 printk(KERN_ERR "Couldn't create iova cache\n");
3441 ret = -ENOMEM;
3444 return ret;
3447 static int __init iommu_init_mempool(void)
3449 int ret;
3450 ret = iommu_iova_cache_init();
3451 if (ret)
3452 return ret;
3454 ret = iommu_domain_cache_init();
3455 if (ret)
3456 goto domain_error;
3458 ret = iommu_devinfo_cache_init();
3459 if (!ret)
3460 return ret;
3462 kmem_cache_destroy(iommu_domain_cache);
3463 domain_error:
3464 kmem_cache_destroy(iommu_iova_cache);
3466 return -ENOMEM;
3469 static void __init iommu_exit_mempool(void)
3471 kmem_cache_destroy(iommu_devinfo_cache);
3472 kmem_cache_destroy(iommu_domain_cache);
3473 kmem_cache_destroy(iommu_iova_cache);
3477 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3479 struct dmar_drhd_unit *drhd;
3480 u32 vtbar;
3481 int rc;
3483 /* We know that this device on this chipset has its own IOMMU.
3484 * If we find it under a different IOMMU, then the BIOS is lying
3485 * to us. Hope that the IOMMU for this device is actually
3486 * disabled, and it needs no translation...
3488 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3489 if (rc) {
3490 /* "can't" happen */
3491 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3492 return;
3494 vtbar &= 0xffff0000;
3496 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3497 drhd = dmar_find_matched_drhd_unit(pdev);
3498 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3499 TAINT_FIRMWARE_WORKAROUND,
3500 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3501 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3503 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3505 static void __init init_no_remapping_devices(void)
3507 struct dmar_drhd_unit *drhd;
3508 struct device *dev;
3509 int i;
3511 for_each_drhd_unit(drhd) {
3512 if (!drhd->include_all) {
3513 for_each_active_dev_scope(drhd->devices,
3514 drhd->devices_cnt, i, dev)
3515 break;
3516 /* ignore DMAR unit if no devices exist */
3517 if (i == drhd->devices_cnt)
3518 drhd->ignored = 1;
3522 for_each_active_drhd_unit(drhd) {
3523 if (drhd->include_all)
3524 continue;
3526 for_each_active_dev_scope(drhd->devices,
3527 drhd->devices_cnt, i, dev)
3528 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3529 break;
3530 if (i < drhd->devices_cnt)
3531 continue;
3533 /* This IOMMU has *only* gfx devices. Either bypass it or
3534 set the gfx_mapped flag, as appropriate */
3535 if (dmar_map_gfx) {
3536 intel_iommu_gfx_mapped = 1;
3537 } else {
3538 drhd->ignored = 1;
3539 for_each_active_dev_scope(drhd->devices,
3540 drhd->devices_cnt, i, dev)
3541 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3546 #ifdef CONFIG_SUSPEND
3547 static int init_iommu_hw(void)
3549 struct dmar_drhd_unit *drhd;
3550 struct intel_iommu *iommu = NULL;
3552 for_each_active_iommu(iommu, drhd)
3553 if (iommu->qi)
3554 dmar_reenable_qi(iommu);
3556 for_each_iommu(iommu, drhd) {
3557 if (drhd->ignored) {
3559 * we always have to disable PMRs or DMA may fail on
3560 * this device
3562 if (force_on)
3563 iommu_disable_protect_mem_regions(iommu);
3564 continue;
3567 iommu_flush_write_buffer(iommu);
3569 iommu_set_root_entry(iommu);
3571 iommu->flush.flush_context(iommu, 0, 0, 0,
3572 DMA_CCMD_GLOBAL_INVL);
3573 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3574 iommu_enable_translation(iommu);
3575 iommu_disable_protect_mem_regions(iommu);
3578 return 0;
3581 static void iommu_flush_all(void)
3583 struct dmar_drhd_unit *drhd;
3584 struct intel_iommu *iommu;
3586 for_each_active_iommu(iommu, drhd) {
3587 iommu->flush.flush_context(iommu, 0, 0, 0,
3588 DMA_CCMD_GLOBAL_INVL);
3589 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3590 DMA_TLB_GLOBAL_FLUSH);
3594 static int iommu_suspend(void)
3596 struct dmar_drhd_unit *drhd;
3597 struct intel_iommu *iommu = NULL;
3598 unsigned long flag;
3600 for_each_active_iommu(iommu, drhd) {
3601 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3602 GFP_ATOMIC);
3603 if (!iommu->iommu_state)
3604 goto nomem;
3607 iommu_flush_all();
3609 for_each_active_iommu(iommu, drhd) {
3610 iommu_disable_translation(iommu);
3612 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3614 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3615 readl(iommu->reg + DMAR_FECTL_REG);
3616 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3617 readl(iommu->reg + DMAR_FEDATA_REG);
3618 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3619 readl(iommu->reg + DMAR_FEADDR_REG);
3620 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3621 readl(iommu->reg + DMAR_FEUADDR_REG);
3623 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3625 return 0;
3627 nomem:
3628 for_each_active_iommu(iommu, drhd)
3629 kfree(iommu->iommu_state);
3631 return -ENOMEM;
3634 static void iommu_resume(void)
3636 struct dmar_drhd_unit *drhd;
3637 struct intel_iommu *iommu = NULL;
3638 unsigned long flag;
3640 if (init_iommu_hw()) {
3641 if (force_on)
3642 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3643 else
3644 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3645 return;
3648 for_each_active_iommu(iommu, drhd) {
3650 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3652 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3653 iommu->reg + DMAR_FECTL_REG);
3654 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3655 iommu->reg + DMAR_FEDATA_REG);
3656 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3657 iommu->reg + DMAR_FEADDR_REG);
3658 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3659 iommu->reg + DMAR_FEUADDR_REG);
3661 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3664 for_each_active_iommu(iommu, drhd)
3665 kfree(iommu->iommu_state);
3668 static struct syscore_ops iommu_syscore_ops = {
3669 .resume = iommu_resume,
3670 .suspend = iommu_suspend,
3673 static void __init init_iommu_pm_ops(void)
3675 register_syscore_ops(&iommu_syscore_ops);
3678 #else
3679 static inline void init_iommu_pm_ops(void) {}
3680 #endif /* CONFIG_PM */
3683 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3685 struct acpi_dmar_reserved_memory *rmrr;
3686 struct dmar_rmrr_unit *rmrru;
3688 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3689 if (!rmrru)
3690 return -ENOMEM;
3692 rmrru->hdr = header;
3693 rmrr = (struct acpi_dmar_reserved_memory *)header;
3694 rmrru->base_address = rmrr->base_address;
3695 rmrru->end_address = rmrr->end_address;
3696 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3697 ((void *)rmrr) + rmrr->header.length,
3698 &rmrru->devices_cnt);
3699 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3700 kfree(rmrru);
3701 return -ENOMEM;
3704 list_add(&rmrru->list, &dmar_rmrr_units);
3706 return 0;
3709 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3711 struct dmar_atsr_unit *atsru;
3712 struct acpi_dmar_atsr *tmp;
3714 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3715 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3716 if (atsr->segment != tmp->segment)
3717 continue;
3718 if (atsr->header.length != tmp->header.length)
3719 continue;
3720 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3721 return atsru;
3724 return NULL;
3727 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3729 struct acpi_dmar_atsr *atsr;
3730 struct dmar_atsr_unit *atsru;
3732 if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
3733 return 0;
3735 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3736 atsru = dmar_find_atsr(atsr);
3737 if (atsru)
3738 return 0;
3740 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3741 if (!atsru)
3742 return -ENOMEM;
3745 * If memory is allocated from slab by ACPI _DSM method, we need to
3746 * copy the memory content because the memory buffer will be freed
3747 * on return.
3749 atsru->hdr = (void *)(atsru + 1);
3750 memcpy(atsru->hdr, hdr, hdr->length);
3751 atsru->include_all = atsr->flags & 0x1;
3752 if (!atsru->include_all) {
3753 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3754 (void *)atsr + atsr->header.length,
3755 &atsru->devices_cnt);
3756 if (atsru->devices_cnt && atsru->devices == NULL) {
3757 kfree(atsru);
3758 return -ENOMEM;
3762 list_add_rcu(&atsru->list, &dmar_atsr_units);
3764 return 0;
3767 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3769 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3770 kfree(atsru);
3773 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3775 struct acpi_dmar_atsr *atsr;
3776 struct dmar_atsr_unit *atsru;
3778 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3779 atsru = dmar_find_atsr(atsr);
3780 if (atsru) {
3781 list_del_rcu(&atsru->list);
3782 synchronize_rcu();
3783 intel_iommu_free_atsr(atsru);
3786 return 0;
3789 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3791 int i;
3792 struct device *dev;
3793 struct acpi_dmar_atsr *atsr;
3794 struct dmar_atsr_unit *atsru;
3796 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3797 atsru = dmar_find_atsr(atsr);
3798 if (!atsru)
3799 return 0;
3801 if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
3802 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3803 i, dev)
3804 return -EBUSY;
3806 return 0;
3809 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3811 int sp, ret = 0;
3812 struct intel_iommu *iommu = dmaru->iommu;
3814 if (g_iommus[iommu->seq_id])
3815 return 0;
3817 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3818 pr_warn("IOMMU: %s doesn't support hardware pass through.\n",
3819 iommu->name);
3820 return -ENXIO;
3822 if (!ecap_sc_support(iommu->ecap) &&
3823 domain_update_iommu_snooping(iommu)) {
3824 pr_warn("IOMMU: %s doesn't support snooping.\n",
3825 iommu->name);
3826 return -ENXIO;
3828 sp = domain_update_iommu_superpage(iommu) - 1;
3829 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3830 pr_warn("IOMMU: %s doesn't support large page.\n",
3831 iommu->name);
3832 return -ENXIO;
3836 * Disable translation if already enabled prior to OS handover.
3838 if (iommu->gcmd & DMA_GCMD_TE)
3839 iommu_disable_translation(iommu);
3841 g_iommus[iommu->seq_id] = iommu;
3842 ret = iommu_init_domains(iommu);
3843 if (ret == 0)
3844 ret = iommu_alloc_root_entry(iommu);
3845 if (ret)
3846 goto out;
3848 if (dmaru->ignored) {
3850 * we always have to disable PMRs or DMA may fail on this device
3852 if (force_on)
3853 iommu_disable_protect_mem_regions(iommu);
3854 return 0;
3857 intel_iommu_init_qi(iommu);
3858 iommu_flush_write_buffer(iommu);
3859 ret = dmar_set_interrupt(iommu);
3860 if (ret)
3861 goto disable_iommu;
3863 iommu_set_root_entry(iommu);
3864 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3865 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3866 iommu_enable_translation(iommu);
3868 if (si_domain) {
3869 ret = iommu_attach_domain(si_domain, iommu);
3870 if (ret < 0 || si_domain->id != ret)
3871 goto disable_iommu;
3872 domain_attach_iommu(si_domain, iommu);
3875 iommu_disable_protect_mem_regions(iommu);
3876 return 0;
3878 disable_iommu:
3879 disable_dmar_iommu(iommu);
3880 out:
3881 free_dmar_iommu(iommu);
3882 return ret;
3885 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3887 int ret = 0;
3888 struct intel_iommu *iommu = dmaru->iommu;
3890 if (!intel_iommu_enabled)
3891 return 0;
3892 if (iommu == NULL)
3893 return -EINVAL;
3895 if (insert) {
3896 ret = intel_iommu_add(dmaru);
3897 } else {
3898 disable_dmar_iommu(iommu);
3899 free_dmar_iommu(iommu);
3902 return ret;
3905 static void intel_iommu_free_dmars(void)
3907 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3908 struct dmar_atsr_unit *atsru, *atsr_n;
3910 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3911 list_del(&rmrru->list);
3912 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3913 kfree(rmrru);
3916 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3917 list_del(&atsru->list);
3918 intel_iommu_free_atsr(atsru);
3922 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3924 int i, ret = 1;
3925 struct pci_bus *bus;
3926 struct pci_dev *bridge = NULL;
3927 struct device *tmp;
3928 struct acpi_dmar_atsr *atsr;
3929 struct dmar_atsr_unit *atsru;
3931 dev = pci_physfn(dev);
3932 for (bus = dev->bus; bus; bus = bus->parent) {
3933 bridge = bus->self;
3934 if (!bridge || !pci_is_pcie(bridge) ||
3935 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3936 return 0;
3937 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3938 break;
3940 if (!bridge)
3941 return 0;
3943 rcu_read_lock();
3944 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3945 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3946 if (atsr->segment != pci_domain_nr(dev->bus))
3947 continue;
3949 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3950 if (tmp == &bridge->dev)
3951 goto out;
3953 if (atsru->include_all)
3954 goto out;
3956 ret = 0;
3957 out:
3958 rcu_read_unlock();
3960 return ret;
3963 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3965 int ret = 0;
3966 struct dmar_rmrr_unit *rmrru;
3967 struct dmar_atsr_unit *atsru;
3968 struct acpi_dmar_atsr *atsr;
3969 struct acpi_dmar_reserved_memory *rmrr;
3971 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3972 return 0;
3974 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3975 rmrr = container_of(rmrru->hdr,
3976 struct acpi_dmar_reserved_memory, header);
3977 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3978 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3979 ((void *)rmrr) + rmrr->header.length,
3980 rmrr->segment, rmrru->devices,
3981 rmrru->devices_cnt);
3982 if(ret < 0)
3983 return ret;
3984 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3985 dmar_remove_dev_scope(info, rmrr->segment,
3986 rmrru->devices, rmrru->devices_cnt);
3990 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3991 if (atsru->include_all)
3992 continue;
3994 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3995 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3996 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3997 (void *)atsr + atsr->header.length,
3998 atsr->segment, atsru->devices,
3999 atsru->devices_cnt);
4000 if (ret > 0)
4001 break;
4002 else if(ret < 0)
4003 return ret;
4004 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4005 if (dmar_remove_dev_scope(info, atsr->segment,
4006 atsru->devices, atsru->devices_cnt))
4007 break;
4011 return 0;
4015 * Here we only respond to action of unbound device from driver.
4017 * Added device is not attached to its DMAR domain here yet. That will happen
4018 * when mapping the device to iova.
4020 static int device_notifier(struct notifier_block *nb,
4021 unsigned long action, void *data)
4023 struct device *dev = data;
4024 struct dmar_domain *domain;
4026 if (iommu_dummy(dev))
4027 return 0;
4029 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4030 return 0;
4032 domain = find_domain(dev);
4033 if (!domain)
4034 return 0;
4036 down_read(&dmar_global_lock);
4037 domain_remove_one_dev_info(domain, dev);
4038 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4039 domain_exit(domain);
4040 up_read(&dmar_global_lock);
4042 return 0;
4045 static struct notifier_block device_nb = {
4046 .notifier_call = device_notifier,
4049 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4050 unsigned long val, void *v)
4052 struct memory_notify *mhp = v;
4053 unsigned long long start, end;
4054 unsigned long start_vpfn, last_vpfn;
4056 switch (val) {
4057 case MEM_GOING_ONLINE:
4058 start = mhp->start_pfn << PAGE_SHIFT;
4059 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4060 if (iommu_domain_identity_map(si_domain, start, end)) {
4061 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
4062 start, end);
4063 return NOTIFY_BAD;
4065 break;
4067 case MEM_OFFLINE:
4068 case MEM_CANCEL_ONLINE:
4069 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4070 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4071 while (start_vpfn <= last_vpfn) {
4072 struct iova *iova;
4073 struct dmar_drhd_unit *drhd;
4074 struct intel_iommu *iommu;
4075 struct page *freelist;
4077 iova = find_iova(&si_domain->iovad, start_vpfn);
4078 if (iova == NULL) {
4079 pr_debug("dmar: failed get IOVA for PFN %lx\n",
4080 start_vpfn);
4081 break;
4084 iova = split_and_remove_iova(&si_domain->iovad, iova,
4085 start_vpfn, last_vpfn);
4086 if (iova == NULL) {
4087 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
4088 start_vpfn, last_vpfn);
4089 return NOTIFY_BAD;
4092 freelist = domain_unmap(si_domain, iova->pfn_lo,
4093 iova->pfn_hi);
4095 rcu_read_lock();
4096 for_each_active_iommu(iommu, drhd)
4097 iommu_flush_iotlb_psi(iommu, si_domain->id,
4098 iova->pfn_lo, iova_size(iova),
4099 !freelist, 0);
4100 rcu_read_unlock();
4101 dma_free_pagelist(freelist);
4103 start_vpfn = iova->pfn_hi + 1;
4104 free_iova_mem(iova);
4106 break;
4109 return NOTIFY_OK;
4112 static struct notifier_block intel_iommu_memory_nb = {
4113 .notifier_call = intel_iommu_memory_notifier,
4114 .priority = 0
4118 static ssize_t intel_iommu_show_version(struct device *dev,
4119 struct device_attribute *attr,
4120 char *buf)
4122 struct intel_iommu *iommu = dev_get_drvdata(dev);
4123 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4124 return sprintf(buf, "%d:%d\n",
4125 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4127 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4129 static ssize_t intel_iommu_show_address(struct device *dev,
4130 struct device_attribute *attr,
4131 char *buf)
4133 struct intel_iommu *iommu = dev_get_drvdata(dev);
4134 return sprintf(buf, "%llx\n", iommu->reg_phys);
4136 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4138 static ssize_t intel_iommu_show_cap(struct device *dev,
4139 struct device_attribute *attr,
4140 char *buf)
4142 struct intel_iommu *iommu = dev_get_drvdata(dev);
4143 return sprintf(buf, "%llx\n", iommu->cap);
4145 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4147 static ssize_t intel_iommu_show_ecap(struct device *dev,
4148 struct device_attribute *attr,
4149 char *buf)
4151 struct intel_iommu *iommu = dev_get_drvdata(dev);
4152 return sprintf(buf, "%llx\n", iommu->ecap);
4154 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4156 static struct attribute *intel_iommu_attrs[] = {
4157 &dev_attr_version.attr,
4158 &dev_attr_address.attr,
4159 &dev_attr_cap.attr,
4160 &dev_attr_ecap.attr,
4161 NULL,
4164 static struct attribute_group intel_iommu_group = {
4165 .name = "intel-iommu",
4166 .attrs = intel_iommu_attrs,
4169 const struct attribute_group *intel_iommu_groups[] = {
4170 &intel_iommu_group,
4171 NULL,
4174 int __init intel_iommu_init(void)
4176 int ret = -ENODEV;
4177 struct dmar_drhd_unit *drhd;
4178 struct intel_iommu *iommu;
4180 /* VT-d is required for a TXT/tboot launch, so enforce that */
4181 force_on = tboot_force_iommu();
4183 if (iommu_init_mempool()) {
4184 if (force_on)
4185 panic("tboot: Failed to initialize iommu memory\n");
4186 return -ENOMEM;
4189 down_write(&dmar_global_lock);
4190 if (dmar_table_init()) {
4191 if (force_on)
4192 panic("tboot: Failed to initialize DMAR table\n");
4193 goto out_free_dmar;
4197 * Disable translation if already enabled prior to OS handover.
4199 for_each_active_iommu(iommu, drhd)
4200 if (iommu->gcmd & DMA_GCMD_TE)
4201 iommu_disable_translation(iommu);
4203 if (dmar_dev_scope_init() < 0) {
4204 if (force_on)
4205 panic("tboot: Failed to initialize DMAR device scope\n");
4206 goto out_free_dmar;
4209 if (no_iommu || dmar_disabled)
4210 goto out_free_dmar;
4212 if (list_empty(&dmar_rmrr_units))
4213 printk(KERN_INFO "DMAR: No RMRR found\n");
4215 if (list_empty(&dmar_atsr_units))
4216 printk(KERN_INFO "DMAR: No ATSR found\n");
4218 if (dmar_init_reserved_ranges()) {
4219 if (force_on)
4220 panic("tboot: Failed to reserve iommu ranges\n");
4221 goto out_free_reserved_range;
4224 init_no_remapping_devices();
4226 ret = init_dmars();
4227 if (ret) {
4228 if (force_on)
4229 panic("tboot: Failed to initialize DMARs\n");
4230 printk(KERN_ERR "IOMMU: dmar init failed\n");
4231 goto out_free_reserved_range;
4233 up_write(&dmar_global_lock);
4234 printk(KERN_INFO
4235 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
4237 init_timer(&unmap_timer);
4238 #ifdef CONFIG_SWIOTLB
4239 swiotlb = 0;
4240 #endif
4241 dma_ops = &intel_dma_ops;
4243 init_iommu_pm_ops();
4245 for_each_active_iommu(iommu, drhd)
4246 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4247 intel_iommu_groups,
4248 iommu->name);
4250 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4251 bus_register_notifier(&pci_bus_type, &device_nb);
4252 if (si_domain && !hw_pass_through)
4253 register_memory_notifier(&intel_iommu_memory_nb);
4255 intel_iommu_enabled = 1;
4257 return 0;
4259 out_free_reserved_range:
4260 put_iova_domain(&reserved_iova_list);
4261 out_free_dmar:
4262 intel_iommu_free_dmars();
4263 up_write(&dmar_global_lock);
4264 iommu_exit_mempool();
4265 return ret;
4268 static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4270 struct intel_iommu *iommu = opaque;
4272 iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4273 return 0;
4277 * NB - intel-iommu lacks any sort of reference counting for the users of
4278 * dependent devices. If multiple endpoints have intersecting dependent
4279 * devices, unbinding the driver from any one of them will possibly leave
4280 * the others unable to operate.
4282 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4283 struct device *dev)
4285 if (!iommu || !dev || !dev_is_pci(dev))
4286 return;
4288 pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4291 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4292 struct device *dev)
4294 struct device_domain_info *info, *tmp;
4295 struct intel_iommu *iommu;
4296 unsigned long flags;
4297 int found = 0;
4298 u8 bus, devfn;
4300 iommu = device_to_iommu(dev, &bus, &devfn);
4301 if (!iommu)
4302 return;
4304 spin_lock_irqsave(&device_domain_lock, flags);
4305 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4306 if (info->iommu == iommu && info->bus == bus &&
4307 info->devfn == devfn) {
4308 unlink_domain_info(info);
4309 spin_unlock_irqrestore(&device_domain_lock, flags);
4311 iommu_disable_dev_iotlb(info);
4312 iommu_detach_dev(iommu, info->bus, info->devfn);
4313 iommu_detach_dependent_devices(iommu, dev);
4314 free_devinfo_mem(info);
4316 spin_lock_irqsave(&device_domain_lock, flags);
4318 if (found)
4319 break;
4320 else
4321 continue;
4324 /* if there is no other devices under the same iommu
4325 * owned by this domain, clear this iommu in iommu_bmp
4326 * update iommu count and coherency
4328 if (info->iommu == iommu)
4329 found = 1;
4332 spin_unlock_irqrestore(&device_domain_lock, flags);
4334 if (found == 0) {
4335 domain_detach_iommu(domain, iommu);
4336 if (!domain_type_is_vm_or_si(domain))
4337 iommu_detach_domain(domain, iommu);
4341 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4343 int adjust_width;
4345 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4346 domain_reserve_special_ranges(domain);
4348 /* calculate AGAW */
4349 domain->gaw = guest_width;
4350 adjust_width = guestwidth_to_adjustwidth(guest_width);
4351 domain->agaw = width_to_agaw(adjust_width);
4353 domain->iommu_coherency = 0;
4354 domain->iommu_snooping = 0;
4355 domain->iommu_superpage = 0;
4356 domain->max_addr = 0;
4358 /* always allocate the top pgd */
4359 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4360 if (!domain->pgd)
4361 return -ENOMEM;
4362 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4363 return 0;
4366 static int intel_iommu_domain_init(struct iommu_domain *domain)
4368 struct dmar_domain *dmar_domain;
4370 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4371 if (!dmar_domain) {
4372 printk(KERN_ERR
4373 "intel_iommu_domain_init: dmar_domain == NULL\n");
4374 return -ENOMEM;
4376 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4377 printk(KERN_ERR
4378 "intel_iommu_domain_init() failed\n");
4379 domain_exit(dmar_domain);
4380 return -ENOMEM;
4382 domain_update_iommu_cap(dmar_domain);
4383 domain->priv = dmar_domain;
4385 domain->geometry.aperture_start = 0;
4386 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4387 domain->geometry.force_aperture = true;
4389 return 0;
4392 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4394 struct dmar_domain *dmar_domain = domain->priv;
4396 domain->priv = NULL;
4397 domain_exit(dmar_domain);
4400 static int intel_iommu_attach_device(struct iommu_domain *domain,
4401 struct device *dev)
4403 struct dmar_domain *dmar_domain = domain->priv;
4404 struct intel_iommu *iommu;
4405 int addr_width;
4406 u8 bus, devfn;
4408 if (device_is_rmrr_locked(dev)) {
4409 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4410 return -EPERM;
4413 /* normally dev is not mapped */
4414 if (unlikely(domain_context_mapped(dev))) {
4415 struct dmar_domain *old_domain;
4417 old_domain = find_domain(dev);
4418 if (old_domain) {
4419 if (domain_type_is_vm_or_si(dmar_domain))
4420 domain_remove_one_dev_info(old_domain, dev);
4421 else
4422 domain_remove_dev_info(old_domain);
4424 if (!domain_type_is_vm_or_si(old_domain) &&
4425 list_empty(&old_domain->devices))
4426 domain_exit(old_domain);
4430 iommu = device_to_iommu(dev, &bus, &devfn);
4431 if (!iommu)
4432 return -ENODEV;
4434 /* check if this iommu agaw is sufficient for max mapped address */
4435 addr_width = agaw_to_width(iommu->agaw);
4436 if (addr_width > cap_mgaw(iommu->cap))
4437 addr_width = cap_mgaw(iommu->cap);
4439 if (dmar_domain->max_addr > (1LL << addr_width)) {
4440 printk(KERN_ERR "%s: iommu width (%d) is not "
4441 "sufficient for the mapped address (%llx)\n",
4442 __func__, addr_width, dmar_domain->max_addr);
4443 return -EFAULT;
4445 dmar_domain->gaw = addr_width;
4448 * Knock out extra levels of page tables if necessary
4450 while (iommu->agaw < dmar_domain->agaw) {
4451 struct dma_pte *pte;
4453 pte = dmar_domain->pgd;
4454 if (dma_pte_present(pte)) {
4455 dmar_domain->pgd = (struct dma_pte *)
4456 phys_to_virt(dma_pte_addr(pte));
4457 free_pgtable_page(pte);
4459 dmar_domain->agaw--;
4462 return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4465 static void intel_iommu_detach_device(struct iommu_domain *domain,
4466 struct device *dev)
4468 struct dmar_domain *dmar_domain = domain->priv;
4470 domain_remove_one_dev_info(dmar_domain, dev);
4473 static int intel_iommu_map(struct iommu_domain *domain,
4474 unsigned long iova, phys_addr_t hpa,
4475 size_t size, int iommu_prot)
4477 struct dmar_domain *dmar_domain = domain->priv;
4478 u64 max_addr;
4479 int prot = 0;
4480 int ret;
4482 if (iommu_prot & IOMMU_READ)
4483 prot |= DMA_PTE_READ;
4484 if (iommu_prot & IOMMU_WRITE)
4485 prot |= DMA_PTE_WRITE;
4486 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4487 prot |= DMA_PTE_SNP;
4489 max_addr = iova + size;
4490 if (dmar_domain->max_addr < max_addr) {
4491 u64 end;
4493 /* check if minimum agaw is sufficient for mapped address */
4494 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4495 if (end < max_addr) {
4496 printk(KERN_ERR "%s: iommu width (%d) is not "
4497 "sufficient for the mapped address (%llx)\n",
4498 __func__, dmar_domain->gaw, max_addr);
4499 return -EFAULT;
4501 dmar_domain->max_addr = max_addr;
4503 /* Round up size to next multiple of PAGE_SIZE, if it and
4504 the low bits of hpa would take us onto the next page */
4505 size = aligned_nrpages(hpa, size);
4506 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4507 hpa >> VTD_PAGE_SHIFT, size, prot);
4508 return ret;
4511 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4512 unsigned long iova, size_t size)
4514 struct dmar_domain *dmar_domain = domain->priv;
4515 struct page *freelist = NULL;
4516 struct intel_iommu *iommu;
4517 unsigned long start_pfn, last_pfn;
4518 unsigned int npages;
4519 int iommu_id, num, ndomains, level = 0;
4521 /* Cope with horrid API which requires us to unmap more than the
4522 size argument if it happens to be a large-page mapping. */
4523 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4524 BUG();
4526 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4527 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4529 start_pfn = iova >> VTD_PAGE_SHIFT;
4530 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4532 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4534 npages = last_pfn - start_pfn + 1;
4536 for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4537 iommu = g_iommus[iommu_id];
4540 * find bit position of dmar_domain
4542 ndomains = cap_ndoms(iommu->cap);
4543 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4544 if (iommu->domains[num] == dmar_domain)
4545 iommu_flush_iotlb_psi(iommu, num, start_pfn,
4546 npages, !freelist, 0);
4551 dma_free_pagelist(freelist);
4553 if (dmar_domain->max_addr == iova + size)
4554 dmar_domain->max_addr = iova;
4556 return size;
4559 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4560 dma_addr_t iova)
4562 struct dmar_domain *dmar_domain = domain->priv;
4563 struct dma_pte *pte;
4564 int level = 0;
4565 u64 phys = 0;
4567 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4568 if (pte)
4569 phys = dma_pte_addr(pte);
4571 return phys;
4574 static bool intel_iommu_capable(enum iommu_cap cap)
4576 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4577 return domain_update_iommu_snooping(NULL) == 1;
4578 if (cap == IOMMU_CAP_INTR_REMAP)
4579 return irq_remapping_enabled == 1;
4581 return false;
4584 static int intel_iommu_add_device(struct device *dev)
4586 struct intel_iommu *iommu;
4587 struct iommu_group *group;
4588 u8 bus, devfn;
4590 iommu = device_to_iommu(dev, &bus, &devfn);
4591 if (!iommu)
4592 return -ENODEV;
4594 iommu_device_link(iommu->iommu_dev, dev);
4596 group = iommu_group_get_for_dev(dev);
4598 if (IS_ERR(group))
4599 return PTR_ERR(group);
4601 iommu_group_put(group);
4602 return 0;
4605 static void intel_iommu_remove_device(struct device *dev)
4607 struct intel_iommu *iommu;
4608 u8 bus, devfn;
4610 iommu = device_to_iommu(dev, &bus, &devfn);
4611 if (!iommu)
4612 return;
4614 iommu_group_remove_device(dev);
4616 iommu_device_unlink(iommu->iommu_dev, dev);
4619 static const struct iommu_ops intel_iommu_ops = {
4620 .capable = intel_iommu_capable,
4621 .domain_init = intel_iommu_domain_init,
4622 .domain_destroy = intel_iommu_domain_destroy,
4623 .attach_dev = intel_iommu_attach_device,
4624 .detach_dev = intel_iommu_detach_device,
4625 .map = intel_iommu_map,
4626 .unmap = intel_iommu_unmap,
4627 .map_sg = default_iommu_map_sg,
4628 .iova_to_phys = intel_iommu_iova_to_phys,
4629 .add_device = intel_iommu_add_device,
4630 .remove_device = intel_iommu_remove_device,
4631 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4634 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4636 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4637 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4638 dmar_map_gfx = 0;
4641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4649 static void quirk_iommu_rwbf(struct pci_dev *dev)
4652 * Mobile 4 Series Chipset neglects to set RWBF capability,
4653 * but needs it. Same seems to hold for the desktop versions.
4655 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4656 rwbf_quirk = 1;
4659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4667 #define GGC 0x52
4668 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4669 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4670 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4671 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4672 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4673 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4674 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4675 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4677 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4679 unsigned short ggc;
4681 if (pci_read_config_word(dev, GGC, &ggc))
4682 return;
4684 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4685 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4686 dmar_map_gfx = 0;
4687 } else if (dmar_map_gfx) {
4688 /* we have to ensure the gfx device is idle before we flush */
4689 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4690 intel_iommu_strict = 1;
4693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4698 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4699 ISOCH DMAR unit for the Azalia sound device, but not give it any
4700 TLB entries, which causes it to deadlock. Check for that. We do
4701 this in a function called from init_dmars(), instead of in a PCI
4702 quirk, because we don't want to print the obnoxious "BIOS broken"
4703 message if VT-d is actually disabled.
4705 static void __init check_tylersburg_isoch(void)
4707 struct pci_dev *pdev;
4708 uint32_t vtisochctrl;
4710 /* If there's no Azalia in the system anyway, forget it. */
4711 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4712 if (!pdev)
4713 return;
4714 pci_dev_put(pdev);
4716 /* System Management Registers. Might be hidden, in which case
4717 we can't do the sanity check. But that's OK, because the
4718 known-broken BIOSes _don't_ actually hide it, so far. */
4719 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4720 if (!pdev)
4721 return;
4723 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4724 pci_dev_put(pdev);
4725 return;
4728 pci_dev_put(pdev);
4730 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4731 if (vtisochctrl & 1)
4732 return;
4734 /* Drop all bits other than the number of TLB entries */
4735 vtisochctrl &= 0x1c;
4737 /* If we have the recommended number of TLB entries (16), fine. */
4738 if (vtisochctrl == 0x10)
4739 return;
4741 /* Zero TLB entries? You get to ride the short bus to school. */
4742 if (!vtisochctrl) {
4743 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4744 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4745 dmi_get_system_info(DMI_BIOS_VENDOR),
4746 dmi_get_system_info(DMI_BIOS_VERSION),
4747 dmi_get_system_info(DMI_PRODUCT_VERSION));
4748 iommu_identity_mapping |= IDENTMAP_AZALIA;
4749 return;
4752 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4753 vtisochctrl);