drm/radeon: remove cayman_gpu_is_lockup
[linux-2.6/libata-dev.git] / drivers / iommu / intel-iommu.c
blobf93d5ac8f81c0b2ff02b6f97b2ae079b8b3f80a7
1 /*
2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
48 #define ROOT_SIZE VTD_PAGE_SIZE
49 #define CONTEXT_SIZE VTD_PAGE_SIZE
51 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
52 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
53 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55 #define IOAPIC_RANGE_START (0xfee00000)
56 #define IOAPIC_RANGE_END (0xfeefffff)
57 #define IOVA_START_ADDR (0x1000)
59 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61 #define MAX_AGAW_WIDTH 64
63 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
64 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
67 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
68 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
69 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
70 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
73 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
74 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
76 /* page table handling */
77 #define LEVEL_STRIDE (9)
78 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
81 * This bitmap is used to advertise the page sizes our hardware support
82 * to the IOMMU core, which will then use this information to split
83 * physically contiguous memory regions it is mapping into page sizes
84 * that we support.
86 * Traditionally the IOMMU core just handed us the mappings directly,
87 * after making sure the size is an order of a 4KiB page and that the
88 * mapping has natural alignment.
90 * To retain this behavior, we currently advertise that we support
91 * all page sizes that are an order of 4KiB.
93 * If at some point we'd like to utilize the IOMMU core's new behavior,
94 * we could change this to advertise the real page sizes we support.
96 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
98 static inline int agaw_to_level(int agaw)
100 return agaw + 2;
103 static inline int agaw_to_width(int agaw)
105 return 30 + agaw * LEVEL_STRIDE;
108 static inline int width_to_agaw(int width)
110 return (width - 30) / LEVEL_STRIDE;
113 static inline unsigned int level_to_offset_bits(int level)
115 return (level - 1) * LEVEL_STRIDE;
118 static inline int pfn_level_offset(unsigned long pfn, int level)
120 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
123 static inline unsigned long level_mask(int level)
125 return -1UL << level_to_offset_bits(level);
128 static inline unsigned long level_size(int level)
130 return 1UL << level_to_offset_bits(level);
133 static inline unsigned long align_to_level(unsigned long pfn, int level)
135 return (pfn + level_size(level) - 1) & level_mask(level);
138 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
140 return 1 << ((lvl - 1) * LEVEL_STRIDE);
143 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
144 are never going to work. */
145 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
147 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
150 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
152 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 static inline unsigned long page_to_dma_pfn(struct page *pg)
156 return mm_to_dma_pfn(page_to_pfn(pg));
158 static inline unsigned long virt_to_dma_pfn(void *p)
160 return page_to_dma_pfn(virt_to_page(p));
163 /* global iommu list, set NULL for ignored DMAR units */
164 static struct intel_iommu **g_iommus;
166 static void __init check_tylersburg_isoch(void);
167 static int rwbf_quirk;
170 * set to 1 to panic kernel if can't successfully enable VT-d
171 * (used when kernel is launched w/ TXT)
173 static int force_on = 0;
176 * 0: Present
177 * 1-11: Reserved
178 * 12-63: Context Ptr (12 - (haw-1))
179 * 64-127: Reserved
181 struct root_entry {
182 u64 val;
183 u64 rsvd1;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 static inline bool root_present(struct root_entry *root)
188 return (root->val & 1);
190 static inline void set_root_present(struct root_entry *root)
192 root->val |= 1;
194 static inline void set_root_value(struct root_entry *root, unsigned long value)
196 root->val |= value & VTD_PAGE_MASK;
199 static inline struct context_entry *
200 get_context_addr_from_root(struct root_entry *root)
202 return (struct context_entry *)
203 (root_present(root)?phys_to_virt(
204 root->val & VTD_PAGE_MASK) :
205 NULL);
209 * low 64 bits:
210 * 0: present
211 * 1: fault processing disable
212 * 2-3: translation type
213 * 12-63: address space root
214 * high 64 bits:
215 * 0-2: address width
216 * 3-6: aval
217 * 8-23: domain id
219 struct context_entry {
220 u64 lo;
221 u64 hi;
224 static inline bool context_present(struct context_entry *context)
226 return (context->lo & 1);
228 static inline void context_set_present(struct context_entry *context)
230 context->lo |= 1;
233 static inline void context_set_fault_enable(struct context_entry *context)
235 context->lo &= (((u64)-1) << 2) | 1;
238 static inline void context_set_translation_type(struct context_entry *context,
239 unsigned long value)
241 context->lo &= (((u64)-1) << 4) | 3;
242 context->lo |= (value & 3) << 2;
245 static inline void context_set_address_root(struct context_entry *context,
246 unsigned long value)
248 context->lo |= value & VTD_PAGE_MASK;
251 static inline void context_set_address_width(struct context_entry *context,
252 unsigned long value)
254 context->hi |= value & 7;
257 static inline void context_set_domain_id(struct context_entry *context,
258 unsigned long value)
260 context->hi |= (value & ((1 << 16) - 1)) << 8;
263 static inline void context_clear_entry(struct context_entry *context)
265 context->lo = 0;
266 context->hi = 0;
270 * 0: readable
271 * 1: writable
272 * 2-6: reserved
273 * 7: super page
274 * 8-10: available
275 * 11: snoop behavior
276 * 12-63: Host physcial address
278 struct dma_pte {
279 u64 val;
282 static inline void dma_clear_pte(struct dma_pte *pte)
284 pte->val = 0;
287 static inline void dma_set_pte_readable(struct dma_pte *pte)
289 pte->val |= DMA_PTE_READ;
292 static inline void dma_set_pte_writable(struct dma_pte *pte)
294 pte->val |= DMA_PTE_WRITE;
297 static inline void dma_set_pte_snp(struct dma_pte *pte)
299 pte->val |= DMA_PTE_SNP;
302 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
304 pte->val = (pte->val & ~3) | (prot & 3);
307 static inline u64 dma_pte_addr(struct dma_pte *pte)
309 #ifdef CONFIG_64BIT
310 return pte->val & VTD_PAGE_MASK;
311 #else
312 /* Must have a full atomic 64-bit read */
313 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
314 #endif
317 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
319 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
322 static inline bool dma_pte_present(struct dma_pte *pte)
324 return (pte->val & 3) != 0;
327 static inline bool dma_pte_superpage(struct dma_pte *pte)
329 return (pte->val & (1 << 7));
332 static inline int first_pte_in_page(struct dma_pte *pte)
334 return !((unsigned long)pte & ~VTD_PAGE_MASK);
338 * This domain is a statically identity mapping domain.
339 * 1. This domain creats a static 1:1 mapping to all usable memory.
340 * 2. It maps to each iommu if successful.
341 * 3. Each iommu mapps to this domain if successful.
343 static struct dmar_domain *si_domain;
344 static int hw_pass_through = 1;
346 /* devices under the same p2p bridge are owned in one domain */
347 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
349 /* domain represents a virtual machine, more than one devices
350 * across iommus may be owned in one domain, e.g. kvm guest.
352 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
354 /* si_domain contains mulitple devices */
355 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
357 /* define the limit of IOMMUs supported in each domain */
358 #ifdef CONFIG_X86
359 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
360 #else
361 # define IOMMU_UNITS_SUPPORTED 64
362 #endif
364 struct dmar_domain {
365 int id; /* domain id */
366 int nid; /* node id */
367 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
368 /* bitmap of iommus this domain uses*/
370 struct list_head devices; /* all devices' list */
371 struct iova_domain iovad; /* iova's that belong to this domain */
373 struct dma_pte *pgd; /* virtual address */
374 int gaw; /* max guest address width */
376 /* adjusted guest address width, 0 is level 2 30-bit */
377 int agaw;
379 int flags; /* flags to find out type of domain */
381 int iommu_coherency;/* indicate coherency of iommu access */
382 int iommu_snooping; /* indicate snooping control feature*/
383 int iommu_count; /* reference count of iommu */
384 int iommu_superpage;/* Level of superpages supported:
385 0 == 4KiB (no superpages), 1 == 2MiB,
386 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
387 spinlock_t iommu_lock; /* protect iommu set in domain */
388 u64 max_addr; /* maximum mapped address */
391 /* PCI domain-device relationship */
392 struct device_domain_info {
393 struct list_head link; /* link to domain siblings */
394 struct list_head global; /* link to global list */
395 int segment; /* PCI domain */
396 u8 bus; /* PCI bus number */
397 u8 devfn; /* PCI devfn number */
398 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
399 struct intel_iommu *iommu; /* IOMMU used by this device */
400 struct dmar_domain *domain; /* pointer to domain */
403 static void flush_unmaps_timeout(unsigned long data);
405 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
407 #define HIGH_WATER_MARK 250
408 struct deferred_flush_tables {
409 int next;
410 struct iova *iova[HIGH_WATER_MARK];
411 struct dmar_domain *domain[HIGH_WATER_MARK];
414 static struct deferred_flush_tables *deferred_flush;
416 /* bitmap for indexing intel_iommus */
417 static int g_num_of_iommus;
419 static DEFINE_SPINLOCK(async_umap_flush_lock);
420 static LIST_HEAD(unmaps_to_do);
422 static int timer_on;
423 static long list_size;
425 static void domain_remove_dev_info(struct dmar_domain *domain);
427 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
428 int dmar_disabled = 0;
429 #else
430 int dmar_disabled = 1;
431 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
433 int intel_iommu_enabled = 0;
434 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
436 static int dmar_map_gfx = 1;
437 static int dmar_forcedac;
438 static int intel_iommu_strict;
439 static int intel_iommu_superpage = 1;
441 int intel_iommu_gfx_mapped;
442 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
444 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
445 static DEFINE_SPINLOCK(device_domain_lock);
446 static LIST_HEAD(device_domain_list);
448 static struct iommu_ops intel_iommu_ops;
450 static int __init intel_iommu_setup(char *str)
452 if (!str)
453 return -EINVAL;
454 while (*str) {
455 if (!strncmp(str, "on", 2)) {
456 dmar_disabled = 0;
457 printk(KERN_INFO "Intel-IOMMU: enabled\n");
458 } else if (!strncmp(str, "off", 3)) {
459 dmar_disabled = 1;
460 printk(KERN_INFO "Intel-IOMMU: disabled\n");
461 } else if (!strncmp(str, "igfx_off", 8)) {
462 dmar_map_gfx = 0;
463 printk(KERN_INFO
464 "Intel-IOMMU: disable GFX device mapping\n");
465 } else if (!strncmp(str, "forcedac", 8)) {
466 printk(KERN_INFO
467 "Intel-IOMMU: Forcing DAC for PCI devices\n");
468 dmar_forcedac = 1;
469 } else if (!strncmp(str, "strict", 6)) {
470 printk(KERN_INFO
471 "Intel-IOMMU: disable batched IOTLB flush\n");
472 intel_iommu_strict = 1;
473 } else if (!strncmp(str, "sp_off", 6)) {
474 printk(KERN_INFO
475 "Intel-IOMMU: disable supported super page\n");
476 intel_iommu_superpage = 0;
479 str += strcspn(str, ",");
480 while (*str == ',')
481 str++;
483 return 0;
485 __setup("intel_iommu=", intel_iommu_setup);
487 static struct kmem_cache *iommu_domain_cache;
488 static struct kmem_cache *iommu_devinfo_cache;
489 static struct kmem_cache *iommu_iova_cache;
491 static inline void *alloc_pgtable_page(int node)
493 struct page *page;
494 void *vaddr = NULL;
496 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
497 if (page)
498 vaddr = page_address(page);
499 return vaddr;
502 static inline void free_pgtable_page(void *vaddr)
504 free_page((unsigned long)vaddr);
507 static inline void *alloc_domain_mem(void)
509 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
512 static void free_domain_mem(void *vaddr)
514 kmem_cache_free(iommu_domain_cache, vaddr);
517 static inline void * alloc_devinfo_mem(void)
519 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
522 static inline void free_devinfo_mem(void *vaddr)
524 kmem_cache_free(iommu_devinfo_cache, vaddr);
527 struct iova *alloc_iova_mem(void)
529 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
532 void free_iova_mem(struct iova *iova)
534 kmem_cache_free(iommu_iova_cache, iova);
538 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
540 unsigned long sagaw;
541 int agaw = -1;
543 sagaw = cap_sagaw(iommu->cap);
544 for (agaw = width_to_agaw(max_gaw);
545 agaw >= 0; agaw--) {
546 if (test_bit(agaw, &sagaw))
547 break;
550 return agaw;
554 * Calculate max SAGAW for each iommu.
556 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
558 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
562 * calculate agaw for each iommu.
563 * "SAGAW" may be different across iommus, use a default agaw, and
564 * get a supported less agaw for iommus that don't support the default agaw.
566 int iommu_calculate_agaw(struct intel_iommu *iommu)
568 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
571 /* This functionin only returns single iommu in a domain */
572 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
574 int iommu_id;
576 /* si_domain and vm domain should not get here. */
577 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
578 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
580 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
581 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
582 return NULL;
584 return g_iommus[iommu_id];
587 static void domain_update_iommu_coherency(struct dmar_domain *domain)
589 int i;
591 domain->iommu_coherency = 1;
593 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
594 if (!ecap_coherent(g_iommus[i]->ecap)) {
595 domain->iommu_coherency = 0;
596 break;
601 static void domain_update_iommu_snooping(struct dmar_domain *domain)
603 int i;
605 domain->iommu_snooping = 1;
607 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
608 if (!ecap_sc_support(g_iommus[i]->ecap)) {
609 domain->iommu_snooping = 0;
610 break;
615 static void domain_update_iommu_superpage(struct dmar_domain *domain)
617 struct dmar_drhd_unit *drhd;
618 struct intel_iommu *iommu = NULL;
619 int mask = 0xf;
621 if (!intel_iommu_superpage) {
622 domain->iommu_superpage = 0;
623 return;
626 /* set iommu_superpage to the smallest common denominator */
627 for_each_active_iommu(iommu, drhd) {
628 mask &= cap_super_page_val(iommu->cap);
629 if (!mask) {
630 break;
633 domain->iommu_superpage = fls(mask);
636 /* Some capabilities may be different across iommus */
637 static void domain_update_iommu_cap(struct dmar_domain *domain)
639 domain_update_iommu_coherency(domain);
640 domain_update_iommu_snooping(domain);
641 domain_update_iommu_superpage(domain);
644 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
646 struct dmar_drhd_unit *drhd = NULL;
647 int i;
649 for_each_drhd_unit(drhd) {
650 if (drhd->ignored)
651 continue;
652 if (segment != drhd->segment)
653 continue;
655 for (i = 0; i < drhd->devices_cnt; i++) {
656 if (drhd->devices[i] &&
657 drhd->devices[i]->bus->number == bus &&
658 drhd->devices[i]->devfn == devfn)
659 return drhd->iommu;
660 if (drhd->devices[i] &&
661 drhd->devices[i]->subordinate &&
662 drhd->devices[i]->subordinate->number <= bus &&
663 drhd->devices[i]->subordinate->subordinate >= bus)
664 return drhd->iommu;
667 if (drhd->include_all)
668 return drhd->iommu;
671 return NULL;
674 static void domain_flush_cache(struct dmar_domain *domain,
675 void *addr, int size)
677 if (!domain->iommu_coherency)
678 clflush_cache_range(addr, size);
681 /* Gets context entry for a given bus and devfn */
682 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
683 u8 bus, u8 devfn)
685 struct root_entry *root;
686 struct context_entry *context;
687 unsigned long phy_addr;
688 unsigned long flags;
690 spin_lock_irqsave(&iommu->lock, flags);
691 root = &iommu->root_entry[bus];
692 context = get_context_addr_from_root(root);
693 if (!context) {
694 context = (struct context_entry *)
695 alloc_pgtable_page(iommu->node);
696 if (!context) {
697 spin_unlock_irqrestore(&iommu->lock, flags);
698 return NULL;
700 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
701 phy_addr = virt_to_phys((void *)context);
702 set_root_value(root, phy_addr);
703 set_root_present(root);
704 __iommu_flush_cache(iommu, root, sizeof(*root));
706 spin_unlock_irqrestore(&iommu->lock, flags);
707 return &context[devfn];
710 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
712 struct root_entry *root;
713 struct context_entry *context;
714 int ret;
715 unsigned long flags;
717 spin_lock_irqsave(&iommu->lock, flags);
718 root = &iommu->root_entry[bus];
719 context = get_context_addr_from_root(root);
720 if (!context) {
721 ret = 0;
722 goto out;
724 ret = context_present(&context[devfn]);
725 out:
726 spin_unlock_irqrestore(&iommu->lock, flags);
727 return ret;
730 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
732 struct root_entry *root;
733 struct context_entry *context;
734 unsigned long flags;
736 spin_lock_irqsave(&iommu->lock, flags);
737 root = &iommu->root_entry[bus];
738 context = get_context_addr_from_root(root);
739 if (context) {
740 context_clear_entry(&context[devfn]);
741 __iommu_flush_cache(iommu, &context[devfn], \
742 sizeof(*context));
744 spin_unlock_irqrestore(&iommu->lock, flags);
747 static void free_context_table(struct intel_iommu *iommu)
749 struct root_entry *root;
750 int i;
751 unsigned long flags;
752 struct context_entry *context;
754 spin_lock_irqsave(&iommu->lock, flags);
755 if (!iommu->root_entry) {
756 goto out;
758 for (i = 0; i < ROOT_ENTRY_NR; i++) {
759 root = &iommu->root_entry[i];
760 context = get_context_addr_from_root(root);
761 if (context)
762 free_pgtable_page(context);
764 free_pgtable_page(iommu->root_entry);
765 iommu->root_entry = NULL;
766 out:
767 spin_unlock_irqrestore(&iommu->lock, flags);
770 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
771 unsigned long pfn, int target_level)
773 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
774 struct dma_pte *parent, *pte = NULL;
775 int level = agaw_to_level(domain->agaw);
776 int offset;
778 BUG_ON(!domain->pgd);
779 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
780 parent = domain->pgd;
782 while (level > 0) {
783 void *tmp_page;
785 offset = pfn_level_offset(pfn, level);
786 pte = &parent[offset];
787 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
788 break;
789 if (level == target_level)
790 break;
792 if (!dma_pte_present(pte)) {
793 uint64_t pteval;
795 tmp_page = alloc_pgtable_page(domain->nid);
797 if (!tmp_page)
798 return NULL;
800 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
801 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
802 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
803 /* Someone else set it while we were thinking; use theirs. */
804 free_pgtable_page(tmp_page);
805 } else {
806 dma_pte_addr(pte);
807 domain_flush_cache(domain, pte, sizeof(*pte));
810 parent = phys_to_virt(dma_pte_addr(pte));
811 level--;
814 return pte;
818 /* return address's pte at specific level */
819 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
820 unsigned long pfn,
821 int level, int *large_page)
823 struct dma_pte *parent, *pte = NULL;
824 int total = agaw_to_level(domain->agaw);
825 int offset;
827 parent = domain->pgd;
828 while (level <= total) {
829 offset = pfn_level_offset(pfn, total);
830 pte = &parent[offset];
831 if (level == total)
832 return pte;
834 if (!dma_pte_present(pte)) {
835 *large_page = total;
836 break;
839 if (pte->val & DMA_PTE_LARGE_PAGE) {
840 *large_page = total;
841 return pte;
844 parent = phys_to_virt(dma_pte_addr(pte));
845 total--;
847 return NULL;
850 /* clear last level pte, a tlb flush should be followed */
851 static int dma_pte_clear_range(struct dmar_domain *domain,
852 unsigned long start_pfn,
853 unsigned long last_pfn)
855 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
856 unsigned int large_page = 1;
857 struct dma_pte *first_pte, *pte;
858 int order;
860 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
861 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
862 BUG_ON(start_pfn > last_pfn);
864 /* we don't need lock here; nobody else touches the iova range */
865 do {
866 large_page = 1;
867 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
868 if (!pte) {
869 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
870 continue;
872 do {
873 dma_clear_pte(pte);
874 start_pfn += lvl_to_nr_pages(large_page);
875 pte++;
876 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
878 domain_flush_cache(domain, first_pte,
879 (void *)pte - (void *)first_pte);
881 } while (start_pfn && start_pfn <= last_pfn);
883 order = (large_page - 1) * 9;
884 return order;
887 /* free page table pages. last level pte should already be cleared */
888 static void dma_pte_free_pagetable(struct dmar_domain *domain,
889 unsigned long start_pfn,
890 unsigned long last_pfn)
892 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
893 struct dma_pte *first_pte, *pte;
894 int total = agaw_to_level(domain->agaw);
895 int level;
896 unsigned long tmp;
897 int large_page = 2;
899 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
900 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
901 BUG_ON(start_pfn > last_pfn);
903 /* We don't need lock here; nobody else touches the iova range */
904 level = 2;
905 while (level <= total) {
906 tmp = align_to_level(start_pfn, level);
908 /* If we can't even clear one PTE at this level, we're done */
909 if (tmp + level_size(level) - 1 > last_pfn)
910 return;
912 do {
913 large_page = level;
914 first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
915 if (large_page > level)
916 level = large_page + 1;
917 if (!pte) {
918 tmp = align_to_level(tmp + 1, level + 1);
919 continue;
921 do {
922 if (dma_pte_present(pte)) {
923 free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
924 dma_clear_pte(pte);
926 pte++;
927 tmp += level_size(level);
928 } while (!first_pte_in_page(pte) &&
929 tmp + level_size(level) - 1 <= last_pfn);
931 domain_flush_cache(domain, first_pte,
932 (void *)pte - (void *)first_pte);
934 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
935 level++;
937 /* free pgd */
938 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
939 free_pgtable_page(domain->pgd);
940 domain->pgd = NULL;
944 /* iommu handling */
945 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
947 struct root_entry *root;
948 unsigned long flags;
950 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
951 if (!root)
952 return -ENOMEM;
954 __iommu_flush_cache(iommu, root, ROOT_SIZE);
956 spin_lock_irqsave(&iommu->lock, flags);
957 iommu->root_entry = root;
958 spin_unlock_irqrestore(&iommu->lock, flags);
960 return 0;
963 static void iommu_set_root_entry(struct intel_iommu *iommu)
965 void *addr;
966 u32 sts;
967 unsigned long flag;
969 addr = iommu->root_entry;
971 raw_spin_lock_irqsave(&iommu->register_lock, flag);
972 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
974 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
976 /* Make sure hardware complete it */
977 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
978 readl, (sts & DMA_GSTS_RTPS), sts);
980 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
983 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
985 u32 val;
986 unsigned long flag;
988 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
989 return;
991 raw_spin_lock_irqsave(&iommu->register_lock, flag);
992 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
994 /* Make sure hardware complete it */
995 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
996 readl, (!(val & DMA_GSTS_WBFS)), val);
998 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1001 /* return value determine if we need a write buffer flush */
1002 static void __iommu_flush_context(struct intel_iommu *iommu,
1003 u16 did, u16 source_id, u8 function_mask,
1004 u64 type)
1006 u64 val = 0;
1007 unsigned long flag;
1009 switch (type) {
1010 case DMA_CCMD_GLOBAL_INVL:
1011 val = DMA_CCMD_GLOBAL_INVL;
1012 break;
1013 case DMA_CCMD_DOMAIN_INVL:
1014 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1015 break;
1016 case DMA_CCMD_DEVICE_INVL:
1017 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1018 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1019 break;
1020 default:
1021 BUG();
1023 val |= DMA_CCMD_ICC;
1025 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1026 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1028 /* Make sure hardware complete it */
1029 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1030 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1032 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1035 /* return value determine if we need a write buffer flush */
1036 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1037 u64 addr, unsigned int size_order, u64 type)
1039 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1040 u64 val = 0, val_iva = 0;
1041 unsigned long flag;
1043 switch (type) {
1044 case DMA_TLB_GLOBAL_FLUSH:
1045 /* global flush doesn't need set IVA_REG */
1046 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1047 break;
1048 case DMA_TLB_DSI_FLUSH:
1049 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1050 break;
1051 case DMA_TLB_PSI_FLUSH:
1052 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1053 /* Note: always flush non-leaf currently */
1054 val_iva = size_order | addr;
1055 break;
1056 default:
1057 BUG();
1059 /* Note: set drain read/write */
1060 #if 0
1062 * This is probably to be super secure.. Looks like we can
1063 * ignore it without any impact.
1065 if (cap_read_drain(iommu->cap))
1066 val |= DMA_TLB_READ_DRAIN;
1067 #endif
1068 if (cap_write_drain(iommu->cap))
1069 val |= DMA_TLB_WRITE_DRAIN;
1071 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1072 /* Note: Only uses first TLB reg currently */
1073 if (val_iva)
1074 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1075 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1077 /* Make sure hardware complete it */
1078 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1079 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1081 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1083 /* check IOTLB invalidation granularity */
1084 if (DMA_TLB_IAIG(val) == 0)
1085 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1086 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1087 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1088 (unsigned long long)DMA_TLB_IIRG(type),
1089 (unsigned long long)DMA_TLB_IAIG(val));
1092 static struct device_domain_info *iommu_support_dev_iotlb(
1093 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1095 int found = 0;
1096 unsigned long flags;
1097 struct device_domain_info *info;
1098 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1100 if (!ecap_dev_iotlb_support(iommu->ecap))
1101 return NULL;
1103 if (!iommu->qi)
1104 return NULL;
1106 spin_lock_irqsave(&device_domain_lock, flags);
1107 list_for_each_entry(info, &domain->devices, link)
1108 if (info->bus == bus && info->devfn == devfn) {
1109 found = 1;
1110 break;
1112 spin_unlock_irqrestore(&device_domain_lock, flags);
1114 if (!found || !info->dev)
1115 return NULL;
1117 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1118 return NULL;
1120 if (!dmar_find_matched_atsr_unit(info->dev))
1121 return NULL;
1123 info->iommu = iommu;
1125 return info;
1128 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1130 if (!info)
1131 return;
1133 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1136 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1138 if (!info->dev || !pci_ats_enabled(info->dev))
1139 return;
1141 pci_disable_ats(info->dev);
1144 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1145 u64 addr, unsigned mask)
1147 u16 sid, qdep;
1148 unsigned long flags;
1149 struct device_domain_info *info;
1151 spin_lock_irqsave(&device_domain_lock, flags);
1152 list_for_each_entry(info, &domain->devices, link) {
1153 if (!info->dev || !pci_ats_enabled(info->dev))
1154 continue;
1156 sid = info->bus << 8 | info->devfn;
1157 qdep = pci_ats_queue_depth(info->dev);
1158 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1160 spin_unlock_irqrestore(&device_domain_lock, flags);
1163 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1164 unsigned long pfn, unsigned int pages, int map)
1166 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1167 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1169 BUG_ON(pages == 0);
1172 * Fallback to domain selective flush if no PSI support or the size is
1173 * too big.
1174 * PSI requires page size to be 2 ^ x, and the base address is naturally
1175 * aligned to the size
1177 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1178 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1179 DMA_TLB_DSI_FLUSH);
1180 else
1181 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1182 DMA_TLB_PSI_FLUSH);
1185 * In caching mode, changes of pages from non-present to present require
1186 * flush. However, device IOTLB doesn't need to be flushed in this case.
1188 if (!cap_caching_mode(iommu->cap) || !map)
1189 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1192 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1194 u32 pmen;
1195 unsigned long flags;
1197 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1198 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1199 pmen &= ~DMA_PMEN_EPM;
1200 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1202 /* wait for the protected region status bit to clear */
1203 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1204 readl, !(pmen & DMA_PMEN_PRS), pmen);
1206 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1209 static int iommu_enable_translation(struct intel_iommu *iommu)
1211 u32 sts;
1212 unsigned long flags;
1214 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1215 iommu->gcmd |= DMA_GCMD_TE;
1216 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1218 /* Make sure hardware complete it */
1219 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1220 readl, (sts & DMA_GSTS_TES), sts);
1222 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1223 return 0;
1226 static int iommu_disable_translation(struct intel_iommu *iommu)
1228 u32 sts;
1229 unsigned long flag;
1231 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1232 iommu->gcmd &= ~DMA_GCMD_TE;
1233 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1235 /* Make sure hardware complete it */
1236 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1237 readl, (!(sts & DMA_GSTS_TES)), sts);
1239 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1240 return 0;
1244 static int iommu_init_domains(struct intel_iommu *iommu)
1246 unsigned long ndomains;
1247 unsigned long nlongs;
1249 ndomains = cap_ndoms(iommu->cap);
1250 pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1251 ndomains);
1252 nlongs = BITS_TO_LONGS(ndomains);
1254 spin_lock_init(&iommu->lock);
1256 /* TBD: there might be 64K domains,
1257 * consider other allocation for future chip
1259 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1260 if (!iommu->domain_ids) {
1261 printk(KERN_ERR "Allocating domain id array failed\n");
1262 return -ENOMEM;
1264 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1265 GFP_KERNEL);
1266 if (!iommu->domains) {
1267 printk(KERN_ERR "Allocating domain array failed\n");
1268 return -ENOMEM;
1272 * if Caching mode is set, then invalid translations are tagged
1273 * with domainid 0. Hence we need to pre-allocate it.
1275 if (cap_caching_mode(iommu->cap))
1276 set_bit(0, iommu->domain_ids);
1277 return 0;
1281 static void domain_exit(struct dmar_domain *domain);
1282 static void vm_domain_exit(struct dmar_domain *domain);
1284 void free_dmar_iommu(struct intel_iommu *iommu)
1286 struct dmar_domain *domain;
1287 int i;
1288 unsigned long flags;
1290 if ((iommu->domains) && (iommu->domain_ids)) {
1291 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1292 domain = iommu->domains[i];
1293 clear_bit(i, iommu->domain_ids);
1295 spin_lock_irqsave(&domain->iommu_lock, flags);
1296 if (--domain->iommu_count == 0) {
1297 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1298 vm_domain_exit(domain);
1299 else
1300 domain_exit(domain);
1302 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1306 if (iommu->gcmd & DMA_GCMD_TE)
1307 iommu_disable_translation(iommu);
1309 if (iommu->irq) {
1310 irq_set_handler_data(iommu->irq, NULL);
1311 /* This will mask the irq */
1312 free_irq(iommu->irq, iommu);
1313 destroy_irq(iommu->irq);
1316 kfree(iommu->domains);
1317 kfree(iommu->domain_ids);
1319 g_iommus[iommu->seq_id] = NULL;
1321 /* if all iommus are freed, free g_iommus */
1322 for (i = 0; i < g_num_of_iommus; i++) {
1323 if (g_iommus[i])
1324 break;
1327 if (i == g_num_of_iommus)
1328 kfree(g_iommus);
1330 /* free context mapping */
1331 free_context_table(iommu);
1334 static struct dmar_domain *alloc_domain(void)
1336 struct dmar_domain *domain;
1338 domain = alloc_domain_mem();
1339 if (!domain)
1340 return NULL;
1342 domain->nid = -1;
1343 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1344 domain->flags = 0;
1346 return domain;
1349 static int iommu_attach_domain(struct dmar_domain *domain,
1350 struct intel_iommu *iommu)
1352 int num;
1353 unsigned long ndomains;
1354 unsigned long flags;
1356 ndomains = cap_ndoms(iommu->cap);
1358 spin_lock_irqsave(&iommu->lock, flags);
1360 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1361 if (num >= ndomains) {
1362 spin_unlock_irqrestore(&iommu->lock, flags);
1363 printk(KERN_ERR "IOMMU: no free domain ids\n");
1364 return -ENOMEM;
1367 domain->id = num;
1368 set_bit(num, iommu->domain_ids);
1369 set_bit(iommu->seq_id, domain->iommu_bmp);
1370 iommu->domains[num] = domain;
1371 spin_unlock_irqrestore(&iommu->lock, flags);
1373 return 0;
1376 static void iommu_detach_domain(struct dmar_domain *domain,
1377 struct intel_iommu *iommu)
1379 unsigned long flags;
1380 int num, ndomains;
1381 int found = 0;
1383 spin_lock_irqsave(&iommu->lock, flags);
1384 ndomains = cap_ndoms(iommu->cap);
1385 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1386 if (iommu->domains[num] == domain) {
1387 found = 1;
1388 break;
1392 if (found) {
1393 clear_bit(num, iommu->domain_ids);
1394 clear_bit(iommu->seq_id, domain->iommu_bmp);
1395 iommu->domains[num] = NULL;
1397 spin_unlock_irqrestore(&iommu->lock, flags);
1400 static struct iova_domain reserved_iova_list;
1401 static struct lock_class_key reserved_rbtree_key;
1403 static int dmar_init_reserved_ranges(void)
1405 struct pci_dev *pdev = NULL;
1406 struct iova *iova;
1407 int i;
1409 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1411 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1412 &reserved_rbtree_key);
1414 /* IOAPIC ranges shouldn't be accessed by DMA */
1415 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1416 IOVA_PFN(IOAPIC_RANGE_END));
1417 if (!iova) {
1418 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1419 return -ENODEV;
1422 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1423 for_each_pci_dev(pdev) {
1424 struct resource *r;
1426 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1427 r = &pdev->resource[i];
1428 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1429 continue;
1430 iova = reserve_iova(&reserved_iova_list,
1431 IOVA_PFN(r->start),
1432 IOVA_PFN(r->end));
1433 if (!iova) {
1434 printk(KERN_ERR "Reserve iova failed\n");
1435 return -ENODEV;
1439 return 0;
1442 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1444 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1447 static inline int guestwidth_to_adjustwidth(int gaw)
1449 int agaw;
1450 int r = (gaw - 12) % 9;
1452 if (r == 0)
1453 agaw = gaw;
1454 else
1455 agaw = gaw + 9 - r;
1456 if (agaw > 64)
1457 agaw = 64;
1458 return agaw;
1461 static int domain_init(struct dmar_domain *domain, int guest_width)
1463 struct intel_iommu *iommu;
1464 int adjust_width, agaw;
1465 unsigned long sagaw;
1467 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1468 spin_lock_init(&domain->iommu_lock);
1470 domain_reserve_special_ranges(domain);
1472 /* calculate AGAW */
1473 iommu = domain_get_iommu(domain);
1474 if (guest_width > cap_mgaw(iommu->cap))
1475 guest_width = cap_mgaw(iommu->cap);
1476 domain->gaw = guest_width;
1477 adjust_width = guestwidth_to_adjustwidth(guest_width);
1478 agaw = width_to_agaw(adjust_width);
1479 sagaw = cap_sagaw(iommu->cap);
1480 if (!test_bit(agaw, &sagaw)) {
1481 /* hardware doesn't support it, choose a bigger one */
1482 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1483 agaw = find_next_bit(&sagaw, 5, agaw);
1484 if (agaw >= 5)
1485 return -ENODEV;
1487 domain->agaw = agaw;
1488 INIT_LIST_HEAD(&domain->devices);
1490 if (ecap_coherent(iommu->ecap))
1491 domain->iommu_coherency = 1;
1492 else
1493 domain->iommu_coherency = 0;
1495 if (ecap_sc_support(iommu->ecap))
1496 domain->iommu_snooping = 1;
1497 else
1498 domain->iommu_snooping = 0;
1500 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1501 domain->iommu_count = 1;
1502 domain->nid = iommu->node;
1504 /* always allocate the top pgd */
1505 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1506 if (!domain->pgd)
1507 return -ENOMEM;
1508 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1509 return 0;
1512 static void domain_exit(struct dmar_domain *domain)
1514 struct dmar_drhd_unit *drhd;
1515 struct intel_iommu *iommu;
1517 /* Domain 0 is reserved, so dont process it */
1518 if (!domain)
1519 return;
1521 /* Flush any lazy unmaps that may reference this domain */
1522 if (!intel_iommu_strict)
1523 flush_unmaps_timeout(0);
1525 domain_remove_dev_info(domain);
1526 /* destroy iovas */
1527 put_iova_domain(&domain->iovad);
1529 /* clear ptes */
1530 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1532 /* free page tables */
1533 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1535 for_each_active_iommu(iommu, drhd)
1536 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1537 iommu_detach_domain(domain, iommu);
1539 free_domain_mem(domain);
1542 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1543 u8 bus, u8 devfn, int translation)
1545 struct context_entry *context;
1546 unsigned long flags;
1547 struct intel_iommu *iommu;
1548 struct dma_pte *pgd;
1549 unsigned long num;
1550 unsigned long ndomains;
1551 int id;
1552 int agaw;
1553 struct device_domain_info *info = NULL;
1555 pr_debug("Set context mapping for %02x:%02x.%d\n",
1556 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1558 BUG_ON(!domain->pgd);
1559 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1560 translation != CONTEXT_TT_MULTI_LEVEL);
1562 iommu = device_to_iommu(segment, bus, devfn);
1563 if (!iommu)
1564 return -ENODEV;
1566 context = device_to_context_entry(iommu, bus, devfn);
1567 if (!context)
1568 return -ENOMEM;
1569 spin_lock_irqsave(&iommu->lock, flags);
1570 if (context_present(context)) {
1571 spin_unlock_irqrestore(&iommu->lock, flags);
1572 return 0;
1575 id = domain->id;
1576 pgd = domain->pgd;
1578 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1579 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1580 int found = 0;
1582 /* find an available domain id for this device in iommu */
1583 ndomains = cap_ndoms(iommu->cap);
1584 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1585 if (iommu->domains[num] == domain) {
1586 id = num;
1587 found = 1;
1588 break;
1592 if (found == 0) {
1593 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1594 if (num >= ndomains) {
1595 spin_unlock_irqrestore(&iommu->lock, flags);
1596 printk(KERN_ERR "IOMMU: no free domain ids\n");
1597 return -EFAULT;
1600 set_bit(num, iommu->domain_ids);
1601 iommu->domains[num] = domain;
1602 id = num;
1605 /* Skip top levels of page tables for
1606 * iommu which has less agaw than default.
1607 * Unnecessary for PT mode.
1609 if (translation != CONTEXT_TT_PASS_THROUGH) {
1610 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1611 pgd = phys_to_virt(dma_pte_addr(pgd));
1612 if (!dma_pte_present(pgd)) {
1613 spin_unlock_irqrestore(&iommu->lock, flags);
1614 return -ENOMEM;
1620 context_set_domain_id(context, id);
1622 if (translation != CONTEXT_TT_PASS_THROUGH) {
1623 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1624 translation = info ? CONTEXT_TT_DEV_IOTLB :
1625 CONTEXT_TT_MULTI_LEVEL;
1628 * In pass through mode, AW must be programmed to indicate the largest
1629 * AGAW value supported by hardware. And ASR is ignored by hardware.
1631 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1632 context_set_address_width(context, iommu->msagaw);
1633 else {
1634 context_set_address_root(context, virt_to_phys(pgd));
1635 context_set_address_width(context, iommu->agaw);
1638 context_set_translation_type(context, translation);
1639 context_set_fault_enable(context);
1640 context_set_present(context);
1641 domain_flush_cache(domain, context, sizeof(*context));
1644 * It's a non-present to present mapping. If hardware doesn't cache
1645 * non-present entry we only need to flush the write-buffer. If the
1646 * _does_ cache non-present entries, then it does so in the special
1647 * domain #0, which we have to flush:
1649 if (cap_caching_mode(iommu->cap)) {
1650 iommu->flush.flush_context(iommu, 0,
1651 (((u16)bus) << 8) | devfn,
1652 DMA_CCMD_MASK_NOBIT,
1653 DMA_CCMD_DEVICE_INVL);
1654 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1655 } else {
1656 iommu_flush_write_buffer(iommu);
1658 iommu_enable_dev_iotlb(info);
1659 spin_unlock_irqrestore(&iommu->lock, flags);
1661 spin_lock_irqsave(&domain->iommu_lock, flags);
1662 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1663 domain->iommu_count++;
1664 if (domain->iommu_count == 1)
1665 domain->nid = iommu->node;
1666 domain_update_iommu_cap(domain);
1668 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1669 return 0;
1672 static int
1673 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1674 int translation)
1676 int ret;
1677 struct pci_dev *tmp, *parent;
1679 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1680 pdev->bus->number, pdev->devfn,
1681 translation);
1682 if (ret)
1683 return ret;
1685 /* dependent device mapping */
1686 tmp = pci_find_upstream_pcie_bridge(pdev);
1687 if (!tmp)
1688 return 0;
1689 /* Secondary interface's bus number and devfn 0 */
1690 parent = pdev->bus->self;
1691 while (parent != tmp) {
1692 ret = domain_context_mapping_one(domain,
1693 pci_domain_nr(parent->bus),
1694 parent->bus->number,
1695 parent->devfn, translation);
1696 if (ret)
1697 return ret;
1698 parent = parent->bus->self;
1700 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1701 return domain_context_mapping_one(domain,
1702 pci_domain_nr(tmp->subordinate),
1703 tmp->subordinate->number, 0,
1704 translation);
1705 else /* this is a legacy PCI bridge */
1706 return domain_context_mapping_one(domain,
1707 pci_domain_nr(tmp->bus),
1708 tmp->bus->number,
1709 tmp->devfn,
1710 translation);
1713 static int domain_context_mapped(struct pci_dev *pdev)
1715 int ret;
1716 struct pci_dev *tmp, *parent;
1717 struct intel_iommu *iommu;
1719 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1720 pdev->devfn);
1721 if (!iommu)
1722 return -ENODEV;
1724 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1725 if (!ret)
1726 return ret;
1727 /* dependent device mapping */
1728 tmp = pci_find_upstream_pcie_bridge(pdev);
1729 if (!tmp)
1730 return ret;
1731 /* Secondary interface's bus number and devfn 0 */
1732 parent = pdev->bus->self;
1733 while (parent != tmp) {
1734 ret = device_context_mapped(iommu, parent->bus->number,
1735 parent->devfn);
1736 if (!ret)
1737 return ret;
1738 parent = parent->bus->self;
1740 if (pci_is_pcie(tmp))
1741 return device_context_mapped(iommu, tmp->subordinate->number,
1743 else
1744 return device_context_mapped(iommu, tmp->bus->number,
1745 tmp->devfn);
1748 /* Returns a number of VTD pages, but aligned to MM page size */
1749 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1750 size_t size)
1752 host_addr &= ~PAGE_MASK;
1753 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1756 /* Return largest possible superpage level for a given mapping */
1757 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1758 unsigned long iov_pfn,
1759 unsigned long phy_pfn,
1760 unsigned long pages)
1762 int support, level = 1;
1763 unsigned long pfnmerge;
1765 support = domain->iommu_superpage;
1767 /* To use a large page, the virtual *and* physical addresses
1768 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1769 of them will mean we have to use smaller pages. So just
1770 merge them and check both at once. */
1771 pfnmerge = iov_pfn | phy_pfn;
1773 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1774 pages >>= VTD_STRIDE_SHIFT;
1775 if (!pages)
1776 break;
1777 pfnmerge >>= VTD_STRIDE_SHIFT;
1778 level++;
1779 support--;
1781 return level;
1784 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1785 struct scatterlist *sg, unsigned long phys_pfn,
1786 unsigned long nr_pages, int prot)
1788 struct dma_pte *first_pte = NULL, *pte = NULL;
1789 phys_addr_t uninitialized_var(pteval);
1790 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1791 unsigned long sg_res;
1792 unsigned int largepage_lvl = 0;
1793 unsigned long lvl_pages = 0;
1795 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1797 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1798 return -EINVAL;
1800 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1802 if (sg)
1803 sg_res = 0;
1804 else {
1805 sg_res = nr_pages + 1;
1806 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1809 while (nr_pages > 0) {
1810 uint64_t tmp;
1812 if (!sg_res) {
1813 sg_res = aligned_nrpages(sg->offset, sg->length);
1814 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1815 sg->dma_length = sg->length;
1816 pteval = page_to_phys(sg_page(sg)) | prot;
1817 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1820 if (!pte) {
1821 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1823 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1824 if (!pte)
1825 return -ENOMEM;
1826 /* It is large page*/
1827 if (largepage_lvl > 1)
1828 pteval |= DMA_PTE_LARGE_PAGE;
1829 else
1830 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1833 /* We don't need lock here, nobody else
1834 * touches the iova range
1836 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1837 if (tmp) {
1838 static int dumps = 5;
1839 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1840 iov_pfn, tmp, (unsigned long long)pteval);
1841 if (dumps) {
1842 dumps--;
1843 debug_dma_dump_mappings(NULL);
1845 WARN_ON(1);
1848 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1850 BUG_ON(nr_pages < lvl_pages);
1851 BUG_ON(sg_res < lvl_pages);
1853 nr_pages -= lvl_pages;
1854 iov_pfn += lvl_pages;
1855 phys_pfn += lvl_pages;
1856 pteval += lvl_pages * VTD_PAGE_SIZE;
1857 sg_res -= lvl_pages;
1859 /* If the next PTE would be the first in a new page, then we
1860 need to flush the cache on the entries we've just written.
1861 And then we'll need to recalculate 'pte', so clear it and
1862 let it get set again in the if (!pte) block above.
1864 If we're done (!nr_pages) we need to flush the cache too.
1866 Also if we've been setting superpages, we may need to
1867 recalculate 'pte' and switch back to smaller pages for the
1868 end of the mapping, if the trailing size is not enough to
1869 use another superpage (i.e. sg_res < lvl_pages). */
1870 pte++;
1871 if (!nr_pages || first_pte_in_page(pte) ||
1872 (largepage_lvl > 1 && sg_res < lvl_pages)) {
1873 domain_flush_cache(domain, first_pte,
1874 (void *)pte - (void *)first_pte);
1875 pte = NULL;
1878 if (!sg_res && nr_pages)
1879 sg = sg_next(sg);
1881 return 0;
1884 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1885 struct scatterlist *sg, unsigned long nr_pages,
1886 int prot)
1888 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1891 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1892 unsigned long phys_pfn, unsigned long nr_pages,
1893 int prot)
1895 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1898 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1900 if (!iommu)
1901 return;
1903 clear_context_table(iommu, bus, devfn);
1904 iommu->flush.flush_context(iommu, 0, 0, 0,
1905 DMA_CCMD_GLOBAL_INVL);
1906 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1909 static void domain_remove_dev_info(struct dmar_domain *domain)
1911 struct device_domain_info *info;
1912 unsigned long flags;
1913 struct intel_iommu *iommu;
1915 spin_lock_irqsave(&device_domain_lock, flags);
1916 while (!list_empty(&domain->devices)) {
1917 info = list_entry(domain->devices.next,
1918 struct device_domain_info, link);
1919 list_del(&info->link);
1920 list_del(&info->global);
1921 if (info->dev)
1922 info->dev->dev.archdata.iommu = NULL;
1923 spin_unlock_irqrestore(&device_domain_lock, flags);
1925 iommu_disable_dev_iotlb(info);
1926 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1927 iommu_detach_dev(iommu, info->bus, info->devfn);
1928 free_devinfo_mem(info);
1930 spin_lock_irqsave(&device_domain_lock, flags);
1932 spin_unlock_irqrestore(&device_domain_lock, flags);
1936 * find_domain
1937 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1939 static struct dmar_domain *
1940 find_domain(struct pci_dev *pdev)
1942 struct device_domain_info *info;
1944 /* No lock here, assumes no domain exit in normal case */
1945 info = pdev->dev.archdata.iommu;
1946 if (info)
1947 return info->domain;
1948 return NULL;
1951 /* domain is initialized */
1952 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1954 struct dmar_domain *domain, *found = NULL;
1955 struct intel_iommu *iommu;
1956 struct dmar_drhd_unit *drhd;
1957 struct device_domain_info *info, *tmp;
1958 struct pci_dev *dev_tmp;
1959 unsigned long flags;
1960 int bus = 0, devfn = 0;
1961 int segment;
1962 int ret;
1964 domain = find_domain(pdev);
1965 if (domain)
1966 return domain;
1968 segment = pci_domain_nr(pdev->bus);
1970 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1971 if (dev_tmp) {
1972 if (pci_is_pcie(dev_tmp)) {
1973 bus = dev_tmp->subordinate->number;
1974 devfn = 0;
1975 } else {
1976 bus = dev_tmp->bus->number;
1977 devfn = dev_tmp->devfn;
1979 spin_lock_irqsave(&device_domain_lock, flags);
1980 list_for_each_entry(info, &device_domain_list, global) {
1981 if (info->segment == segment &&
1982 info->bus == bus && info->devfn == devfn) {
1983 found = info->domain;
1984 break;
1987 spin_unlock_irqrestore(&device_domain_lock, flags);
1988 /* pcie-pci bridge already has a domain, uses it */
1989 if (found) {
1990 domain = found;
1991 goto found_domain;
1995 domain = alloc_domain();
1996 if (!domain)
1997 goto error;
1999 /* Allocate new domain for the device */
2000 drhd = dmar_find_matched_drhd_unit(pdev);
2001 if (!drhd) {
2002 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2003 pci_name(pdev));
2004 return NULL;
2006 iommu = drhd->iommu;
2008 ret = iommu_attach_domain(domain, iommu);
2009 if (ret) {
2010 free_domain_mem(domain);
2011 goto error;
2014 if (domain_init(domain, gaw)) {
2015 domain_exit(domain);
2016 goto error;
2019 /* register pcie-to-pci device */
2020 if (dev_tmp) {
2021 info = alloc_devinfo_mem();
2022 if (!info) {
2023 domain_exit(domain);
2024 goto error;
2026 info->segment = segment;
2027 info->bus = bus;
2028 info->devfn = devfn;
2029 info->dev = NULL;
2030 info->domain = domain;
2031 /* This domain is shared by devices under p2p bridge */
2032 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2034 /* pcie-to-pci bridge already has a domain, uses it */
2035 found = NULL;
2036 spin_lock_irqsave(&device_domain_lock, flags);
2037 list_for_each_entry(tmp, &device_domain_list, global) {
2038 if (tmp->segment == segment &&
2039 tmp->bus == bus && tmp->devfn == devfn) {
2040 found = tmp->domain;
2041 break;
2044 if (found) {
2045 spin_unlock_irqrestore(&device_domain_lock, flags);
2046 free_devinfo_mem(info);
2047 domain_exit(domain);
2048 domain = found;
2049 } else {
2050 list_add(&info->link, &domain->devices);
2051 list_add(&info->global, &device_domain_list);
2052 spin_unlock_irqrestore(&device_domain_lock, flags);
2056 found_domain:
2057 info = alloc_devinfo_mem();
2058 if (!info)
2059 goto error;
2060 info->segment = segment;
2061 info->bus = pdev->bus->number;
2062 info->devfn = pdev->devfn;
2063 info->dev = pdev;
2064 info->domain = domain;
2065 spin_lock_irqsave(&device_domain_lock, flags);
2066 /* somebody is fast */
2067 found = find_domain(pdev);
2068 if (found != NULL) {
2069 spin_unlock_irqrestore(&device_domain_lock, flags);
2070 if (found != domain) {
2071 domain_exit(domain);
2072 domain = found;
2074 free_devinfo_mem(info);
2075 return domain;
2077 list_add(&info->link, &domain->devices);
2078 list_add(&info->global, &device_domain_list);
2079 pdev->dev.archdata.iommu = info;
2080 spin_unlock_irqrestore(&device_domain_lock, flags);
2081 return domain;
2082 error:
2083 /* recheck it here, maybe others set it */
2084 return find_domain(pdev);
2087 static int iommu_identity_mapping;
2088 #define IDENTMAP_ALL 1
2089 #define IDENTMAP_GFX 2
2090 #define IDENTMAP_AZALIA 4
2092 static int iommu_domain_identity_map(struct dmar_domain *domain,
2093 unsigned long long start,
2094 unsigned long long end)
2096 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2097 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2099 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2100 dma_to_mm_pfn(last_vpfn))) {
2101 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2102 return -ENOMEM;
2105 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2106 start, end, domain->id);
2108 * RMRR range might have overlap with physical memory range,
2109 * clear it first
2111 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2113 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2114 last_vpfn - first_vpfn + 1,
2115 DMA_PTE_READ|DMA_PTE_WRITE);
2118 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2119 unsigned long long start,
2120 unsigned long long end)
2122 struct dmar_domain *domain;
2123 int ret;
2125 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2126 if (!domain)
2127 return -ENOMEM;
2129 /* For _hardware_ passthrough, don't bother. But for software
2130 passthrough, we do it anyway -- it may indicate a memory
2131 range which is reserved in E820, so which didn't get set
2132 up to start with in si_domain */
2133 if (domain == si_domain && hw_pass_through) {
2134 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2135 pci_name(pdev), start, end);
2136 return 0;
2139 printk(KERN_INFO
2140 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2141 pci_name(pdev), start, end);
2143 if (end < start) {
2144 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2145 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2146 dmi_get_system_info(DMI_BIOS_VENDOR),
2147 dmi_get_system_info(DMI_BIOS_VERSION),
2148 dmi_get_system_info(DMI_PRODUCT_VERSION));
2149 ret = -EIO;
2150 goto error;
2153 if (end >> agaw_to_width(domain->agaw)) {
2154 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2155 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2156 agaw_to_width(domain->agaw),
2157 dmi_get_system_info(DMI_BIOS_VENDOR),
2158 dmi_get_system_info(DMI_BIOS_VERSION),
2159 dmi_get_system_info(DMI_PRODUCT_VERSION));
2160 ret = -EIO;
2161 goto error;
2164 ret = iommu_domain_identity_map(domain, start, end);
2165 if (ret)
2166 goto error;
2168 /* context entry init */
2169 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2170 if (ret)
2171 goto error;
2173 return 0;
2175 error:
2176 domain_exit(domain);
2177 return ret;
2180 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2181 struct pci_dev *pdev)
2183 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2184 return 0;
2185 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2186 rmrr->end_address);
2189 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2190 static inline void iommu_prepare_isa(void)
2192 struct pci_dev *pdev;
2193 int ret;
2195 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2196 if (!pdev)
2197 return;
2199 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2200 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2202 if (ret)
2203 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2204 "floppy might not work\n");
2207 #else
2208 static inline void iommu_prepare_isa(void)
2210 return;
2212 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2214 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2216 static int __init si_domain_init(int hw)
2218 struct dmar_drhd_unit *drhd;
2219 struct intel_iommu *iommu;
2220 int nid, ret = 0;
2222 si_domain = alloc_domain();
2223 if (!si_domain)
2224 return -EFAULT;
2226 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2228 for_each_active_iommu(iommu, drhd) {
2229 ret = iommu_attach_domain(si_domain, iommu);
2230 if (ret) {
2231 domain_exit(si_domain);
2232 return -EFAULT;
2236 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2237 domain_exit(si_domain);
2238 return -EFAULT;
2241 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2243 if (hw)
2244 return 0;
2246 for_each_online_node(nid) {
2247 unsigned long start_pfn, end_pfn;
2248 int i;
2250 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2251 ret = iommu_domain_identity_map(si_domain,
2252 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2253 if (ret)
2254 return ret;
2258 return 0;
2261 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2262 struct pci_dev *pdev);
2263 static int identity_mapping(struct pci_dev *pdev)
2265 struct device_domain_info *info;
2267 if (likely(!iommu_identity_mapping))
2268 return 0;
2270 info = pdev->dev.archdata.iommu;
2271 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2272 return (info->domain == si_domain);
2274 return 0;
2277 static int domain_add_dev_info(struct dmar_domain *domain,
2278 struct pci_dev *pdev,
2279 int translation)
2281 struct device_domain_info *info;
2282 unsigned long flags;
2283 int ret;
2285 info = alloc_devinfo_mem();
2286 if (!info)
2287 return -ENOMEM;
2289 ret = domain_context_mapping(domain, pdev, translation);
2290 if (ret) {
2291 free_devinfo_mem(info);
2292 return ret;
2295 info->segment = pci_domain_nr(pdev->bus);
2296 info->bus = pdev->bus->number;
2297 info->devfn = pdev->devfn;
2298 info->dev = pdev;
2299 info->domain = domain;
2301 spin_lock_irqsave(&device_domain_lock, flags);
2302 list_add(&info->link, &domain->devices);
2303 list_add(&info->global, &device_domain_list);
2304 pdev->dev.archdata.iommu = info;
2305 spin_unlock_irqrestore(&device_domain_lock, flags);
2307 return 0;
2310 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2312 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2313 return 1;
2315 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2316 return 1;
2318 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2319 return 0;
2322 * We want to start off with all devices in the 1:1 domain, and
2323 * take them out later if we find they can't access all of memory.
2325 * However, we can't do this for PCI devices behind bridges,
2326 * because all PCI devices behind the same bridge will end up
2327 * with the same source-id on their transactions.
2329 * Practically speaking, we can't change things around for these
2330 * devices at run-time, because we can't be sure there'll be no
2331 * DMA transactions in flight for any of their siblings.
2333 * So PCI devices (unless they're on the root bus) as well as
2334 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2335 * the 1:1 domain, just in _case_ one of their siblings turns out
2336 * not to be able to map all of memory.
2338 if (!pci_is_pcie(pdev)) {
2339 if (!pci_is_root_bus(pdev->bus))
2340 return 0;
2341 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2342 return 0;
2343 } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2344 return 0;
2347 * At boot time, we don't yet know if devices will be 64-bit capable.
2348 * Assume that they will -- if they turn out not to be, then we can
2349 * take them out of the 1:1 domain later.
2351 if (!startup) {
2353 * If the device's dma_mask is less than the system's memory
2354 * size then this is not a candidate for identity mapping.
2356 u64 dma_mask = pdev->dma_mask;
2358 if (pdev->dev.coherent_dma_mask &&
2359 pdev->dev.coherent_dma_mask < dma_mask)
2360 dma_mask = pdev->dev.coherent_dma_mask;
2362 return dma_mask >= dma_get_required_mask(&pdev->dev);
2365 return 1;
2368 static int __init iommu_prepare_static_identity_mapping(int hw)
2370 struct pci_dev *pdev = NULL;
2371 int ret;
2373 ret = si_domain_init(hw);
2374 if (ret)
2375 return -EFAULT;
2377 for_each_pci_dev(pdev) {
2378 if (iommu_should_identity_map(pdev, 1)) {
2379 ret = domain_add_dev_info(si_domain, pdev,
2380 hw ? CONTEXT_TT_PASS_THROUGH :
2381 CONTEXT_TT_MULTI_LEVEL);
2382 if (ret) {
2383 /* device not associated with an iommu */
2384 if (ret == -ENODEV)
2385 continue;
2386 return ret;
2388 pr_info("IOMMU: %s identity mapping for device %s\n",
2389 hw ? "hardware" : "software", pci_name(pdev));
2393 return 0;
2396 static int __init init_dmars(void)
2398 struct dmar_drhd_unit *drhd;
2399 struct dmar_rmrr_unit *rmrr;
2400 struct pci_dev *pdev;
2401 struct intel_iommu *iommu;
2402 int i, ret;
2405 * for each drhd
2406 * allocate root
2407 * initialize and program root entry to not present
2408 * endfor
2410 for_each_drhd_unit(drhd) {
2412 * lock not needed as this is only incremented in the single
2413 * threaded kernel __init code path all other access are read
2414 * only
2416 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2417 g_num_of_iommus++;
2418 continue;
2420 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2421 IOMMU_UNITS_SUPPORTED);
2424 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2425 GFP_KERNEL);
2426 if (!g_iommus) {
2427 printk(KERN_ERR "Allocating global iommu array failed\n");
2428 ret = -ENOMEM;
2429 goto error;
2432 deferred_flush = kzalloc(g_num_of_iommus *
2433 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2434 if (!deferred_flush) {
2435 ret = -ENOMEM;
2436 goto error;
2439 for_each_drhd_unit(drhd) {
2440 if (drhd->ignored)
2441 continue;
2443 iommu = drhd->iommu;
2444 g_iommus[iommu->seq_id] = iommu;
2446 ret = iommu_init_domains(iommu);
2447 if (ret)
2448 goto error;
2451 * TBD:
2452 * we could share the same root & context tables
2453 * among all IOMMU's. Need to Split it later.
2455 ret = iommu_alloc_root_entry(iommu);
2456 if (ret) {
2457 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2458 goto error;
2460 if (!ecap_pass_through(iommu->ecap))
2461 hw_pass_through = 0;
2465 * Start from the sane iommu hardware state.
2467 for_each_drhd_unit(drhd) {
2468 if (drhd->ignored)
2469 continue;
2471 iommu = drhd->iommu;
2474 * If the queued invalidation is already initialized by us
2475 * (for example, while enabling interrupt-remapping) then
2476 * we got the things already rolling from a sane state.
2478 if (iommu->qi)
2479 continue;
2482 * Clear any previous faults.
2484 dmar_fault(-1, iommu);
2486 * Disable queued invalidation if supported and already enabled
2487 * before OS handover.
2489 dmar_disable_qi(iommu);
2492 for_each_drhd_unit(drhd) {
2493 if (drhd->ignored)
2494 continue;
2496 iommu = drhd->iommu;
2498 if (dmar_enable_qi(iommu)) {
2500 * Queued Invalidate not enabled, use Register Based
2501 * Invalidate
2503 iommu->flush.flush_context = __iommu_flush_context;
2504 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2505 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2506 "invalidation\n",
2507 iommu->seq_id,
2508 (unsigned long long)drhd->reg_base_addr);
2509 } else {
2510 iommu->flush.flush_context = qi_flush_context;
2511 iommu->flush.flush_iotlb = qi_flush_iotlb;
2512 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2513 "invalidation\n",
2514 iommu->seq_id,
2515 (unsigned long long)drhd->reg_base_addr);
2519 if (iommu_pass_through)
2520 iommu_identity_mapping |= IDENTMAP_ALL;
2522 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2523 iommu_identity_mapping |= IDENTMAP_GFX;
2524 #endif
2526 check_tylersburg_isoch();
2529 * If pass through is not set or not enabled, setup context entries for
2530 * identity mappings for rmrr, gfx, and isa and may fall back to static
2531 * identity mapping if iommu_identity_mapping is set.
2533 if (iommu_identity_mapping) {
2534 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2535 if (ret) {
2536 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2537 goto error;
2541 * For each rmrr
2542 * for each dev attached to rmrr
2543 * do
2544 * locate drhd for dev, alloc domain for dev
2545 * allocate free domain
2546 * allocate page table entries for rmrr
2547 * if context not allocated for bus
2548 * allocate and init context
2549 * set present in root table for this bus
2550 * init context with domain, translation etc
2551 * endfor
2552 * endfor
2554 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2555 for_each_rmrr_units(rmrr) {
2556 for (i = 0; i < rmrr->devices_cnt; i++) {
2557 pdev = rmrr->devices[i];
2559 * some BIOS lists non-exist devices in DMAR
2560 * table.
2562 if (!pdev)
2563 continue;
2564 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2565 if (ret)
2566 printk(KERN_ERR
2567 "IOMMU: mapping reserved region failed\n");
2571 iommu_prepare_isa();
2574 * for each drhd
2575 * enable fault log
2576 * global invalidate context cache
2577 * global invalidate iotlb
2578 * enable translation
2580 for_each_drhd_unit(drhd) {
2581 if (drhd->ignored) {
2583 * we always have to disable PMRs or DMA may fail on
2584 * this device
2586 if (force_on)
2587 iommu_disable_protect_mem_regions(drhd->iommu);
2588 continue;
2590 iommu = drhd->iommu;
2592 iommu_flush_write_buffer(iommu);
2594 ret = dmar_set_interrupt(iommu);
2595 if (ret)
2596 goto error;
2598 iommu_set_root_entry(iommu);
2600 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2601 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2603 ret = iommu_enable_translation(iommu);
2604 if (ret)
2605 goto error;
2607 iommu_disable_protect_mem_regions(iommu);
2610 return 0;
2611 error:
2612 for_each_drhd_unit(drhd) {
2613 if (drhd->ignored)
2614 continue;
2615 iommu = drhd->iommu;
2616 free_iommu(iommu);
2618 kfree(g_iommus);
2619 return ret;
2622 /* This takes a number of _MM_ pages, not VTD pages */
2623 static struct iova *intel_alloc_iova(struct device *dev,
2624 struct dmar_domain *domain,
2625 unsigned long nrpages, uint64_t dma_mask)
2627 struct pci_dev *pdev = to_pci_dev(dev);
2628 struct iova *iova = NULL;
2630 /* Restrict dma_mask to the width that the iommu can handle */
2631 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2633 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2635 * First try to allocate an io virtual address in
2636 * DMA_BIT_MASK(32) and if that fails then try allocating
2637 * from higher range
2639 iova = alloc_iova(&domain->iovad, nrpages,
2640 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2641 if (iova)
2642 return iova;
2644 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2645 if (unlikely(!iova)) {
2646 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2647 nrpages, pci_name(pdev));
2648 return NULL;
2651 return iova;
2654 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2656 struct dmar_domain *domain;
2657 int ret;
2659 domain = get_domain_for_dev(pdev,
2660 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2661 if (!domain) {
2662 printk(KERN_ERR
2663 "Allocating domain for %s failed", pci_name(pdev));
2664 return NULL;
2667 /* make sure context mapping is ok */
2668 if (unlikely(!domain_context_mapped(pdev))) {
2669 ret = domain_context_mapping(domain, pdev,
2670 CONTEXT_TT_MULTI_LEVEL);
2671 if (ret) {
2672 printk(KERN_ERR
2673 "Domain context map for %s failed",
2674 pci_name(pdev));
2675 return NULL;
2679 return domain;
2682 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2684 struct device_domain_info *info;
2686 /* No lock here, assumes no domain exit in normal case */
2687 info = dev->dev.archdata.iommu;
2688 if (likely(info))
2689 return info->domain;
2691 return __get_valid_domain_for_dev(dev);
2694 static int iommu_dummy(struct pci_dev *pdev)
2696 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2699 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2700 static int iommu_no_mapping(struct device *dev)
2702 struct pci_dev *pdev;
2703 int found;
2705 if (unlikely(dev->bus != &pci_bus_type))
2706 return 1;
2708 pdev = to_pci_dev(dev);
2709 if (iommu_dummy(pdev))
2710 return 1;
2712 if (!iommu_identity_mapping)
2713 return 0;
2715 found = identity_mapping(pdev);
2716 if (found) {
2717 if (iommu_should_identity_map(pdev, 0))
2718 return 1;
2719 else {
2721 * 32 bit DMA is removed from si_domain and fall back
2722 * to non-identity mapping.
2724 domain_remove_one_dev_info(si_domain, pdev);
2725 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2726 pci_name(pdev));
2727 return 0;
2729 } else {
2731 * In case of a detached 64 bit DMA device from vm, the device
2732 * is put into si_domain for identity mapping.
2734 if (iommu_should_identity_map(pdev, 0)) {
2735 int ret;
2736 ret = domain_add_dev_info(si_domain, pdev,
2737 hw_pass_through ?
2738 CONTEXT_TT_PASS_THROUGH :
2739 CONTEXT_TT_MULTI_LEVEL);
2740 if (!ret) {
2741 printk(KERN_INFO "64bit %s uses identity mapping\n",
2742 pci_name(pdev));
2743 return 1;
2748 return 0;
2751 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2752 size_t size, int dir, u64 dma_mask)
2754 struct pci_dev *pdev = to_pci_dev(hwdev);
2755 struct dmar_domain *domain;
2756 phys_addr_t start_paddr;
2757 struct iova *iova;
2758 int prot = 0;
2759 int ret;
2760 struct intel_iommu *iommu;
2761 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2763 BUG_ON(dir == DMA_NONE);
2765 if (iommu_no_mapping(hwdev))
2766 return paddr;
2768 domain = get_valid_domain_for_dev(pdev);
2769 if (!domain)
2770 return 0;
2772 iommu = domain_get_iommu(domain);
2773 size = aligned_nrpages(paddr, size);
2775 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2776 if (!iova)
2777 goto error;
2780 * Check if DMAR supports zero-length reads on write only
2781 * mappings..
2783 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2784 !cap_zlr(iommu->cap))
2785 prot |= DMA_PTE_READ;
2786 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2787 prot |= DMA_PTE_WRITE;
2789 * paddr - (paddr + size) might be partial page, we should map the whole
2790 * page. Note: if two part of one page are separately mapped, we
2791 * might have two guest_addr mapping to the same host paddr, but this
2792 * is not a big problem
2794 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2795 mm_to_dma_pfn(paddr_pfn), size, prot);
2796 if (ret)
2797 goto error;
2799 /* it's a non-present to present mapping. Only flush if caching mode */
2800 if (cap_caching_mode(iommu->cap))
2801 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2802 else
2803 iommu_flush_write_buffer(iommu);
2805 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2806 start_paddr += paddr & ~PAGE_MASK;
2807 return start_paddr;
2809 error:
2810 if (iova)
2811 __free_iova(&domain->iovad, iova);
2812 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2813 pci_name(pdev), size, (unsigned long long)paddr, dir);
2814 return 0;
2817 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2818 unsigned long offset, size_t size,
2819 enum dma_data_direction dir,
2820 struct dma_attrs *attrs)
2822 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2823 dir, to_pci_dev(dev)->dma_mask);
2826 static void flush_unmaps(void)
2828 int i, j;
2830 timer_on = 0;
2832 /* just flush them all */
2833 for (i = 0; i < g_num_of_iommus; i++) {
2834 struct intel_iommu *iommu = g_iommus[i];
2835 if (!iommu)
2836 continue;
2838 if (!deferred_flush[i].next)
2839 continue;
2841 /* In caching mode, global flushes turn emulation expensive */
2842 if (!cap_caching_mode(iommu->cap))
2843 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2844 DMA_TLB_GLOBAL_FLUSH);
2845 for (j = 0; j < deferred_flush[i].next; j++) {
2846 unsigned long mask;
2847 struct iova *iova = deferred_flush[i].iova[j];
2848 struct dmar_domain *domain = deferred_flush[i].domain[j];
2850 /* On real hardware multiple invalidations are expensive */
2851 if (cap_caching_mode(iommu->cap))
2852 iommu_flush_iotlb_psi(iommu, domain->id,
2853 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2854 else {
2855 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2856 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2857 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2859 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2861 deferred_flush[i].next = 0;
2864 list_size = 0;
2867 static void flush_unmaps_timeout(unsigned long data)
2869 unsigned long flags;
2871 spin_lock_irqsave(&async_umap_flush_lock, flags);
2872 flush_unmaps();
2873 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2876 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2878 unsigned long flags;
2879 int next, iommu_id;
2880 struct intel_iommu *iommu;
2882 spin_lock_irqsave(&async_umap_flush_lock, flags);
2883 if (list_size == HIGH_WATER_MARK)
2884 flush_unmaps();
2886 iommu = domain_get_iommu(dom);
2887 iommu_id = iommu->seq_id;
2889 next = deferred_flush[iommu_id].next;
2890 deferred_flush[iommu_id].domain[next] = dom;
2891 deferred_flush[iommu_id].iova[next] = iova;
2892 deferred_flush[iommu_id].next++;
2894 if (!timer_on) {
2895 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2896 timer_on = 1;
2898 list_size++;
2899 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2902 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2903 size_t size, enum dma_data_direction dir,
2904 struct dma_attrs *attrs)
2906 struct pci_dev *pdev = to_pci_dev(dev);
2907 struct dmar_domain *domain;
2908 unsigned long start_pfn, last_pfn;
2909 struct iova *iova;
2910 struct intel_iommu *iommu;
2912 if (iommu_no_mapping(dev))
2913 return;
2915 domain = find_domain(pdev);
2916 BUG_ON(!domain);
2918 iommu = domain_get_iommu(domain);
2920 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2921 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2922 (unsigned long long)dev_addr))
2923 return;
2925 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2926 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2928 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2929 pci_name(pdev), start_pfn, last_pfn);
2931 /* clear the whole page */
2932 dma_pte_clear_range(domain, start_pfn, last_pfn);
2934 /* free page tables */
2935 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2937 if (intel_iommu_strict) {
2938 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2939 last_pfn - start_pfn + 1, 0);
2940 /* free iova */
2941 __free_iova(&domain->iovad, iova);
2942 } else {
2943 add_unmap(domain, iova);
2945 * queue up the release of the unmap to save the 1/6th of the
2946 * cpu used up by the iotlb flush operation...
2951 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2952 dma_addr_t *dma_handle, gfp_t flags,
2953 struct dma_attrs *attrs)
2955 void *vaddr;
2956 int order;
2958 size = PAGE_ALIGN(size);
2959 order = get_order(size);
2961 if (!iommu_no_mapping(hwdev))
2962 flags &= ~(GFP_DMA | GFP_DMA32);
2963 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2964 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2965 flags |= GFP_DMA;
2966 else
2967 flags |= GFP_DMA32;
2970 vaddr = (void *)__get_free_pages(flags, order);
2971 if (!vaddr)
2972 return NULL;
2973 memset(vaddr, 0, size);
2975 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2976 DMA_BIDIRECTIONAL,
2977 hwdev->coherent_dma_mask);
2978 if (*dma_handle)
2979 return vaddr;
2980 free_pages((unsigned long)vaddr, order);
2981 return NULL;
2984 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2985 dma_addr_t dma_handle, struct dma_attrs *attrs)
2987 int order;
2989 size = PAGE_ALIGN(size);
2990 order = get_order(size);
2992 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2993 free_pages((unsigned long)vaddr, order);
2996 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2997 int nelems, enum dma_data_direction dir,
2998 struct dma_attrs *attrs)
3000 struct pci_dev *pdev = to_pci_dev(hwdev);
3001 struct dmar_domain *domain;
3002 unsigned long start_pfn, last_pfn;
3003 struct iova *iova;
3004 struct intel_iommu *iommu;
3006 if (iommu_no_mapping(hwdev))
3007 return;
3009 domain = find_domain(pdev);
3010 BUG_ON(!domain);
3012 iommu = domain_get_iommu(domain);
3014 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3015 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3016 (unsigned long long)sglist[0].dma_address))
3017 return;
3019 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3020 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3022 /* clear the whole page */
3023 dma_pte_clear_range(domain, start_pfn, last_pfn);
3025 /* free page tables */
3026 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3028 if (intel_iommu_strict) {
3029 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3030 last_pfn - start_pfn + 1, 0);
3031 /* free iova */
3032 __free_iova(&domain->iovad, iova);
3033 } else {
3034 add_unmap(domain, iova);
3036 * queue up the release of the unmap to save the 1/6th of the
3037 * cpu used up by the iotlb flush operation...
3042 static int intel_nontranslate_map_sg(struct device *hddev,
3043 struct scatterlist *sglist, int nelems, int dir)
3045 int i;
3046 struct scatterlist *sg;
3048 for_each_sg(sglist, sg, nelems, i) {
3049 BUG_ON(!sg_page(sg));
3050 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3051 sg->dma_length = sg->length;
3053 return nelems;
3056 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3057 enum dma_data_direction dir, struct dma_attrs *attrs)
3059 int i;
3060 struct pci_dev *pdev = to_pci_dev(hwdev);
3061 struct dmar_domain *domain;
3062 size_t size = 0;
3063 int prot = 0;
3064 struct iova *iova = NULL;
3065 int ret;
3066 struct scatterlist *sg;
3067 unsigned long start_vpfn;
3068 struct intel_iommu *iommu;
3070 BUG_ON(dir == DMA_NONE);
3071 if (iommu_no_mapping(hwdev))
3072 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3074 domain = get_valid_domain_for_dev(pdev);
3075 if (!domain)
3076 return 0;
3078 iommu = domain_get_iommu(domain);
3080 for_each_sg(sglist, sg, nelems, i)
3081 size += aligned_nrpages(sg->offset, sg->length);
3083 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3084 pdev->dma_mask);
3085 if (!iova) {
3086 sglist->dma_length = 0;
3087 return 0;
3091 * Check if DMAR supports zero-length reads on write only
3092 * mappings..
3094 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3095 !cap_zlr(iommu->cap))
3096 prot |= DMA_PTE_READ;
3097 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3098 prot |= DMA_PTE_WRITE;
3100 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3102 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3103 if (unlikely(ret)) {
3104 /* clear the page */
3105 dma_pte_clear_range(domain, start_vpfn,
3106 start_vpfn + size - 1);
3107 /* free page tables */
3108 dma_pte_free_pagetable(domain, start_vpfn,
3109 start_vpfn + size - 1);
3110 /* free iova */
3111 __free_iova(&domain->iovad, iova);
3112 return 0;
3115 /* it's a non-present to present mapping. Only flush if caching mode */
3116 if (cap_caching_mode(iommu->cap))
3117 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3118 else
3119 iommu_flush_write_buffer(iommu);
3121 return nelems;
3124 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3126 return !dma_addr;
3129 struct dma_map_ops intel_dma_ops = {
3130 .alloc = intel_alloc_coherent,
3131 .free = intel_free_coherent,
3132 .map_sg = intel_map_sg,
3133 .unmap_sg = intel_unmap_sg,
3134 .map_page = intel_map_page,
3135 .unmap_page = intel_unmap_page,
3136 .mapping_error = intel_mapping_error,
3139 static inline int iommu_domain_cache_init(void)
3141 int ret = 0;
3143 iommu_domain_cache = kmem_cache_create("iommu_domain",
3144 sizeof(struct dmar_domain),
3146 SLAB_HWCACHE_ALIGN,
3148 NULL);
3149 if (!iommu_domain_cache) {
3150 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3151 ret = -ENOMEM;
3154 return ret;
3157 static inline int iommu_devinfo_cache_init(void)
3159 int ret = 0;
3161 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3162 sizeof(struct device_domain_info),
3164 SLAB_HWCACHE_ALIGN,
3165 NULL);
3166 if (!iommu_devinfo_cache) {
3167 printk(KERN_ERR "Couldn't create devinfo cache\n");
3168 ret = -ENOMEM;
3171 return ret;
3174 static inline int iommu_iova_cache_init(void)
3176 int ret = 0;
3178 iommu_iova_cache = kmem_cache_create("iommu_iova",
3179 sizeof(struct iova),
3181 SLAB_HWCACHE_ALIGN,
3182 NULL);
3183 if (!iommu_iova_cache) {
3184 printk(KERN_ERR "Couldn't create iova cache\n");
3185 ret = -ENOMEM;
3188 return ret;
3191 static int __init iommu_init_mempool(void)
3193 int ret;
3194 ret = iommu_iova_cache_init();
3195 if (ret)
3196 return ret;
3198 ret = iommu_domain_cache_init();
3199 if (ret)
3200 goto domain_error;
3202 ret = iommu_devinfo_cache_init();
3203 if (!ret)
3204 return ret;
3206 kmem_cache_destroy(iommu_domain_cache);
3207 domain_error:
3208 kmem_cache_destroy(iommu_iova_cache);
3210 return -ENOMEM;
3213 static void __init iommu_exit_mempool(void)
3215 kmem_cache_destroy(iommu_devinfo_cache);
3216 kmem_cache_destroy(iommu_domain_cache);
3217 kmem_cache_destroy(iommu_iova_cache);
3221 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3223 struct dmar_drhd_unit *drhd;
3224 u32 vtbar;
3225 int rc;
3227 /* We know that this device on this chipset has its own IOMMU.
3228 * If we find it under a different IOMMU, then the BIOS is lying
3229 * to us. Hope that the IOMMU for this device is actually
3230 * disabled, and it needs no translation...
3232 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3233 if (rc) {
3234 /* "can't" happen */
3235 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3236 return;
3238 vtbar &= 0xffff0000;
3240 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3241 drhd = dmar_find_matched_drhd_unit(pdev);
3242 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3243 TAINT_FIRMWARE_WORKAROUND,
3244 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3245 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3247 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3249 static void __init init_no_remapping_devices(void)
3251 struct dmar_drhd_unit *drhd;
3253 for_each_drhd_unit(drhd) {
3254 if (!drhd->include_all) {
3255 int i;
3256 for (i = 0; i < drhd->devices_cnt; i++)
3257 if (drhd->devices[i] != NULL)
3258 break;
3259 /* ignore DMAR unit if no pci devices exist */
3260 if (i == drhd->devices_cnt)
3261 drhd->ignored = 1;
3265 for_each_drhd_unit(drhd) {
3266 int i;
3267 if (drhd->ignored || drhd->include_all)
3268 continue;
3270 for (i = 0; i < drhd->devices_cnt; i++)
3271 if (drhd->devices[i] &&
3272 !IS_GFX_DEVICE(drhd->devices[i]))
3273 break;
3275 if (i < drhd->devices_cnt)
3276 continue;
3278 /* This IOMMU has *only* gfx devices. Either bypass it or
3279 set the gfx_mapped flag, as appropriate */
3280 if (dmar_map_gfx) {
3281 intel_iommu_gfx_mapped = 1;
3282 } else {
3283 drhd->ignored = 1;
3284 for (i = 0; i < drhd->devices_cnt; i++) {
3285 if (!drhd->devices[i])
3286 continue;
3287 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3293 #ifdef CONFIG_SUSPEND
3294 static int init_iommu_hw(void)
3296 struct dmar_drhd_unit *drhd;
3297 struct intel_iommu *iommu = NULL;
3299 for_each_active_iommu(iommu, drhd)
3300 if (iommu->qi)
3301 dmar_reenable_qi(iommu);
3303 for_each_iommu(iommu, drhd) {
3304 if (drhd->ignored) {
3306 * we always have to disable PMRs or DMA may fail on
3307 * this device
3309 if (force_on)
3310 iommu_disable_protect_mem_regions(iommu);
3311 continue;
3314 iommu_flush_write_buffer(iommu);
3316 iommu_set_root_entry(iommu);
3318 iommu->flush.flush_context(iommu, 0, 0, 0,
3319 DMA_CCMD_GLOBAL_INVL);
3320 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3321 DMA_TLB_GLOBAL_FLUSH);
3322 if (iommu_enable_translation(iommu))
3323 return 1;
3324 iommu_disable_protect_mem_regions(iommu);
3327 return 0;
3330 static void iommu_flush_all(void)
3332 struct dmar_drhd_unit *drhd;
3333 struct intel_iommu *iommu;
3335 for_each_active_iommu(iommu, drhd) {
3336 iommu->flush.flush_context(iommu, 0, 0, 0,
3337 DMA_CCMD_GLOBAL_INVL);
3338 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3339 DMA_TLB_GLOBAL_FLUSH);
3343 static int iommu_suspend(void)
3345 struct dmar_drhd_unit *drhd;
3346 struct intel_iommu *iommu = NULL;
3347 unsigned long flag;
3349 for_each_active_iommu(iommu, drhd) {
3350 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3351 GFP_ATOMIC);
3352 if (!iommu->iommu_state)
3353 goto nomem;
3356 iommu_flush_all();
3358 for_each_active_iommu(iommu, drhd) {
3359 iommu_disable_translation(iommu);
3361 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3363 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3364 readl(iommu->reg + DMAR_FECTL_REG);
3365 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3366 readl(iommu->reg + DMAR_FEDATA_REG);
3367 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3368 readl(iommu->reg + DMAR_FEADDR_REG);
3369 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3370 readl(iommu->reg + DMAR_FEUADDR_REG);
3372 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3374 return 0;
3376 nomem:
3377 for_each_active_iommu(iommu, drhd)
3378 kfree(iommu->iommu_state);
3380 return -ENOMEM;
3383 static void iommu_resume(void)
3385 struct dmar_drhd_unit *drhd;
3386 struct intel_iommu *iommu = NULL;
3387 unsigned long flag;
3389 if (init_iommu_hw()) {
3390 if (force_on)
3391 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3392 else
3393 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3394 return;
3397 for_each_active_iommu(iommu, drhd) {
3399 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3401 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3402 iommu->reg + DMAR_FECTL_REG);
3403 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3404 iommu->reg + DMAR_FEDATA_REG);
3405 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3406 iommu->reg + DMAR_FEADDR_REG);
3407 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3408 iommu->reg + DMAR_FEUADDR_REG);
3410 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3413 for_each_active_iommu(iommu, drhd)
3414 kfree(iommu->iommu_state);
3417 static struct syscore_ops iommu_syscore_ops = {
3418 .resume = iommu_resume,
3419 .suspend = iommu_suspend,
3422 static void __init init_iommu_pm_ops(void)
3424 register_syscore_ops(&iommu_syscore_ops);
3427 #else
3428 static inline void init_iommu_pm_ops(void) {}
3429 #endif /* CONFIG_PM */
3431 LIST_HEAD(dmar_rmrr_units);
3433 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3435 list_add(&rmrr->list, &dmar_rmrr_units);
3439 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3441 struct acpi_dmar_reserved_memory *rmrr;
3442 struct dmar_rmrr_unit *rmrru;
3444 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3445 if (!rmrru)
3446 return -ENOMEM;
3448 rmrru->hdr = header;
3449 rmrr = (struct acpi_dmar_reserved_memory *)header;
3450 rmrru->base_address = rmrr->base_address;
3451 rmrru->end_address = rmrr->end_address;
3453 dmar_register_rmrr_unit(rmrru);
3454 return 0;
3457 static int __init
3458 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3460 struct acpi_dmar_reserved_memory *rmrr;
3461 int ret;
3463 rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3464 ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3465 ((void *)rmrr) + rmrr->header.length,
3466 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3468 if (ret || (rmrru->devices_cnt == 0)) {
3469 list_del(&rmrru->list);
3470 kfree(rmrru);
3472 return ret;
3475 static LIST_HEAD(dmar_atsr_units);
3477 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3479 struct acpi_dmar_atsr *atsr;
3480 struct dmar_atsr_unit *atsru;
3482 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3483 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3484 if (!atsru)
3485 return -ENOMEM;
3487 atsru->hdr = hdr;
3488 atsru->include_all = atsr->flags & 0x1;
3490 list_add(&atsru->list, &dmar_atsr_units);
3492 return 0;
3495 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3497 int rc;
3498 struct acpi_dmar_atsr *atsr;
3500 if (atsru->include_all)
3501 return 0;
3503 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3504 rc = dmar_parse_dev_scope((void *)(atsr + 1),
3505 (void *)atsr + atsr->header.length,
3506 &atsru->devices_cnt, &atsru->devices,
3507 atsr->segment);
3508 if (rc || !atsru->devices_cnt) {
3509 list_del(&atsru->list);
3510 kfree(atsru);
3513 return rc;
3516 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3518 int i;
3519 struct pci_bus *bus;
3520 struct acpi_dmar_atsr *atsr;
3521 struct dmar_atsr_unit *atsru;
3523 dev = pci_physfn(dev);
3525 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3526 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3527 if (atsr->segment == pci_domain_nr(dev->bus))
3528 goto found;
3531 return 0;
3533 found:
3534 for (bus = dev->bus; bus; bus = bus->parent) {
3535 struct pci_dev *bridge = bus->self;
3537 if (!bridge || !pci_is_pcie(bridge) ||
3538 bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3539 return 0;
3541 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3542 for (i = 0; i < atsru->devices_cnt; i++)
3543 if (atsru->devices[i] == bridge)
3544 return 1;
3545 break;
3549 if (atsru->include_all)
3550 return 1;
3552 return 0;
3555 int __init dmar_parse_rmrr_atsr_dev(void)
3557 struct dmar_rmrr_unit *rmrr, *rmrr_n;
3558 struct dmar_atsr_unit *atsr, *atsr_n;
3559 int ret = 0;
3561 list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3562 ret = rmrr_parse_dev(rmrr);
3563 if (ret)
3564 return ret;
3567 list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3568 ret = atsr_parse_dev(atsr);
3569 if (ret)
3570 return ret;
3573 return ret;
3577 * Here we only respond to action of unbound device from driver.
3579 * Added device is not attached to its DMAR domain here yet. That will happen
3580 * when mapping the device to iova.
3582 static int device_notifier(struct notifier_block *nb,
3583 unsigned long action, void *data)
3585 struct device *dev = data;
3586 struct pci_dev *pdev = to_pci_dev(dev);
3587 struct dmar_domain *domain;
3589 if (iommu_no_mapping(dev))
3590 return 0;
3592 domain = find_domain(pdev);
3593 if (!domain)
3594 return 0;
3596 if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3597 domain_remove_one_dev_info(domain, pdev);
3599 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3600 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3601 list_empty(&domain->devices))
3602 domain_exit(domain);
3605 return 0;
3608 static struct notifier_block device_nb = {
3609 .notifier_call = device_notifier,
3612 int __init intel_iommu_init(void)
3614 int ret = 0;
3616 /* VT-d is required for a TXT/tboot launch, so enforce that */
3617 force_on = tboot_force_iommu();
3619 if (dmar_table_init()) {
3620 if (force_on)
3621 panic("tboot: Failed to initialize DMAR table\n");
3622 return -ENODEV;
3625 if (dmar_dev_scope_init() < 0) {
3626 if (force_on)
3627 panic("tboot: Failed to initialize DMAR device scope\n");
3628 return -ENODEV;
3631 if (no_iommu || dmar_disabled)
3632 return -ENODEV;
3634 if (iommu_init_mempool()) {
3635 if (force_on)
3636 panic("tboot: Failed to initialize iommu memory\n");
3637 return -ENODEV;
3640 if (list_empty(&dmar_rmrr_units))
3641 printk(KERN_INFO "DMAR: No RMRR found\n");
3643 if (list_empty(&dmar_atsr_units))
3644 printk(KERN_INFO "DMAR: No ATSR found\n");
3646 if (dmar_init_reserved_ranges()) {
3647 if (force_on)
3648 panic("tboot: Failed to reserve iommu ranges\n");
3649 return -ENODEV;
3652 init_no_remapping_devices();
3654 ret = init_dmars();
3655 if (ret) {
3656 if (force_on)
3657 panic("tboot: Failed to initialize DMARs\n");
3658 printk(KERN_ERR "IOMMU: dmar init failed\n");
3659 put_iova_domain(&reserved_iova_list);
3660 iommu_exit_mempool();
3661 return ret;
3663 printk(KERN_INFO
3664 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3666 init_timer(&unmap_timer);
3667 #ifdef CONFIG_SWIOTLB
3668 swiotlb = 0;
3669 #endif
3670 dma_ops = &intel_dma_ops;
3672 init_iommu_pm_ops();
3674 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3676 bus_register_notifier(&pci_bus_type, &device_nb);
3678 intel_iommu_enabled = 1;
3680 return 0;
3683 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3684 struct pci_dev *pdev)
3686 struct pci_dev *tmp, *parent;
3688 if (!iommu || !pdev)
3689 return;
3691 /* dependent device detach */
3692 tmp = pci_find_upstream_pcie_bridge(pdev);
3693 /* Secondary interface's bus number and devfn 0 */
3694 if (tmp) {
3695 parent = pdev->bus->self;
3696 while (parent != tmp) {
3697 iommu_detach_dev(iommu, parent->bus->number,
3698 parent->devfn);
3699 parent = parent->bus->self;
3701 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3702 iommu_detach_dev(iommu,
3703 tmp->subordinate->number, 0);
3704 else /* this is a legacy PCI bridge */
3705 iommu_detach_dev(iommu, tmp->bus->number,
3706 tmp->devfn);
3710 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3711 struct pci_dev *pdev)
3713 struct device_domain_info *info;
3714 struct intel_iommu *iommu;
3715 unsigned long flags;
3716 int found = 0;
3717 struct list_head *entry, *tmp;
3719 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3720 pdev->devfn);
3721 if (!iommu)
3722 return;
3724 spin_lock_irqsave(&device_domain_lock, flags);
3725 list_for_each_safe(entry, tmp, &domain->devices) {
3726 info = list_entry(entry, struct device_domain_info, link);
3727 if (info->segment == pci_domain_nr(pdev->bus) &&
3728 info->bus == pdev->bus->number &&
3729 info->devfn == pdev->devfn) {
3730 list_del(&info->link);
3731 list_del(&info->global);
3732 if (info->dev)
3733 info->dev->dev.archdata.iommu = NULL;
3734 spin_unlock_irqrestore(&device_domain_lock, flags);
3736 iommu_disable_dev_iotlb(info);
3737 iommu_detach_dev(iommu, info->bus, info->devfn);
3738 iommu_detach_dependent_devices(iommu, pdev);
3739 free_devinfo_mem(info);
3741 spin_lock_irqsave(&device_domain_lock, flags);
3743 if (found)
3744 break;
3745 else
3746 continue;
3749 /* if there is no other devices under the same iommu
3750 * owned by this domain, clear this iommu in iommu_bmp
3751 * update iommu count and coherency
3753 if (iommu == device_to_iommu(info->segment, info->bus,
3754 info->devfn))
3755 found = 1;
3758 spin_unlock_irqrestore(&device_domain_lock, flags);
3760 if (found == 0) {
3761 unsigned long tmp_flags;
3762 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3763 clear_bit(iommu->seq_id, domain->iommu_bmp);
3764 domain->iommu_count--;
3765 domain_update_iommu_cap(domain);
3766 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3768 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3769 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3770 spin_lock_irqsave(&iommu->lock, tmp_flags);
3771 clear_bit(domain->id, iommu->domain_ids);
3772 iommu->domains[domain->id] = NULL;
3773 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3778 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3780 struct device_domain_info *info;
3781 struct intel_iommu *iommu;
3782 unsigned long flags1, flags2;
3784 spin_lock_irqsave(&device_domain_lock, flags1);
3785 while (!list_empty(&domain->devices)) {
3786 info = list_entry(domain->devices.next,
3787 struct device_domain_info, link);
3788 list_del(&info->link);
3789 list_del(&info->global);
3790 if (info->dev)
3791 info->dev->dev.archdata.iommu = NULL;
3793 spin_unlock_irqrestore(&device_domain_lock, flags1);
3795 iommu_disable_dev_iotlb(info);
3796 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3797 iommu_detach_dev(iommu, info->bus, info->devfn);
3798 iommu_detach_dependent_devices(iommu, info->dev);
3800 /* clear this iommu in iommu_bmp, update iommu count
3801 * and capabilities
3803 spin_lock_irqsave(&domain->iommu_lock, flags2);
3804 if (test_and_clear_bit(iommu->seq_id,
3805 domain->iommu_bmp)) {
3806 domain->iommu_count--;
3807 domain_update_iommu_cap(domain);
3809 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3811 free_devinfo_mem(info);
3812 spin_lock_irqsave(&device_domain_lock, flags1);
3814 spin_unlock_irqrestore(&device_domain_lock, flags1);
3817 /* domain id for virtual machine, it won't be set in context */
3818 static unsigned long vm_domid;
3820 static struct dmar_domain *iommu_alloc_vm_domain(void)
3822 struct dmar_domain *domain;
3824 domain = alloc_domain_mem();
3825 if (!domain)
3826 return NULL;
3828 domain->id = vm_domid++;
3829 domain->nid = -1;
3830 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3831 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3833 return domain;
3836 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3838 int adjust_width;
3840 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3841 spin_lock_init(&domain->iommu_lock);
3843 domain_reserve_special_ranges(domain);
3845 /* calculate AGAW */
3846 domain->gaw = guest_width;
3847 adjust_width = guestwidth_to_adjustwidth(guest_width);
3848 domain->agaw = width_to_agaw(adjust_width);
3850 INIT_LIST_HEAD(&domain->devices);
3852 domain->iommu_count = 0;
3853 domain->iommu_coherency = 0;
3854 domain->iommu_snooping = 0;
3855 domain->iommu_superpage = 0;
3856 domain->max_addr = 0;
3857 domain->nid = -1;
3859 /* always allocate the top pgd */
3860 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3861 if (!domain->pgd)
3862 return -ENOMEM;
3863 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3864 return 0;
3867 static void iommu_free_vm_domain(struct dmar_domain *domain)
3869 unsigned long flags;
3870 struct dmar_drhd_unit *drhd;
3871 struct intel_iommu *iommu;
3872 unsigned long i;
3873 unsigned long ndomains;
3875 for_each_drhd_unit(drhd) {
3876 if (drhd->ignored)
3877 continue;
3878 iommu = drhd->iommu;
3880 ndomains = cap_ndoms(iommu->cap);
3881 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3882 if (iommu->domains[i] == domain) {
3883 spin_lock_irqsave(&iommu->lock, flags);
3884 clear_bit(i, iommu->domain_ids);
3885 iommu->domains[i] = NULL;
3886 spin_unlock_irqrestore(&iommu->lock, flags);
3887 break;
3893 static void vm_domain_exit(struct dmar_domain *domain)
3895 /* Domain 0 is reserved, so dont process it */
3896 if (!domain)
3897 return;
3899 vm_domain_remove_all_dev_info(domain);
3900 /* destroy iovas */
3901 put_iova_domain(&domain->iovad);
3903 /* clear ptes */
3904 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3906 /* free page tables */
3907 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3909 iommu_free_vm_domain(domain);
3910 free_domain_mem(domain);
3913 static int intel_iommu_domain_init(struct iommu_domain *domain)
3915 struct dmar_domain *dmar_domain;
3917 dmar_domain = iommu_alloc_vm_domain();
3918 if (!dmar_domain) {
3919 printk(KERN_ERR
3920 "intel_iommu_domain_init: dmar_domain == NULL\n");
3921 return -ENOMEM;
3923 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3924 printk(KERN_ERR
3925 "intel_iommu_domain_init() failed\n");
3926 vm_domain_exit(dmar_domain);
3927 return -ENOMEM;
3929 domain_update_iommu_cap(dmar_domain);
3930 domain->priv = dmar_domain;
3932 return 0;
3935 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3937 struct dmar_domain *dmar_domain = domain->priv;
3939 domain->priv = NULL;
3940 vm_domain_exit(dmar_domain);
3943 static int intel_iommu_attach_device(struct iommu_domain *domain,
3944 struct device *dev)
3946 struct dmar_domain *dmar_domain = domain->priv;
3947 struct pci_dev *pdev = to_pci_dev(dev);
3948 struct intel_iommu *iommu;
3949 int addr_width;
3951 /* normally pdev is not mapped */
3952 if (unlikely(domain_context_mapped(pdev))) {
3953 struct dmar_domain *old_domain;
3955 old_domain = find_domain(pdev);
3956 if (old_domain) {
3957 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3958 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3959 domain_remove_one_dev_info(old_domain, pdev);
3960 else
3961 domain_remove_dev_info(old_domain);
3965 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3966 pdev->devfn);
3967 if (!iommu)
3968 return -ENODEV;
3970 /* check if this iommu agaw is sufficient for max mapped address */
3971 addr_width = agaw_to_width(iommu->agaw);
3972 if (addr_width > cap_mgaw(iommu->cap))
3973 addr_width = cap_mgaw(iommu->cap);
3975 if (dmar_domain->max_addr > (1LL << addr_width)) {
3976 printk(KERN_ERR "%s: iommu width (%d) is not "
3977 "sufficient for the mapped address (%llx)\n",
3978 __func__, addr_width, dmar_domain->max_addr);
3979 return -EFAULT;
3981 dmar_domain->gaw = addr_width;
3984 * Knock out extra levels of page tables if necessary
3986 while (iommu->agaw < dmar_domain->agaw) {
3987 struct dma_pte *pte;
3989 pte = dmar_domain->pgd;
3990 if (dma_pte_present(pte)) {
3991 dmar_domain->pgd = (struct dma_pte *)
3992 phys_to_virt(dma_pte_addr(pte));
3993 free_pgtable_page(pte);
3995 dmar_domain->agaw--;
3998 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4001 static void intel_iommu_detach_device(struct iommu_domain *domain,
4002 struct device *dev)
4004 struct dmar_domain *dmar_domain = domain->priv;
4005 struct pci_dev *pdev = to_pci_dev(dev);
4007 domain_remove_one_dev_info(dmar_domain, pdev);
4010 static int intel_iommu_map(struct iommu_domain *domain,
4011 unsigned long iova, phys_addr_t hpa,
4012 size_t size, int iommu_prot)
4014 struct dmar_domain *dmar_domain = domain->priv;
4015 u64 max_addr;
4016 int prot = 0;
4017 int ret;
4019 if (iommu_prot & IOMMU_READ)
4020 prot |= DMA_PTE_READ;
4021 if (iommu_prot & IOMMU_WRITE)
4022 prot |= DMA_PTE_WRITE;
4023 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4024 prot |= DMA_PTE_SNP;
4026 max_addr = iova + size;
4027 if (dmar_domain->max_addr < max_addr) {
4028 u64 end;
4030 /* check if minimum agaw is sufficient for mapped address */
4031 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4032 if (end < max_addr) {
4033 printk(KERN_ERR "%s: iommu width (%d) is not "
4034 "sufficient for the mapped address (%llx)\n",
4035 __func__, dmar_domain->gaw, max_addr);
4036 return -EFAULT;
4038 dmar_domain->max_addr = max_addr;
4040 /* Round up size to next multiple of PAGE_SIZE, if it and
4041 the low bits of hpa would take us onto the next page */
4042 size = aligned_nrpages(hpa, size);
4043 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4044 hpa >> VTD_PAGE_SHIFT, size, prot);
4045 return ret;
4048 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4049 unsigned long iova, size_t size)
4051 struct dmar_domain *dmar_domain = domain->priv;
4052 int order;
4054 order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4055 (iova + size - 1) >> VTD_PAGE_SHIFT);
4057 if (dmar_domain->max_addr == iova + size)
4058 dmar_domain->max_addr = iova;
4060 return PAGE_SIZE << order;
4063 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4064 unsigned long iova)
4066 struct dmar_domain *dmar_domain = domain->priv;
4067 struct dma_pte *pte;
4068 u64 phys = 0;
4070 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4071 if (pte)
4072 phys = dma_pte_addr(pte);
4074 return phys;
4077 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4078 unsigned long cap)
4080 struct dmar_domain *dmar_domain = domain->priv;
4082 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4083 return dmar_domain->iommu_snooping;
4084 if (cap == IOMMU_CAP_INTR_REMAP)
4085 return intr_remapping_enabled;
4087 return 0;
4091 * Group numbers are arbitrary. Device with the same group number
4092 * indicate the iommu cannot differentiate between them. To avoid
4093 * tracking used groups we just use the seg|bus|devfn of the lowest
4094 * level we're able to differentiate devices
4096 static int intel_iommu_device_group(struct device *dev, unsigned int *groupid)
4098 struct pci_dev *pdev = to_pci_dev(dev);
4099 struct pci_dev *bridge;
4100 union {
4101 struct {
4102 u8 devfn;
4103 u8 bus;
4104 u16 segment;
4105 } pci;
4106 u32 group;
4107 } id;
4109 if (iommu_no_mapping(dev))
4110 return -ENODEV;
4112 id.pci.segment = pci_domain_nr(pdev->bus);
4113 id.pci.bus = pdev->bus->number;
4114 id.pci.devfn = pdev->devfn;
4116 if (!device_to_iommu(id.pci.segment, id.pci.bus, id.pci.devfn))
4117 return -ENODEV;
4119 bridge = pci_find_upstream_pcie_bridge(pdev);
4120 if (bridge) {
4121 if (pci_is_pcie(bridge)) {
4122 id.pci.bus = bridge->subordinate->number;
4123 id.pci.devfn = 0;
4124 } else {
4125 id.pci.bus = bridge->bus->number;
4126 id.pci.devfn = bridge->devfn;
4130 if (!pdev->is_virtfn && iommu_group_mf)
4131 id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0);
4133 *groupid = id.group;
4135 return 0;
4138 static struct iommu_ops intel_iommu_ops = {
4139 .domain_init = intel_iommu_domain_init,
4140 .domain_destroy = intel_iommu_domain_destroy,
4141 .attach_dev = intel_iommu_attach_device,
4142 .detach_dev = intel_iommu_detach_device,
4143 .map = intel_iommu_map,
4144 .unmap = intel_iommu_unmap,
4145 .iova_to_phys = intel_iommu_iova_to_phys,
4146 .domain_has_cap = intel_iommu_domain_has_cap,
4147 .device_group = intel_iommu_device_group,
4148 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4151 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4154 * Mobile 4 Series Chipset neglects to set RWBF capability,
4155 * but needs it:
4157 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4158 rwbf_quirk = 1;
4160 /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4161 if (dev->revision == 0x07) {
4162 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4163 dmar_map_gfx = 0;
4167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4169 #define GGC 0x52
4170 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4171 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4172 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4173 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4174 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4175 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4176 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4177 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4179 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4181 unsigned short ggc;
4183 if (pci_read_config_word(dev, GGC, &ggc))
4184 return;
4186 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4187 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4188 dmar_map_gfx = 0;
4189 } else if (dmar_map_gfx) {
4190 /* we have to ensure the gfx device is idle before we flush */
4191 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4192 intel_iommu_strict = 1;
4195 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4196 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4197 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4198 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4200 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4201 ISOCH DMAR unit for the Azalia sound device, but not give it any
4202 TLB entries, which causes it to deadlock. Check for that. We do
4203 this in a function called from init_dmars(), instead of in a PCI
4204 quirk, because we don't want to print the obnoxious "BIOS broken"
4205 message if VT-d is actually disabled.
4207 static void __init check_tylersburg_isoch(void)
4209 struct pci_dev *pdev;
4210 uint32_t vtisochctrl;
4212 /* If there's no Azalia in the system anyway, forget it. */
4213 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4214 if (!pdev)
4215 return;
4216 pci_dev_put(pdev);
4218 /* System Management Registers. Might be hidden, in which case
4219 we can't do the sanity check. But that's OK, because the
4220 known-broken BIOSes _don't_ actually hide it, so far. */
4221 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4222 if (!pdev)
4223 return;
4225 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4226 pci_dev_put(pdev);
4227 return;
4230 pci_dev_put(pdev);
4232 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4233 if (vtisochctrl & 1)
4234 return;
4236 /* Drop all bits other than the number of TLB entries */
4237 vtisochctrl &= 0x1c;
4239 /* If we have the recommended number of TLB entries (16), fine. */
4240 if (vtisochctrl == 0x10)
4241 return;
4243 /* Zero TLB entries? You get to ride the short bus to school. */
4244 if (!vtisochctrl) {
4245 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4246 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4247 dmi_get_system_info(DMI_BIOS_VENDOR),
4248 dmi_get_system_info(DMI_BIOS_VERSION),
4249 dmi_get_system_info(DMI_PRODUCT_VERSION));
4250 iommu_identity_mapping |= IDENTMAP_AZALIA;
4251 return;
4254 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4255 vtisochctrl);