mtd: mtd_blkdevs: don't increase 'open' count on error path
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / drivers / pci / intel-iommu.c
blob0ec8930f31b8797812c4b2f795d9bb042c35acfb
1 /*
2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/syscore_ops.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <linux/pci-ats.h>
43 #include <asm/cacheflush.h>
44 #include <asm/iommu.h>
45 #include "pci.h"
47 #define ROOT_SIZE VTD_PAGE_SIZE
48 #define CONTEXT_SIZE VTD_PAGE_SIZE
50 #define IS_BRIDGE_HOST_DEVICE(pdev) \
51 ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56 #define IOAPIC_RANGE_START (0xfee00000)
57 #define IOAPIC_RANGE_END (0xfeefffff)
58 #define IOVA_START_ADDR (0x1000)
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62 #define MAX_AGAW_WIDTH 64
64 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
70 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
73 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
77 /* page table handling */
78 #define LEVEL_STRIDE (9)
79 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
81 static inline int agaw_to_level(int agaw)
83 return agaw + 2;
86 static inline int agaw_to_width(int agaw)
88 return 30 + agaw * LEVEL_STRIDE;
91 static inline int width_to_agaw(int width)
93 return (width - 30) / LEVEL_STRIDE;
96 static inline unsigned int level_to_offset_bits(int level)
98 return (level - 1) * LEVEL_STRIDE;
101 static inline int pfn_level_offset(unsigned long pfn, int level)
103 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
106 static inline unsigned long level_mask(int level)
108 return -1UL << level_to_offset_bits(level);
111 static inline unsigned long level_size(int level)
113 return 1UL << level_to_offset_bits(level);
116 static inline unsigned long align_to_level(unsigned long pfn, int level)
118 return (pfn + level_size(level) - 1) & level_mask(level);
121 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
123 return 1 << ((lvl - 1) * LEVEL_STRIDE);
126 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
127 are never going to work. */
128 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
130 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
133 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
135 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
137 static inline unsigned long page_to_dma_pfn(struct page *pg)
139 return mm_to_dma_pfn(page_to_pfn(pg));
141 static inline unsigned long virt_to_dma_pfn(void *p)
143 return page_to_dma_pfn(virt_to_page(p));
146 /* global iommu list, set NULL for ignored DMAR units */
147 static struct intel_iommu **g_iommus;
149 static void __init check_tylersburg_isoch(void);
150 static int rwbf_quirk;
153 * set to 1 to panic kernel if can't successfully enable VT-d
154 * (used when kernel is launched w/ TXT)
156 static int force_on = 0;
159 * 0: Present
160 * 1-11: Reserved
161 * 12-63: Context Ptr (12 - (haw-1))
162 * 64-127: Reserved
164 struct root_entry {
165 u64 val;
166 u64 rsvd1;
168 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
169 static inline bool root_present(struct root_entry *root)
171 return (root->val & 1);
173 static inline void set_root_present(struct root_entry *root)
175 root->val |= 1;
177 static inline void set_root_value(struct root_entry *root, unsigned long value)
179 root->val |= value & VTD_PAGE_MASK;
182 static inline struct context_entry *
183 get_context_addr_from_root(struct root_entry *root)
185 return (struct context_entry *)
186 (root_present(root)?phys_to_virt(
187 root->val & VTD_PAGE_MASK) :
188 NULL);
192 * low 64 bits:
193 * 0: present
194 * 1: fault processing disable
195 * 2-3: translation type
196 * 12-63: address space root
197 * high 64 bits:
198 * 0-2: address width
199 * 3-6: aval
200 * 8-23: domain id
202 struct context_entry {
203 u64 lo;
204 u64 hi;
207 static inline bool context_present(struct context_entry *context)
209 return (context->lo & 1);
211 static inline void context_set_present(struct context_entry *context)
213 context->lo |= 1;
216 static inline void context_set_fault_enable(struct context_entry *context)
218 context->lo &= (((u64)-1) << 2) | 1;
221 static inline void context_set_translation_type(struct context_entry *context,
222 unsigned long value)
224 context->lo &= (((u64)-1) << 4) | 3;
225 context->lo |= (value & 3) << 2;
228 static inline void context_set_address_root(struct context_entry *context,
229 unsigned long value)
231 context->lo |= value & VTD_PAGE_MASK;
234 static inline void context_set_address_width(struct context_entry *context,
235 unsigned long value)
237 context->hi |= value & 7;
240 static inline void context_set_domain_id(struct context_entry *context,
241 unsigned long value)
243 context->hi |= (value & ((1 << 16) - 1)) << 8;
246 static inline void context_clear_entry(struct context_entry *context)
248 context->lo = 0;
249 context->hi = 0;
253 * 0: readable
254 * 1: writable
255 * 2-6: reserved
256 * 7: super page
257 * 8-10: available
258 * 11: snoop behavior
259 * 12-63: Host physcial address
261 struct dma_pte {
262 u64 val;
265 static inline void dma_clear_pte(struct dma_pte *pte)
267 pte->val = 0;
270 static inline void dma_set_pte_readable(struct dma_pte *pte)
272 pte->val |= DMA_PTE_READ;
275 static inline void dma_set_pte_writable(struct dma_pte *pte)
277 pte->val |= DMA_PTE_WRITE;
280 static inline void dma_set_pte_snp(struct dma_pte *pte)
282 pte->val |= DMA_PTE_SNP;
285 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
287 pte->val = (pte->val & ~3) | (prot & 3);
290 static inline u64 dma_pte_addr(struct dma_pte *pte)
292 #ifdef CONFIG_64BIT
293 return pte->val & VTD_PAGE_MASK;
294 #else
295 /* Must have a full atomic 64-bit read */
296 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
297 #endif
300 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
302 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
305 static inline bool dma_pte_present(struct dma_pte *pte)
307 return (pte->val & 3) != 0;
310 static inline bool dma_pte_superpage(struct dma_pte *pte)
312 return (pte->val & (1 << 7));
315 static inline int first_pte_in_page(struct dma_pte *pte)
317 return !((unsigned long)pte & ~VTD_PAGE_MASK);
321 * This domain is a statically identity mapping domain.
322 * 1. This domain creats a static 1:1 mapping to all usable memory.
323 * 2. It maps to each iommu if successful.
324 * 3. Each iommu mapps to this domain if successful.
326 static struct dmar_domain *si_domain;
327 static int hw_pass_through = 1;
329 /* devices under the same p2p bridge are owned in one domain */
330 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
332 /* domain represents a virtual machine, more than one devices
333 * across iommus may be owned in one domain, e.g. kvm guest.
335 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
337 /* si_domain contains mulitple devices */
338 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
340 struct dmar_domain {
341 int id; /* domain id */
342 int nid; /* node id */
343 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
345 struct list_head devices; /* all devices' list */
346 struct iova_domain iovad; /* iova's that belong to this domain */
348 struct dma_pte *pgd; /* virtual address */
349 int gaw; /* max guest address width */
351 /* adjusted guest address width, 0 is level 2 30-bit */
352 int agaw;
354 int flags; /* flags to find out type of domain */
356 int iommu_coherency;/* indicate coherency of iommu access */
357 int iommu_snooping; /* indicate snooping control feature*/
358 int iommu_count; /* reference count of iommu */
359 int iommu_superpage;/* Level of superpages supported:
360 0 == 4KiB (no superpages), 1 == 2MiB,
361 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
362 spinlock_t iommu_lock; /* protect iommu set in domain */
363 u64 max_addr; /* maximum mapped address */
366 /* PCI domain-device relationship */
367 struct device_domain_info {
368 struct list_head link; /* link to domain siblings */
369 struct list_head global; /* link to global list */
370 int segment; /* PCI domain */
371 u8 bus; /* PCI bus number */
372 u8 devfn; /* PCI devfn number */
373 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
374 struct intel_iommu *iommu; /* IOMMU used by this device */
375 struct dmar_domain *domain; /* pointer to domain */
378 static void flush_unmaps_timeout(unsigned long data);
380 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
382 #define HIGH_WATER_MARK 250
383 struct deferred_flush_tables {
384 int next;
385 struct iova *iova[HIGH_WATER_MARK];
386 struct dmar_domain *domain[HIGH_WATER_MARK];
389 static struct deferred_flush_tables *deferred_flush;
391 /* bitmap for indexing intel_iommus */
392 static int g_num_of_iommus;
394 static DEFINE_SPINLOCK(async_umap_flush_lock);
395 static LIST_HEAD(unmaps_to_do);
397 static int timer_on;
398 static long list_size;
400 static void domain_remove_dev_info(struct dmar_domain *domain);
402 #ifdef CONFIG_DMAR_DEFAULT_ON
403 int dmar_disabled = 0;
404 #else
405 int dmar_disabled = 1;
406 #endif /*CONFIG_DMAR_DEFAULT_ON*/
408 static int dmar_map_gfx = 1;
409 static int dmar_forcedac;
410 static int intel_iommu_strict;
411 static int intel_iommu_superpage = 1;
413 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
414 static DEFINE_SPINLOCK(device_domain_lock);
415 static LIST_HEAD(device_domain_list);
417 static struct iommu_ops intel_iommu_ops;
419 static int __init intel_iommu_setup(char *str)
421 if (!str)
422 return -EINVAL;
423 while (*str) {
424 if (!strncmp(str, "on", 2)) {
425 dmar_disabled = 0;
426 printk(KERN_INFO "Intel-IOMMU: enabled\n");
427 } else if (!strncmp(str, "off", 3)) {
428 dmar_disabled = 1;
429 printk(KERN_INFO "Intel-IOMMU: disabled\n");
430 } else if (!strncmp(str, "igfx_off", 8)) {
431 dmar_map_gfx = 0;
432 printk(KERN_INFO
433 "Intel-IOMMU: disable GFX device mapping\n");
434 } else if (!strncmp(str, "forcedac", 8)) {
435 printk(KERN_INFO
436 "Intel-IOMMU: Forcing DAC for PCI devices\n");
437 dmar_forcedac = 1;
438 } else if (!strncmp(str, "strict", 6)) {
439 printk(KERN_INFO
440 "Intel-IOMMU: disable batched IOTLB flush\n");
441 intel_iommu_strict = 1;
442 } else if (!strncmp(str, "sp_off", 6)) {
443 printk(KERN_INFO
444 "Intel-IOMMU: disable supported super page\n");
445 intel_iommu_superpage = 0;
448 str += strcspn(str, ",");
449 while (*str == ',')
450 str++;
452 return 0;
454 __setup("intel_iommu=", intel_iommu_setup);
456 static struct kmem_cache *iommu_domain_cache;
457 static struct kmem_cache *iommu_devinfo_cache;
458 static struct kmem_cache *iommu_iova_cache;
460 static inline void *alloc_pgtable_page(int node)
462 struct page *page;
463 void *vaddr = NULL;
465 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
466 if (page)
467 vaddr = page_address(page);
468 return vaddr;
471 static inline void free_pgtable_page(void *vaddr)
473 free_page((unsigned long)vaddr);
476 static inline void *alloc_domain_mem(void)
478 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
481 static void free_domain_mem(void *vaddr)
483 kmem_cache_free(iommu_domain_cache, vaddr);
486 static inline void * alloc_devinfo_mem(void)
488 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
491 static inline void free_devinfo_mem(void *vaddr)
493 kmem_cache_free(iommu_devinfo_cache, vaddr);
496 struct iova *alloc_iova_mem(void)
498 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
501 void free_iova_mem(struct iova *iova)
503 kmem_cache_free(iommu_iova_cache, iova);
507 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
509 unsigned long sagaw;
510 int agaw = -1;
512 sagaw = cap_sagaw(iommu->cap);
513 for (agaw = width_to_agaw(max_gaw);
514 agaw >= 0; agaw--) {
515 if (test_bit(agaw, &sagaw))
516 break;
519 return agaw;
523 * Calculate max SAGAW for each iommu.
525 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
527 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
531 * calculate agaw for each iommu.
532 * "SAGAW" may be different across iommus, use a default agaw, and
533 * get a supported less agaw for iommus that don't support the default agaw.
535 int iommu_calculate_agaw(struct intel_iommu *iommu)
537 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
540 /* This functionin only returns single iommu in a domain */
541 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
543 int iommu_id;
545 /* si_domain and vm domain should not get here. */
546 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
547 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
549 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
550 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
551 return NULL;
553 return g_iommus[iommu_id];
556 static void domain_update_iommu_coherency(struct dmar_domain *domain)
558 int i;
560 domain->iommu_coherency = 1;
562 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
563 if (!ecap_coherent(g_iommus[i]->ecap)) {
564 domain->iommu_coherency = 0;
565 break;
570 static void domain_update_iommu_snooping(struct dmar_domain *domain)
572 int i;
574 domain->iommu_snooping = 1;
576 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
577 if (!ecap_sc_support(g_iommus[i]->ecap)) {
578 domain->iommu_snooping = 0;
579 break;
584 static void domain_update_iommu_superpage(struct dmar_domain *domain)
586 struct dmar_drhd_unit *drhd;
587 struct intel_iommu *iommu = NULL;
588 int mask = 0xf;
590 if (!intel_iommu_superpage) {
591 domain->iommu_superpage = 0;
592 return;
595 /* set iommu_superpage to the smallest common denominator */
596 for_each_active_iommu(iommu, drhd) {
597 mask &= cap_super_page_val(iommu->cap);
598 if (!mask) {
599 break;
602 domain->iommu_superpage = fls(mask);
605 /* Some capabilities may be different across iommus */
606 static void domain_update_iommu_cap(struct dmar_domain *domain)
608 domain_update_iommu_coherency(domain);
609 domain_update_iommu_snooping(domain);
610 domain_update_iommu_superpage(domain);
613 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
615 struct dmar_drhd_unit *drhd = NULL;
616 int i;
618 for_each_drhd_unit(drhd) {
619 if (drhd->ignored)
620 continue;
621 if (segment != drhd->segment)
622 continue;
624 for (i = 0; i < drhd->devices_cnt; i++) {
625 if (drhd->devices[i] &&
626 drhd->devices[i]->bus->number == bus &&
627 drhd->devices[i]->devfn == devfn)
628 return drhd->iommu;
629 if (drhd->devices[i] &&
630 drhd->devices[i]->subordinate &&
631 drhd->devices[i]->subordinate->number <= bus &&
632 drhd->devices[i]->subordinate->subordinate >= bus)
633 return drhd->iommu;
636 if (drhd->include_all)
637 return drhd->iommu;
640 return NULL;
643 static void domain_flush_cache(struct dmar_domain *domain,
644 void *addr, int size)
646 if (!domain->iommu_coherency)
647 clflush_cache_range(addr, size);
650 /* Gets context entry for a given bus and devfn */
651 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
652 u8 bus, u8 devfn)
654 struct root_entry *root;
655 struct context_entry *context;
656 unsigned long phy_addr;
657 unsigned long flags;
659 spin_lock_irqsave(&iommu->lock, flags);
660 root = &iommu->root_entry[bus];
661 context = get_context_addr_from_root(root);
662 if (!context) {
663 context = (struct context_entry *)
664 alloc_pgtable_page(iommu->node);
665 if (!context) {
666 spin_unlock_irqrestore(&iommu->lock, flags);
667 return NULL;
669 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
670 phy_addr = virt_to_phys((void *)context);
671 set_root_value(root, phy_addr);
672 set_root_present(root);
673 __iommu_flush_cache(iommu, root, sizeof(*root));
675 spin_unlock_irqrestore(&iommu->lock, flags);
676 return &context[devfn];
679 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
681 struct root_entry *root;
682 struct context_entry *context;
683 int ret;
684 unsigned long flags;
686 spin_lock_irqsave(&iommu->lock, flags);
687 root = &iommu->root_entry[bus];
688 context = get_context_addr_from_root(root);
689 if (!context) {
690 ret = 0;
691 goto out;
693 ret = context_present(&context[devfn]);
694 out:
695 spin_unlock_irqrestore(&iommu->lock, flags);
696 return ret;
699 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
701 struct root_entry *root;
702 struct context_entry *context;
703 unsigned long flags;
705 spin_lock_irqsave(&iommu->lock, flags);
706 root = &iommu->root_entry[bus];
707 context = get_context_addr_from_root(root);
708 if (context) {
709 context_clear_entry(&context[devfn]);
710 __iommu_flush_cache(iommu, &context[devfn], \
711 sizeof(*context));
713 spin_unlock_irqrestore(&iommu->lock, flags);
716 static void free_context_table(struct intel_iommu *iommu)
718 struct root_entry *root;
719 int i;
720 unsigned long flags;
721 struct context_entry *context;
723 spin_lock_irqsave(&iommu->lock, flags);
724 if (!iommu->root_entry) {
725 goto out;
727 for (i = 0; i < ROOT_ENTRY_NR; i++) {
728 root = &iommu->root_entry[i];
729 context = get_context_addr_from_root(root);
730 if (context)
731 free_pgtable_page(context);
733 free_pgtable_page(iommu->root_entry);
734 iommu->root_entry = NULL;
735 out:
736 spin_unlock_irqrestore(&iommu->lock, flags);
739 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
740 unsigned long pfn, int target_level)
742 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
743 struct dma_pte *parent, *pte = NULL;
744 int level = agaw_to_level(domain->agaw);
745 int offset;
747 BUG_ON(!domain->pgd);
748 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
749 parent = domain->pgd;
751 while (level > 0) {
752 void *tmp_page;
754 offset = pfn_level_offset(pfn, level);
755 pte = &parent[offset];
756 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
757 break;
758 if (level == target_level)
759 break;
761 if (!dma_pte_present(pte)) {
762 uint64_t pteval;
764 tmp_page = alloc_pgtable_page(domain->nid);
766 if (!tmp_page)
767 return NULL;
769 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
770 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
771 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
772 /* Someone else set it while we were thinking; use theirs. */
773 free_pgtable_page(tmp_page);
774 } else {
775 dma_pte_addr(pte);
776 domain_flush_cache(domain, pte, sizeof(*pte));
779 parent = phys_to_virt(dma_pte_addr(pte));
780 level--;
783 return pte;
787 /* return address's pte at specific level */
788 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
789 unsigned long pfn,
790 int level, int *large_page)
792 struct dma_pte *parent, *pte = NULL;
793 int total = agaw_to_level(domain->agaw);
794 int offset;
796 parent = domain->pgd;
797 while (level <= total) {
798 offset = pfn_level_offset(pfn, total);
799 pte = &parent[offset];
800 if (level == total)
801 return pte;
803 if (!dma_pte_present(pte)) {
804 *large_page = total;
805 break;
808 if (pte->val & DMA_PTE_LARGE_PAGE) {
809 *large_page = total;
810 return pte;
813 parent = phys_to_virt(dma_pte_addr(pte));
814 total--;
816 return NULL;
819 /* clear last level pte, a tlb flush should be followed */
820 static int dma_pte_clear_range(struct dmar_domain *domain,
821 unsigned long start_pfn,
822 unsigned long last_pfn)
824 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
825 unsigned int large_page = 1;
826 struct dma_pte *first_pte, *pte;
827 int order;
829 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
830 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
831 BUG_ON(start_pfn > last_pfn);
833 /* we don't need lock here; nobody else touches the iova range */
834 do {
835 large_page = 1;
836 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
837 if (!pte) {
838 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
839 continue;
841 do {
842 dma_clear_pte(pte);
843 start_pfn += lvl_to_nr_pages(large_page);
844 pte++;
845 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
847 domain_flush_cache(domain, first_pte,
848 (void *)pte - (void *)first_pte);
850 } while (start_pfn && start_pfn <= last_pfn);
852 order = (large_page - 1) * 9;
853 return order;
856 /* free page table pages. last level pte should already be cleared */
857 static void dma_pte_free_pagetable(struct dmar_domain *domain,
858 unsigned long start_pfn,
859 unsigned long last_pfn)
861 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
862 struct dma_pte *first_pte, *pte;
863 int total = agaw_to_level(domain->agaw);
864 int level;
865 unsigned long tmp;
866 int large_page = 2;
868 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
869 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
870 BUG_ON(start_pfn > last_pfn);
872 /* We don't need lock here; nobody else touches the iova range */
873 level = 2;
874 while (level <= total) {
875 tmp = align_to_level(start_pfn, level);
877 /* If we can't even clear one PTE at this level, we're done */
878 if (tmp + level_size(level) - 1 > last_pfn)
879 return;
881 do {
882 large_page = level;
883 first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
884 if (large_page > level)
885 level = large_page + 1;
886 if (!pte) {
887 tmp = align_to_level(tmp + 1, level + 1);
888 continue;
890 do {
891 if (dma_pte_present(pte)) {
892 free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
893 dma_clear_pte(pte);
895 pte++;
896 tmp += level_size(level);
897 } while (!first_pte_in_page(pte) &&
898 tmp + level_size(level) - 1 <= last_pfn);
900 domain_flush_cache(domain, first_pte,
901 (void *)pte - (void *)first_pte);
903 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
904 level++;
906 /* free pgd */
907 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
908 free_pgtable_page(domain->pgd);
909 domain->pgd = NULL;
913 /* iommu handling */
914 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
916 struct root_entry *root;
917 unsigned long flags;
919 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
920 if (!root)
921 return -ENOMEM;
923 __iommu_flush_cache(iommu, root, ROOT_SIZE);
925 spin_lock_irqsave(&iommu->lock, flags);
926 iommu->root_entry = root;
927 spin_unlock_irqrestore(&iommu->lock, flags);
929 return 0;
932 static void iommu_set_root_entry(struct intel_iommu *iommu)
934 void *addr;
935 u32 sts;
936 unsigned long flag;
938 addr = iommu->root_entry;
940 spin_lock_irqsave(&iommu->register_lock, flag);
941 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
943 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
945 /* Make sure hardware complete it */
946 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
947 readl, (sts & DMA_GSTS_RTPS), sts);
949 spin_unlock_irqrestore(&iommu->register_lock, flag);
952 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
954 u32 val;
955 unsigned long flag;
957 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
958 return;
960 spin_lock_irqsave(&iommu->register_lock, flag);
961 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
963 /* Make sure hardware complete it */
964 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
965 readl, (!(val & DMA_GSTS_WBFS)), val);
967 spin_unlock_irqrestore(&iommu->register_lock, flag);
970 /* return value determine if we need a write buffer flush */
971 static void __iommu_flush_context(struct intel_iommu *iommu,
972 u16 did, u16 source_id, u8 function_mask,
973 u64 type)
975 u64 val = 0;
976 unsigned long flag;
978 switch (type) {
979 case DMA_CCMD_GLOBAL_INVL:
980 val = DMA_CCMD_GLOBAL_INVL;
981 break;
982 case DMA_CCMD_DOMAIN_INVL:
983 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
984 break;
985 case DMA_CCMD_DEVICE_INVL:
986 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
987 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
988 break;
989 default:
990 BUG();
992 val |= DMA_CCMD_ICC;
994 spin_lock_irqsave(&iommu->register_lock, flag);
995 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
997 /* Make sure hardware complete it */
998 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
999 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1001 spin_unlock_irqrestore(&iommu->register_lock, flag);
1004 /* return value determine if we need a write buffer flush */
1005 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1006 u64 addr, unsigned int size_order, u64 type)
1008 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1009 u64 val = 0, val_iva = 0;
1010 unsigned long flag;
1012 switch (type) {
1013 case DMA_TLB_GLOBAL_FLUSH:
1014 /* global flush doesn't need set IVA_REG */
1015 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1016 break;
1017 case DMA_TLB_DSI_FLUSH:
1018 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1019 break;
1020 case DMA_TLB_PSI_FLUSH:
1021 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1022 /* Note: always flush non-leaf currently */
1023 val_iva = size_order | addr;
1024 break;
1025 default:
1026 BUG();
1028 /* Note: set drain read/write */
1029 #if 0
1031 * This is probably to be super secure.. Looks like we can
1032 * ignore it without any impact.
1034 if (cap_read_drain(iommu->cap))
1035 val |= DMA_TLB_READ_DRAIN;
1036 #endif
1037 if (cap_write_drain(iommu->cap))
1038 val |= DMA_TLB_WRITE_DRAIN;
1040 spin_lock_irqsave(&iommu->register_lock, flag);
1041 /* Note: Only uses first TLB reg currently */
1042 if (val_iva)
1043 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1044 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1046 /* Make sure hardware complete it */
1047 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1048 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1050 spin_unlock_irqrestore(&iommu->register_lock, flag);
1052 /* check IOTLB invalidation granularity */
1053 if (DMA_TLB_IAIG(val) == 0)
1054 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1055 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1056 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1057 (unsigned long long)DMA_TLB_IIRG(type),
1058 (unsigned long long)DMA_TLB_IAIG(val));
1061 static struct device_domain_info *iommu_support_dev_iotlb(
1062 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1064 int found = 0;
1065 unsigned long flags;
1066 struct device_domain_info *info;
1067 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1069 if (!ecap_dev_iotlb_support(iommu->ecap))
1070 return NULL;
1072 if (!iommu->qi)
1073 return NULL;
1075 spin_lock_irqsave(&device_domain_lock, flags);
1076 list_for_each_entry(info, &domain->devices, link)
1077 if (info->bus == bus && info->devfn == devfn) {
1078 found = 1;
1079 break;
1081 spin_unlock_irqrestore(&device_domain_lock, flags);
1083 if (!found || !info->dev)
1084 return NULL;
1086 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1087 return NULL;
1089 if (!dmar_find_matched_atsr_unit(info->dev))
1090 return NULL;
1092 info->iommu = iommu;
1094 return info;
1097 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1099 if (!info)
1100 return;
1102 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1105 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1107 if (!info->dev || !pci_ats_enabled(info->dev))
1108 return;
1110 pci_disable_ats(info->dev);
1113 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1114 u64 addr, unsigned mask)
1116 u16 sid, qdep;
1117 unsigned long flags;
1118 struct device_domain_info *info;
1120 spin_lock_irqsave(&device_domain_lock, flags);
1121 list_for_each_entry(info, &domain->devices, link) {
1122 if (!info->dev || !pci_ats_enabled(info->dev))
1123 continue;
1125 sid = info->bus << 8 | info->devfn;
1126 qdep = pci_ats_queue_depth(info->dev);
1127 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1129 spin_unlock_irqrestore(&device_domain_lock, flags);
1132 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1133 unsigned long pfn, unsigned int pages, int map)
1135 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1136 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1138 BUG_ON(pages == 0);
1141 * Fallback to domain selective flush if no PSI support or the size is
1142 * too big.
1143 * PSI requires page size to be 2 ^ x, and the base address is naturally
1144 * aligned to the size
1146 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1147 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1148 DMA_TLB_DSI_FLUSH);
1149 else
1150 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1151 DMA_TLB_PSI_FLUSH);
1154 * In caching mode, changes of pages from non-present to present require
1155 * flush. However, device IOTLB doesn't need to be flushed in this case.
1157 if (!cap_caching_mode(iommu->cap) || !map)
1158 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1161 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1163 u32 pmen;
1164 unsigned long flags;
1166 spin_lock_irqsave(&iommu->register_lock, flags);
1167 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1168 pmen &= ~DMA_PMEN_EPM;
1169 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1171 /* wait for the protected region status bit to clear */
1172 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1173 readl, !(pmen & DMA_PMEN_PRS), pmen);
1175 spin_unlock_irqrestore(&iommu->register_lock, flags);
1178 static int iommu_enable_translation(struct intel_iommu *iommu)
1180 u32 sts;
1181 unsigned long flags;
1183 spin_lock_irqsave(&iommu->register_lock, flags);
1184 iommu->gcmd |= DMA_GCMD_TE;
1185 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1187 /* Make sure hardware complete it */
1188 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1189 readl, (sts & DMA_GSTS_TES), sts);
1191 spin_unlock_irqrestore(&iommu->register_lock, flags);
1192 return 0;
1195 static int iommu_disable_translation(struct intel_iommu *iommu)
1197 u32 sts;
1198 unsigned long flag;
1200 spin_lock_irqsave(&iommu->register_lock, flag);
1201 iommu->gcmd &= ~DMA_GCMD_TE;
1202 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1204 /* Make sure hardware complete it */
1205 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1206 readl, (!(sts & DMA_GSTS_TES)), sts);
1208 spin_unlock_irqrestore(&iommu->register_lock, flag);
1209 return 0;
1213 static int iommu_init_domains(struct intel_iommu *iommu)
1215 unsigned long ndomains;
1216 unsigned long nlongs;
1218 ndomains = cap_ndoms(iommu->cap);
1219 pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1220 ndomains);
1221 nlongs = BITS_TO_LONGS(ndomains);
1223 spin_lock_init(&iommu->lock);
1225 /* TBD: there might be 64K domains,
1226 * consider other allocation for future chip
1228 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1229 if (!iommu->domain_ids) {
1230 printk(KERN_ERR "Allocating domain id array failed\n");
1231 return -ENOMEM;
1233 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1234 GFP_KERNEL);
1235 if (!iommu->domains) {
1236 printk(KERN_ERR "Allocating domain array failed\n");
1237 return -ENOMEM;
1241 * if Caching mode is set, then invalid translations are tagged
1242 * with domainid 0. Hence we need to pre-allocate it.
1244 if (cap_caching_mode(iommu->cap))
1245 set_bit(0, iommu->domain_ids);
1246 return 0;
1250 static void domain_exit(struct dmar_domain *domain);
1251 static void vm_domain_exit(struct dmar_domain *domain);
1253 void free_dmar_iommu(struct intel_iommu *iommu)
1255 struct dmar_domain *domain;
1256 int i;
1257 unsigned long flags;
1259 if ((iommu->domains) && (iommu->domain_ids)) {
1260 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1261 domain = iommu->domains[i];
1262 clear_bit(i, iommu->domain_ids);
1264 spin_lock_irqsave(&domain->iommu_lock, flags);
1265 if (--domain->iommu_count == 0) {
1266 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1267 vm_domain_exit(domain);
1268 else
1269 domain_exit(domain);
1271 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1275 if (iommu->gcmd & DMA_GCMD_TE)
1276 iommu_disable_translation(iommu);
1278 if (iommu->irq) {
1279 irq_set_handler_data(iommu->irq, NULL);
1280 /* This will mask the irq */
1281 free_irq(iommu->irq, iommu);
1282 destroy_irq(iommu->irq);
1285 kfree(iommu->domains);
1286 kfree(iommu->domain_ids);
1288 g_iommus[iommu->seq_id] = NULL;
1290 /* if all iommus are freed, free g_iommus */
1291 for (i = 0; i < g_num_of_iommus; i++) {
1292 if (g_iommus[i])
1293 break;
1296 if (i == g_num_of_iommus)
1297 kfree(g_iommus);
1299 /* free context mapping */
1300 free_context_table(iommu);
1303 static struct dmar_domain *alloc_domain(void)
1305 struct dmar_domain *domain;
1307 domain = alloc_domain_mem();
1308 if (!domain)
1309 return NULL;
1311 domain->nid = -1;
1312 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1313 domain->flags = 0;
1315 return domain;
1318 static int iommu_attach_domain(struct dmar_domain *domain,
1319 struct intel_iommu *iommu)
1321 int num;
1322 unsigned long ndomains;
1323 unsigned long flags;
1325 ndomains = cap_ndoms(iommu->cap);
1327 spin_lock_irqsave(&iommu->lock, flags);
1329 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1330 if (num >= ndomains) {
1331 spin_unlock_irqrestore(&iommu->lock, flags);
1332 printk(KERN_ERR "IOMMU: no free domain ids\n");
1333 return -ENOMEM;
1336 domain->id = num;
1337 set_bit(num, iommu->domain_ids);
1338 set_bit(iommu->seq_id, &domain->iommu_bmp);
1339 iommu->domains[num] = domain;
1340 spin_unlock_irqrestore(&iommu->lock, flags);
1342 return 0;
1345 static void iommu_detach_domain(struct dmar_domain *domain,
1346 struct intel_iommu *iommu)
1348 unsigned long flags;
1349 int num, ndomains;
1350 int found = 0;
1352 spin_lock_irqsave(&iommu->lock, flags);
1353 ndomains = cap_ndoms(iommu->cap);
1354 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1355 if (iommu->domains[num] == domain) {
1356 found = 1;
1357 break;
1361 if (found) {
1362 clear_bit(num, iommu->domain_ids);
1363 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1364 iommu->domains[num] = NULL;
1366 spin_unlock_irqrestore(&iommu->lock, flags);
1369 static struct iova_domain reserved_iova_list;
1370 static struct lock_class_key reserved_rbtree_key;
1372 static int dmar_init_reserved_ranges(void)
1374 struct pci_dev *pdev = NULL;
1375 struct iova *iova;
1376 int i;
1378 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1380 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1381 &reserved_rbtree_key);
1383 /* IOAPIC ranges shouldn't be accessed by DMA */
1384 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1385 IOVA_PFN(IOAPIC_RANGE_END));
1386 if (!iova) {
1387 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1388 return -ENODEV;
1391 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1392 for_each_pci_dev(pdev) {
1393 struct resource *r;
1395 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1396 r = &pdev->resource[i];
1397 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1398 continue;
1399 iova = reserve_iova(&reserved_iova_list,
1400 IOVA_PFN(r->start),
1401 IOVA_PFN(r->end));
1402 if (!iova) {
1403 printk(KERN_ERR "Reserve iova failed\n");
1404 return -ENODEV;
1408 return 0;
1411 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1413 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1416 static inline int guestwidth_to_adjustwidth(int gaw)
1418 int agaw;
1419 int r = (gaw - 12) % 9;
1421 if (r == 0)
1422 agaw = gaw;
1423 else
1424 agaw = gaw + 9 - r;
1425 if (agaw > 64)
1426 agaw = 64;
1427 return agaw;
1430 static int domain_init(struct dmar_domain *domain, int guest_width)
1432 struct intel_iommu *iommu;
1433 int adjust_width, agaw;
1434 unsigned long sagaw;
1436 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1437 spin_lock_init(&domain->iommu_lock);
1439 domain_reserve_special_ranges(domain);
1441 /* calculate AGAW */
1442 iommu = domain_get_iommu(domain);
1443 if (guest_width > cap_mgaw(iommu->cap))
1444 guest_width = cap_mgaw(iommu->cap);
1445 domain->gaw = guest_width;
1446 adjust_width = guestwidth_to_adjustwidth(guest_width);
1447 agaw = width_to_agaw(adjust_width);
1448 sagaw = cap_sagaw(iommu->cap);
1449 if (!test_bit(agaw, &sagaw)) {
1450 /* hardware doesn't support it, choose a bigger one */
1451 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1452 agaw = find_next_bit(&sagaw, 5, agaw);
1453 if (agaw >= 5)
1454 return -ENODEV;
1456 domain->agaw = agaw;
1457 INIT_LIST_HEAD(&domain->devices);
1459 if (ecap_coherent(iommu->ecap))
1460 domain->iommu_coherency = 1;
1461 else
1462 domain->iommu_coherency = 0;
1464 if (ecap_sc_support(iommu->ecap))
1465 domain->iommu_snooping = 1;
1466 else
1467 domain->iommu_snooping = 0;
1469 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1470 domain->iommu_count = 1;
1471 domain->nid = iommu->node;
1473 /* always allocate the top pgd */
1474 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1475 if (!domain->pgd)
1476 return -ENOMEM;
1477 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1478 return 0;
1481 static void domain_exit(struct dmar_domain *domain)
1483 struct dmar_drhd_unit *drhd;
1484 struct intel_iommu *iommu;
1486 /* Domain 0 is reserved, so dont process it */
1487 if (!domain)
1488 return;
1490 /* Flush any lazy unmaps that may reference this domain */
1491 if (!intel_iommu_strict)
1492 flush_unmaps_timeout(0);
1494 domain_remove_dev_info(domain);
1495 /* destroy iovas */
1496 put_iova_domain(&domain->iovad);
1498 /* clear ptes */
1499 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1501 /* free page tables */
1502 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1504 for_each_active_iommu(iommu, drhd)
1505 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1506 iommu_detach_domain(domain, iommu);
1508 free_domain_mem(domain);
1511 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1512 u8 bus, u8 devfn, int translation)
1514 struct context_entry *context;
1515 unsigned long flags;
1516 struct intel_iommu *iommu;
1517 struct dma_pte *pgd;
1518 unsigned long num;
1519 unsigned long ndomains;
1520 int id;
1521 int agaw;
1522 struct device_domain_info *info = NULL;
1524 pr_debug("Set context mapping for %02x:%02x.%d\n",
1525 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1527 BUG_ON(!domain->pgd);
1528 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1529 translation != CONTEXT_TT_MULTI_LEVEL);
1531 iommu = device_to_iommu(segment, bus, devfn);
1532 if (!iommu)
1533 return -ENODEV;
1535 context = device_to_context_entry(iommu, bus, devfn);
1536 if (!context)
1537 return -ENOMEM;
1538 spin_lock_irqsave(&iommu->lock, flags);
1539 if (context_present(context)) {
1540 spin_unlock_irqrestore(&iommu->lock, flags);
1541 return 0;
1544 id = domain->id;
1545 pgd = domain->pgd;
1547 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1548 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1549 int found = 0;
1551 /* find an available domain id for this device in iommu */
1552 ndomains = cap_ndoms(iommu->cap);
1553 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1554 if (iommu->domains[num] == domain) {
1555 id = num;
1556 found = 1;
1557 break;
1561 if (found == 0) {
1562 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1563 if (num >= ndomains) {
1564 spin_unlock_irqrestore(&iommu->lock, flags);
1565 printk(KERN_ERR "IOMMU: no free domain ids\n");
1566 return -EFAULT;
1569 set_bit(num, iommu->domain_ids);
1570 iommu->domains[num] = domain;
1571 id = num;
1574 /* Skip top levels of page tables for
1575 * iommu which has less agaw than default.
1576 * Unnecessary for PT mode.
1578 if (translation != CONTEXT_TT_PASS_THROUGH) {
1579 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1580 pgd = phys_to_virt(dma_pte_addr(pgd));
1581 if (!dma_pte_present(pgd)) {
1582 spin_unlock_irqrestore(&iommu->lock, flags);
1583 return -ENOMEM;
1589 context_set_domain_id(context, id);
1591 if (translation != CONTEXT_TT_PASS_THROUGH) {
1592 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1593 translation = info ? CONTEXT_TT_DEV_IOTLB :
1594 CONTEXT_TT_MULTI_LEVEL;
1597 * In pass through mode, AW must be programmed to indicate the largest
1598 * AGAW value supported by hardware. And ASR is ignored by hardware.
1600 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1601 context_set_address_width(context, iommu->msagaw);
1602 else {
1603 context_set_address_root(context, virt_to_phys(pgd));
1604 context_set_address_width(context, iommu->agaw);
1607 context_set_translation_type(context, translation);
1608 context_set_fault_enable(context);
1609 context_set_present(context);
1610 domain_flush_cache(domain, context, sizeof(*context));
1613 * It's a non-present to present mapping. If hardware doesn't cache
1614 * non-present entry we only need to flush the write-buffer. If the
1615 * _does_ cache non-present entries, then it does so in the special
1616 * domain #0, which we have to flush:
1618 if (cap_caching_mode(iommu->cap)) {
1619 iommu->flush.flush_context(iommu, 0,
1620 (((u16)bus) << 8) | devfn,
1621 DMA_CCMD_MASK_NOBIT,
1622 DMA_CCMD_DEVICE_INVL);
1623 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1624 } else {
1625 iommu_flush_write_buffer(iommu);
1627 iommu_enable_dev_iotlb(info);
1628 spin_unlock_irqrestore(&iommu->lock, flags);
1630 spin_lock_irqsave(&domain->iommu_lock, flags);
1631 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1632 domain->iommu_count++;
1633 if (domain->iommu_count == 1)
1634 domain->nid = iommu->node;
1635 domain_update_iommu_cap(domain);
1637 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1638 return 0;
1641 static int
1642 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1643 int translation)
1645 int ret;
1646 struct pci_dev *tmp, *parent;
1648 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1649 pdev->bus->number, pdev->devfn,
1650 translation);
1651 if (ret)
1652 return ret;
1654 /* dependent device mapping */
1655 tmp = pci_find_upstream_pcie_bridge(pdev);
1656 if (!tmp)
1657 return 0;
1658 /* Secondary interface's bus number and devfn 0 */
1659 parent = pdev->bus->self;
1660 while (parent != tmp) {
1661 ret = domain_context_mapping_one(domain,
1662 pci_domain_nr(parent->bus),
1663 parent->bus->number,
1664 parent->devfn, translation);
1665 if (ret)
1666 return ret;
1667 parent = parent->bus->self;
1669 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1670 return domain_context_mapping_one(domain,
1671 pci_domain_nr(tmp->subordinate),
1672 tmp->subordinate->number, 0,
1673 translation);
1674 else /* this is a legacy PCI bridge */
1675 return domain_context_mapping_one(domain,
1676 pci_domain_nr(tmp->bus),
1677 tmp->bus->number,
1678 tmp->devfn,
1679 translation);
1682 static int domain_context_mapped(struct pci_dev *pdev)
1684 int ret;
1685 struct pci_dev *tmp, *parent;
1686 struct intel_iommu *iommu;
1688 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1689 pdev->devfn);
1690 if (!iommu)
1691 return -ENODEV;
1693 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1694 if (!ret)
1695 return ret;
1696 /* dependent device mapping */
1697 tmp = pci_find_upstream_pcie_bridge(pdev);
1698 if (!tmp)
1699 return ret;
1700 /* Secondary interface's bus number and devfn 0 */
1701 parent = pdev->bus->self;
1702 while (parent != tmp) {
1703 ret = device_context_mapped(iommu, parent->bus->number,
1704 parent->devfn);
1705 if (!ret)
1706 return ret;
1707 parent = parent->bus->self;
1709 if (pci_is_pcie(tmp))
1710 return device_context_mapped(iommu, tmp->subordinate->number,
1712 else
1713 return device_context_mapped(iommu, tmp->bus->number,
1714 tmp->devfn);
1717 /* Returns a number of VTD pages, but aligned to MM page size */
1718 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1719 size_t size)
1721 host_addr &= ~PAGE_MASK;
1722 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1725 /* Return largest possible superpage level for a given mapping */
1726 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1727 unsigned long iov_pfn,
1728 unsigned long phy_pfn,
1729 unsigned long pages)
1731 int support, level = 1;
1732 unsigned long pfnmerge;
1734 support = domain->iommu_superpage;
1736 /* To use a large page, the virtual *and* physical addresses
1737 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1738 of them will mean we have to use smaller pages. So just
1739 merge them and check both at once. */
1740 pfnmerge = iov_pfn | phy_pfn;
1742 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1743 pages >>= VTD_STRIDE_SHIFT;
1744 if (!pages)
1745 break;
1746 pfnmerge >>= VTD_STRIDE_SHIFT;
1747 level++;
1748 support--;
1750 return level;
1753 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1754 struct scatterlist *sg, unsigned long phys_pfn,
1755 unsigned long nr_pages, int prot)
1757 struct dma_pte *first_pte = NULL, *pte = NULL;
1758 phys_addr_t uninitialized_var(pteval);
1759 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1760 unsigned long sg_res;
1761 unsigned int largepage_lvl = 0;
1762 unsigned long lvl_pages = 0;
1764 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1766 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1767 return -EINVAL;
1769 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1771 if (sg)
1772 sg_res = 0;
1773 else {
1774 sg_res = nr_pages + 1;
1775 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1778 while (nr_pages > 0) {
1779 uint64_t tmp;
1781 if (!sg_res) {
1782 sg_res = aligned_nrpages(sg->offset, sg->length);
1783 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1784 sg->dma_length = sg->length;
1785 pteval = page_to_phys(sg_page(sg)) | prot;
1786 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1789 if (!pte) {
1790 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1792 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1793 if (!pte)
1794 return -ENOMEM;
1795 /* It is large page*/
1796 if (largepage_lvl > 1)
1797 pteval |= DMA_PTE_LARGE_PAGE;
1798 else
1799 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1802 /* We don't need lock here, nobody else
1803 * touches the iova range
1805 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1806 if (tmp) {
1807 static int dumps = 5;
1808 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1809 iov_pfn, tmp, (unsigned long long)pteval);
1810 if (dumps) {
1811 dumps--;
1812 debug_dma_dump_mappings(NULL);
1814 WARN_ON(1);
1817 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1819 BUG_ON(nr_pages < lvl_pages);
1820 BUG_ON(sg_res < lvl_pages);
1822 nr_pages -= lvl_pages;
1823 iov_pfn += lvl_pages;
1824 phys_pfn += lvl_pages;
1825 pteval += lvl_pages * VTD_PAGE_SIZE;
1826 sg_res -= lvl_pages;
1828 /* If the next PTE would be the first in a new page, then we
1829 need to flush the cache on the entries we've just written.
1830 And then we'll need to recalculate 'pte', so clear it and
1831 let it get set again in the if (!pte) block above.
1833 If we're done (!nr_pages) we need to flush the cache too.
1835 Also if we've been setting superpages, we may need to
1836 recalculate 'pte' and switch back to smaller pages for the
1837 end of the mapping, if the trailing size is not enough to
1838 use another superpage (i.e. sg_res < lvl_pages). */
1839 pte++;
1840 if (!nr_pages || first_pte_in_page(pte) ||
1841 (largepage_lvl > 1 && sg_res < lvl_pages)) {
1842 domain_flush_cache(domain, first_pte,
1843 (void *)pte - (void *)first_pte);
1844 pte = NULL;
1847 if (!sg_res && nr_pages)
1848 sg = sg_next(sg);
1850 return 0;
1853 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1854 struct scatterlist *sg, unsigned long nr_pages,
1855 int prot)
1857 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1860 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1861 unsigned long phys_pfn, unsigned long nr_pages,
1862 int prot)
1864 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1867 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1869 if (!iommu)
1870 return;
1872 clear_context_table(iommu, bus, devfn);
1873 iommu->flush.flush_context(iommu, 0, 0, 0,
1874 DMA_CCMD_GLOBAL_INVL);
1875 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1878 static void domain_remove_dev_info(struct dmar_domain *domain)
1880 struct device_domain_info *info;
1881 unsigned long flags;
1882 struct intel_iommu *iommu;
1884 spin_lock_irqsave(&device_domain_lock, flags);
1885 while (!list_empty(&domain->devices)) {
1886 info = list_entry(domain->devices.next,
1887 struct device_domain_info, link);
1888 list_del(&info->link);
1889 list_del(&info->global);
1890 if (info->dev)
1891 info->dev->dev.archdata.iommu = NULL;
1892 spin_unlock_irqrestore(&device_domain_lock, flags);
1894 iommu_disable_dev_iotlb(info);
1895 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1896 iommu_detach_dev(iommu, info->bus, info->devfn);
1897 free_devinfo_mem(info);
1899 spin_lock_irqsave(&device_domain_lock, flags);
1901 spin_unlock_irqrestore(&device_domain_lock, flags);
1905 * find_domain
1906 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1908 static struct dmar_domain *
1909 find_domain(struct pci_dev *pdev)
1911 struct device_domain_info *info;
1913 /* No lock here, assumes no domain exit in normal case */
1914 info = pdev->dev.archdata.iommu;
1915 if (info)
1916 return info->domain;
1917 return NULL;
1920 /* domain is initialized */
1921 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1923 struct dmar_domain *domain, *found = NULL;
1924 struct intel_iommu *iommu;
1925 struct dmar_drhd_unit *drhd;
1926 struct device_domain_info *info, *tmp;
1927 struct pci_dev *dev_tmp;
1928 unsigned long flags;
1929 int bus = 0, devfn = 0;
1930 int segment;
1931 int ret;
1933 domain = find_domain(pdev);
1934 if (domain)
1935 return domain;
1937 segment = pci_domain_nr(pdev->bus);
1939 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1940 if (dev_tmp) {
1941 if (pci_is_pcie(dev_tmp)) {
1942 bus = dev_tmp->subordinate->number;
1943 devfn = 0;
1944 } else {
1945 bus = dev_tmp->bus->number;
1946 devfn = dev_tmp->devfn;
1948 spin_lock_irqsave(&device_domain_lock, flags);
1949 list_for_each_entry(info, &device_domain_list, global) {
1950 if (info->segment == segment &&
1951 info->bus == bus && info->devfn == devfn) {
1952 found = info->domain;
1953 break;
1956 spin_unlock_irqrestore(&device_domain_lock, flags);
1957 /* pcie-pci bridge already has a domain, uses it */
1958 if (found) {
1959 domain = found;
1960 goto found_domain;
1964 domain = alloc_domain();
1965 if (!domain)
1966 goto error;
1968 /* Allocate new domain for the device */
1969 drhd = dmar_find_matched_drhd_unit(pdev);
1970 if (!drhd) {
1971 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1972 pci_name(pdev));
1973 return NULL;
1975 iommu = drhd->iommu;
1977 ret = iommu_attach_domain(domain, iommu);
1978 if (ret) {
1979 free_domain_mem(domain);
1980 goto error;
1983 if (domain_init(domain, gaw)) {
1984 domain_exit(domain);
1985 goto error;
1988 /* register pcie-to-pci device */
1989 if (dev_tmp) {
1990 info = alloc_devinfo_mem();
1991 if (!info) {
1992 domain_exit(domain);
1993 goto error;
1995 info->segment = segment;
1996 info->bus = bus;
1997 info->devfn = devfn;
1998 info->dev = NULL;
1999 info->domain = domain;
2000 /* This domain is shared by devices under p2p bridge */
2001 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2003 /* pcie-to-pci bridge already has a domain, uses it */
2004 found = NULL;
2005 spin_lock_irqsave(&device_domain_lock, flags);
2006 list_for_each_entry(tmp, &device_domain_list, global) {
2007 if (tmp->segment == segment &&
2008 tmp->bus == bus && tmp->devfn == devfn) {
2009 found = tmp->domain;
2010 break;
2013 if (found) {
2014 spin_unlock_irqrestore(&device_domain_lock, flags);
2015 free_devinfo_mem(info);
2016 domain_exit(domain);
2017 domain = found;
2018 } else {
2019 list_add(&info->link, &domain->devices);
2020 list_add(&info->global, &device_domain_list);
2021 spin_unlock_irqrestore(&device_domain_lock, flags);
2025 found_domain:
2026 info = alloc_devinfo_mem();
2027 if (!info)
2028 goto error;
2029 info->segment = segment;
2030 info->bus = pdev->bus->number;
2031 info->devfn = pdev->devfn;
2032 info->dev = pdev;
2033 info->domain = domain;
2034 spin_lock_irqsave(&device_domain_lock, flags);
2035 /* somebody is fast */
2036 found = find_domain(pdev);
2037 if (found != NULL) {
2038 spin_unlock_irqrestore(&device_domain_lock, flags);
2039 if (found != domain) {
2040 domain_exit(domain);
2041 domain = found;
2043 free_devinfo_mem(info);
2044 return domain;
2046 list_add(&info->link, &domain->devices);
2047 list_add(&info->global, &device_domain_list);
2048 pdev->dev.archdata.iommu = info;
2049 spin_unlock_irqrestore(&device_domain_lock, flags);
2050 return domain;
2051 error:
2052 /* recheck it here, maybe others set it */
2053 return find_domain(pdev);
2056 static int iommu_identity_mapping;
2057 #define IDENTMAP_ALL 1
2058 #define IDENTMAP_GFX 2
2059 #define IDENTMAP_AZALIA 4
2061 static int iommu_domain_identity_map(struct dmar_domain *domain,
2062 unsigned long long start,
2063 unsigned long long end)
2065 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2066 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2068 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2069 dma_to_mm_pfn(last_vpfn))) {
2070 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2071 return -ENOMEM;
2074 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2075 start, end, domain->id);
2077 * RMRR range might have overlap with physical memory range,
2078 * clear it first
2080 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2082 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2083 last_vpfn - first_vpfn + 1,
2084 DMA_PTE_READ|DMA_PTE_WRITE);
2087 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2088 unsigned long long start,
2089 unsigned long long end)
2091 struct dmar_domain *domain;
2092 int ret;
2094 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2095 if (!domain)
2096 return -ENOMEM;
2098 /* For _hardware_ passthrough, don't bother. But for software
2099 passthrough, we do it anyway -- it may indicate a memory
2100 range which is reserved in E820, so which didn't get set
2101 up to start with in si_domain */
2102 if (domain == si_domain && hw_pass_through) {
2103 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2104 pci_name(pdev), start, end);
2105 return 0;
2108 printk(KERN_INFO
2109 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2110 pci_name(pdev), start, end);
2112 if (end < start) {
2113 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2114 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2115 dmi_get_system_info(DMI_BIOS_VENDOR),
2116 dmi_get_system_info(DMI_BIOS_VERSION),
2117 dmi_get_system_info(DMI_PRODUCT_VERSION));
2118 ret = -EIO;
2119 goto error;
2122 if (end >> agaw_to_width(domain->agaw)) {
2123 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2124 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2125 agaw_to_width(domain->agaw),
2126 dmi_get_system_info(DMI_BIOS_VENDOR),
2127 dmi_get_system_info(DMI_BIOS_VERSION),
2128 dmi_get_system_info(DMI_PRODUCT_VERSION));
2129 ret = -EIO;
2130 goto error;
2133 ret = iommu_domain_identity_map(domain, start, end);
2134 if (ret)
2135 goto error;
2137 /* context entry init */
2138 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2139 if (ret)
2140 goto error;
2142 return 0;
2144 error:
2145 domain_exit(domain);
2146 return ret;
2149 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2150 struct pci_dev *pdev)
2152 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2153 return 0;
2154 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2155 rmrr->end_address);
2158 #ifdef CONFIG_DMAR_FLOPPY_WA
2159 static inline void iommu_prepare_isa(void)
2161 struct pci_dev *pdev;
2162 int ret;
2164 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2165 if (!pdev)
2166 return;
2168 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2169 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2171 if (ret)
2172 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2173 "floppy might not work\n");
2176 #else
2177 static inline void iommu_prepare_isa(void)
2179 return;
2181 #endif /* !CONFIG_DMAR_FLPY_WA */
2183 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2185 static int __init si_domain_work_fn(unsigned long start_pfn,
2186 unsigned long end_pfn, void *datax)
2188 int *ret = datax;
2190 *ret = iommu_domain_identity_map(si_domain,
2191 (uint64_t)start_pfn << PAGE_SHIFT,
2192 (uint64_t)end_pfn << PAGE_SHIFT);
2193 return *ret;
2197 static int __init si_domain_init(int hw)
2199 struct dmar_drhd_unit *drhd;
2200 struct intel_iommu *iommu;
2201 int nid, ret = 0;
2203 si_domain = alloc_domain();
2204 if (!si_domain)
2205 return -EFAULT;
2207 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2209 for_each_active_iommu(iommu, drhd) {
2210 ret = iommu_attach_domain(si_domain, iommu);
2211 if (ret) {
2212 domain_exit(si_domain);
2213 return -EFAULT;
2217 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2218 domain_exit(si_domain);
2219 return -EFAULT;
2222 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2224 if (hw)
2225 return 0;
2227 for_each_online_node(nid) {
2228 work_with_active_regions(nid, si_domain_work_fn, &ret);
2229 if (ret)
2230 return ret;
2233 return 0;
2236 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2237 struct pci_dev *pdev);
2238 static int identity_mapping(struct pci_dev *pdev)
2240 struct device_domain_info *info;
2242 if (likely(!iommu_identity_mapping))
2243 return 0;
2245 info = pdev->dev.archdata.iommu;
2246 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2247 return (info->domain == si_domain);
2249 return 0;
2252 static int domain_add_dev_info(struct dmar_domain *domain,
2253 struct pci_dev *pdev,
2254 int translation)
2256 struct device_domain_info *info;
2257 unsigned long flags;
2258 int ret;
2260 info = alloc_devinfo_mem();
2261 if (!info)
2262 return -ENOMEM;
2264 ret = domain_context_mapping(domain, pdev, translation);
2265 if (ret) {
2266 free_devinfo_mem(info);
2267 return ret;
2270 info->segment = pci_domain_nr(pdev->bus);
2271 info->bus = pdev->bus->number;
2272 info->devfn = pdev->devfn;
2273 info->dev = pdev;
2274 info->domain = domain;
2276 spin_lock_irqsave(&device_domain_lock, flags);
2277 list_add(&info->link, &domain->devices);
2278 list_add(&info->global, &device_domain_list);
2279 pdev->dev.archdata.iommu = info;
2280 spin_unlock_irqrestore(&device_domain_lock, flags);
2282 return 0;
2285 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2287 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2288 return 1;
2290 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2291 return 1;
2293 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2294 return 0;
2297 * We want to start off with all devices in the 1:1 domain, and
2298 * take them out later if we find they can't access all of memory.
2300 * However, we can't do this for PCI devices behind bridges,
2301 * because all PCI devices behind the same bridge will end up
2302 * with the same source-id on their transactions.
2304 * Practically speaking, we can't change things around for these
2305 * devices at run-time, because we can't be sure there'll be no
2306 * DMA transactions in flight for any of their siblings.
2308 * So PCI devices (unless they're on the root bus) as well as
2309 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2310 * the 1:1 domain, just in _case_ one of their siblings turns out
2311 * not to be able to map all of memory.
2313 if (!pci_is_pcie(pdev)) {
2314 if (!pci_is_root_bus(pdev->bus))
2315 return 0;
2316 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2317 return 0;
2318 } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2319 return 0;
2322 * At boot time, we don't yet know if devices will be 64-bit capable.
2323 * Assume that they will -- if they turn out not to be, then we can
2324 * take them out of the 1:1 domain later.
2326 if (!startup) {
2328 * If the device's dma_mask is less than the system's memory
2329 * size then this is not a candidate for identity mapping.
2331 u64 dma_mask = pdev->dma_mask;
2333 if (pdev->dev.coherent_dma_mask &&
2334 pdev->dev.coherent_dma_mask < dma_mask)
2335 dma_mask = pdev->dev.coherent_dma_mask;
2337 return dma_mask >= dma_get_required_mask(&pdev->dev);
2340 return 1;
2343 static int __init iommu_prepare_static_identity_mapping(int hw)
2345 struct pci_dev *pdev = NULL;
2346 int ret;
2348 ret = si_domain_init(hw);
2349 if (ret)
2350 return -EFAULT;
2352 for_each_pci_dev(pdev) {
2353 /* Skip Host/PCI Bridge devices */
2354 if (IS_BRIDGE_HOST_DEVICE(pdev))
2355 continue;
2356 if (iommu_should_identity_map(pdev, 1)) {
2357 printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2358 hw ? "hardware" : "software", pci_name(pdev));
2360 ret = domain_add_dev_info(si_domain, pdev,
2361 hw ? CONTEXT_TT_PASS_THROUGH :
2362 CONTEXT_TT_MULTI_LEVEL);
2363 if (ret)
2364 return ret;
2368 return 0;
2371 static int __init init_dmars(void)
2373 struct dmar_drhd_unit *drhd;
2374 struct dmar_rmrr_unit *rmrr;
2375 struct pci_dev *pdev;
2376 struct intel_iommu *iommu;
2377 int i, ret;
2380 * for each drhd
2381 * allocate root
2382 * initialize and program root entry to not present
2383 * endfor
2385 for_each_drhd_unit(drhd) {
2386 g_num_of_iommus++;
2388 * lock not needed as this is only incremented in the single
2389 * threaded kernel __init code path all other access are read
2390 * only
2394 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2395 GFP_KERNEL);
2396 if (!g_iommus) {
2397 printk(KERN_ERR "Allocating global iommu array failed\n");
2398 ret = -ENOMEM;
2399 goto error;
2402 deferred_flush = kzalloc(g_num_of_iommus *
2403 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2404 if (!deferred_flush) {
2405 ret = -ENOMEM;
2406 goto error;
2409 for_each_drhd_unit(drhd) {
2410 if (drhd->ignored)
2411 continue;
2413 iommu = drhd->iommu;
2414 g_iommus[iommu->seq_id] = iommu;
2416 ret = iommu_init_domains(iommu);
2417 if (ret)
2418 goto error;
2421 * TBD:
2422 * we could share the same root & context tables
2423 * among all IOMMU's. Need to Split it later.
2425 ret = iommu_alloc_root_entry(iommu);
2426 if (ret) {
2427 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2428 goto error;
2430 if (!ecap_pass_through(iommu->ecap))
2431 hw_pass_through = 0;
2435 * Start from the sane iommu hardware state.
2437 for_each_drhd_unit(drhd) {
2438 if (drhd->ignored)
2439 continue;
2441 iommu = drhd->iommu;
2444 * If the queued invalidation is already initialized by us
2445 * (for example, while enabling interrupt-remapping) then
2446 * we got the things already rolling from a sane state.
2448 if (iommu->qi)
2449 continue;
2452 * Clear any previous faults.
2454 dmar_fault(-1, iommu);
2456 * Disable queued invalidation if supported and already enabled
2457 * before OS handover.
2459 dmar_disable_qi(iommu);
2462 for_each_drhd_unit(drhd) {
2463 if (drhd->ignored)
2464 continue;
2466 iommu = drhd->iommu;
2468 if (dmar_enable_qi(iommu)) {
2470 * Queued Invalidate not enabled, use Register Based
2471 * Invalidate
2473 iommu->flush.flush_context = __iommu_flush_context;
2474 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2475 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2476 "invalidation\n",
2477 iommu->seq_id,
2478 (unsigned long long)drhd->reg_base_addr);
2479 } else {
2480 iommu->flush.flush_context = qi_flush_context;
2481 iommu->flush.flush_iotlb = qi_flush_iotlb;
2482 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2483 "invalidation\n",
2484 iommu->seq_id,
2485 (unsigned long long)drhd->reg_base_addr);
2489 if (iommu_pass_through)
2490 iommu_identity_mapping |= IDENTMAP_ALL;
2492 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2493 iommu_identity_mapping |= IDENTMAP_GFX;
2494 #endif
2496 check_tylersburg_isoch();
2499 * If pass through is not set or not enabled, setup context entries for
2500 * identity mappings for rmrr, gfx, and isa and may fall back to static
2501 * identity mapping if iommu_identity_mapping is set.
2503 if (iommu_identity_mapping) {
2504 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2505 if (ret) {
2506 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2507 goto error;
2511 * For each rmrr
2512 * for each dev attached to rmrr
2513 * do
2514 * locate drhd for dev, alloc domain for dev
2515 * allocate free domain
2516 * allocate page table entries for rmrr
2517 * if context not allocated for bus
2518 * allocate and init context
2519 * set present in root table for this bus
2520 * init context with domain, translation etc
2521 * endfor
2522 * endfor
2524 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2525 for_each_rmrr_units(rmrr) {
2526 for (i = 0; i < rmrr->devices_cnt; i++) {
2527 pdev = rmrr->devices[i];
2529 * some BIOS lists non-exist devices in DMAR
2530 * table.
2532 if (!pdev)
2533 continue;
2534 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2535 if (ret)
2536 printk(KERN_ERR
2537 "IOMMU: mapping reserved region failed\n");
2541 iommu_prepare_isa();
2544 * for each drhd
2545 * enable fault log
2546 * global invalidate context cache
2547 * global invalidate iotlb
2548 * enable translation
2550 for_each_drhd_unit(drhd) {
2551 if (drhd->ignored) {
2553 * we always have to disable PMRs or DMA may fail on
2554 * this device
2556 if (force_on)
2557 iommu_disable_protect_mem_regions(drhd->iommu);
2558 continue;
2560 iommu = drhd->iommu;
2562 iommu_flush_write_buffer(iommu);
2564 ret = dmar_set_interrupt(iommu);
2565 if (ret)
2566 goto error;
2568 iommu_set_root_entry(iommu);
2570 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2571 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2573 ret = iommu_enable_translation(iommu);
2574 if (ret)
2575 goto error;
2577 iommu_disable_protect_mem_regions(iommu);
2580 return 0;
2581 error:
2582 for_each_drhd_unit(drhd) {
2583 if (drhd->ignored)
2584 continue;
2585 iommu = drhd->iommu;
2586 free_iommu(iommu);
2588 kfree(g_iommus);
2589 return ret;
2592 /* This takes a number of _MM_ pages, not VTD pages */
2593 static struct iova *intel_alloc_iova(struct device *dev,
2594 struct dmar_domain *domain,
2595 unsigned long nrpages, uint64_t dma_mask)
2597 struct pci_dev *pdev = to_pci_dev(dev);
2598 struct iova *iova = NULL;
2600 /* Restrict dma_mask to the width that the iommu can handle */
2601 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2603 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2605 * First try to allocate an io virtual address in
2606 * DMA_BIT_MASK(32) and if that fails then try allocating
2607 * from higher range
2609 iova = alloc_iova(&domain->iovad, nrpages,
2610 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2611 if (iova)
2612 return iova;
2614 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2615 if (unlikely(!iova)) {
2616 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2617 nrpages, pci_name(pdev));
2618 return NULL;
2621 return iova;
2624 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2626 struct dmar_domain *domain;
2627 int ret;
2629 domain = get_domain_for_dev(pdev,
2630 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2631 if (!domain) {
2632 printk(KERN_ERR
2633 "Allocating domain for %s failed", pci_name(pdev));
2634 return NULL;
2637 /* make sure context mapping is ok */
2638 if (unlikely(!domain_context_mapped(pdev))) {
2639 ret = domain_context_mapping(domain, pdev,
2640 CONTEXT_TT_MULTI_LEVEL);
2641 if (ret) {
2642 printk(KERN_ERR
2643 "Domain context map for %s failed",
2644 pci_name(pdev));
2645 return NULL;
2649 return domain;
2652 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2654 struct device_domain_info *info;
2656 /* No lock here, assumes no domain exit in normal case */
2657 info = dev->dev.archdata.iommu;
2658 if (likely(info))
2659 return info->domain;
2661 return __get_valid_domain_for_dev(dev);
2664 static int iommu_dummy(struct pci_dev *pdev)
2666 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2669 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2670 static int iommu_no_mapping(struct device *dev)
2672 struct pci_dev *pdev;
2673 int found;
2675 if (unlikely(dev->bus != &pci_bus_type))
2676 return 1;
2678 pdev = to_pci_dev(dev);
2679 if (iommu_dummy(pdev))
2680 return 1;
2682 if (!iommu_identity_mapping)
2683 return 0;
2685 found = identity_mapping(pdev);
2686 if (found) {
2687 if (iommu_should_identity_map(pdev, 0))
2688 return 1;
2689 else {
2691 * 32 bit DMA is removed from si_domain and fall back
2692 * to non-identity mapping.
2694 domain_remove_one_dev_info(si_domain, pdev);
2695 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2696 pci_name(pdev));
2697 return 0;
2699 } else {
2701 * In case of a detached 64 bit DMA device from vm, the device
2702 * is put into si_domain for identity mapping.
2704 if (iommu_should_identity_map(pdev, 0)) {
2705 int ret;
2706 ret = domain_add_dev_info(si_domain, pdev,
2707 hw_pass_through ?
2708 CONTEXT_TT_PASS_THROUGH :
2709 CONTEXT_TT_MULTI_LEVEL);
2710 if (!ret) {
2711 printk(KERN_INFO "64bit %s uses identity mapping\n",
2712 pci_name(pdev));
2713 return 1;
2718 return 0;
2721 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2722 size_t size, int dir, u64 dma_mask)
2724 struct pci_dev *pdev = to_pci_dev(hwdev);
2725 struct dmar_domain *domain;
2726 phys_addr_t start_paddr;
2727 struct iova *iova;
2728 int prot = 0;
2729 int ret;
2730 struct intel_iommu *iommu;
2731 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2733 BUG_ON(dir == DMA_NONE);
2735 if (iommu_no_mapping(hwdev))
2736 return paddr;
2738 domain = get_valid_domain_for_dev(pdev);
2739 if (!domain)
2740 return 0;
2742 iommu = domain_get_iommu(domain);
2743 size = aligned_nrpages(paddr, size);
2745 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2746 if (!iova)
2747 goto error;
2750 * Check if DMAR supports zero-length reads on write only
2751 * mappings..
2753 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2754 !cap_zlr(iommu->cap))
2755 prot |= DMA_PTE_READ;
2756 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2757 prot |= DMA_PTE_WRITE;
2759 * paddr - (paddr + size) might be partial page, we should map the whole
2760 * page. Note: if two part of one page are separately mapped, we
2761 * might have two guest_addr mapping to the same host paddr, but this
2762 * is not a big problem
2764 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2765 mm_to_dma_pfn(paddr_pfn), size, prot);
2766 if (ret)
2767 goto error;
2769 /* it's a non-present to present mapping. Only flush if caching mode */
2770 if (cap_caching_mode(iommu->cap))
2771 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2772 else
2773 iommu_flush_write_buffer(iommu);
2775 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2776 start_paddr += paddr & ~PAGE_MASK;
2777 return start_paddr;
2779 error:
2780 if (iova)
2781 __free_iova(&domain->iovad, iova);
2782 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2783 pci_name(pdev), size, (unsigned long long)paddr, dir);
2784 return 0;
2787 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2788 unsigned long offset, size_t size,
2789 enum dma_data_direction dir,
2790 struct dma_attrs *attrs)
2792 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2793 dir, to_pci_dev(dev)->dma_mask);
2796 static void flush_unmaps(void)
2798 int i, j;
2800 timer_on = 0;
2802 /* just flush them all */
2803 for (i = 0; i < g_num_of_iommus; i++) {
2804 struct intel_iommu *iommu = g_iommus[i];
2805 if (!iommu)
2806 continue;
2808 if (!deferred_flush[i].next)
2809 continue;
2811 /* In caching mode, global flushes turn emulation expensive */
2812 if (!cap_caching_mode(iommu->cap))
2813 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2814 DMA_TLB_GLOBAL_FLUSH);
2815 for (j = 0; j < deferred_flush[i].next; j++) {
2816 unsigned long mask;
2817 struct iova *iova = deferred_flush[i].iova[j];
2818 struct dmar_domain *domain = deferred_flush[i].domain[j];
2820 /* On real hardware multiple invalidations are expensive */
2821 if (cap_caching_mode(iommu->cap))
2822 iommu_flush_iotlb_psi(iommu, domain->id,
2823 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2824 else {
2825 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2826 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2827 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2829 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2831 deferred_flush[i].next = 0;
2834 list_size = 0;
2837 static void flush_unmaps_timeout(unsigned long data)
2839 unsigned long flags;
2841 spin_lock_irqsave(&async_umap_flush_lock, flags);
2842 flush_unmaps();
2843 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2846 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2848 unsigned long flags;
2849 int next, iommu_id;
2850 struct intel_iommu *iommu;
2852 spin_lock_irqsave(&async_umap_flush_lock, flags);
2853 if (list_size == HIGH_WATER_MARK)
2854 flush_unmaps();
2856 iommu = domain_get_iommu(dom);
2857 iommu_id = iommu->seq_id;
2859 next = deferred_flush[iommu_id].next;
2860 deferred_flush[iommu_id].domain[next] = dom;
2861 deferred_flush[iommu_id].iova[next] = iova;
2862 deferred_flush[iommu_id].next++;
2864 if (!timer_on) {
2865 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2866 timer_on = 1;
2868 list_size++;
2869 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2872 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2873 size_t size, enum dma_data_direction dir,
2874 struct dma_attrs *attrs)
2876 struct pci_dev *pdev = to_pci_dev(dev);
2877 struct dmar_domain *domain;
2878 unsigned long start_pfn, last_pfn;
2879 struct iova *iova;
2880 struct intel_iommu *iommu;
2882 if (iommu_no_mapping(dev))
2883 return;
2885 domain = find_domain(pdev);
2886 BUG_ON(!domain);
2888 iommu = domain_get_iommu(domain);
2890 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2891 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2892 (unsigned long long)dev_addr))
2893 return;
2895 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2896 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2898 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2899 pci_name(pdev), start_pfn, last_pfn);
2901 /* clear the whole page */
2902 dma_pte_clear_range(domain, start_pfn, last_pfn);
2904 /* free page tables */
2905 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2907 if (intel_iommu_strict) {
2908 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2909 last_pfn - start_pfn + 1, 0);
2910 /* free iova */
2911 __free_iova(&domain->iovad, iova);
2912 } else {
2913 add_unmap(domain, iova);
2915 * queue up the release of the unmap to save the 1/6th of the
2916 * cpu used up by the iotlb flush operation...
2921 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2922 dma_addr_t *dma_handle, gfp_t flags)
2924 void *vaddr;
2925 int order;
2927 size = PAGE_ALIGN(size);
2928 order = get_order(size);
2930 if (!iommu_no_mapping(hwdev))
2931 flags &= ~(GFP_DMA | GFP_DMA32);
2932 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2933 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2934 flags |= GFP_DMA;
2935 else
2936 flags |= GFP_DMA32;
2939 vaddr = (void *)__get_free_pages(flags, order);
2940 if (!vaddr)
2941 return NULL;
2942 memset(vaddr, 0, size);
2944 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2945 DMA_BIDIRECTIONAL,
2946 hwdev->coherent_dma_mask);
2947 if (*dma_handle)
2948 return vaddr;
2949 free_pages((unsigned long)vaddr, order);
2950 return NULL;
2953 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2954 dma_addr_t dma_handle)
2956 int order;
2958 size = PAGE_ALIGN(size);
2959 order = get_order(size);
2961 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2962 free_pages((unsigned long)vaddr, order);
2965 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2966 int nelems, enum dma_data_direction dir,
2967 struct dma_attrs *attrs)
2969 struct pci_dev *pdev = to_pci_dev(hwdev);
2970 struct dmar_domain *domain;
2971 unsigned long start_pfn, last_pfn;
2972 struct iova *iova;
2973 struct intel_iommu *iommu;
2975 if (iommu_no_mapping(hwdev))
2976 return;
2978 domain = find_domain(pdev);
2979 BUG_ON(!domain);
2981 iommu = domain_get_iommu(domain);
2983 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2984 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2985 (unsigned long long)sglist[0].dma_address))
2986 return;
2988 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2989 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2991 /* clear the whole page */
2992 dma_pte_clear_range(domain, start_pfn, last_pfn);
2994 /* free page tables */
2995 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2997 if (intel_iommu_strict) {
2998 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2999 last_pfn - start_pfn + 1, 0);
3000 /* free iova */
3001 __free_iova(&domain->iovad, iova);
3002 } else {
3003 add_unmap(domain, iova);
3005 * queue up the release of the unmap to save the 1/6th of the
3006 * cpu used up by the iotlb flush operation...
3011 static int intel_nontranslate_map_sg(struct device *hddev,
3012 struct scatterlist *sglist, int nelems, int dir)
3014 int i;
3015 struct scatterlist *sg;
3017 for_each_sg(sglist, sg, nelems, i) {
3018 BUG_ON(!sg_page(sg));
3019 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3020 sg->dma_length = sg->length;
3022 return nelems;
3025 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3026 enum dma_data_direction dir, struct dma_attrs *attrs)
3028 int i;
3029 struct pci_dev *pdev = to_pci_dev(hwdev);
3030 struct dmar_domain *domain;
3031 size_t size = 0;
3032 int prot = 0;
3033 struct iova *iova = NULL;
3034 int ret;
3035 struct scatterlist *sg;
3036 unsigned long start_vpfn;
3037 struct intel_iommu *iommu;
3039 BUG_ON(dir == DMA_NONE);
3040 if (iommu_no_mapping(hwdev))
3041 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3043 domain = get_valid_domain_for_dev(pdev);
3044 if (!domain)
3045 return 0;
3047 iommu = domain_get_iommu(domain);
3049 for_each_sg(sglist, sg, nelems, i)
3050 size += aligned_nrpages(sg->offset, sg->length);
3052 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3053 pdev->dma_mask);
3054 if (!iova) {
3055 sglist->dma_length = 0;
3056 return 0;
3060 * Check if DMAR supports zero-length reads on write only
3061 * mappings..
3063 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3064 !cap_zlr(iommu->cap))
3065 prot |= DMA_PTE_READ;
3066 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3067 prot |= DMA_PTE_WRITE;
3069 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3071 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3072 if (unlikely(ret)) {
3073 /* clear the page */
3074 dma_pte_clear_range(domain, start_vpfn,
3075 start_vpfn + size - 1);
3076 /* free page tables */
3077 dma_pte_free_pagetable(domain, start_vpfn,
3078 start_vpfn + size - 1);
3079 /* free iova */
3080 __free_iova(&domain->iovad, iova);
3081 return 0;
3084 /* it's a non-present to present mapping. Only flush if caching mode */
3085 if (cap_caching_mode(iommu->cap))
3086 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3087 else
3088 iommu_flush_write_buffer(iommu);
3090 return nelems;
3093 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3095 return !dma_addr;
3098 struct dma_map_ops intel_dma_ops = {
3099 .alloc_coherent = intel_alloc_coherent,
3100 .free_coherent = intel_free_coherent,
3101 .map_sg = intel_map_sg,
3102 .unmap_sg = intel_unmap_sg,
3103 .map_page = intel_map_page,
3104 .unmap_page = intel_unmap_page,
3105 .mapping_error = intel_mapping_error,
3108 static inline int iommu_domain_cache_init(void)
3110 int ret = 0;
3112 iommu_domain_cache = kmem_cache_create("iommu_domain",
3113 sizeof(struct dmar_domain),
3115 SLAB_HWCACHE_ALIGN,
3117 NULL);
3118 if (!iommu_domain_cache) {
3119 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3120 ret = -ENOMEM;
3123 return ret;
3126 static inline int iommu_devinfo_cache_init(void)
3128 int ret = 0;
3130 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3131 sizeof(struct device_domain_info),
3133 SLAB_HWCACHE_ALIGN,
3134 NULL);
3135 if (!iommu_devinfo_cache) {
3136 printk(KERN_ERR "Couldn't create devinfo cache\n");
3137 ret = -ENOMEM;
3140 return ret;
3143 static inline int iommu_iova_cache_init(void)
3145 int ret = 0;
3147 iommu_iova_cache = kmem_cache_create("iommu_iova",
3148 sizeof(struct iova),
3150 SLAB_HWCACHE_ALIGN,
3151 NULL);
3152 if (!iommu_iova_cache) {
3153 printk(KERN_ERR "Couldn't create iova cache\n");
3154 ret = -ENOMEM;
3157 return ret;
3160 static int __init iommu_init_mempool(void)
3162 int ret;
3163 ret = iommu_iova_cache_init();
3164 if (ret)
3165 return ret;
3167 ret = iommu_domain_cache_init();
3168 if (ret)
3169 goto domain_error;
3171 ret = iommu_devinfo_cache_init();
3172 if (!ret)
3173 return ret;
3175 kmem_cache_destroy(iommu_domain_cache);
3176 domain_error:
3177 kmem_cache_destroy(iommu_iova_cache);
3179 return -ENOMEM;
3182 static void __init iommu_exit_mempool(void)
3184 kmem_cache_destroy(iommu_devinfo_cache);
3185 kmem_cache_destroy(iommu_domain_cache);
3186 kmem_cache_destroy(iommu_iova_cache);
3190 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3192 struct dmar_drhd_unit *drhd;
3193 u32 vtbar;
3194 int rc;
3196 /* We know that this device on this chipset has its own IOMMU.
3197 * If we find it under a different IOMMU, then the BIOS is lying
3198 * to us. Hope that the IOMMU for this device is actually
3199 * disabled, and it needs no translation...
3201 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3202 if (rc) {
3203 /* "can't" happen */
3204 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3205 return;
3207 vtbar &= 0xffff0000;
3209 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3210 drhd = dmar_find_matched_drhd_unit(pdev);
3211 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3212 TAINT_FIRMWARE_WORKAROUND,
3213 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3214 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3216 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3218 static void __init init_no_remapping_devices(void)
3220 struct dmar_drhd_unit *drhd;
3222 for_each_drhd_unit(drhd) {
3223 if (!drhd->include_all) {
3224 int i;
3225 for (i = 0; i < drhd->devices_cnt; i++)
3226 if (drhd->devices[i] != NULL)
3227 break;
3228 /* ignore DMAR unit if no pci devices exist */
3229 if (i == drhd->devices_cnt)
3230 drhd->ignored = 1;
3234 if (dmar_map_gfx)
3235 return;
3237 for_each_drhd_unit(drhd) {
3238 int i;
3239 if (drhd->ignored || drhd->include_all)
3240 continue;
3242 for (i = 0; i < drhd->devices_cnt; i++)
3243 if (drhd->devices[i] &&
3244 !IS_GFX_DEVICE(drhd->devices[i]))
3245 break;
3247 if (i < drhd->devices_cnt)
3248 continue;
3250 /* bypass IOMMU if it is just for gfx devices */
3251 drhd->ignored = 1;
3252 for (i = 0; i < drhd->devices_cnt; i++) {
3253 if (!drhd->devices[i])
3254 continue;
3255 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3260 #ifdef CONFIG_SUSPEND
3261 static int init_iommu_hw(void)
3263 struct dmar_drhd_unit *drhd;
3264 struct intel_iommu *iommu = NULL;
3266 for_each_active_iommu(iommu, drhd)
3267 if (iommu->qi)
3268 dmar_reenable_qi(iommu);
3270 for_each_iommu(iommu, drhd) {
3271 if (drhd->ignored) {
3273 * we always have to disable PMRs or DMA may fail on
3274 * this device
3276 if (force_on)
3277 iommu_disable_protect_mem_regions(iommu);
3278 continue;
3281 iommu_flush_write_buffer(iommu);
3283 iommu_set_root_entry(iommu);
3285 iommu->flush.flush_context(iommu, 0, 0, 0,
3286 DMA_CCMD_GLOBAL_INVL);
3287 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3288 DMA_TLB_GLOBAL_FLUSH);
3289 if (iommu_enable_translation(iommu))
3290 return 1;
3291 iommu_disable_protect_mem_regions(iommu);
3294 return 0;
3297 static void iommu_flush_all(void)
3299 struct dmar_drhd_unit *drhd;
3300 struct intel_iommu *iommu;
3302 for_each_active_iommu(iommu, drhd) {
3303 iommu->flush.flush_context(iommu, 0, 0, 0,
3304 DMA_CCMD_GLOBAL_INVL);
3305 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3306 DMA_TLB_GLOBAL_FLUSH);
3310 static int iommu_suspend(void)
3312 struct dmar_drhd_unit *drhd;
3313 struct intel_iommu *iommu = NULL;
3314 unsigned long flag;
3316 for_each_active_iommu(iommu, drhd) {
3317 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3318 GFP_ATOMIC);
3319 if (!iommu->iommu_state)
3320 goto nomem;
3323 iommu_flush_all();
3325 for_each_active_iommu(iommu, drhd) {
3326 iommu_disable_translation(iommu);
3328 spin_lock_irqsave(&iommu->register_lock, flag);
3330 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3331 readl(iommu->reg + DMAR_FECTL_REG);
3332 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3333 readl(iommu->reg + DMAR_FEDATA_REG);
3334 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3335 readl(iommu->reg + DMAR_FEADDR_REG);
3336 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3337 readl(iommu->reg + DMAR_FEUADDR_REG);
3339 spin_unlock_irqrestore(&iommu->register_lock, flag);
3341 return 0;
3343 nomem:
3344 for_each_active_iommu(iommu, drhd)
3345 kfree(iommu->iommu_state);
3347 return -ENOMEM;
3350 static void iommu_resume(void)
3352 struct dmar_drhd_unit *drhd;
3353 struct intel_iommu *iommu = NULL;
3354 unsigned long flag;
3356 if (init_iommu_hw()) {
3357 if (force_on)
3358 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3359 else
3360 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3361 return;
3364 for_each_active_iommu(iommu, drhd) {
3366 spin_lock_irqsave(&iommu->register_lock, flag);
3368 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3369 iommu->reg + DMAR_FECTL_REG);
3370 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3371 iommu->reg + DMAR_FEDATA_REG);
3372 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3373 iommu->reg + DMAR_FEADDR_REG);
3374 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3375 iommu->reg + DMAR_FEUADDR_REG);
3377 spin_unlock_irqrestore(&iommu->register_lock, flag);
3380 for_each_active_iommu(iommu, drhd)
3381 kfree(iommu->iommu_state);
3384 static struct syscore_ops iommu_syscore_ops = {
3385 .resume = iommu_resume,
3386 .suspend = iommu_suspend,
3389 static void __init init_iommu_pm_ops(void)
3391 register_syscore_ops(&iommu_syscore_ops);
3394 #else
3395 static inline void init_iommu_pm_ops(void) {}
3396 #endif /* CONFIG_PM */
3399 * Here we only respond to action of unbound device from driver.
3401 * Added device is not attached to its DMAR domain here yet. That will happen
3402 * when mapping the device to iova.
3404 static int device_notifier(struct notifier_block *nb,
3405 unsigned long action, void *data)
3407 struct device *dev = data;
3408 struct pci_dev *pdev = to_pci_dev(dev);
3409 struct dmar_domain *domain;
3411 if (iommu_no_mapping(dev))
3412 return 0;
3414 domain = find_domain(pdev);
3415 if (!domain)
3416 return 0;
3418 if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3419 domain_remove_one_dev_info(domain, pdev);
3421 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3422 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3423 list_empty(&domain->devices))
3424 domain_exit(domain);
3427 return 0;
3430 static struct notifier_block device_nb = {
3431 .notifier_call = device_notifier,
3434 int __init intel_iommu_init(void)
3436 int ret = 0;
3438 /* VT-d is required for a TXT/tboot launch, so enforce that */
3439 force_on = tboot_force_iommu();
3441 if (dmar_table_init()) {
3442 if (force_on)
3443 panic("tboot: Failed to initialize DMAR table\n");
3444 return -ENODEV;
3447 if (dmar_dev_scope_init()) {
3448 if (force_on)
3449 panic("tboot: Failed to initialize DMAR device scope\n");
3450 return -ENODEV;
3454 * Check the need for DMA-remapping initialization now.
3455 * Above initialization will also be used by Interrupt-remapping.
3457 if (no_iommu || dmar_disabled)
3458 return -ENODEV;
3460 if (iommu_init_mempool()) {
3461 if (force_on)
3462 panic("tboot: Failed to initialize iommu memory\n");
3463 return -ENODEV;
3466 if (dmar_init_reserved_ranges()) {
3467 if (force_on)
3468 panic("tboot: Failed to reserve iommu ranges\n");
3469 return -ENODEV;
3472 init_no_remapping_devices();
3474 ret = init_dmars();
3475 if (ret) {
3476 if (force_on)
3477 panic("tboot: Failed to initialize DMARs\n");
3478 printk(KERN_ERR "IOMMU: dmar init failed\n");
3479 put_iova_domain(&reserved_iova_list);
3480 iommu_exit_mempool();
3481 return ret;
3483 printk(KERN_INFO
3484 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3486 init_timer(&unmap_timer);
3487 #ifdef CONFIG_SWIOTLB
3488 swiotlb = 0;
3489 #endif
3490 dma_ops = &intel_dma_ops;
3492 init_iommu_pm_ops();
3494 register_iommu(&intel_iommu_ops);
3496 bus_register_notifier(&pci_bus_type, &device_nb);
3498 return 0;
3501 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3502 struct pci_dev *pdev)
3504 struct pci_dev *tmp, *parent;
3506 if (!iommu || !pdev)
3507 return;
3509 /* dependent device detach */
3510 tmp = pci_find_upstream_pcie_bridge(pdev);
3511 /* Secondary interface's bus number and devfn 0 */
3512 if (tmp) {
3513 parent = pdev->bus->self;
3514 while (parent != tmp) {
3515 iommu_detach_dev(iommu, parent->bus->number,
3516 parent->devfn);
3517 parent = parent->bus->self;
3519 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3520 iommu_detach_dev(iommu,
3521 tmp->subordinate->number, 0);
3522 else /* this is a legacy PCI bridge */
3523 iommu_detach_dev(iommu, tmp->bus->number,
3524 tmp->devfn);
3528 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3529 struct pci_dev *pdev)
3531 struct device_domain_info *info;
3532 struct intel_iommu *iommu;
3533 unsigned long flags;
3534 int found = 0;
3535 struct list_head *entry, *tmp;
3537 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3538 pdev->devfn);
3539 if (!iommu)
3540 return;
3542 spin_lock_irqsave(&device_domain_lock, flags);
3543 list_for_each_safe(entry, tmp, &domain->devices) {
3544 info = list_entry(entry, struct device_domain_info, link);
3545 if (info->segment == pci_domain_nr(pdev->bus) &&
3546 info->bus == pdev->bus->number &&
3547 info->devfn == pdev->devfn) {
3548 list_del(&info->link);
3549 list_del(&info->global);
3550 if (info->dev)
3551 info->dev->dev.archdata.iommu = NULL;
3552 spin_unlock_irqrestore(&device_domain_lock, flags);
3554 iommu_disable_dev_iotlb(info);
3555 iommu_detach_dev(iommu, info->bus, info->devfn);
3556 iommu_detach_dependent_devices(iommu, pdev);
3557 free_devinfo_mem(info);
3559 spin_lock_irqsave(&device_domain_lock, flags);
3561 if (found)
3562 break;
3563 else
3564 continue;
3567 /* if there is no other devices under the same iommu
3568 * owned by this domain, clear this iommu in iommu_bmp
3569 * update iommu count and coherency
3571 if (iommu == device_to_iommu(info->segment, info->bus,
3572 info->devfn))
3573 found = 1;
3576 if (found == 0) {
3577 unsigned long tmp_flags;
3578 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3579 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3580 domain->iommu_count--;
3581 domain_update_iommu_cap(domain);
3582 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3584 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3585 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3586 spin_lock_irqsave(&iommu->lock, tmp_flags);
3587 clear_bit(domain->id, iommu->domain_ids);
3588 iommu->domains[domain->id] = NULL;
3589 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3593 spin_unlock_irqrestore(&device_domain_lock, flags);
3596 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3598 struct device_domain_info *info;
3599 struct intel_iommu *iommu;
3600 unsigned long flags1, flags2;
3602 spin_lock_irqsave(&device_domain_lock, flags1);
3603 while (!list_empty(&domain->devices)) {
3604 info = list_entry(domain->devices.next,
3605 struct device_domain_info, link);
3606 list_del(&info->link);
3607 list_del(&info->global);
3608 if (info->dev)
3609 info->dev->dev.archdata.iommu = NULL;
3611 spin_unlock_irqrestore(&device_domain_lock, flags1);
3613 iommu_disable_dev_iotlb(info);
3614 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3615 iommu_detach_dev(iommu, info->bus, info->devfn);
3616 iommu_detach_dependent_devices(iommu, info->dev);
3618 /* clear this iommu in iommu_bmp, update iommu count
3619 * and capabilities
3621 spin_lock_irqsave(&domain->iommu_lock, flags2);
3622 if (test_and_clear_bit(iommu->seq_id,
3623 &domain->iommu_bmp)) {
3624 domain->iommu_count--;
3625 domain_update_iommu_cap(domain);
3627 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3629 free_devinfo_mem(info);
3630 spin_lock_irqsave(&device_domain_lock, flags1);
3632 spin_unlock_irqrestore(&device_domain_lock, flags1);
3635 /* domain id for virtual machine, it won't be set in context */
3636 static unsigned long vm_domid;
3638 static struct dmar_domain *iommu_alloc_vm_domain(void)
3640 struct dmar_domain *domain;
3642 domain = alloc_domain_mem();
3643 if (!domain)
3644 return NULL;
3646 domain->id = vm_domid++;
3647 domain->nid = -1;
3648 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3649 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3651 return domain;
3654 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3656 int adjust_width;
3658 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3659 spin_lock_init(&domain->iommu_lock);
3661 domain_reserve_special_ranges(domain);
3663 /* calculate AGAW */
3664 domain->gaw = guest_width;
3665 adjust_width = guestwidth_to_adjustwidth(guest_width);
3666 domain->agaw = width_to_agaw(adjust_width);
3668 INIT_LIST_HEAD(&domain->devices);
3670 domain->iommu_count = 0;
3671 domain->iommu_coherency = 0;
3672 domain->iommu_snooping = 0;
3673 domain->iommu_superpage = 0;
3674 domain->max_addr = 0;
3675 domain->nid = -1;
3677 /* always allocate the top pgd */
3678 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3679 if (!domain->pgd)
3680 return -ENOMEM;
3681 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3682 return 0;
3685 static void iommu_free_vm_domain(struct dmar_domain *domain)
3687 unsigned long flags;
3688 struct dmar_drhd_unit *drhd;
3689 struct intel_iommu *iommu;
3690 unsigned long i;
3691 unsigned long ndomains;
3693 for_each_drhd_unit(drhd) {
3694 if (drhd->ignored)
3695 continue;
3696 iommu = drhd->iommu;
3698 ndomains = cap_ndoms(iommu->cap);
3699 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3700 if (iommu->domains[i] == domain) {
3701 spin_lock_irqsave(&iommu->lock, flags);
3702 clear_bit(i, iommu->domain_ids);
3703 iommu->domains[i] = NULL;
3704 spin_unlock_irqrestore(&iommu->lock, flags);
3705 break;
3711 static void vm_domain_exit(struct dmar_domain *domain)
3713 /* Domain 0 is reserved, so dont process it */
3714 if (!domain)
3715 return;
3717 vm_domain_remove_all_dev_info(domain);
3718 /* destroy iovas */
3719 put_iova_domain(&domain->iovad);
3721 /* clear ptes */
3722 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3724 /* free page tables */
3725 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3727 iommu_free_vm_domain(domain);
3728 free_domain_mem(domain);
3731 static int intel_iommu_domain_init(struct iommu_domain *domain)
3733 struct dmar_domain *dmar_domain;
3735 dmar_domain = iommu_alloc_vm_domain();
3736 if (!dmar_domain) {
3737 printk(KERN_ERR
3738 "intel_iommu_domain_init: dmar_domain == NULL\n");
3739 return -ENOMEM;
3741 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3742 printk(KERN_ERR
3743 "intel_iommu_domain_init() failed\n");
3744 vm_domain_exit(dmar_domain);
3745 return -ENOMEM;
3747 domain_update_iommu_cap(dmar_domain);
3748 domain->priv = dmar_domain;
3750 return 0;
3753 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3755 struct dmar_domain *dmar_domain = domain->priv;
3757 domain->priv = NULL;
3758 vm_domain_exit(dmar_domain);
3761 static int intel_iommu_attach_device(struct iommu_domain *domain,
3762 struct device *dev)
3764 struct dmar_domain *dmar_domain = domain->priv;
3765 struct pci_dev *pdev = to_pci_dev(dev);
3766 struct intel_iommu *iommu;
3767 int addr_width;
3769 /* normally pdev is not mapped */
3770 if (unlikely(domain_context_mapped(pdev))) {
3771 struct dmar_domain *old_domain;
3773 old_domain = find_domain(pdev);
3774 if (old_domain) {
3775 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3776 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3777 domain_remove_one_dev_info(old_domain, pdev);
3778 else
3779 domain_remove_dev_info(old_domain);
3783 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3784 pdev->devfn);
3785 if (!iommu)
3786 return -ENODEV;
3788 /* check if this iommu agaw is sufficient for max mapped address */
3789 addr_width = agaw_to_width(iommu->agaw);
3790 if (addr_width > cap_mgaw(iommu->cap))
3791 addr_width = cap_mgaw(iommu->cap);
3793 if (dmar_domain->max_addr > (1LL << addr_width)) {
3794 printk(KERN_ERR "%s: iommu width (%d) is not "
3795 "sufficient for the mapped address (%llx)\n",
3796 __func__, addr_width, dmar_domain->max_addr);
3797 return -EFAULT;
3799 dmar_domain->gaw = addr_width;
3802 * Knock out extra levels of page tables if necessary
3804 while (iommu->agaw < dmar_domain->agaw) {
3805 struct dma_pte *pte;
3807 pte = dmar_domain->pgd;
3808 if (dma_pte_present(pte)) {
3809 dmar_domain->pgd = (struct dma_pte *)
3810 phys_to_virt(dma_pte_addr(pte));
3811 free_pgtable_page(pte);
3813 dmar_domain->agaw--;
3816 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3819 static void intel_iommu_detach_device(struct iommu_domain *domain,
3820 struct device *dev)
3822 struct dmar_domain *dmar_domain = domain->priv;
3823 struct pci_dev *pdev = to_pci_dev(dev);
3825 domain_remove_one_dev_info(dmar_domain, pdev);
3828 static int intel_iommu_map(struct iommu_domain *domain,
3829 unsigned long iova, phys_addr_t hpa,
3830 int gfp_order, int iommu_prot)
3832 struct dmar_domain *dmar_domain = domain->priv;
3833 u64 max_addr;
3834 int prot = 0;
3835 size_t size;
3836 int ret;
3838 if (iommu_prot & IOMMU_READ)
3839 prot |= DMA_PTE_READ;
3840 if (iommu_prot & IOMMU_WRITE)
3841 prot |= DMA_PTE_WRITE;
3842 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3843 prot |= DMA_PTE_SNP;
3845 size = PAGE_SIZE << gfp_order;
3846 max_addr = iova + size;
3847 if (dmar_domain->max_addr < max_addr) {
3848 u64 end;
3850 /* check if minimum agaw is sufficient for mapped address */
3851 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3852 if (end < max_addr) {
3853 printk(KERN_ERR "%s: iommu width (%d) is not "
3854 "sufficient for the mapped address (%llx)\n",
3855 __func__, dmar_domain->gaw, max_addr);
3856 return -EFAULT;
3858 dmar_domain->max_addr = max_addr;
3860 /* Round up size to next multiple of PAGE_SIZE, if it and
3861 the low bits of hpa would take us onto the next page */
3862 size = aligned_nrpages(hpa, size);
3863 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3864 hpa >> VTD_PAGE_SHIFT, size, prot);
3865 return ret;
3868 static int intel_iommu_unmap(struct iommu_domain *domain,
3869 unsigned long iova, int gfp_order)
3871 struct dmar_domain *dmar_domain = domain->priv;
3872 size_t size = PAGE_SIZE << gfp_order;
3873 int order;
3875 order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3876 (iova + size - 1) >> VTD_PAGE_SHIFT);
3878 if (dmar_domain->max_addr == iova + size)
3879 dmar_domain->max_addr = iova;
3881 return order;
3884 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3885 unsigned long iova)
3887 struct dmar_domain *dmar_domain = domain->priv;
3888 struct dma_pte *pte;
3889 u64 phys = 0;
3891 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
3892 if (pte)
3893 phys = dma_pte_addr(pte);
3895 return phys;
3898 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3899 unsigned long cap)
3901 struct dmar_domain *dmar_domain = domain->priv;
3903 if (cap == IOMMU_CAP_CACHE_COHERENCY)
3904 return dmar_domain->iommu_snooping;
3905 if (cap == IOMMU_CAP_INTR_REMAP)
3906 return intr_remapping_enabled;
3908 return 0;
3911 static struct iommu_ops intel_iommu_ops = {
3912 .domain_init = intel_iommu_domain_init,
3913 .domain_destroy = intel_iommu_domain_destroy,
3914 .attach_dev = intel_iommu_attach_device,
3915 .detach_dev = intel_iommu_detach_device,
3916 .map = intel_iommu_map,
3917 .unmap = intel_iommu_unmap,
3918 .iova_to_phys = intel_iommu_iova_to_phys,
3919 .domain_has_cap = intel_iommu_domain_has_cap,
3922 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3925 * Mobile 4 Series Chipset neglects to set RWBF capability,
3926 * but needs it:
3928 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3929 rwbf_quirk = 1;
3931 /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3932 if (dev->revision == 0x07) {
3933 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3934 dmar_map_gfx = 0;
3938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3940 #define GGC 0x52
3941 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
3942 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
3943 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
3944 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
3945 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
3946 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
3947 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
3948 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
3950 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3952 unsigned short ggc;
3954 if (pci_read_config_word(dev, GGC, &ggc))
3955 return;
3957 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3958 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3959 dmar_map_gfx = 0;
3962 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3967 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3968 ISOCH DMAR unit for the Azalia sound device, but not give it any
3969 TLB entries, which causes it to deadlock. Check for that. We do
3970 this in a function called from init_dmars(), instead of in a PCI
3971 quirk, because we don't want to print the obnoxious "BIOS broken"
3972 message if VT-d is actually disabled.
3974 static void __init check_tylersburg_isoch(void)
3976 struct pci_dev *pdev;
3977 uint32_t vtisochctrl;
3979 /* If there's no Azalia in the system anyway, forget it. */
3980 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3981 if (!pdev)
3982 return;
3983 pci_dev_put(pdev);
3985 /* System Management Registers. Might be hidden, in which case
3986 we can't do the sanity check. But that's OK, because the
3987 known-broken BIOSes _don't_ actually hide it, so far. */
3988 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3989 if (!pdev)
3990 return;
3992 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3993 pci_dev_put(pdev);
3994 return;
3997 pci_dev_put(pdev);
3999 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4000 if (vtisochctrl & 1)
4001 return;
4003 /* Drop all bits other than the number of TLB entries */
4004 vtisochctrl &= 0x1c;
4006 /* If we have the recommended number of TLB entries (16), fine. */
4007 if (vtisochctrl == 0x10)
4008 return;
4010 /* Zero TLB entries? You get to ride the short bus to school. */
4011 if (!vtisochctrl) {
4012 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4013 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4014 dmi_get_system_info(DMI_BIOS_VENDOR),
4015 dmi_get_system_info(DMI_BIOS_VERSION),
4016 dmi_get_system_info(DMI_PRODUCT_VERSION));
4017 iommu_identity_mapping |= IDENTMAP_AZALIA;
4018 return;
4021 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4022 vtisochctrl);