intel-iommu: Remove Host Bridge devices from identity mapping
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / drivers / pci / intel-iommu.c
blob9593496fe4c4df810d92ada661705a5b96720967
1 /*
2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/syscore_ops.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
46 #define ROOT_SIZE VTD_PAGE_SIZE
47 #define CONTEXT_SIZE VTD_PAGE_SIZE
49 #define IS_BRIDGE_HOST_DEVICE(pdev) \
50 ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
51 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
52 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
53 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55 #define IOAPIC_RANGE_START (0xfee00000)
56 #define IOAPIC_RANGE_END (0xfeefffff)
57 #define IOVA_START_ADDR (0x1000)
59 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61 #define MAX_AGAW_WIDTH 64
63 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
64 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
67 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
68 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
69 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
70 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
73 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
74 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
76 /* page table handling */
77 #define LEVEL_STRIDE (9)
78 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
80 static inline int agaw_to_level(int agaw)
82 return agaw + 2;
85 static inline int agaw_to_width(int agaw)
87 return 30 + agaw * LEVEL_STRIDE;
90 static inline int width_to_agaw(int width)
92 return (width - 30) / LEVEL_STRIDE;
95 static inline unsigned int level_to_offset_bits(int level)
97 return (level - 1) * LEVEL_STRIDE;
100 static inline int pfn_level_offset(unsigned long pfn, int level)
102 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
105 static inline unsigned long level_mask(int level)
107 return -1UL << level_to_offset_bits(level);
110 static inline unsigned long level_size(int level)
112 return 1UL << level_to_offset_bits(level);
115 static inline unsigned long align_to_level(unsigned long pfn, int level)
117 return (pfn + level_size(level) - 1) & level_mask(level);
120 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
121 are never going to work. */
122 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
124 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
127 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
129 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
131 static inline unsigned long page_to_dma_pfn(struct page *pg)
133 return mm_to_dma_pfn(page_to_pfn(pg));
135 static inline unsigned long virt_to_dma_pfn(void *p)
137 return page_to_dma_pfn(virt_to_page(p));
140 /* global iommu list, set NULL for ignored DMAR units */
141 static struct intel_iommu **g_iommus;
143 static void __init check_tylersburg_isoch(void);
144 static int rwbf_quirk;
147 * 0: Present
148 * 1-11: Reserved
149 * 12-63: Context Ptr (12 - (haw-1))
150 * 64-127: Reserved
152 struct root_entry {
153 u64 val;
154 u64 rsvd1;
156 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
157 static inline bool root_present(struct root_entry *root)
159 return (root->val & 1);
161 static inline void set_root_present(struct root_entry *root)
163 root->val |= 1;
165 static inline void set_root_value(struct root_entry *root, unsigned long value)
167 root->val |= value & VTD_PAGE_MASK;
170 static inline struct context_entry *
171 get_context_addr_from_root(struct root_entry *root)
173 return (struct context_entry *)
174 (root_present(root)?phys_to_virt(
175 root->val & VTD_PAGE_MASK) :
176 NULL);
180 * low 64 bits:
181 * 0: present
182 * 1: fault processing disable
183 * 2-3: translation type
184 * 12-63: address space root
185 * high 64 bits:
186 * 0-2: address width
187 * 3-6: aval
188 * 8-23: domain id
190 struct context_entry {
191 u64 lo;
192 u64 hi;
195 static inline bool context_present(struct context_entry *context)
197 return (context->lo & 1);
199 static inline void context_set_present(struct context_entry *context)
201 context->lo |= 1;
204 static inline void context_set_fault_enable(struct context_entry *context)
206 context->lo &= (((u64)-1) << 2) | 1;
209 static inline void context_set_translation_type(struct context_entry *context,
210 unsigned long value)
212 context->lo &= (((u64)-1) << 4) | 3;
213 context->lo |= (value & 3) << 2;
216 static inline void context_set_address_root(struct context_entry *context,
217 unsigned long value)
219 context->lo |= value & VTD_PAGE_MASK;
222 static inline void context_set_address_width(struct context_entry *context,
223 unsigned long value)
225 context->hi |= value & 7;
228 static inline void context_set_domain_id(struct context_entry *context,
229 unsigned long value)
231 context->hi |= (value & ((1 << 16) - 1)) << 8;
234 static inline void context_clear_entry(struct context_entry *context)
236 context->lo = 0;
237 context->hi = 0;
241 * 0: readable
242 * 1: writable
243 * 2-6: reserved
244 * 7: super page
245 * 8-10: available
246 * 11: snoop behavior
247 * 12-63: Host physcial address
249 struct dma_pte {
250 u64 val;
253 static inline void dma_clear_pte(struct dma_pte *pte)
255 pte->val = 0;
258 static inline void dma_set_pte_readable(struct dma_pte *pte)
260 pte->val |= DMA_PTE_READ;
263 static inline void dma_set_pte_writable(struct dma_pte *pte)
265 pte->val |= DMA_PTE_WRITE;
268 static inline void dma_set_pte_snp(struct dma_pte *pte)
270 pte->val |= DMA_PTE_SNP;
273 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
275 pte->val = (pte->val & ~3) | (prot & 3);
278 static inline u64 dma_pte_addr(struct dma_pte *pte)
280 #ifdef CONFIG_64BIT
281 return pte->val & VTD_PAGE_MASK;
282 #else
283 /* Must have a full atomic 64-bit read */
284 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
285 #endif
288 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
290 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
293 static inline bool dma_pte_present(struct dma_pte *pte)
295 return (pte->val & 3) != 0;
298 static inline int first_pte_in_page(struct dma_pte *pte)
300 return !((unsigned long)pte & ~VTD_PAGE_MASK);
304 * This domain is a statically identity mapping domain.
305 * 1. This domain creats a static 1:1 mapping to all usable memory.
306 * 2. It maps to each iommu if successful.
307 * 3. Each iommu mapps to this domain if successful.
309 static struct dmar_domain *si_domain;
310 static int hw_pass_through = 1;
312 /* devices under the same p2p bridge are owned in one domain */
313 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
315 /* domain represents a virtual machine, more than one devices
316 * across iommus may be owned in one domain, e.g. kvm guest.
318 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
320 /* si_domain contains mulitple devices */
321 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
323 struct dmar_domain {
324 int id; /* domain id */
325 int nid; /* node id */
326 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
328 struct list_head devices; /* all devices' list */
329 struct iova_domain iovad; /* iova's that belong to this domain */
331 struct dma_pte *pgd; /* virtual address */
332 int gaw; /* max guest address width */
334 /* adjusted guest address width, 0 is level 2 30-bit */
335 int agaw;
337 int flags; /* flags to find out type of domain */
339 int iommu_coherency;/* indicate coherency of iommu access */
340 int iommu_snooping; /* indicate snooping control feature*/
341 int iommu_count; /* reference count of iommu */
342 spinlock_t iommu_lock; /* protect iommu set in domain */
343 u64 max_addr; /* maximum mapped address */
346 /* PCI domain-device relationship */
347 struct device_domain_info {
348 struct list_head link; /* link to domain siblings */
349 struct list_head global; /* link to global list */
350 int segment; /* PCI domain */
351 u8 bus; /* PCI bus number */
352 u8 devfn; /* PCI devfn number */
353 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
354 struct intel_iommu *iommu; /* IOMMU used by this device */
355 struct dmar_domain *domain; /* pointer to domain */
358 static void flush_unmaps_timeout(unsigned long data);
360 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
362 #define HIGH_WATER_MARK 250
363 struct deferred_flush_tables {
364 int next;
365 struct iova *iova[HIGH_WATER_MARK];
366 struct dmar_domain *domain[HIGH_WATER_MARK];
369 static struct deferred_flush_tables *deferred_flush;
371 /* bitmap for indexing intel_iommus */
372 static int g_num_of_iommus;
374 static DEFINE_SPINLOCK(async_umap_flush_lock);
375 static LIST_HEAD(unmaps_to_do);
377 static int timer_on;
378 static long list_size;
380 static void domain_remove_dev_info(struct dmar_domain *domain);
382 #ifdef CONFIG_DMAR_DEFAULT_ON
383 int dmar_disabled = 0;
384 #else
385 int dmar_disabled = 1;
386 #endif /*CONFIG_DMAR_DEFAULT_ON*/
388 static int dmar_map_gfx = 1;
389 static int dmar_forcedac;
390 static int intel_iommu_strict;
392 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
393 static DEFINE_SPINLOCK(device_domain_lock);
394 static LIST_HEAD(device_domain_list);
396 static struct iommu_ops intel_iommu_ops;
398 static int __init intel_iommu_setup(char *str)
400 if (!str)
401 return -EINVAL;
402 while (*str) {
403 if (!strncmp(str, "on", 2)) {
404 dmar_disabled = 0;
405 printk(KERN_INFO "Intel-IOMMU: enabled\n");
406 } else if (!strncmp(str, "off", 3)) {
407 dmar_disabled = 1;
408 printk(KERN_INFO "Intel-IOMMU: disabled\n");
409 } else if (!strncmp(str, "igfx_off", 8)) {
410 dmar_map_gfx = 0;
411 printk(KERN_INFO
412 "Intel-IOMMU: disable GFX device mapping\n");
413 } else if (!strncmp(str, "forcedac", 8)) {
414 printk(KERN_INFO
415 "Intel-IOMMU: Forcing DAC for PCI devices\n");
416 dmar_forcedac = 1;
417 } else if (!strncmp(str, "strict", 6)) {
418 printk(KERN_INFO
419 "Intel-IOMMU: disable batched IOTLB flush\n");
420 intel_iommu_strict = 1;
423 str += strcspn(str, ",");
424 while (*str == ',')
425 str++;
427 return 0;
429 __setup("intel_iommu=", intel_iommu_setup);
431 static struct kmem_cache *iommu_domain_cache;
432 static struct kmem_cache *iommu_devinfo_cache;
433 static struct kmem_cache *iommu_iova_cache;
435 static inline void *alloc_pgtable_page(int node)
437 struct page *page;
438 void *vaddr = NULL;
440 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
441 if (page)
442 vaddr = page_address(page);
443 return vaddr;
446 static inline void free_pgtable_page(void *vaddr)
448 free_page((unsigned long)vaddr);
451 static inline void *alloc_domain_mem(void)
453 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
456 static void free_domain_mem(void *vaddr)
458 kmem_cache_free(iommu_domain_cache, vaddr);
461 static inline void * alloc_devinfo_mem(void)
463 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
466 static inline void free_devinfo_mem(void *vaddr)
468 kmem_cache_free(iommu_devinfo_cache, vaddr);
471 struct iova *alloc_iova_mem(void)
473 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
476 void free_iova_mem(struct iova *iova)
478 kmem_cache_free(iommu_iova_cache, iova);
482 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
484 unsigned long sagaw;
485 int agaw = -1;
487 sagaw = cap_sagaw(iommu->cap);
488 for (agaw = width_to_agaw(max_gaw);
489 agaw >= 0; agaw--) {
490 if (test_bit(agaw, &sagaw))
491 break;
494 return agaw;
498 * Calculate max SAGAW for each iommu.
500 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
502 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
506 * calculate agaw for each iommu.
507 * "SAGAW" may be different across iommus, use a default agaw, and
508 * get a supported less agaw for iommus that don't support the default agaw.
510 int iommu_calculate_agaw(struct intel_iommu *iommu)
512 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
515 /* This functionin only returns single iommu in a domain */
516 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
518 int iommu_id;
520 /* si_domain and vm domain should not get here. */
521 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
522 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
524 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
525 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
526 return NULL;
528 return g_iommus[iommu_id];
531 static void domain_update_iommu_coherency(struct dmar_domain *domain)
533 int i;
535 domain->iommu_coherency = 1;
537 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
538 if (!ecap_coherent(g_iommus[i]->ecap)) {
539 domain->iommu_coherency = 0;
540 break;
545 static void domain_update_iommu_snooping(struct dmar_domain *domain)
547 int i;
549 domain->iommu_snooping = 1;
551 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
552 if (!ecap_sc_support(g_iommus[i]->ecap)) {
553 domain->iommu_snooping = 0;
554 break;
559 /* Some capabilities may be different across iommus */
560 static void domain_update_iommu_cap(struct dmar_domain *domain)
562 domain_update_iommu_coherency(domain);
563 domain_update_iommu_snooping(domain);
566 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
568 struct dmar_drhd_unit *drhd = NULL;
569 int i;
571 for_each_drhd_unit(drhd) {
572 if (drhd->ignored)
573 continue;
574 if (segment != drhd->segment)
575 continue;
577 for (i = 0; i < drhd->devices_cnt; i++) {
578 if (drhd->devices[i] &&
579 drhd->devices[i]->bus->number == bus &&
580 drhd->devices[i]->devfn == devfn)
581 return drhd->iommu;
582 if (drhd->devices[i] &&
583 drhd->devices[i]->subordinate &&
584 drhd->devices[i]->subordinate->number <= bus &&
585 drhd->devices[i]->subordinate->subordinate >= bus)
586 return drhd->iommu;
589 if (drhd->include_all)
590 return drhd->iommu;
593 return NULL;
596 static void domain_flush_cache(struct dmar_domain *domain,
597 void *addr, int size)
599 if (!domain->iommu_coherency)
600 clflush_cache_range(addr, size);
603 /* Gets context entry for a given bus and devfn */
604 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
605 u8 bus, u8 devfn)
607 struct root_entry *root;
608 struct context_entry *context;
609 unsigned long phy_addr;
610 unsigned long flags;
612 spin_lock_irqsave(&iommu->lock, flags);
613 root = &iommu->root_entry[bus];
614 context = get_context_addr_from_root(root);
615 if (!context) {
616 context = (struct context_entry *)
617 alloc_pgtable_page(iommu->node);
618 if (!context) {
619 spin_unlock_irqrestore(&iommu->lock, flags);
620 return NULL;
622 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
623 phy_addr = virt_to_phys((void *)context);
624 set_root_value(root, phy_addr);
625 set_root_present(root);
626 __iommu_flush_cache(iommu, root, sizeof(*root));
628 spin_unlock_irqrestore(&iommu->lock, flags);
629 return &context[devfn];
632 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
634 struct root_entry *root;
635 struct context_entry *context;
636 int ret;
637 unsigned long flags;
639 spin_lock_irqsave(&iommu->lock, flags);
640 root = &iommu->root_entry[bus];
641 context = get_context_addr_from_root(root);
642 if (!context) {
643 ret = 0;
644 goto out;
646 ret = context_present(&context[devfn]);
647 out:
648 spin_unlock_irqrestore(&iommu->lock, flags);
649 return ret;
652 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
654 struct root_entry *root;
655 struct context_entry *context;
656 unsigned long flags;
658 spin_lock_irqsave(&iommu->lock, flags);
659 root = &iommu->root_entry[bus];
660 context = get_context_addr_from_root(root);
661 if (context) {
662 context_clear_entry(&context[devfn]);
663 __iommu_flush_cache(iommu, &context[devfn], \
664 sizeof(*context));
666 spin_unlock_irqrestore(&iommu->lock, flags);
669 static void free_context_table(struct intel_iommu *iommu)
671 struct root_entry *root;
672 int i;
673 unsigned long flags;
674 struct context_entry *context;
676 spin_lock_irqsave(&iommu->lock, flags);
677 if (!iommu->root_entry) {
678 goto out;
680 for (i = 0; i < ROOT_ENTRY_NR; i++) {
681 root = &iommu->root_entry[i];
682 context = get_context_addr_from_root(root);
683 if (context)
684 free_pgtable_page(context);
686 free_pgtable_page(iommu->root_entry);
687 iommu->root_entry = NULL;
688 out:
689 spin_unlock_irqrestore(&iommu->lock, flags);
692 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
693 unsigned long pfn)
695 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
696 struct dma_pte *parent, *pte = NULL;
697 int level = agaw_to_level(domain->agaw);
698 int offset;
700 BUG_ON(!domain->pgd);
701 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
702 parent = domain->pgd;
704 while (level > 0) {
705 void *tmp_page;
707 offset = pfn_level_offset(pfn, level);
708 pte = &parent[offset];
709 if (level == 1)
710 break;
712 if (!dma_pte_present(pte)) {
713 uint64_t pteval;
715 tmp_page = alloc_pgtable_page(domain->nid);
717 if (!tmp_page)
718 return NULL;
720 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
721 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
722 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
723 /* Someone else set it while we were thinking; use theirs. */
724 free_pgtable_page(tmp_page);
725 } else {
726 dma_pte_addr(pte);
727 domain_flush_cache(domain, pte, sizeof(*pte));
730 parent = phys_to_virt(dma_pte_addr(pte));
731 level--;
734 return pte;
737 /* return address's pte at specific level */
738 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
739 unsigned long pfn,
740 int level)
742 struct dma_pte *parent, *pte = NULL;
743 int total = agaw_to_level(domain->agaw);
744 int offset;
746 parent = domain->pgd;
747 while (level <= total) {
748 offset = pfn_level_offset(pfn, total);
749 pte = &parent[offset];
750 if (level == total)
751 return pte;
753 if (!dma_pte_present(pte))
754 break;
755 parent = phys_to_virt(dma_pte_addr(pte));
756 total--;
758 return NULL;
761 /* clear last level pte, a tlb flush should be followed */
762 static void dma_pte_clear_range(struct dmar_domain *domain,
763 unsigned long start_pfn,
764 unsigned long last_pfn)
766 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
767 struct dma_pte *first_pte, *pte;
769 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
770 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
771 BUG_ON(start_pfn > last_pfn);
773 /* we don't need lock here; nobody else touches the iova range */
774 do {
775 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
776 if (!pte) {
777 start_pfn = align_to_level(start_pfn + 1, 2);
778 continue;
780 do {
781 dma_clear_pte(pte);
782 start_pfn++;
783 pte++;
784 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
786 domain_flush_cache(domain, first_pte,
787 (void *)pte - (void *)first_pte);
789 } while (start_pfn && start_pfn <= last_pfn);
792 /* free page table pages. last level pte should already be cleared */
793 static void dma_pte_free_pagetable(struct dmar_domain *domain,
794 unsigned long start_pfn,
795 unsigned long last_pfn)
797 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
798 struct dma_pte *first_pte, *pte;
799 int total = agaw_to_level(domain->agaw);
800 int level;
801 unsigned long tmp;
803 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
804 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
805 BUG_ON(start_pfn > last_pfn);
807 /* We don't need lock here; nobody else touches the iova range */
808 level = 2;
809 while (level <= total) {
810 tmp = align_to_level(start_pfn, level);
812 /* If we can't even clear one PTE at this level, we're done */
813 if (tmp + level_size(level) - 1 > last_pfn)
814 return;
816 do {
817 first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
818 if (!pte) {
819 tmp = align_to_level(tmp + 1, level + 1);
820 continue;
822 do {
823 if (dma_pte_present(pte)) {
824 free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
825 dma_clear_pte(pte);
827 pte++;
828 tmp += level_size(level);
829 } while (!first_pte_in_page(pte) &&
830 tmp + level_size(level) - 1 <= last_pfn);
832 domain_flush_cache(domain, first_pte,
833 (void *)pte - (void *)first_pte);
835 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
836 level++;
838 /* free pgd */
839 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
840 free_pgtable_page(domain->pgd);
841 domain->pgd = NULL;
845 /* iommu handling */
846 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
848 struct root_entry *root;
849 unsigned long flags;
851 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
852 if (!root)
853 return -ENOMEM;
855 __iommu_flush_cache(iommu, root, ROOT_SIZE);
857 spin_lock_irqsave(&iommu->lock, flags);
858 iommu->root_entry = root;
859 spin_unlock_irqrestore(&iommu->lock, flags);
861 return 0;
864 static void iommu_set_root_entry(struct intel_iommu *iommu)
866 void *addr;
867 u32 sts;
868 unsigned long flag;
870 addr = iommu->root_entry;
872 spin_lock_irqsave(&iommu->register_lock, flag);
873 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
875 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
877 /* Make sure hardware complete it */
878 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
879 readl, (sts & DMA_GSTS_RTPS), sts);
881 spin_unlock_irqrestore(&iommu->register_lock, flag);
884 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
886 u32 val;
887 unsigned long flag;
889 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
890 return;
892 spin_lock_irqsave(&iommu->register_lock, flag);
893 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
895 /* Make sure hardware complete it */
896 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
897 readl, (!(val & DMA_GSTS_WBFS)), val);
899 spin_unlock_irqrestore(&iommu->register_lock, flag);
902 /* return value determine if we need a write buffer flush */
903 static void __iommu_flush_context(struct intel_iommu *iommu,
904 u16 did, u16 source_id, u8 function_mask,
905 u64 type)
907 u64 val = 0;
908 unsigned long flag;
910 switch (type) {
911 case DMA_CCMD_GLOBAL_INVL:
912 val = DMA_CCMD_GLOBAL_INVL;
913 break;
914 case DMA_CCMD_DOMAIN_INVL:
915 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
916 break;
917 case DMA_CCMD_DEVICE_INVL:
918 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
919 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
920 break;
921 default:
922 BUG();
924 val |= DMA_CCMD_ICC;
926 spin_lock_irqsave(&iommu->register_lock, flag);
927 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
929 /* Make sure hardware complete it */
930 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
931 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
933 spin_unlock_irqrestore(&iommu->register_lock, flag);
936 /* return value determine if we need a write buffer flush */
937 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
938 u64 addr, unsigned int size_order, u64 type)
940 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
941 u64 val = 0, val_iva = 0;
942 unsigned long flag;
944 switch (type) {
945 case DMA_TLB_GLOBAL_FLUSH:
946 /* global flush doesn't need set IVA_REG */
947 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
948 break;
949 case DMA_TLB_DSI_FLUSH:
950 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
951 break;
952 case DMA_TLB_PSI_FLUSH:
953 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
954 /* Note: always flush non-leaf currently */
955 val_iva = size_order | addr;
956 break;
957 default:
958 BUG();
960 /* Note: set drain read/write */
961 #if 0
963 * This is probably to be super secure.. Looks like we can
964 * ignore it without any impact.
966 if (cap_read_drain(iommu->cap))
967 val |= DMA_TLB_READ_DRAIN;
968 #endif
969 if (cap_write_drain(iommu->cap))
970 val |= DMA_TLB_WRITE_DRAIN;
972 spin_lock_irqsave(&iommu->register_lock, flag);
973 /* Note: Only uses first TLB reg currently */
974 if (val_iva)
975 dmar_writeq(iommu->reg + tlb_offset, val_iva);
976 dmar_writeq(iommu->reg + tlb_offset + 8, val);
978 /* Make sure hardware complete it */
979 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
980 dmar_readq, (!(val & DMA_TLB_IVT)), val);
982 spin_unlock_irqrestore(&iommu->register_lock, flag);
984 /* check IOTLB invalidation granularity */
985 if (DMA_TLB_IAIG(val) == 0)
986 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
987 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
988 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
989 (unsigned long long)DMA_TLB_IIRG(type),
990 (unsigned long long)DMA_TLB_IAIG(val));
993 static struct device_domain_info *iommu_support_dev_iotlb(
994 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
996 int found = 0;
997 unsigned long flags;
998 struct device_domain_info *info;
999 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1001 if (!ecap_dev_iotlb_support(iommu->ecap))
1002 return NULL;
1004 if (!iommu->qi)
1005 return NULL;
1007 spin_lock_irqsave(&device_domain_lock, flags);
1008 list_for_each_entry(info, &domain->devices, link)
1009 if (info->bus == bus && info->devfn == devfn) {
1010 found = 1;
1011 break;
1013 spin_unlock_irqrestore(&device_domain_lock, flags);
1015 if (!found || !info->dev)
1016 return NULL;
1018 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1019 return NULL;
1021 if (!dmar_find_matched_atsr_unit(info->dev))
1022 return NULL;
1024 info->iommu = iommu;
1026 return info;
1029 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1031 if (!info)
1032 return;
1034 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1037 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1039 if (!info->dev || !pci_ats_enabled(info->dev))
1040 return;
1042 pci_disable_ats(info->dev);
1045 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1046 u64 addr, unsigned mask)
1048 u16 sid, qdep;
1049 unsigned long flags;
1050 struct device_domain_info *info;
1052 spin_lock_irqsave(&device_domain_lock, flags);
1053 list_for_each_entry(info, &domain->devices, link) {
1054 if (!info->dev || !pci_ats_enabled(info->dev))
1055 continue;
1057 sid = info->bus << 8 | info->devfn;
1058 qdep = pci_ats_queue_depth(info->dev);
1059 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1061 spin_unlock_irqrestore(&device_domain_lock, flags);
1064 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1065 unsigned long pfn, unsigned int pages, int map)
1067 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1068 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1070 BUG_ON(pages == 0);
1073 * Fallback to domain selective flush if no PSI support or the size is
1074 * too big.
1075 * PSI requires page size to be 2 ^ x, and the base address is naturally
1076 * aligned to the size
1078 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1079 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1080 DMA_TLB_DSI_FLUSH);
1081 else
1082 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1083 DMA_TLB_PSI_FLUSH);
1086 * In caching mode, changes of pages from non-present to present require
1087 * flush. However, device IOTLB doesn't need to be flushed in this case.
1089 if (!cap_caching_mode(iommu->cap) || !map)
1090 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1093 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1095 u32 pmen;
1096 unsigned long flags;
1098 spin_lock_irqsave(&iommu->register_lock, flags);
1099 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1100 pmen &= ~DMA_PMEN_EPM;
1101 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1103 /* wait for the protected region status bit to clear */
1104 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1105 readl, !(pmen & DMA_PMEN_PRS), pmen);
1107 spin_unlock_irqrestore(&iommu->register_lock, flags);
1110 static int iommu_enable_translation(struct intel_iommu *iommu)
1112 u32 sts;
1113 unsigned long flags;
1115 spin_lock_irqsave(&iommu->register_lock, flags);
1116 iommu->gcmd |= DMA_GCMD_TE;
1117 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1119 /* Make sure hardware complete it */
1120 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1121 readl, (sts & DMA_GSTS_TES), sts);
1123 spin_unlock_irqrestore(&iommu->register_lock, flags);
1124 return 0;
1127 static int iommu_disable_translation(struct intel_iommu *iommu)
1129 u32 sts;
1130 unsigned long flag;
1132 spin_lock_irqsave(&iommu->register_lock, flag);
1133 iommu->gcmd &= ~DMA_GCMD_TE;
1134 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1136 /* Make sure hardware complete it */
1137 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1138 readl, (!(sts & DMA_GSTS_TES)), sts);
1140 spin_unlock_irqrestore(&iommu->register_lock, flag);
1141 return 0;
1145 static int iommu_init_domains(struct intel_iommu *iommu)
1147 unsigned long ndomains;
1148 unsigned long nlongs;
1150 ndomains = cap_ndoms(iommu->cap);
1151 pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1152 ndomains);
1153 nlongs = BITS_TO_LONGS(ndomains);
1155 spin_lock_init(&iommu->lock);
1157 /* TBD: there might be 64K domains,
1158 * consider other allocation for future chip
1160 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1161 if (!iommu->domain_ids) {
1162 printk(KERN_ERR "Allocating domain id array failed\n");
1163 return -ENOMEM;
1165 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1166 GFP_KERNEL);
1167 if (!iommu->domains) {
1168 printk(KERN_ERR "Allocating domain array failed\n");
1169 return -ENOMEM;
1173 * if Caching mode is set, then invalid translations are tagged
1174 * with domainid 0. Hence we need to pre-allocate it.
1176 if (cap_caching_mode(iommu->cap))
1177 set_bit(0, iommu->domain_ids);
1178 return 0;
1182 static void domain_exit(struct dmar_domain *domain);
1183 static void vm_domain_exit(struct dmar_domain *domain);
1185 void free_dmar_iommu(struct intel_iommu *iommu)
1187 struct dmar_domain *domain;
1188 int i;
1189 unsigned long flags;
1191 if ((iommu->domains) && (iommu->domain_ids)) {
1192 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1193 domain = iommu->domains[i];
1194 clear_bit(i, iommu->domain_ids);
1196 spin_lock_irqsave(&domain->iommu_lock, flags);
1197 if (--domain->iommu_count == 0) {
1198 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1199 vm_domain_exit(domain);
1200 else
1201 domain_exit(domain);
1203 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1207 if (iommu->gcmd & DMA_GCMD_TE)
1208 iommu_disable_translation(iommu);
1210 if (iommu->irq) {
1211 irq_set_handler_data(iommu->irq, NULL);
1212 /* This will mask the irq */
1213 free_irq(iommu->irq, iommu);
1214 destroy_irq(iommu->irq);
1217 kfree(iommu->domains);
1218 kfree(iommu->domain_ids);
1220 g_iommus[iommu->seq_id] = NULL;
1222 /* if all iommus are freed, free g_iommus */
1223 for (i = 0; i < g_num_of_iommus; i++) {
1224 if (g_iommus[i])
1225 break;
1228 if (i == g_num_of_iommus)
1229 kfree(g_iommus);
1231 /* free context mapping */
1232 free_context_table(iommu);
1235 static struct dmar_domain *alloc_domain(void)
1237 struct dmar_domain *domain;
1239 domain = alloc_domain_mem();
1240 if (!domain)
1241 return NULL;
1243 domain->nid = -1;
1244 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1245 domain->flags = 0;
1247 return domain;
1250 static int iommu_attach_domain(struct dmar_domain *domain,
1251 struct intel_iommu *iommu)
1253 int num;
1254 unsigned long ndomains;
1255 unsigned long flags;
1257 ndomains = cap_ndoms(iommu->cap);
1259 spin_lock_irqsave(&iommu->lock, flags);
1261 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1262 if (num >= ndomains) {
1263 spin_unlock_irqrestore(&iommu->lock, flags);
1264 printk(KERN_ERR "IOMMU: no free domain ids\n");
1265 return -ENOMEM;
1268 domain->id = num;
1269 set_bit(num, iommu->domain_ids);
1270 set_bit(iommu->seq_id, &domain->iommu_bmp);
1271 iommu->domains[num] = domain;
1272 spin_unlock_irqrestore(&iommu->lock, flags);
1274 return 0;
1277 static void iommu_detach_domain(struct dmar_domain *domain,
1278 struct intel_iommu *iommu)
1280 unsigned long flags;
1281 int num, ndomains;
1282 int found = 0;
1284 spin_lock_irqsave(&iommu->lock, flags);
1285 ndomains = cap_ndoms(iommu->cap);
1286 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1287 if (iommu->domains[num] == domain) {
1288 found = 1;
1289 break;
1293 if (found) {
1294 clear_bit(num, iommu->domain_ids);
1295 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1296 iommu->domains[num] = NULL;
1298 spin_unlock_irqrestore(&iommu->lock, flags);
1301 static struct iova_domain reserved_iova_list;
1302 static struct lock_class_key reserved_rbtree_key;
1304 static int dmar_init_reserved_ranges(void)
1306 struct pci_dev *pdev = NULL;
1307 struct iova *iova;
1308 int i;
1310 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1312 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1313 &reserved_rbtree_key);
1315 /* IOAPIC ranges shouldn't be accessed by DMA */
1316 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1317 IOVA_PFN(IOAPIC_RANGE_END));
1318 if (!iova) {
1319 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1320 return -ENODEV;
1323 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1324 for_each_pci_dev(pdev) {
1325 struct resource *r;
1327 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1328 r = &pdev->resource[i];
1329 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1330 continue;
1331 iova = reserve_iova(&reserved_iova_list,
1332 IOVA_PFN(r->start),
1333 IOVA_PFN(r->end));
1334 if (!iova) {
1335 printk(KERN_ERR "Reserve iova failed\n");
1336 return -ENODEV;
1340 return 0;
1343 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1345 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1348 static inline int guestwidth_to_adjustwidth(int gaw)
1350 int agaw;
1351 int r = (gaw - 12) % 9;
1353 if (r == 0)
1354 agaw = gaw;
1355 else
1356 agaw = gaw + 9 - r;
1357 if (agaw > 64)
1358 agaw = 64;
1359 return agaw;
1362 static int domain_init(struct dmar_domain *domain, int guest_width)
1364 struct intel_iommu *iommu;
1365 int adjust_width, agaw;
1366 unsigned long sagaw;
1368 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1369 spin_lock_init(&domain->iommu_lock);
1371 domain_reserve_special_ranges(domain);
1373 /* calculate AGAW */
1374 iommu = domain_get_iommu(domain);
1375 if (guest_width > cap_mgaw(iommu->cap))
1376 guest_width = cap_mgaw(iommu->cap);
1377 domain->gaw = guest_width;
1378 adjust_width = guestwidth_to_adjustwidth(guest_width);
1379 agaw = width_to_agaw(adjust_width);
1380 sagaw = cap_sagaw(iommu->cap);
1381 if (!test_bit(agaw, &sagaw)) {
1382 /* hardware doesn't support it, choose a bigger one */
1383 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1384 agaw = find_next_bit(&sagaw, 5, agaw);
1385 if (agaw >= 5)
1386 return -ENODEV;
1388 domain->agaw = agaw;
1389 INIT_LIST_HEAD(&domain->devices);
1391 if (ecap_coherent(iommu->ecap))
1392 domain->iommu_coherency = 1;
1393 else
1394 domain->iommu_coherency = 0;
1396 if (ecap_sc_support(iommu->ecap))
1397 domain->iommu_snooping = 1;
1398 else
1399 domain->iommu_snooping = 0;
1401 domain->iommu_count = 1;
1402 domain->nid = iommu->node;
1404 /* always allocate the top pgd */
1405 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1406 if (!domain->pgd)
1407 return -ENOMEM;
1408 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1409 return 0;
1412 static void domain_exit(struct dmar_domain *domain)
1414 struct dmar_drhd_unit *drhd;
1415 struct intel_iommu *iommu;
1417 /* Domain 0 is reserved, so dont process it */
1418 if (!domain)
1419 return;
1421 /* Flush any lazy unmaps that may reference this domain */
1422 if (!intel_iommu_strict)
1423 flush_unmaps_timeout(0);
1425 domain_remove_dev_info(domain);
1426 /* destroy iovas */
1427 put_iova_domain(&domain->iovad);
1429 /* clear ptes */
1430 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1432 /* free page tables */
1433 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1435 for_each_active_iommu(iommu, drhd)
1436 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1437 iommu_detach_domain(domain, iommu);
1439 free_domain_mem(domain);
1442 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1443 u8 bus, u8 devfn, int translation)
1445 struct context_entry *context;
1446 unsigned long flags;
1447 struct intel_iommu *iommu;
1448 struct dma_pte *pgd;
1449 unsigned long num;
1450 unsigned long ndomains;
1451 int id;
1452 int agaw;
1453 struct device_domain_info *info = NULL;
1455 pr_debug("Set context mapping for %02x:%02x.%d\n",
1456 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1458 BUG_ON(!domain->pgd);
1459 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1460 translation != CONTEXT_TT_MULTI_LEVEL);
1462 iommu = device_to_iommu(segment, bus, devfn);
1463 if (!iommu)
1464 return -ENODEV;
1466 context = device_to_context_entry(iommu, bus, devfn);
1467 if (!context)
1468 return -ENOMEM;
1469 spin_lock_irqsave(&iommu->lock, flags);
1470 if (context_present(context)) {
1471 spin_unlock_irqrestore(&iommu->lock, flags);
1472 return 0;
1475 id = domain->id;
1476 pgd = domain->pgd;
1478 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1479 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1480 int found = 0;
1482 /* find an available domain id for this device in iommu */
1483 ndomains = cap_ndoms(iommu->cap);
1484 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1485 if (iommu->domains[num] == domain) {
1486 id = num;
1487 found = 1;
1488 break;
1492 if (found == 0) {
1493 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1494 if (num >= ndomains) {
1495 spin_unlock_irqrestore(&iommu->lock, flags);
1496 printk(KERN_ERR "IOMMU: no free domain ids\n");
1497 return -EFAULT;
1500 set_bit(num, iommu->domain_ids);
1501 iommu->domains[num] = domain;
1502 id = num;
1505 /* Skip top levels of page tables for
1506 * iommu which has less agaw than default.
1507 * Unnecessary for PT mode.
1509 if (translation != CONTEXT_TT_PASS_THROUGH) {
1510 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1511 pgd = phys_to_virt(dma_pte_addr(pgd));
1512 if (!dma_pte_present(pgd)) {
1513 spin_unlock_irqrestore(&iommu->lock, flags);
1514 return -ENOMEM;
1520 context_set_domain_id(context, id);
1522 if (translation != CONTEXT_TT_PASS_THROUGH) {
1523 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1524 translation = info ? CONTEXT_TT_DEV_IOTLB :
1525 CONTEXT_TT_MULTI_LEVEL;
1528 * In pass through mode, AW must be programmed to indicate the largest
1529 * AGAW value supported by hardware. And ASR is ignored by hardware.
1531 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1532 context_set_address_width(context, iommu->msagaw);
1533 else {
1534 context_set_address_root(context, virt_to_phys(pgd));
1535 context_set_address_width(context, iommu->agaw);
1538 context_set_translation_type(context, translation);
1539 context_set_fault_enable(context);
1540 context_set_present(context);
1541 domain_flush_cache(domain, context, sizeof(*context));
1544 * It's a non-present to present mapping. If hardware doesn't cache
1545 * non-present entry we only need to flush the write-buffer. If the
1546 * _does_ cache non-present entries, then it does so in the special
1547 * domain #0, which we have to flush:
1549 if (cap_caching_mode(iommu->cap)) {
1550 iommu->flush.flush_context(iommu, 0,
1551 (((u16)bus) << 8) | devfn,
1552 DMA_CCMD_MASK_NOBIT,
1553 DMA_CCMD_DEVICE_INVL);
1554 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1555 } else {
1556 iommu_flush_write_buffer(iommu);
1558 iommu_enable_dev_iotlb(info);
1559 spin_unlock_irqrestore(&iommu->lock, flags);
1561 spin_lock_irqsave(&domain->iommu_lock, flags);
1562 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1563 domain->iommu_count++;
1564 if (domain->iommu_count == 1)
1565 domain->nid = iommu->node;
1566 domain_update_iommu_cap(domain);
1568 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1569 return 0;
1572 static int
1573 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1574 int translation)
1576 int ret;
1577 struct pci_dev *tmp, *parent;
1579 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1580 pdev->bus->number, pdev->devfn,
1581 translation);
1582 if (ret)
1583 return ret;
1585 /* dependent device mapping */
1586 tmp = pci_find_upstream_pcie_bridge(pdev);
1587 if (!tmp)
1588 return 0;
1589 /* Secondary interface's bus number and devfn 0 */
1590 parent = pdev->bus->self;
1591 while (parent != tmp) {
1592 ret = domain_context_mapping_one(domain,
1593 pci_domain_nr(parent->bus),
1594 parent->bus->number,
1595 parent->devfn, translation);
1596 if (ret)
1597 return ret;
1598 parent = parent->bus->self;
1600 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1601 return domain_context_mapping_one(domain,
1602 pci_domain_nr(tmp->subordinate),
1603 tmp->subordinate->number, 0,
1604 translation);
1605 else /* this is a legacy PCI bridge */
1606 return domain_context_mapping_one(domain,
1607 pci_domain_nr(tmp->bus),
1608 tmp->bus->number,
1609 tmp->devfn,
1610 translation);
1613 static int domain_context_mapped(struct pci_dev *pdev)
1615 int ret;
1616 struct pci_dev *tmp, *parent;
1617 struct intel_iommu *iommu;
1619 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1620 pdev->devfn);
1621 if (!iommu)
1622 return -ENODEV;
1624 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1625 if (!ret)
1626 return ret;
1627 /* dependent device mapping */
1628 tmp = pci_find_upstream_pcie_bridge(pdev);
1629 if (!tmp)
1630 return ret;
1631 /* Secondary interface's bus number and devfn 0 */
1632 parent = pdev->bus->self;
1633 while (parent != tmp) {
1634 ret = device_context_mapped(iommu, parent->bus->number,
1635 parent->devfn);
1636 if (!ret)
1637 return ret;
1638 parent = parent->bus->self;
1640 if (pci_is_pcie(tmp))
1641 return device_context_mapped(iommu, tmp->subordinate->number,
1643 else
1644 return device_context_mapped(iommu, tmp->bus->number,
1645 tmp->devfn);
1648 /* Returns a number of VTD pages, but aligned to MM page size */
1649 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1650 size_t size)
1652 host_addr &= ~PAGE_MASK;
1653 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1656 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1657 struct scatterlist *sg, unsigned long phys_pfn,
1658 unsigned long nr_pages, int prot)
1660 struct dma_pte *first_pte = NULL, *pte = NULL;
1661 phys_addr_t uninitialized_var(pteval);
1662 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1663 unsigned long sg_res;
1665 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1667 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1668 return -EINVAL;
1670 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1672 if (sg)
1673 sg_res = 0;
1674 else {
1675 sg_res = nr_pages + 1;
1676 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1679 while (nr_pages--) {
1680 uint64_t tmp;
1682 if (!sg_res) {
1683 sg_res = aligned_nrpages(sg->offset, sg->length);
1684 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1685 sg->dma_length = sg->length;
1686 pteval = page_to_phys(sg_page(sg)) | prot;
1688 if (!pte) {
1689 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1690 if (!pte)
1691 return -ENOMEM;
1693 /* We don't need lock here, nobody else
1694 * touches the iova range
1696 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1697 if (tmp) {
1698 static int dumps = 5;
1699 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1700 iov_pfn, tmp, (unsigned long long)pteval);
1701 if (dumps) {
1702 dumps--;
1703 debug_dma_dump_mappings(NULL);
1705 WARN_ON(1);
1707 pte++;
1708 if (!nr_pages || first_pte_in_page(pte)) {
1709 domain_flush_cache(domain, first_pte,
1710 (void *)pte - (void *)first_pte);
1711 pte = NULL;
1713 iov_pfn++;
1714 pteval += VTD_PAGE_SIZE;
1715 sg_res--;
1716 if (!sg_res)
1717 sg = sg_next(sg);
1719 return 0;
1722 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1723 struct scatterlist *sg, unsigned long nr_pages,
1724 int prot)
1726 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1729 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1730 unsigned long phys_pfn, unsigned long nr_pages,
1731 int prot)
1733 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1736 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1738 if (!iommu)
1739 return;
1741 clear_context_table(iommu, bus, devfn);
1742 iommu->flush.flush_context(iommu, 0, 0, 0,
1743 DMA_CCMD_GLOBAL_INVL);
1744 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1747 static void domain_remove_dev_info(struct dmar_domain *domain)
1749 struct device_domain_info *info;
1750 unsigned long flags;
1751 struct intel_iommu *iommu;
1753 spin_lock_irqsave(&device_domain_lock, flags);
1754 while (!list_empty(&domain->devices)) {
1755 info = list_entry(domain->devices.next,
1756 struct device_domain_info, link);
1757 list_del(&info->link);
1758 list_del(&info->global);
1759 if (info->dev)
1760 info->dev->dev.archdata.iommu = NULL;
1761 spin_unlock_irqrestore(&device_domain_lock, flags);
1763 iommu_disable_dev_iotlb(info);
1764 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1765 iommu_detach_dev(iommu, info->bus, info->devfn);
1766 free_devinfo_mem(info);
1768 spin_lock_irqsave(&device_domain_lock, flags);
1770 spin_unlock_irqrestore(&device_domain_lock, flags);
1774 * find_domain
1775 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1777 static struct dmar_domain *
1778 find_domain(struct pci_dev *pdev)
1780 struct device_domain_info *info;
1782 /* No lock here, assumes no domain exit in normal case */
1783 info = pdev->dev.archdata.iommu;
1784 if (info)
1785 return info->domain;
1786 return NULL;
1789 /* domain is initialized */
1790 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1792 struct dmar_domain *domain, *found = NULL;
1793 struct intel_iommu *iommu;
1794 struct dmar_drhd_unit *drhd;
1795 struct device_domain_info *info, *tmp;
1796 struct pci_dev *dev_tmp;
1797 unsigned long flags;
1798 int bus = 0, devfn = 0;
1799 int segment;
1800 int ret;
1802 domain = find_domain(pdev);
1803 if (domain)
1804 return domain;
1806 segment = pci_domain_nr(pdev->bus);
1808 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1809 if (dev_tmp) {
1810 if (pci_is_pcie(dev_tmp)) {
1811 bus = dev_tmp->subordinate->number;
1812 devfn = 0;
1813 } else {
1814 bus = dev_tmp->bus->number;
1815 devfn = dev_tmp->devfn;
1817 spin_lock_irqsave(&device_domain_lock, flags);
1818 list_for_each_entry(info, &device_domain_list, global) {
1819 if (info->segment == segment &&
1820 info->bus == bus && info->devfn == devfn) {
1821 found = info->domain;
1822 break;
1825 spin_unlock_irqrestore(&device_domain_lock, flags);
1826 /* pcie-pci bridge already has a domain, uses it */
1827 if (found) {
1828 domain = found;
1829 goto found_domain;
1833 domain = alloc_domain();
1834 if (!domain)
1835 goto error;
1837 /* Allocate new domain for the device */
1838 drhd = dmar_find_matched_drhd_unit(pdev);
1839 if (!drhd) {
1840 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1841 pci_name(pdev));
1842 return NULL;
1844 iommu = drhd->iommu;
1846 ret = iommu_attach_domain(domain, iommu);
1847 if (ret) {
1848 free_domain_mem(domain);
1849 goto error;
1852 if (domain_init(domain, gaw)) {
1853 domain_exit(domain);
1854 goto error;
1857 /* register pcie-to-pci device */
1858 if (dev_tmp) {
1859 info = alloc_devinfo_mem();
1860 if (!info) {
1861 domain_exit(domain);
1862 goto error;
1864 info->segment = segment;
1865 info->bus = bus;
1866 info->devfn = devfn;
1867 info->dev = NULL;
1868 info->domain = domain;
1869 /* This domain is shared by devices under p2p bridge */
1870 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1872 /* pcie-to-pci bridge already has a domain, uses it */
1873 found = NULL;
1874 spin_lock_irqsave(&device_domain_lock, flags);
1875 list_for_each_entry(tmp, &device_domain_list, global) {
1876 if (tmp->segment == segment &&
1877 tmp->bus == bus && tmp->devfn == devfn) {
1878 found = tmp->domain;
1879 break;
1882 if (found) {
1883 spin_unlock_irqrestore(&device_domain_lock, flags);
1884 free_devinfo_mem(info);
1885 domain_exit(domain);
1886 domain = found;
1887 } else {
1888 list_add(&info->link, &domain->devices);
1889 list_add(&info->global, &device_domain_list);
1890 spin_unlock_irqrestore(&device_domain_lock, flags);
1894 found_domain:
1895 info = alloc_devinfo_mem();
1896 if (!info)
1897 goto error;
1898 info->segment = segment;
1899 info->bus = pdev->bus->number;
1900 info->devfn = pdev->devfn;
1901 info->dev = pdev;
1902 info->domain = domain;
1903 spin_lock_irqsave(&device_domain_lock, flags);
1904 /* somebody is fast */
1905 found = find_domain(pdev);
1906 if (found != NULL) {
1907 spin_unlock_irqrestore(&device_domain_lock, flags);
1908 if (found != domain) {
1909 domain_exit(domain);
1910 domain = found;
1912 free_devinfo_mem(info);
1913 return domain;
1915 list_add(&info->link, &domain->devices);
1916 list_add(&info->global, &device_domain_list);
1917 pdev->dev.archdata.iommu = info;
1918 spin_unlock_irqrestore(&device_domain_lock, flags);
1919 return domain;
1920 error:
1921 /* recheck it here, maybe others set it */
1922 return find_domain(pdev);
1925 static int iommu_identity_mapping;
1926 #define IDENTMAP_ALL 1
1927 #define IDENTMAP_GFX 2
1928 #define IDENTMAP_AZALIA 4
1930 static int iommu_domain_identity_map(struct dmar_domain *domain,
1931 unsigned long long start,
1932 unsigned long long end)
1934 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1935 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1937 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1938 dma_to_mm_pfn(last_vpfn))) {
1939 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1940 return -ENOMEM;
1943 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1944 start, end, domain->id);
1946 * RMRR range might have overlap with physical memory range,
1947 * clear it first
1949 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1951 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1952 last_vpfn - first_vpfn + 1,
1953 DMA_PTE_READ|DMA_PTE_WRITE);
1956 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1957 unsigned long long start,
1958 unsigned long long end)
1960 struct dmar_domain *domain;
1961 int ret;
1963 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1964 if (!domain)
1965 return -ENOMEM;
1967 /* For _hardware_ passthrough, don't bother. But for software
1968 passthrough, we do it anyway -- it may indicate a memory
1969 range which is reserved in E820, so which didn't get set
1970 up to start with in si_domain */
1971 if (domain == si_domain && hw_pass_through) {
1972 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1973 pci_name(pdev), start, end);
1974 return 0;
1977 printk(KERN_INFO
1978 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1979 pci_name(pdev), start, end);
1981 if (end < start) {
1982 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1983 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1984 dmi_get_system_info(DMI_BIOS_VENDOR),
1985 dmi_get_system_info(DMI_BIOS_VERSION),
1986 dmi_get_system_info(DMI_PRODUCT_VERSION));
1987 ret = -EIO;
1988 goto error;
1991 if (end >> agaw_to_width(domain->agaw)) {
1992 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1993 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1994 agaw_to_width(domain->agaw),
1995 dmi_get_system_info(DMI_BIOS_VENDOR),
1996 dmi_get_system_info(DMI_BIOS_VERSION),
1997 dmi_get_system_info(DMI_PRODUCT_VERSION));
1998 ret = -EIO;
1999 goto error;
2002 ret = iommu_domain_identity_map(domain, start, end);
2003 if (ret)
2004 goto error;
2006 /* context entry init */
2007 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2008 if (ret)
2009 goto error;
2011 return 0;
2013 error:
2014 domain_exit(domain);
2015 return ret;
2018 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2019 struct pci_dev *pdev)
2021 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2022 return 0;
2023 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2024 rmrr->end_address + 1);
2027 #ifdef CONFIG_DMAR_FLOPPY_WA
2028 static inline void iommu_prepare_isa(void)
2030 struct pci_dev *pdev;
2031 int ret;
2033 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2034 if (!pdev)
2035 return;
2037 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2038 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2040 if (ret)
2041 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2042 "floppy might not work\n");
2045 #else
2046 static inline void iommu_prepare_isa(void)
2048 return;
2050 #endif /* !CONFIG_DMAR_FLPY_WA */
2052 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2054 static int __init si_domain_work_fn(unsigned long start_pfn,
2055 unsigned long end_pfn, void *datax)
2057 int *ret = datax;
2059 *ret = iommu_domain_identity_map(si_domain,
2060 (uint64_t)start_pfn << PAGE_SHIFT,
2061 (uint64_t)end_pfn << PAGE_SHIFT);
2062 return *ret;
2066 static int __init si_domain_init(int hw)
2068 struct dmar_drhd_unit *drhd;
2069 struct intel_iommu *iommu;
2070 int nid, ret = 0;
2072 si_domain = alloc_domain();
2073 if (!si_domain)
2074 return -EFAULT;
2076 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2078 for_each_active_iommu(iommu, drhd) {
2079 ret = iommu_attach_domain(si_domain, iommu);
2080 if (ret) {
2081 domain_exit(si_domain);
2082 return -EFAULT;
2086 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2087 domain_exit(si_domain);
2088 return -EFAULT;
2091 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2093 if (hw)
2094 return 0;
2096 for_each_online_node(nid) {
2097 work_with_active_regions(nid, si_domain_work_fn, &ret);
2098 if (ret)
2099 return ret;
2102 return 0;
2105 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2106 struct pci_dev *pdev);
2107 static int identity_mapping(struct pci_dev *pdev)
2109 struct device_domain_info *info;
2111 if (likely(!iommu_identity_mapping))
2112 return 0;
2114 info = pdev->dev.archdata.iommu;
2115 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2116 return (info->domain == si_domain);
2118 return 0;
2121 static int domain_add_dev_info(struct dmar_domain *domain,
2122 struct pci_dev *pdev,
2123 int translation)
2125 struct device_domain_info *info;
2126 unsigned long flags;
2127 int ret;
2129 info = alloc_devinfo_mem();
2130 if (!info)
2131 return -ENOMEM;
2133 ret = domain_context_mapping(domain, pdev, translation);
2134 if (ret) {
2135 free_devinfo_mem(info);
2136 return ret;
2139 info->segment = pci_domain_nr(pdev->bus);
2140 info->bus = pdev->bus->number;
2141 info->devfn = pdev->devfn;
2142 info->dev = pdev;
2143 info->domain = domain;
2145 spin_lock_irqsave(&device_domain_lock, flags);
2146 list_add(&info->link, &domain->devices);
2147 list_add(&info->global, &device_domain_list);
2148 pdev->dev.archdata.iommu = info;
2149 spin_unlock_irqrestore(&device_domain_lock, flags);
2151 return 0;
2154 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2156 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2157 return 1;
2159 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2160 return 1;
2162 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2163 return 0;
2166 * We want to start off with all devices in the 1:1 domain, and
2167 * take them out later if we find they can't access all of memory.
2169 * However, we can't do this for PCI devices behind bridges,
2170 * because all PCI devices behind the same bridge will end up
2171 * with the same source-id on their transactions.
2173 * Practically speaking, we can't change things around for these
2174 * devices at run-time, because we can't be sure there'll be no
2175 * DMA transactions in flight for any of their siblings.
2177 * So PCI devices (unless they're on the root bus) as well as
2178 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2179 * the 1:1 domain, just in _case_ one of their siblings turns out
2180 * not to be able to map all of memory.
2182 if (!pci_is_pcie(pdev)) {
2183 if (!pci_is_root_bus(pdev->bus))
2184 return 0;
2185 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2186 return 0;
2187 } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2188 return 0;
2191 * At boot time, we don't yet know if devices will be 64-bit capable.
2192 * Assume that they will -- if they turn out not to be, then we can
2193 * take them out of the 1:1 domain later.
2195 if (!startup) {
2197 * If the device's dma_mask is less than the system's memory
2198 * size then this is not a candidate for identity mapping.
2200 u64 dma_mask = pdev->dma_mask;
2202 if (pdev->dev.coherent_dma_mask &&
2203 pdev->dev.coherent_dma_mask < dma_mask)
2204 dma_mask = pdev->dev.coherent_dma_mask;
2206 return dma_mask >= dma_get_required_mask(&pdev->dev);
2209 return 1;
2212 static int __init iommu_prepare_static_identity_mapping(int hw)
2214 struct pci_dev *pdev = NULL;
2215 int ret;
2217 ret = si_domain_init(hw);
2218 if (ret)
2219 return -EFAULT;
2221 for_each_pci_dev(pdev) {
2222 /* Skip Host/PCI Bridge devices */
2223 if (IS_BRIDGE_HOST_DEVICE(pdev))
2224 continue;
2225 if (iommu_should_identity_map(pdev, 1)) {
2226 printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2227 hw ? "hardware" : "software", pci_name(pdev));
2229 ret = domain_add_dev_info(si_domain, pdev,
2230 hw ? CONTEXT_TT_PASS_THROUGH :
2231 CONTEXT_TT_MULTI_LEVEL);
2232 if (ret)
2233 return ret;
2237 return 0;
2240 static int __init init_dmars(int force_on)
2242 struct dmar_drhd_unit *drhd;
2243 struct dmar_rmrr_unit *rmrr;
2244 struct pci_dev *pdev;
2245 struct intel_iommu *iommu;
2246 int i, ret;
2249 * for each drhd
2250 * allocate root
2251 * initialize and program root entry to not present
2252 * endfor
2254 for_each_drhd_unit(drhd) {
2255 g_num_of_iommus++;
2257 * lock not needed as this is only incremented in the single
2258 * threaded kernel __init code path all other access are read
2259 * only
2263 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2264 GFP_KERNEL);
2265 if (!g_iommus) {
2266 printk(KERN_ERR "Allocating global iommu array failed\n");
2267 ret = -ENOMEM;
2268 goto error;
2271 deferred_flush = kzalloc(g_num_of_iommus *
2272 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2273 if (!deferred_flush) {
2274 ret = -ENOMEM;
2275 goto error;
2278 for_each_drhd_unit(drhd) {
2279 if (drhd->ignored)
2280 continue;
2282 iommu = drhd->iommu;
2283 g_iommus[iommu->seq_id] = iommu;
2285 ret = iommu_init_domains(iommu);
2286 if (ret)
2287 goto error;
2290 * TBD:
2291 * we could share the same root & context tables
2292 * among all IOMMU's. Need to Split it later.
2294 ret = iommu_alloc_root_entry(iommu);
2295 if (ret) {
2296 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2297 goto error;
2299 if (!ecap_pass_through(iommu->ecap))
2300 hw_pass_through = 0;
2304 * Start from the sane iommu hardware state.
2306 for_each_drhd_unit(drhd) {
2307 if (drhd->ignored)
2308 continue;
2310 iommu = drhd->iommu;
2313 * If the queued invalidation is already initialized by us
2314 * (for example, while enabling interrupt-remapping) then
2315 * we got the things already rolling from a sane state.
2317 if (iommu->qi)
2318 continue;
2321 * Clear any previous faults.
2323 dmar_fault(-1, iommu);
2325 * Disable queued invalidation if supported and already enabled
2326 * before OS handover.
2328 dmar_disable_qi(iommu);
2331 for_each_drhd_unit(drhd) {
2332 if (drhd->ignored)
2333 continue;
2335 iommu = drhd->iommu;
2337 if (dmar_enable_qi(iommu)) {
2339 * Queued Invalidate not enabled, use Register Based
2340 * Invalidate
2342 iommu->flush.flush_context = __iommu_flush_context;
2343 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2344 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2345 "invalidation\n",
2346 iommu->seq_id,
2347 (unsigned long long)drhd->reg_base_addr);
2348 } else {
2349 iommu->flush.flush_context = qi_flush_context;
2350 iommu->flush.flush_iotlb = qi_flush_iotlb;
2351 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2352 "invalidation\n",
2353 iommu->seq_id,
2354 (unsigned long long)drhd->reg_base_addr);
2358 if (iommu_pass_through)
2359 iommu_identity_mapping |= IDENTMAP_ALL;
2361 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2362 iommu_identity_mapping |= IDENTMAP_GFX;
2363 #endif
2365 check_tylersburg_isoch();
2368 * If pass through is not set or not enabled, setup context entries for
2369 * identity mappings for rmrr, gfx, and isa and may fall back to static
2370 * identity mapping if iommu_identity_mapping is set.
2372 if (iommu_identity_mapping) {
2373 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2374 if (ret) {
2375 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2376 goto error;
2380 * For each rmrr
2381 * for each dev attached to rmrr
2382 * do
2383 * locate drhd for dev, alloc domain for dev
2384 * allocate free domain
2385 * allocate page table entries for rmrr
2386 * if context not allocated for bus
2387 * allocate and init context
2388 * set present in root table for this bus
2389 * init context with domain, translation etc
2390 * endfor
2391 * endfor
2393 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2394 for_each_rmrr_units(rmrr) {
2395 for (i = 0; i < rmrr->devices_cnt; i++) {
2396 pdev = rmrr->devices[i];
2398 * some BIOS lists non-exist devices in DMAR
2399 * table.
2401 if (!pdev)
2402 continue;
2403 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2404 if (ret)
2405 printk(KERN_ERR
2406 "IOMMU: mapping reserved region failed\n");
2410 iommu_prepare_isa();
2413 * for each drhd
2414 * enable fault log
2415 * global invalidate context cache
2416 * global invalidate iotlb
2417 * enable translation
2419 for_each_drhd_unit(drhd) {
2420 if (drhd->ignored) {
2422 * we always have to disable PMRs or DMA may fail on
2423 * this device
2425 if (force_on)
2426 iommu_disable_protect_mem_regions(drhd->iommu);
2427 continue;
2429 iommu = drhd->iommu;
2431 iommu_flush_write_buffer(iommu);
2433 ret = dmar_set_interrupt(iommu);
2434 if (ret)
2435 goto error;
2437 iommu_set_root_entry(iommu);
2439 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2440 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2442 ret = iommu_enable_translation(iommu);
2443 if (ret)
2444 goto error;
2446 iommu_disable_protect_mem_regions(iommu);
2449 return 0;
2450 error:
2451 for_each_drhd_unit(drhd) {
2452 if (drhd->ignored)
2453 continue;
2454 iommu = drhd->iommu;
2455 free_iommu(iommu);
2457 kfree(g_iommus);
2458 return ret;
2461 /* This takes a number of _MM_ pages, not VTD pages */
2462 static struct iova *intel_alloc_iova(struct device *dev,
2463 struct dmar_domain *domain,
2464 unsigned long nrpages, uint64_t dma_mask)
2466 struct pci_dev *pdev = to_pci_dev(dev);
2467 struct iova *iova = NULL;
2469 /* Restrict dma_mask to the width that the iommu can handle */
2470 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2472 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2474 * First try to allocate an io virtual address in
2475 * DMA_BIT_MASK(32) and if that fails then try allocating
2476 * from higher range
2478 iova = alloc_iova(&domain->iovad, nrpages,
2479 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2480 if (iova)
2481 return iova;
2483 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2484 if (unlikely(!iova)) {
2485 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2486 nrpages, pci_name(pdev));
2487 return NULL;
2490 return iova;
2493 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2495 struct dmar_domain *domain;
2496 int ret;
2498 domain = get_domain_for_dev(pdev,
2499 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2500 if (!domain) {
2501 printk(KERN_ERR
2502 "Allocating domain for %s failed", pci_name(pdev));
2503 return NULL;
2506 /* make sure context mapping is ok */
2507 if (unlikely(!domain_context_mapped(pdev))) {
2508 ret = domain_context_mapping(domain, pdev,
2509 CONTEXT_TT_MULTI_LEVEL);
2510 if (ret) {
2511 printk(KERN_ERR
2512 "Domain context map for %s failed",
2513 pci_name(pdev));
2514 return NULL;
2518 return domain;
2521 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2523 struct device_domain_info *info;
2525 /* No lock here, assumes no domain exit in normal case */
2526 info = dev->dev.archdata.iommu;
2527 if (likely(info))
2528 return info->domain;
2530 return __get_valid_domain_for_dev(dev);
2533 static int iommu_dummy(struct pci_dev *pdev)
2535 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2538 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2539 static int iommu_no_mapping(struct device *dev)
2541 struct pci_dev *pdev;
2542 int found;
2544 if (unlikely(dev->bus != &pci_bus_type))
2545 return 1;
2547 pdev = to_pci_dev(dev);
2548 if (iommu_dummy(pdev))
2549 return 1;
2551 if (!iommu_identity_mapping)
2552 return 0;
2554 found = identity_mapping(pdev);
2555 if (found) {
2556 if (iommu_should_identity_map(pdev, 0))
2557 return 1;
2558 else {
2560 * 32 bit DMA is removed from si_domain and fall back
2561 * to non-identity mapping.
2563 domain_remove_one_dev_info(si_domain, pdev);
2564 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2565 pci_name(pdev));
2566 return 0;
2568 } else {
2570 * In case of a detached 64 bit DMA device from vm, the device
2571 * is put into si_domain for identity mapping.
2573 if (iommu_should_identity_map(pdev, 0)) {
2574 int ret;
2575 ret = domain_add_dev_info(si_domain, pdev,
2576 hw_pass_through ?
2577 CONTEXT_TT_PASS_THROUGH :
2578 CONTEXT_TT_MULTI_LEVEL);
2579 if (!ret) {
2580 printk(KERN_INFO "64bit %s uses identity mapping\n",
2581 pci_name(pdev));
2582 return 1;
2587 return 0;
2590 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2591 size_t size, int dir, u64 dma_mask)
2593 struct pci_dev *pdev = to_pci_dev(hwdev);
2594 struct dmar_domain *domain;
2595 phys_addr_t start_paddr;
2596 struct iova *iova;
2597 int prot = 0;
2598 int ret;
2599 struct intel_iommu *iommu;
2600 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2602 BUG_ON(dir == DMA_NONE);
2604 if (iommu_no_mapping(hwdev))
2605 return paddr;
2607 domain = get_valid_domain_for_dev(pdev);
2608 if (!domain)
2609 return 0;
2611 iommu = domain_get_iommu(domain);
2612 size = aligned_nrpages(paddr, size);
2614 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2615 if (!iova)
2616 goto error;
2619 * Check if DMAR supports zero-length reads on write only
2620 * mappings..
2622 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2623 !cap_zlr(iommu->cap))
2624 prot |= DMA_PTE_READ;
2625 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2626 prot |= DMA_PTE_WRITE;
2628 * paddr - (paddr + size) might be partial page, we should map the whole
2629 * page. Note: if two part of one page are separately mapped, we
2630 * might have two guest_addr mapping to the same host paddr, but this
2631 * is not a big problem
2633 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2634 mm_to_dma_pfn(paddr_pfn), size, prot);
2635 if (ret)
2636 goto error;
2638 /* it's a non-present to present mapping. Only flush if caching mode */
2639 if (cap_caching_mode(iommu->cap))
2640 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2641 else
2642 iommu_flush_write_buffer(iommu);
2644 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2645 start_paddr += paddr & ~PAGE_MASK;
2646 return start_paddr;
2648 error:
2649 if (iova)
2650 __free_iova(&domain->iovad, iova);
2651 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2652 pci_name(pdev), size, (unsigned long long)paddr, dir);
2653 return 0;
2656 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2657 unsigned long offset, size_t size,
2658 enum dma_data_direction dir,
2659 struct dma_attrs *attrs)
2661 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2662 dir, to_pci_dev(dev)->dma_mask);
2665 static void flush_unmaps(void)
2667 int i, j;
2669 timer_on = 0;
2671 /* just flush them all */
2672 for (i = 0; i < g_num_of_iommus; i++) {
2673 struct intel_iommu *iommu = g_iommus[i];
2674 if (!iommu)
2675 continue;
2677 if (!deferred_flush[i].next)
2678 continue;
2680 /* In caching mode, global flushes turn emulation expensive */
2681 if (!cap_caching_mode(iommu->cap))
2682 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2683 DMA_TLB_GLOBAL_FLUSH);
2684 for (j = 0; j < deferred_flush[i].next; j++) {
2685 unsigned long mask;
2686 struct iova *iova = deferred_flush[i].iova[j];
2687 struct dmar_domain *domain = deferred_flush[i].domain[j];
2689 /* On real hardware multiple invalidations are expensive */
2690 if (cap_caching_mode(iommu->cap))
2691 iommu_flush_iotlb_psi(iommu, domain->id,
2692 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2693 else {
2694 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2695 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2696 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2698 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2700 deferred_flush[i].next = 0;
2703 list_size = 0;
2706 static void flush_unmaps_timeout(unsigned long data)
2708 unsigned long flags;
2710 spin_lock_irqsave(&async_umap_flush_lock, flags);
2711 flush_unmaps();
2712 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2715 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2717 unsigned long flags;
2718 int next, iommu_id;
2719 struct intel_iommu *iommu;
2721 spin_lock_irqsave(&async_umap_flush_lock, flags);
2722 if (list_size == HIGH_WATER_MARK)
2723 flush_unmaps();
2725 iommu = domain_get_iommu(dom);
2726 iommu_id = iommu->seq_id;
2728 next = deferred_flush[iommu_id].next;
2729 deferred_flush[iommu_id].domain[next] = dom;
2730 deferred_flush[iommu_id].iova[next] = iova;
2731 deferred_flush[iommu_id].next++;
2733 if (!timer_on) {
2734 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2735 timer_on = 1;
2737 list_size++;
2738 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2741 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2742 size_t size, enum dma_data_direction dir,
2743 struct dma_attrs *attrs)
2745 struct pci_dev *pdev = to_pci_dev(dev);
2746 struct dmar_domain *domain;
2747 unsigned long start_pfn, last_pfn;
2748 struct iova *iova;
2749 struct intel_iommu *iommu;
2751 if (iommu_no_mapping(dev))
2752 return;
2754 domain = find_domain(pdev);
2755 BUG_ON(!domain);
2757 iommu = domain_get_iommu(domain);
2759 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2760 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2761 (unsigned long long)dev_addr))
2762 return;
2764 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2765 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2767 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2768 pci_name(pdev), start_pfn, last_pfn);
2770 /* clear the whole page */
2771 dma_pte_clear_range(domain, start_pfn, last_pfn);
2773 /* free page tables */
2774 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2776 if (intel_iommu_strict) {
2777 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2778 last_pfn - start_pfn + 1, 0);
2779 /* free iova */
2780 __free_iova(&domain->iovad, iova);
2781 } else {
2782 add_unmap(domain, iova);
2784 * queue up the release of the unmap to save the 1/6th of the
2785 * cpu used up by the iotlb flush operation...
2790 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2791 dma_addr_t *dma_handle, gfp_t flags)
2793 void *vaddr;
2794 int order;
2796 size = PAGE_ALIGN(size);
2797 order = get_order(size);
2799 if (!iommu_no_mapping(hwdev))
2800 flags &= ~(GFP_DMA | GFP_DMA32);
2801 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2802 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2803 flags |= GFP_DMA;
2804 else
2805 flags |= GFP_DMA32;
2808 vaddr = (void *)__get_free_pages(flags, order);
2809 if (!vaddr)
2810 return NULL;
2811 memset(vaddr, 0, size);
2813 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2814 DMA_BIDIRECTIONAL,
2815 hwdev->coherent_dma_mask);
2816 if (*dma_handle)
2817 return vaddr;
2818 free_pages((unsigned long)vaddr, order);
2819 return NULL;
2822 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2823 dma_addr_t dma_handle)
2825 int order;
2827 size = PAGE_ALIGN(size);
2828 order = get_order(size);
2830 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2831 free_pages((unsigned long)vaddr, order);
2834 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2835 int nelems, enum dma_data_direction dir,
2836 struct dma_attrs *attrs)
2838 struct pci_dev *pdev = to_pci_dev(hwdev);
2839 struct dmar_domain *domain;
2840 unsigned long start_pfn, last_pfn;
2841 struct iova *iova;
2842 struct intel_iommu *iommu;
2844 if (iommu_no_mapping(hwdev))
2845 return;
2847 domain = find_domain(pdev);
2848 BUG_ON(!domain);
2850 iommu = domain_get_iommu(domain);
2852 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2853 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2854 (unsigned long long)sglist[0].dma_address))
2855 return;
2857 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2858 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2860 /* clear the whole page */
2861 dma_pte_clear_range(domain, start_pfn, last_pfn);
2863 /* free page tables */
2864 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2866 if (intel_iommu_strict) {
2867 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2868 last_pfn - start_pfn + 1, 0);
2869 /* free iova */
2870 __free_iova(&domain->iovad, iova);
2871 } else {
2872 add_unmap(domain, iova);
2874 * queue up the release of the unmap to save the 1/6th of the
2875 * cpu used up by the iotlb flush operation...
2880 static int intel_nontranslate_map_sg(struct device *hddev,
2881 struct scatterlist *sglist, int nelems, int dir)
2883 int i;
2884 struct scatterlist *sg;
2886 for_each_sg(sglist, sg, nelems, i) {
2887 BUG_ON(!sg_page(sg));
2888 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2889 sg->dma_length = sg->length;
2891 return nelems;
2894 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2895 enum dma_data_direction dir, struct dma_attrs *attrs)
2897 int i;
2898 struct pci_dev *pdev = to_pci_dev(hwdev);
2899 struct dmar_domain *domain;
2900 size_t size = 0;
2901 int prot = 0;
2902 struct iova *iova = NULL;
2903 int ret;
2904 struct scatterlist *sg;
2905 unsigned long start_vpfn;
2906 struct intel_iommu *iommu;
2908 BUG_ON(dir == DMA_NONE);
2909 if (iommu_no_mapping(hwdev))
2910 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2912 domain = get_valid_domain_for_dev(pdev);
2913 if (!domain)
2914 return 0;
2916 iommu = domain_get_iommu(domain);
2918 for_each_sg(sglist, sg, nelems, i)
2919 size += aligned_nrpages(sg->offset, sg->length);
2921 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2922 pdev->dma_mask);
2923 if (!iova) {
2924 sglist->dma_length = 0;
2925 return 0;
2929 * Check if DMAR supports zero-length reads on write only
2930 * mappings..
2932 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2933 !cap_zlr(iommu->cap))
2934 prot |= DMA_PTE_READ;
2935 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2936 prot |= DMA_PTE_WRITE;
2938 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2940 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2941 if (unlikely(ret)) {
2942 /* clear the page */
2943 dma_pte_clear_range(domain, start_vpfn,
2944 start_vpfn + size - 1);
2945 /* free page tables */
2946 dma_pte_free_pagetable(domain, start_vpfn,
2947 start_vpfn + size - 1);
2948 /* free iova */
2949 __free_iova(&domain->iovad, iova);
2950 return 0;
2953 /* it's a non-present to present mapping. Only flush if caching mode */
2954 if (cap_caching_mode(iommu->cap))
2955 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2956 else
2957 iommu_flush_write_buffer(iommu);
2959 return nelems;
2962 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2964 return !dma_addr;
2967 struct dma_map_ops intel_dma_ops = {
2968 .alloc_coherent = intel_alloc_coherent,
2969 .free_coherent = intel_free_coherent,
2970 .map_sg = intel_map_sg,
2971 .unmap_sg = intel_unmap_sg,
2972 .map_page = intel_map_page,
2973 .unmap_page = intel_unmap_page,
2974 .mapping_error = intel_mapping_error,
2977 static inline int iommu_domain_cache_init(void)
2979 int ret = 0;
2981 iommu_domain_cache = kmem_cache_create("iommu_domain",
2982 sizeof(struct dmar_domain),
2984 SLAB_HWCACHE_ALIGN,
2986 NULL);
2987 if (!iommu_domain_cache) {
2988 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2989 ret = -ENOMEM;
2992 return ret;
2995 static inline int iommu_devinfo_cache_init(void)
2997 int ret = 0;
2999 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3000 sizeof(struct device_domain_info),
3002 SLAB_HWCACHE_ALIGN,
3003 NULL);
3004 if (!iommu_devinfo_cache) {
3005 printk(KERN_ERR "Couldn't create devinfo cache\n");
3006 ret = -ENOMEM;
3009 return ret;
3012 static inline int iommu_iova_cache_init(void)
3014 int ret = 0;
3016 iommu_iova_cache = kmem_cache_create("iommu_iova",
3017 sizeof(struct iova),
3019 SLAB_HWCACHE_ALIGN,
3020 NULL);
3021 if (!iommu_iova_cache) {
3022 printk(KERN_ERR "Couldn't create iova cache\n");
3023 ret = -ENOMEM;
3026 return ret;
3029 static int __init iommu_init_mempool(void)
3031 int ret;
3032 ret = iommu_iova_cache_init();
3033 if (ret)
3034 return ret;
3036 ret = iommu_domain_cache_init();
3037 if (ret)
3038 goto domain_error;
3040 ret = iommu_devinfo_cache_init();
3041 if (!ret)
3042 return ret;
3044 kmem_cache_destroy(iommu_domain_cache);
3045 domain_error:
3046 kmem_cache_destroy(iommu_iova_cache);
3048 return -ENOMEM;
3051 static void __init iommu_exit_mempool(void)
3053 kmem_cache_destroy(iommu_devinfo_cache);
3054 kmem_cache_destroy(iommu_domain_cache);
3055 kmem_cache_destroy(iommu_iova_cache);
3059 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3061 struct dmar_drhd_unit *drhd;
3062 u32 vtbar;
3063 int rc;
3065 /* We know that this device on this chipset has its own IOMMU.
3066 * If we find it under a different IOMMU, then the BIOS is lying
3067 * to us. Hope that the IOMMU for this device is actually
3068 * disabled, and it needs no translation...
3070 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3071 if (rc) {
3072 /* "can't" happen */
3073 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3074 return;
3076 vtbar &= 0xffff0000;
3078 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3079 drhd = dmar_find_matched_drhd_unit(pdev);
3080 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3081 TAINT_FIRMWARE_WORKAROUND,
3082 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3083 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3085 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3087 static void __init init_no_remapping_devices(void)
3089 struct dmar_drhd_unit *drhd;
3091 for_each_drhd_unit(drhd) {
3092 if (!drhd->include_all) {
3093 int i;
3094 for (i = 0; i < drhd->devices_cnt; i++)
3095 if (drhd->devices[i] != NULL)
3096 break;
3097 /* ignore DMAR unit if no pci devices exist */
3098 if (i == drhd->devices_cnt)
3099 drhd->ignored = 1;
3103 if (dmar_map_gfx)
3104 return;
3106 for_each_drhd_unit(drhd) {
3107 int i;
3108 if (drhd->ignored || drhd->include_all)
3109 continue;
3111 for (i = 0; i < drhd->devices_cnt; i++)
3112 if (drhd->devices[i] &&
3113 !IS_GFX_DEVICE(drhd->devices[i]))
3114 break;
3116 if (i < drhd->devices_cnt)
3117 continue;
3119 /* bypass IOMMU if it is just for gfx devices */
3120 drhd->ignored = 1;
3121 for (i = 0; i < drhd->devices_cnt; i++) {
3122 if (!drhd->devices[i])
3123 continue;
3124 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3129 #ifdef CONFIG_SUSPEND
3130 static int init_iommu_hw(void)
3132 struct dmar_drhd_unit *drhd;
3133 struct intel_iommu *iommu = NULL;
3135 for_each_active_iommu(iommu, drhd)
3136 if (iommu->qi)
3137 dmar_reenable_qi(iommu);
3139 for_each_active_iommu(iommu, drhd) {
3140 iommu_flush_write_buffer(iommu);
3142 iommu_set_root_entry(iommu);
3144 iommu->flush.flush_context(iommu, 0, 0, 0,
3145 DMA_CCMD_GLOBAL_INVL);
3146 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3147 DMA_TLB_GLOBAL_FLUSH);
3148 iommu_enable_translation(iommu);
3149 iommu_disable_protect_mem_regions(iommu);
3152 return 0;
3155 static void iommu_flush_all(void)
3157 struct dmar_drhd_unit *drhd;
3158 struct intel_iommu *iommu;
3160 for_each_active_iommu(iommu, drhd) {
3161 iommu->flush.flush_context(iommu, 0, 0, 0,
3162 DMA_CCMD_GLOBAL_INVL);
3163 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3164 DMA_TLB_GLOBAL_FLUSH);
3168 static int iommu_suspend(void)
3170 struct dmar_drhd_unit *drhd;
3171 struct intel_iommu *iommu = NULL;
3172 unsigned long flag;
3174 for_each_active_iommu(iommu, drhd) {
3175 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3176 GFP_ATOMIC);
3177 if (!iommu->iommu_state)
3178 goto nomem;
3181 iommu_flush_all();
3183 for_each_active_iommu(iommu, drhd) {
3184 iommu_disable_translation(iommu);
3186 spin_lock_irqsave(&iommu->register_lock, flag);
3188 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3189 readl(iommu->reg + DMAR_FECTL_REG);
3190 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3191 readl(iommu->reg + DMAR_FEDATA_REG);
3192 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3193 readl(iommu->reg + DMAR_FEADDR_REG);
3194 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3195 readl(iommu->reg + DMAR_FEUADDR_REG);
3197 spin_unlock_irqrestore(&iommu->register_lock, flag);
3199 return 0;
3201 nomem:
3202 for_each_active_iommu(iommu, drhd)
3203 kfree(iommu->iommu_state);
3205 return -ENOMEM;
3208 static void iommu_resume(void)
3210 struct dmar_drhd_unit *drhd;
3211 struct intel_iommu *iommu = NULL;
3212 unsigned long flag;
3214 if (init_iommu_hw()) {
3215 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3216 return;
3219 for_each_active_iommu(iommu, drhd) {
3221 spin_lock_irqsave(&iommu->register_lock, flag);
3223 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3224 iommu->reg + DMAR_FECTL_REG);
3225 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3226 iommu->reg + DMAR_FEDATA_REG);
3227 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3228 iommu->reg + DMAR_FEADDR_REG);
3229 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3230 iommu->reg + DMAR_FEUADDR_REG);
3232 spin_unlock_irqrestore(&iommu->register_lock, flag);
3235 for_each_active_iommu(iommu, drhd)
3236 kfree(iommu->iommu_state);
3239 static struct syscore_ops iommu_syscore_ops = {
3240 .resume = iommu_resume,
3241 .suspend = iommu_suspend,
3244 static void __init init_iommu_pm_ops(void)
3246 register_syscore_ops(&iommu_syscore_ops);
3249 #else
3250 static inline int init_iommu_pm_ops(void) { }
3251 #endif /* CONFIG_PM */
3254 * Here we only respond to action of unbound device from driver.
3256 * Added device is not attached to its DMAR domain here yet. That will happen
3257 * when mapping the device to iova.
3259 static int device_notifier(struct notifier_block *nb,
3260 unsigned long action, void *data)
3262 struct device *dev = data;
3263 struct pci_dev *pdev = to_pci_dev(dev);
3264 struct dmar_domain *domain;
3266 if (iommu_no_mapping(dev))
3267 return 0;
3269 domain = find_domain(pdev);
3270 if (!domain)
3271 return 0;
3273 if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3274 domain_remove_one_dev_info(domain, pdev);
3276 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3277 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3278 list_empty(&domain->devices))
3279 domain_exit(domain);
3282 return 0;
3285 static struct notifier_block device_nb = {
3286 .notifier_call = device_notifier,
3289 int __init intel_iommu_init(void)
3291 int ret = 0;
3292 int force_on = 0;
3294 /* VT-d is required for a TXT/tboot launch, so enforce that */
3295 force_on = tboot_force_iommu();
3297 if (dmar_table_init()) {
3298 if (force_on)
3299 panic("tboot: Failed to initialize DMAR table\n");
3300 return -ENODEV;
3303 if (dmar_dev_scope_init()) {
3304 if (force_on)
3305 panic("tboot: Failed to initialize DMAR device scope\n");
3306 return -ENODEV;
3310 * Check the need for DMA-remapping initialization now.
3311 * Above initialization will also be used by Interrupt-remapping.
3313 if (no_iommu || dmar_disabled)
3314 return -ENODEV;
3316 if (iommu_init_mempool()) {
3317 if (force_on)
3318 panic("tboot: Failed to initialize iommu memory\n");
3319 return -ENODEV;
3322 if (dmar_init_reserved_ranges()) {
3323 if (force_on)
3324 panic("tboot: Failed to reserve iommu ranges\n");
3325 return -ENODEV;
3328 init_no_remapping_devices();
3330 ret = init_dmars(force_on);
3331 if (ret) {
3332 if (force_on)
3333 panic("tboot: Failed to initialize DMARs\n");
3334 printk(KERN_ERR "IOMMU: dmar init failed\n");
3335 put_iova_domain(&reserved_iova_list);
3336 iommu_exit_mempool();
3337 return ret;
3339 printk(KERN_INFO
3340 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3342 init_timer(&unmap_timer);
3343 #ifdef CONFIG_SWIOTLB
3344 swiotlb = 0;
3345 #endif
3346 dma_ops = &intel_dma_ops;
3348 init_iommu_pm_ops();
3350 register_iommu(&intel_iommu_ops);
3352 bus_register_notifier(&pci_bus_type, &device_nb);
3354 return 0;
3357 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3358 struct pci_dev *pdev)
3360 struct pci_dev *tmp, *parent;
3362 if (!iommu || !pdev)
3363 return;
3365 /* dependent device detach */
3366 tmp = pci_find_upstream_pcie_bridge(pdev);
3367 /* Secondary interface's bus number and devfn 0 */
3368 if (tmp) {
3369 parent = pdev->bus->self;
3370 while (parent != tmp) {
3371 iommu_detach_dev(iommu, parent->bus->number,
3372 parent->devfn);
3373 parent = parent->bus->self;
3375 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3376 iommu_detach_dev(iommu,
3377 tmp->subordinate->number, 0);
3378 else /* this is a legacy PCI bridge */
3379 iommu_detach_dev(iommu, tmp->bus->number,
3380 tmp->devfn);
3384 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3385 struct pci_dev *pdev)
3387 struct device_domain_info *info;
3388 struct intel_iommu *iommu;
3389 unsigned long flags;
3390 int found = 0;
3391 struct list_head *entry, *tmp;
3393 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3394 pdev->devfn);
3395 if (!iommu)
3396 return;
3398 spin_lock_irqsave(&device_domain_lock, flags);
3399 list_for_each_safe(entry, tmp, &domain->devices) {
3400 info = list_entry(entry, struct device_domain_info, link);
3401 /* No need to compare PCI domain; it has to be the same */
3402 if (info->bus == pdev->bus->number &&
3403 info->devfn == pdev->devfn) {
3404 list_del(&info->link);
3405 list_del(&info->global);
3406 if (info->dev)
3407 info->dev->dev.archdata.iommu = NULL;
3408 spin_unlock_irqrestore(&device_domain_lock, flags);
3410 iommu_disable_dev_iotlb(info);
3411 iommu_detach_dev(iommu, info->bus, info->devfn);
3412 iommu_detach_dependent_devices(iommu, pdev);
3413 free_devinfo_mem(info);
3415 spin_lock_irqsave(&device_domain_lock, flags);
3417 if (found)
3418 break;
3419 else
3420 continue;
3423 /* if there is no other devices under the same iommu
3424 * owned by this domain, clear this iommu in iommu_bmp
3425 * update iommu count and coherency
3427 if (iommu == device_to_iommu(info->segment, info->bus,
3428 info->devfn))
3429 found = 1;
3432 if (found == 0) {
3433 unsigned long tmp_flags;
3434 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3435 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3436 domain->iommu_count--;
3437 domain_update_iommu_cap(domain);
3438 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3440 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3441 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3442 spin_lock_irqsave(&iommu->lock, tmp_flags);
3443 clear_bit(domain->id, iommu->domain_ids);
3444 iommu->domains[domain->id] = NULL;
3445 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3449 spin_unlock_irqrestore(&device_domain_lock, flags);
3452 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3454 struct device_domain_info *info;
3455 struct intel_iommu *iommu;
3456 unsigned long flags1, flags2;
3458 spin_lock_irqsave(&device_domain_lock, flags1);
3459 while (!list_empty(&domain->devices)) {
3460 info = list_entry(domain->devices.next,
3461 struct device_domain_info, link);
3462 list_del(&info->link);
3463 list_del(&info->global);
3464 if (info->dev)
3465 info->dev->dev.archdata.iommu = NULL;
3467 spin_unlock_irqrestore(&device_domain_lock, flags1);
3469 iommu_disable_dev_iotlb(info);
3470 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3471 iommu_detach_dev(iommu, info->bus, info->devfn);
3472 iommu_detach_dependent_devices(iommu, info->dev);
3474 /* clear this iommu in iommu_bmp, update iommu count
3475 * and capabilities
3477 spin_lock_irqsave(&domain->iommu_lock, flags2);
3478 if (test_and_clear_bit(iommu->seq_id,
3479 &domain->iommu_bmp)) {
3480 domain->iommu_count--;
3481 domain_update_iommu_cap(domain);
3483 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3485 free_devinfo_mem(info);
3486 spin_lock_irqsave(&device_domain_lock, flags1);
3488 spin_unlock_irqrestore(&device_domain_lock, flags1);
3491 /* domain id for virtual machine, it won't be set in context */
3492 static unsigned long vm_domid;
3494 static struct dmar_domain *iommu_alloc_vm_domain(void)
3496 struct dmar_domain *domain;
3498 domain = alloc_domain_mem();
3499 if (!domain)
3500 return NULL;
3502 domain->id = vm_domid++;
3503 domain->nid = -1;
3504 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3505 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3507 return domain;
3510 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3512 int adjust_width;
3514 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3515 spin_lock_init(&domain->iommu_lock);
3517 domain_reserve_special_ranges(domain);
3519 /* calculate AGAW */
3520 domain->gaw = guest_width;
3521 adjust_width = guestwidth_to_adjustwidth(guest_width);
3522 domain->agaw = width_to_agaw(adjust_width);
3524 INIT_LIST_HEAD(&domain->devices);
3526 domain->iommu_count = 0;
3527 domain->iommu_coherency = 0;
3528 domain->iommu_snooping = 0;
3529 domain->max_addr = 0;
3530 domain->nid = -1;
3532 /* always allocate the top pgd */
3533 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3534 if (!domain->pgd)
3535 return -ENOMEM;
3536 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3537 return 0;
3540 static void iommu_free_vm_domain(struct dmar_domain *domain)
3542 unsigned long flags;
3543 struct dmar_drhd_unit *drhd;
3544 struct intel_iommu *iommu;
3545 unsigned long i;
3546 unsigned long ndomains;
3548 for_each_drhd_unit(drhd) {
3549 if (drhd->ignored)
3550 continue;
3551 iommu = drhd->iommu;
3553 ndomains = cap_ndoms(iommu->cap);
3554 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3555 if (iommu->domains[i] == domain) {
3556 spin_lock_irqsave(&iommu->lock, flags);
3557 clear_bit(i, iommu->domain_ids);
3558 iommu->domains[i] = NULL;
3559 spin_unlock_irqrestore(&iommu->lock, flags);
3560 break;
3566 static void vm_domain_exit(struct dmar_domain *domain)
3568 /* Domain 0 is reserved, so dont process it */
3569 if (!domain)
3570 return;
3572 vm_domain_remove_all_dev_info(domain);
3573 /* destroy iovas */
3574 put_iova_domain(&domain->iovad);
3576 /* clear ptes */
3577 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3579 /* free page tables */
3580 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3582 iommu_free_vm_domain(domain);
3583 free_domain_mem(domain);
3586 static int intel_iommu_domain_init(struct iommu_domain *domain)
3588 struct dmar_domain *dmar_domain;
3590 dmar_domain = iommu_alloc_vm_domain();
3591 if (!dmar_domain) {
3592 printk(KERN_ERR
3593 "intel_iommu_domain_init: dmar_domain == NULL\n");
3594 return -ENOMEM;
3596 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3597 printk(KERN_ERR
3598 "intel_iommu_domain_init() failed\n");
3599 vm_domain_exit(dmar_domain);
3600 return -ENOMEM;
3602 domain->priv = dmar_domain;
3604 return 0;
3607 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3609 struct dmar_domain *dmar_domain = domain->priv;
3611 domain->priv = NULL;
3612 vm_domain_exit(dmar_domain);
3615 static int intel_iommu_attach_device(struct iommu_domain *domain,
3616 struct device *dev)
3618 struct dmar_domain *dmar_domain = domain->priv;
3619 struct pci_dev *pdev = to_pci_dev(dev);
3620 struct intel_iommu *iommu;
3621 int addr_width;
3623 /* normally pdev is not mapped */
3624 if (unlikely(domain_context_mapped(pdev))) {
3625 struct dmar_domain *old_domain;
3627 old_domain = find_domain(pdev);
3628 if (old_domain) {
3629 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3630 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3631 domain_remove_one_dev_info(old_domain, pdev);
3632 else
3633 domain_remove_dev_info(old_domain);
3637 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3638 pdev->devfn);
3639 if (!iommu)
3640 return -ENODEV;
3642 /* check if this iommu agaw is sufficient for max mapped address */
3643 addr_width = agaw_to_width(iommu->agaw);
3644 if (addr_width > cap_mgaw(iommu->cap))
3645 addr_width = cap_mgaw(iommu->cap);
3647 if (dmar_domain->max_addr > (1LL << addr_width)) {
3648 printk(KERN_ERR "%s: iommu width (%d) is not "
3649 "sufficient for the mapped address (%llx)\n",
3650 __func__, addr_width, dmar_domain->max_addr);
3651 return -EFAULT;
3653 dmar_domain->gaw = addr_width;
3656 * Knock out extra levels of page tables if necessary
3658 while (iommu->agaw < dmar_domain->agaw) {
3659 struct dma_pte *pte;
3661 pte = dmar_domain->pgd;
3662 if (dma_pte_present(pte)) {
3663 dmar_domain->pgd = (struct dma_pte *)
3664 phys_to_virt(dma_pte_addr(pte));
3665 free_pgtable_page(pte);
3667 dmar_domain->agaw--;
3670 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3673 static void intel_iommu_detach_device(struct iommu_domain *domain,
3674 struct device *dev)
3676 struct dmar_domain *dmar_domain = domain->priv;
3677 struct pci_dev *pdev = to_pci_dev(dev);
3679 domain_remove_one_dev_info(dmar_domain, pdev);
3682 static int intel_iommu_map(struct iommu_domain *domain,
3683 unsigned long iova, phys_addr_t hpa,
3684 int gfp_order, int iommu_prot)
3686 struct dmar_domain *dmar_domain = domain->priv;
3687 u64 max_addr;
3688 int prot = 0;
3689 size_t size;
3690 int ret;
3692 if (iommu_prot & IOMMU_READ)
3693 prot |= DMA_PTE_READ;
3694 if (iommu_prot & IOMMU_WRITE)
3695 prot |= DMA_PTE_WRITE;
3696 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3697 prot |= DMA_PTE_SNP;
3699 size = PAGE_SIZE << gfp_order;
3700 max_addr = iova + size;
3701 if (dmar_domain->max_addr < max_addr) {
3702 u64 end;
3704 /* check if minimum agaw is sufficient for mapped address */
3705 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3706 if (end < max_addr) {
3707 printk(KERN_ERR "%s: iommu width (%d) is not "
3708 "sufficient for the mapped address (%llx)\n",
3709 __func__, dmar_domain->gaw, max_addr);
3710 return -EFAULT;
3712 dmar_domain->max_addr = max_addr;
3714 /* Round up size to next multiple of PAGE_SIZE, if it and
3715 the low bits of hpa would take us onto the next page */
3716 size = aligned_nrpages(hpa, size);
3717 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3718 hpa >> VTD_PAGE_SHIFT, size, prot);
3719 return ret;
3722 static int intel_iommu_unmap(struct iommu_domain *domain,
3723 unsigned long iova, int gfp_order)
3725 struct dmar_domain *dmar_domain = domain->priv;
3726 size_t size = PAGE_SIZE << gfp_order;
3728 dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3729 (iova + size - 1) >> VTD_PAGE_SHIFT);
3731 if (dmar_domain->max_addr == iova + size)
3732 dmar_domain->max_addr = iova;
3734 return gfp_order;
3737 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3738 unsigned long iova)
3740 struct dmar_domain *dmar_domain = domain->priv;
3741 struct dma_pte *pte;
3742 u64 phys = 0;
3744 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3745 if (pte)
3746 phys = dma_pte_addr(pte);
3748 return phys;
3751 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3752 unsigned long cap)
3754 struct dmar_domain *dmar_domain = domain->priv;
3756 if (cap == IOMMU_CAP_CACHE_COHERENCY)
3757 return dmar_domain->iommu_snooping;
3758 if (cap == IOMMU_CAP_INTR_REMAP)
3759 return intr_remapping_enabled;
3761 return 0;
3764 static struct iommu_ops intel_iommu_ops = {
3765 .domain_init = intel_iommu_domain_init,
3766 .domain_destroy = intel_iommu_domain_destroy,
3767 .attach_dev = intel_iommu_attach_device,
3768 .detach_dev = intel_iommu_detach_device,
3769 .map = intel_iommu_map,
3770 .unmap = intel_iommu_unmap,
3771 .iova_to_phys = intel_iommu_iova_to_phys,
3772 .domain_has_cap = intel_iommu_domain_has_cap,
3775 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3778 * Mobile 4 Series Chipset neglects to set RWBF capability,
3779 * but needs it:
3781 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3782 rwbf_quirk = 1;
3784 /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3785 if (dev->revision == 0x07) {
3786 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3787 dmar_map_gfx = 0;
3791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3793 #define GGC 0x52
3794 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
3795 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
3796 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
3797 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
3798 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
3799 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
3800 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
3801 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
3803 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3805 unsigned short ggc;
3807 if (pci_read_config_word(dev, GGC, &ggc))
3808 return;
3810 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3811 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3812 dmar_map_gfx = 0;
3815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3817 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3818 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3820 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3821 ISOCH DMAR unit for the Azalia sound device, but not give it any
3822 TLB entries, which causes it to deadlock. Check for that. We do
3823 this in a function called from init_dmars(), instead of in a PCI
3824 quirk, because we don't want to print the obnoxious "BIOS broken"
3825 message if VT-d is actually disabled.
3827 static void __init check_tylersburg_isoch(void)
3829 struct pci_dev *pdev;
3830 uint32_t vtisochctrl;
3832 /* If there's no Azalia in the system anyway, forget it. */
3833 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3834 if (!pdev)
3835 return;
3836 pci_dev_put(pdev);
3838 /* System Management Registers. Might be hidden, in which case
3839 we can't do the sanity check. But that's OK, because the
3840 known-broken BIOSes _don't_ actually hide it, so far. */
3841 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3842 if (!pdev)
3843 return;
3845 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3846 pci_dev_put(pdev);
3847 return;
3850 pci_dev_put(pdev);
3852 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3853 if (vtisochctrl & 1)
3854 return;
3856 /* Drop all bits other than the number of TLB entries */
3857 vtisochctrl &= 0x1c;
3859 /* If we have the recommended number of TLB entries (16), fine. */
3860 if (vtisochctrl == 0x10)
3861 return;
3863 /* Zero TLB entries? You get to ride the short bus to school. */
3864 if (!vtisochctrl) {
3865 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3866 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3867 dmi_get_system_info(DMI_BIOS_VENDOR),
3868 dmi_get_system_info(DMI_BIOS_VERSION),
3869 dmi_get_system_info(DMI_PRODUCT_VERSION));
3870 iommu_identity_mapping |= IDENTMAP_AZALIA;
3871 return;
3874 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3875 vtisochctrl);