2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
46 #define ROOT_SIZE VTD_PAGE_SIZE
47 #define CONTEXT_SIZE VTD_PAGE_SIZE
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
53 #define IOAPIC_RANGE_START (0xfee00000)
54 #define IOAPIC_RANGE_END (0xfeefffff)
55 #define IOVA_START_ADDR (0x1000)
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
59 #define MAX_AGAW_WIDTH 64
61 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
67 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
70 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76 are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn
)
79 return dma_pfn
>> (PAGE_SHIFT
- VTD_PAGE_SHIFT
);
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn
)
84 return mm_pfn
<< (PAGE_SHIFT
- VTD_PAGE_SHIFT
);
86 static inline unsigned long page_to_dma_pfn(struct page
*pg
)
88 return mm_to_dma_pfn(page_to_pfn(pg
));
90 static inline unsigned long virt_to_dma_pfn(void *p
)
92 return page_to_dma_pfn(virt_to_page(p
));
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu
**g_iommus
;
98 static void __init
check_tylersburg_isoch(void);
99 static int rwbf_quirk
;
104 * 12-63: Context Ptr (12 - (haw-1))
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry
*root
)
114 return (root
->val
& 1);
116 static inline void set_root_present(struct root_entry
*root
)
120 static inline void set_root_value(struct root_entry
*root
, unsigned long value
)
122 root
->val
|= value
& VTD_PAGE_MASK
;
125 static inline struct context_entry
*
126 get_context_addr_from_root(struct root_entry
*root
)
128 return (struct context_entry
*)
129 (root_present(root
)?phys_to_virt(
130 root
->val
& VTD_PAGE_MASK
) :
137 * 1: fault processing disable
138 * 2-3: translation type
139 * 12-63: address space root
145 struct context_entry
{
150 static inline bool context_present(struct context_entry
*context
)
152 return (context
->lo
& 1);
154 static inline void context_set_present(struct context_entry
*context
)
159 static inline void context_set_fault_enable(struct context_entry
*context
)
161 context
->lo
&= (((u64
)-1) << 2) | 1;
164 static inline void context_set_translation_type(struct context_entry
*context
,
167 context
->lo
&= (((u64
)-1) << 4) | 3;
168 context
->lo
|= (value
& 3) << 2;
171 static inline void context_set_address_root(struct context_entry
*context
,
174 context
->lo
|= value
& VTD_PAGE_MASK
;
177 static inline void context_set_address_width(struct context_entry
*context
,
180 context
->hi
|= value
& 7;
183 static inline void context_set_domain_id(struct context_entry
*context
,
186 context
->hi
|= (value
& ((1 << 16) - 1)) << 8;
189 static inline void context_clear_entry(struct context_entry
*context
)
202 * 12-63: Host physcial address
208 static inline void dma_clear_pte(struct dma_pte
*pte
)
213 static inline void dma_set_pte_readable(struct dma_pte
*pte
)
215 pte
->val
|= DMA_PTE_READ
;
218 static inline void dma_set_pte_writable(struct dma_pte
*pte
)
220 pte
->val
|= DMA_PTE_WRITE
;
223 static inline void dma_set_pte_snp(struct dma_pte
*pte
)
225 pte
->val
|= DMA_PTE_SNP
;
228 static inline void dma_set_pte_prot(struct dma_pte
*pte
, unsigned long prot
)
230 pte
->val
= (pte
->val
& ~3) | (prot
& 3);
233 static inline u64
dma_pte_addr(struct dma_pte
*pte
)
236 return pte
->val
& VTD_PAGE_MASK
;
238 /* Must have a full atomic 64-bit read */
239 return __cmpxchg64(pte
, 0ULL, 0ULL) & VTD_PAGE_MASK
;
243 static inline void dma_set_pte_pfn(struct dma_pte
*pte
, unsigned long pfn
)
245 pte
->val
|= (uint64_t)pfn
<< VTD_PAGE_SHIFT
;
248 static inline bool dma_pte_present(struct dma_pte
*pte
)
250 return (pte
->val
& 3) != 0;
253 static inline int first_pte_in_page(struct dma_pte
*pte
)
255 return !((unsigned long)pte
& ~VTD_PAGE_MASK
);
259 * This domain is a statically identity mapping domain.
260 * 1. This domain creats a static 1:1 mapping to all usable memory.
261 * 2. It maps to each iommu if successful.
262 * 3. Each iommu mapps to this domain if successful.
264 static struct dmar_domain
*si_domain
;
265 static int hw_pass_through
= 1;
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
270 /* domain represents a virtual machine, more than one devices
271 * across iommus may be owned in one domain, e.g. kvm guest.
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
279 int id
; /* domain id */
280 int nid
; /* node id */
281 unsigned long iommu_bmp
; /* bitmap of iommus this domain uses*/
283 struct list_head devices
; /* all devices' list */
284 struct iova_domain iovad
; /* iova's that belong to this domain */
286 struct dma_pte
*pgd
; /* virtual address */
287 int gaw
; /* max guest address width */
289 /* adjusted guest address width, 0 is level 2 30-bit */
292 int flags
; /* flags to find out type of domain */
294 int iommu_coherency
;/* indicate coherency of iommu access */
295 int iommu_snooping
; /* indicate snooping control feature*/
296 int iommu_count
; /* reference count of iommu */
297 spinlock_t iommu_lock
; /* protect iommu set in domain */
298 u64 max_addr
; /* maximum mapped address */
301 /* PCI domain-device relationship */
302 struct device_domain_info
{
303 struct list_head link
; /* link to domain siblings */
304 struct list_head global
; /* link to global list */
305 int segment
; /* PCI domain */
306 u8 bus
; /* PCI bus number */
307 u8 devfn
; /* PCI devfn number */
308 struct pci_dev
*dev
; /* it's NULL for PCIe-to-PCI bridge */
309 struct intel_iommu
*iommu
; /* IOMMU used by this device */
310 struct dmar_domain
*domain
; /* pointer to domain */
313 static void flush_unmaps_timeout(unsigned long data
);
315 DEFINE_TIMER(unmap_timer
, flush_unmaps_timeout
, 0, 0);
317 #define HIGH_WATER_MARK 250
318 struct deferred_flush_tables
{
320 struct iova
*iova
[HIGH_WATER_MARK
];
321 struct dmar_domain
*domain
[HIGH_WATER_MARK
];
324 static struct deferred_flush_tables
*deferred_flush
;
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus
;
329 static DEFINE_SPINLOCK(async_umap_flush_lock
);
330 static LIST_HEAD(unmaps_to_do
);
333 static long list_size
;
335 static void domain_remove_dev_info(struct dmar_domain
*domain
);
337 #ifdef CONFIG_DMAR_DEFAULT_ON
338 int dmar_disabled
= 0;
340 int dmar_disabled
= 1;
341 #endif /*CONFIG_DMAR_DEFAULT_ON*/
343 static int __initdata dmar_map_gfx
= 1;
344 static int dmar_forcedac
;
345 static int intel_iommu_strict
;
347 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
348 static DEFINE_SPINLOCK(device_domain_lock
);
349 static LIST_HEAD(device_domain_list
);
351 static struct iommu_ops intel_iommu_ops
;
353 static int __init
intel_iommu_setup(char *str
)
358 if (!strncmp(str
, "on", 2)) {
360 printk(KERN_INFO
"Intel-IOMMU: enabled\n");
361 } else if (!strncmp(str
, "off", 3)) {
363 printk(KERN_INFO
"Intel-IOMMU: disabled\n");
364 } else if (!strncmp(str
, "igfx_off", 8)) {
367 "Intel-IOMMU: disable GFX device mapping\n");
368 } else if (!strncmp(str
, "forcedac", 8)) {
370 "Intel-IOMMU: Forcing DAC for PCI devices\n");
372 } else if (!strncmp(str
, "strict", 6)) {
374 "Intel-IOMMU: disable batched IOTLB flush\n");
375 intel_iommu_strict
= 1;
378 str
+= strcspn(str
, ",");
384 __setup("intel_iommu=", intel_iommu_setup
);
386 static struct kmem_cache
*iommu_domain_cache
;
387 static struct kmem_cache
*iommu_devinfo_cache
;
388 static struct kmem_cache
*iommu_iova_cache
;
390 static inline void *alloc_pgtable_page(int node
)
395 page
= alloc_pages_node(node
, GFP_ATOMIC
| __GFP_ZERO
, 0);
397 vaddr
= page_address(page
);
401 static inline void free_pgtable_page(void *vaddr
)
403 free_page((unsigned long)vaddr
);
406 static inline void *alloc_domain_mem(void)
408 return kmem_cache_alloc(iommu_domain_cache
, GFP_ATOMIC
);
411 static void free_domain_mem(void *vaddr
)
413 kmem_cache_free(iommu_domain_cache
, vaddr
);
416 static inline void * alloc_devinfo_mem(void)
418 return kmem_cache_alloc(iommu_devinfo_cache
, GFP_ATOMIC
);
421 static inline void free_devinfo_mem(void *vaddr
)
423 kmem_cache_free(iommu_devinfo_cache
, vaddr
);
426 struct iova
*alloc_iova_mem(void)
428 return kmem_cache_alloc(iommu_iova_cache
, GFP_ATOMIC
);
431 void free_iova_mem(struct iova
*iova
)
433 kmem_cache_free(iommu_iova_cache
, iova
);
437 static inline int width_to_agaw(int width
);
439 static int __iommu_calculate_agaw(struct intel_iommu
*iommu
, int max_gaw
)
444 sagaw
= cap_sagaw(iommu
->cap
);
445 for (agaw
= width_to_agaw(max_gaw
);
447 if (test_bit(agaw
, &sagaw
))
455 * Calculate max SAGAW for each iommu.
457 int iommu_calculate_max_sagaw(struct intel_iommu
*iommu
)
459 return __iommu_calculate_agaw(iommu
, MAX_AGAW_WIDTH
);
463 * calculate agaw for each iommu.
464 * "SAGAW" may be different across iommus, use a default agaw, and
465 * get a supported less agaw for iommus that don't support the default agaw.
467 int iommu_calculate_agaw(struct intel_iommu
*iommu
)
469 return __iommu_calculate_agaw(iommu
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
472 /* This functionin only returns single iommu in a domain */
473 static struct intel_iommu
*domain_get_iommu(struct dmar_domain
*domain
)
477 /* si_domain and vm domain should not get here. */
478 BUG_ON(domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
);
479 BUG_ON(domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
);
481 iommu_id
= find_first_bit(&domain
->iommu_bmp
, g_num_of_iommus
);
482 if (iommu_id
< 0 || iommu_id
>= g_num_of_iommus
)
485 return g_iommus
[iommu_id
];
488 static void domain_update_iommu_coherency(struct dmar_domain
*domain
)
492 domain
->iommu_coherency
= 1;
494 i
= find_first_bit(&domain
->iommu_bmp
, g_num_of_iommus
);
495 for (; i
< g_num_of_iommus
; ) {
496 if (!ecap_coherent(g_iommus
[i
]->ecap
)) {
497 domain
->iommu_coherency
= 0;
500 i
= find_next_bit(&domain
->iommu_bmp
, g_num_of_iommus
, i
+1);
504 static void domain_update_iommu_snooping(struct dmar_domain
*domain
)
508 domain
->iommu_snooping
= 1;
510 i
= find_first_bit(&domain
->iommu_bmp
, g_num_of_iommus
);
511 for (; i
< g_num_of_iommus
; ) {
512 if (!ecap_sc_support(g_iommus
[i
]->ecap
)) {
513 domain
->iommu_snooping
= 0;
516 i
= find_next_bit(&domain
->iommu_bmp
, g_num_of_iommus
, i
+1);
520 /* Some capabilities may be different across iommus */
521 static void domain_update_iommu_cap(struct dmar_domain
*domain
)
523 domain_update_iommu_coherency(domain
);
524 domain_update_iommu_snooping(domain
);
527 static struct intel_iommu
*device_to_iommu(int segment
, u8 bus
, u8 devfn
)
529 struct dmar_drhd_unit
*drhd
= NULL
;
532 for_each_drhd_unit(drhd
) {
535 if (segment
!= drhd
->segment
)
538 for (i
= 0; i
< drhd
->devices_cnt
; i
++) {
539 if (drhd
->devices
[i
] &&
540 drhd
->devices
[i
]->bus
->number
== bus
&&
541 drhd
->devices
[i
]->devfn
== devfn
)
543 if (drhd
->devices
[i
] &&
544 drhd
->devices
[i
]->subordinate
&&
545 drhd
->devices
[i
]->subordinate
->number
<= bus
&&
546 drhd
->devices
[i
]->subordinate
->subordinate
>= bus
)
550 if (drhd
->include_all
)
557 static void domain_flush_cache(struct dmar_domain
*domain
,
558 void *addr
, int size
)
560 if (!domain
->iommu_coherency
)
561 clflush_cache_range(addr
, size
);
564 /* Gets context entry for a given bus and devfn */
565 static struct context_entry
* device_to_context_entry(struct intel_iommu
*iommu
,
568 struct root_entry
*root
;
569 struct context_entry
*context
;
570 unsigned long phy_addr
;
573 spin_lock_irqsave(&iommu
->lock
, flags
);
574 root
= &iommu
->root_entry
[bus
];
575 context
= get_context_addr_from_root(root
);
577 context
= (struct context_entry
*)
578 alloc_pgtable_page(iommu
->node
);
580 spin_unlock_irqrestore(&iommu
->lock
, flags
);
583 __iommu_flush_cache(iommu
, (void *)context
, CONTEXT_SIZE
);
584 phy_addr
= virt_to_phys((void *)context
);
585 set_root_value(root
, phy_addr
);
586 set_root_present(root
);
587 __iommu_flush_cache(iommu
, root
, sizeof(*root
));
589 spin_unlock_irqrestore(&iommu
->lock
, flags
);
590 return &context
[devfn
];
593 static int device_context_mapped(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
595 struct root_entry
*root
;
596 struct context_entry
*context
;
600 spin_lock_irqsave(&iommu
->lock
, flags
);
601 root
= &iommu
->root_entry
[bus
];
602 context
= get_context_addr_from_root(root
);
607 ret
= context_present(&context
[devfn
]);
609 spin_unlock_irqrestore(&iommu
->lock
, flags
);
613 static void clear_context_table(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
615 struct root_entry
*root
;
616 struct context_entry
*context
;
619 spin_lock_irqsave(&iommu
->lock
, flags
);
620 root
= &iommu
->root_entry
[bus
];
621 context
= get_context_addr_from_root(root
);
623 context_clear_entry(&context
[devfn
]);
624 __iommu_flush_cache(iommu
, &context
[devfn
], \
627 spin_unlock_irqrestore(&iommu
->lock
, flags
);
630 static void free_context_table(struct intel_iommu
*iommu
)
632 struct root_entry
*root
;
635 struct context_entry
*context
;
637 spin_lock_irqsave(&iommu
->lock
, flags
);
638 if (!iommu
->root_entry
) {
641 for (i
= 0; i
< ROOT_ENTRY_NR
; i
++) {
642 root
= &iommu
->root_entry
[i
];
643 context
= get_context_addr_from_root(root
);
645 free_pgtable_page(context
);
647 free_pgtable_page(iommu
->root_entry
);
648 iommu
->root_entry
= NULL
;
650 spin_unlock_irqrestore(&iommu
->lock
, flags
);
653 /* page table handling */
654 #define LEVEL_STRIDE (9)
655 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
657 static inline int agaw_to_level(int agaw
)
662 static inline int agaw_to_width(int agaw
)
664 return 30 + agaw
* LEVEL_STRIDE
;
668 static inline int width_to_agaw(int width
)
670 return (width
- 30) / LEVEL_STRIDE
;
673 static inline unsigned int level_to_offset_bits(int level
)
675 return (level
- 1) * LEVEL_STRIDE
;
678 static inline int pfn_level_offset(unsigned long pfn
, int level
)
680 return (pfn
>> level_to_offset_bits(level
)) & LEVEL_MASK
;
683 static inline unsigned long level_mask(int level
)
685 return -1UL << level_to_offset_bits(level
);
688 static inline unsigned long level_size(int level
)
690 return 1UL << level_to_offset_bits(level
);
693 static inline unsigned long align_to_level(unsigned long pfn
, int level
)
695 return (pfn
+ level_size(level
) - 1) & level_mask(level
);
698 static struct dma_pte
*pfn_to_dma_pte(struct dmar_domain
*domain
,
701 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
702 struct dma_pte
*parent
, *pte
= NULL
;
703 int level
= agaw_to_level(domain
->agaw
);
706 BUG_ON(!domain
->pgd
);
707 BUG_ON(addr_width
< BITS_PER_LONG
&& pfn
>> addr_width
);
708 parent
= domain
->pgd
;
713 offset
= pfn_level_offset(pfn
, level
);
714 pte
= &parent
[offset
];
718 if (!dma_pte_present(pte
)) {
721 tmp_page
= alloc_pgtable_page(domain
->nid
);
726 domain_flush_cache(domain
, tmp_page
, VTD_PAGE_SIZE
);
727 pteval
= ((uint64_t)virt_to_dma_pfn(tmp_page
) << VTD_PAGE_SHIFT
) | DMA_PTE_READ
| DMA_PTE_WRITE
;
728 if (cmpxchg64(&pte
->val
, 0ULL, pteval
)) {
729 /* Someone else set it while we were thinking; use theirs. */
730 free_pgtable_page(tmp_page
);
733 domain_flush_cache(domain
, pte
, sizeof(*pte
));
736 parent
= phys_to_virt(dma_pte_addr(pte
));
743 /* return address's pte at specific level */
744 static struct dma_pte
*dma_pfn_level_pte(struct dmar_domain
*domain
,
748 struct dma_pte
*parent
, *pte
= NULL
;
749 int total
= agaw_to_level(domain
->agaw
);
752 parent
= domain
->pgd
;
753 while (level
<= total
) {
754 offset
= pfn_level_offset(pfn
, total
);
755 pte
= &parent
[offset
];
759 if (!dma_pte_present(pte
))
761 parent
= phys_to_virt(dma_pte_addr(pte
));
767 /* clear last level pte, a tlb flush should be followed */
768 static void dma_pte_clear_range(struct dmar_domain
*domain
,
769 unsigned long start_pfn
,
770 unsigned long last_pfn
)
772 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
773 struct dma_pte
*first_pte
, *pte
;
775 BUG_ON(addr_width
< BITS_PER_LONG
&& start_pfn
>> addr_width
);
776 BUG_ON(addr_width
< BITS_PER_LONG
&& last_pfn
>> addr_width
);
777 BUG_ON(start_pfn
> last_pfn
);
779 /* we don't need lock here; nobody else touches the iova range */
781 first_pte
= pte
= dma_pfn_level_pte(domain
, start_pfn
, 1);
783 start_pfn
= align_to_level(start_pfn
+ 1, 2);
790 } while (start_pfn
<= last_pfn
&& !first_pte_in_page(pte
));
792 domain_flush_cache(domain
, first_pte
,
793 (void *)pte
- (void *)first_pte
);
795 } while (start_pfn
&& start_pfn
<= last_pfn
);
798 /* free page table pages. last level pte should already be cleared */
799 static void dma_pte_free_pagetable(struct dmar_domain
*domain
,
800 unsigned long start_pfn
,
801 unsigned long last_pfn
)
803 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
804 struct dma_pte
*first_pte
, *pte
;
805 int total
= agaw_to_level(domain
->agaw
);
809 BUG_ON(addr_width
< BITS_PER_LONG
&& start_pfn
>> addr_width
);
810 BUG_ON(addr_width
< BITS_PER_LONG
&& last_pfn
>> addr_width
);
811 BUG_ON(start_pfn
> last_pfn
);
813 /* We don't need lock here; nobody else touches the iova range */
815 while (level
<= total
) {
816 tmp
= align_to_level(start_pfn
, level
);
818 /* If we can't even clear one PTE at this level, we're done */
819 if (tmp
+ level_size(level
) - 1 > last_pfn
)
823 first_pte
= pte
= dma_pfn_level_pte(domain
, tmp
, level
);
825 tmp
= align_to_level(tmp
+ 1, level
+ 1);
829 if (dma_pte_present(pte
)) {
830 free_pgtable_page(phys_to_virt(dma_pte_addr(pte
)));
834 tmp
+= level_size(level
);
835 } while (!first_pte_in_page(pte
) &&
836 tmp
+ level_size(level
) - 1 <= last_pfn
);
838 domain_flush_cache(domain
, first_pte
,
839 (void *)pte
- (void *)first_pte
);
841 } while (tmp
&& tmp
+ level_size(level
) - 1 <= last_pfn
);
845 if (start_pfn
== 0 && last_pfn
== DOMAIN_MAX_PFN(domain
->gaw
)) {
846 free_pgtable_page(domain
->pgd
);
852 static int iommu_alloc_root_entry(struct intel_iommu
*iommu
)
854 struct root_entry
*root
;
857 root
= (struct root_entry
*)alloc_pgtable_page(iommu
->node
);
861 __iommu_flush_cache(iommu
, root
, ROOT_SIZE
);
863 spin_lock_irqsave(&iommu
->lock
, flags
);
864 iommu
->root_entry
= root
;
865 spin_unlock_irqrestore(&iommu
->lock
, flags
);
870 static void iommu_set_root_entry(struct intel_iommu
*iommu
)
876 addr
= iommu
->root_entry
;
878 spin_lock_irqsave(&iommu
->register_lock
, flag
);
879 dmar_writeq(iommu
->reg
+ DMAR_RTADDR_REG
, virt_to_phys(addr
));
881 writel(iommu
->gcmd
| DMA_GCMD_SRTP
, iommu
->reg
+ DMAR_GCMD_REG
);
883 /* Make sure hardware complete it */
884 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
885 readl
, (sts
& DMA_GSTS_RTPS
), sts
);
887 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
890 static void iommu_flush_write_buffer(struct intel_iommu
*iommu
)
895 if (!rwbf_quirk
&& !cap_rwbf(iommu
->cap
))
898 spin_lock_irqsave(&iommu
->register_lock
, flag
);
899 writel(iommu
->gcmd
| DMA_GCMD_WBF
, iommu
->reg
+ DMAR_GCMD_REG
);
901 /* Make sure hardware complete it */
902 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
903 readl
, (!(val
& DMA_GSTS_WBFS
)), val
);
905 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
908 /* return value determine if we need a write buffer flush */
909 static void __iommu_flush_context(struct intel_iommu
*iommu
,
910 u16 did
, u16 source_id
, u8 function_mask
,
917 case DMA_CCMD_GLOBAL_INVL
:
918 val
= DMA_CCMD_GLOBAL_INVL
;
920 case DMA_CCMD_DOMAIN_INVL
:
921 val
= DMA_CCMD_DOMAIN_INVL
|DMA_CCMD_DID(did
);
923 case DMA_CCMD_DEVICE_INVL
:
924 val
= DMA_CCMD_DEVICE_INVL
|DMA_CCMD_DID(did
)
925 | DMA_CCMD_SID(source_id
) | DMA_CCMD_FM(function_mask
);
932 spin_lock_irqsave(&iommu
->register_lock
, flag
);
933 dmar_writeq(iommu
->reg
+ DMAR_CCMD_REG
, val
);
935 /* Make sure hardware complete it */
936 IOMMU_WAIT_OP(iommu
, DMAR_CCMD_REG
,
937 dmar_readq
, (!(val
& DMA_CCMD_ICC
)), val
);
939 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
942 /* return value determine if we need a write buffer flush */
943 static void __iommu_flush_iotlb(struct intel_iommu
*iommu
, u16 did
,
944 u64 addr
, unsigned int size_order
, u64 type
)
946 int tlb_offset
= ecap_iotlb_offset(iommu
->ecap
);
947 u64 val
= 0, val_iva
= 0;
951 case DMA_TLB_GLOBAL_FLUSH
:
952 /* global flush doesn't need set IVA_REG */
953 val
= DMA_TLB_GLOBAL_FLUSH
|DMA_TLB_IVT
;
955 case DMA_TLB_DSI_FLUSH
:
956 val
= DMA_TLB_DSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
958 case DMA_TLB_PSI_FLUSH
:
959 val
= DMA_TLB_PSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
960 /* Note: always flush non-leaf currently */
961 val_iva
= size_order
| addr
;
966 /* Note: set drain read/write */
969 * This is probably to be super secure.. Looks like we can
970 * ignore it without any impact.
972 if (cap_read_drain(iommu
->cap
))
973 val
|= DMA_TLB_READ_DRAIN
;
975 if (cap_write_drain(iommu
->cap
))
976 val
|= DMA_TLB_WRITE_DRAIN
;
978 spin_lock_irqsave(&iommu
->register_lock
, flag
);
979 /* Note: Only uses first TLB reg currently */
981 dmar_writeq(iommu
->reg
+ tlb_offset
, val_iva
);
982 dmar_writeq(iommu
->reg
+ tlb_offset
+ 8, val
);
984 /* Make sure hardware complete it */
985 IOMMU_WAIT_OP(iommu
, tlb_offset
+ 8,
986 dmar_readq
, (!(val
& DMA_TLB_IVT
)), val
);
988 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
990 /* check IOTLB invalidation granularity */
991 if (DMA_TLB_IAIG(val
) == 0)
992 printk(KERN_ERR
"IOMMU: flush IOTLB failed\n");
993 if (DMA_TLB_IAIG(val
) != DMA_TLB_IIRG(type
))
994 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
995 (unsigned long long)DMA_TLB_IIRG(type
),
996 (unsigned long long)DMA_TLB_IAIG(val
));
999 static struct device_domain_info
*iommu_support_dev_iotlb(
1000 struct dmar_domain
*domain
, int segment
, u8 bus
, u8 devfn
)
1003 unsigned long flags
;
1004 struct device_domain_info
*info
;
1005 struct intel_iommu
*iommu
= device_to_iommu(segment
, bus
, devfn
);
1007 if (!ecap_dev_iotlb_support(iommu
->ecap
))
1013 spin_lock_irqsave(&device_domain_lock
, flags
);
1014 list_for_each_entry(info
, &domain
->devices
, link
)
1015 if (info
->bus
== bus
&& info
->devfn
== devfn
) {
1019 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1021 if (!found
|| !info
->dev
)
1024 if (!pci_find_ext_capability(info
->dev
, PCI_EXT_CAP_ID_ATS
))
1027 if (!dmar_find_matched_atsr_unit(info
->dev
))
1030 info
->iommu
= iommu
;
1035 static void iommu_enable_dev_iotlb(struct device_domain_info
*info
)
1040 pci_enable_ats(info
->dev
, VTD_PAGE_SHIFT
);
1043 static void iommu_disable_dev_iotlb(struct device_domain_info
*info
)
1045 if (!info
->dev
|| !pci_ats_enabled(info
->dev
))
1048 pci_disable_ats(info
->dev
);
1051 static void iommu_flush_dev_iotlb(struct dmar_domain
*domain
,
1052 u64 addr
, unsigned mask
)
1055 unsigned long flags
;
1056 struct device_domain_info
*info
;
1058 spin_lock_irqsave(&device_domain_lock
, flags
);
1059 list_for_each_entry(info
, &domain
->devices
, link
) {
1060 if (!info
->dev
|| !pci_ats_enabled(info
->dev
))
1063 sid
= info
->bus
<< 8 | info
->devfn
;
1064 qdep
= pci_ats_queue_depth(info
->dev
);
1065 qi_flush_dev_iotlb(info
->iommu
, sid
, qdep
, addr
, mask
);
1067 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1070 static void iommu_flush_iotlb_psi(struct intel_iommu
*iommu
, u16 did
,
1071 unsigned long pfn
, unsigned int pages
)
1073 unsigned int mask
= ilog2(__roundup_pow_of_two(pages
));
1074 uint64_t addr
= (uint64_t)pfn
<< VTD_PAGE_SHIFT
;
1079 * Fallback to domain selective flush if no PSI support or the size is
1081 * PSI requires page size to be 2 ^ x, and the base address is naturally
1082 * aligned to the size
1084 if (!cap_pgsel_inv(iommu
->cap
) || mask
> cap_max_amask_val(iommu
->cap
))
1085 iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0,
1088 iommu
->flush
.flush_iotlb(iommu
, did
, addr
, mask
,
1092 * In caching mode, domain ID 0 is reserved for non-present to present
1093 * mapping flush. Device IOTLB doesn't need to be flushed in this case.
1095 if (!cap_caching_mode(iommu
->cap
) || did
)
1096 iommu_flush_dev_iotlb(iommu
->domains
[did
], addr
, mask
);
1099 static void iommu_disable_protect_mem_regions(struct intel_iommu
*iommu
)
1102 unsigned long flags
;
1104 spin_lock_irqsave(&iommu
->register_lock
, flags
);
1105 pmen
= readl(iommu
->reg
+ DMAR_PMEN_REG
);
1106 pmen
&= ~DMA_PMEN_EPM
;
1107 writel(pmen
, iommu
->reg
+ DMAR_PMEN_REG
);
1109 /* wait for the protected region status bit to clear */
1110 IOMMU_WAIT_OP(iommu
, DMAR_PMEN_REG
,
1111 readl
, !(pmen
& DMA_PMEN_PRS
), pmen
);
1113 spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1116 static int iommu_enable_translation(struct intel_iommu
*iommu
)
1119 unsigned long flags
;
1121 spin_lock_irqsave(&iommu
->register_lock
, flags
);
1122 iommu
->gcmd
|= DMA_GCMD_TE
;
1123 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1125 /* Make sure hardware complete it */
1126 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1127 readl
, (sts
& DMA_GSTS_TES
), sts
);
1129 spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1133 static int iommu_disable_translation(struct intel_iommu
*iommu
)
1138 spin_lock_irqsave(&iommu
->register_lock
, flag
);
1139 iommu
->gcmd
&= ~DMA_GCMD_TE
;
1140 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1142 /* Make sure hardware complete it */
1143 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1144 readl
, (!(sts
& DMA_GSTS_TES
)), sts
);
1146 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1151 static int iommu_init_domains(struct intel_iommu
*iommu
)
1153 unsigned long ndomains
;
1154 unsigned long nlongs
;
1156 ndomains
= cap_ndoms(iommu
->cap
);
1157 pr_debug("Number of Domains supportd <%ld>\n", ndomains
);
1158 nlongs
= BITS_TO_LONGS(ndomains
);
1160 spin_lock_init(&iommu
->lock
);
1162 /* TBD: there might be 64K domains,
1163 * consider other allocation for future chip
1165 iommu
->domain_ids
= kcalloc(nlongs
, sizeof(unsigned long), GFP_KERNEL
);
1166 if (!iommu
->domain_ids
) {
1167 printk(KERN_ERR
"Allocating domain id array failed\n");
1170 iommu
->domains
= kcalloc(ndomains
, sizeof(struct dmar_domain
*),
1172 if (!iommu
->domains
) {
1173 printk(KERN_ERR
"Allocating domain array failed\n");
1178 * if Caching mode is set, then invalid translations are tagged
1179 * with domainid 0. Hence we need to pre-allocate it.
1181 if (cap_caching_mode(iommu
->cap
))
1182 set_bit(0, iommu
->domain_ids
);
1187 static void domain_exit(struct dmar_domain
*domain
);
1188 static void vm_domain_exit(struct dmar_domain
*domain
);
1190 void free_dmar_iommu(struct intel_iommu
*iommu
)
1192 struct dmar_domain
*domain
;
1194 unsigned long flags
;
1196 if ((iommu
->domains
) && (iommu
->domain_ids
)) {
1197 i
= find_first_bit(iommu
->domain_ids
, cap_ndoms(iommu
->cap
));
1198 for (; i
< cap_ndoms(iommu
->cap
); ) {
1199 domain
= iommu
->domains
[i
];
1200 clear_bit(i
, iommu
->domain_ids
);
1202 spin_lock_irqsave(&domain
->iommu_lock
, flags
);
1203 if (--domain
->iommu_count
== 0) {
1204 if (domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
)
1205 vm_domain_exit(domain
);
1207 domain_exit(domain
);
1209 spin_unlock_irqrestore(&domain
->iommu_lock
, flags
);
1211 i
= find_next_bit(iommu
->domain_ids
,
1212 cap_ndoms(iommu
->cap
), i
+1);
1216 if (iommu
->gcmd
& DMA_GCMD_TE
)
1217 iommu_disable_translation(iommu
);
1220 set_irq_data(iommu
->irq
, NULL
);
1221 /* This will mask the irq */
1222 free_irq(iommu
->irq
, iommu
);
1223 destroy_irq(iommu
->irq
);
1226 kfree(iommu
->domains
);
1227 kfree(iommu
->domain_ids
);
1229 g_iommus
[iommu
->seq_id
] = NULL
;
1231 /* if all iommus are freed, free g_iommus */
1232 for (i
= 0; i
< g_num_of_iommus
; i
++) {
1237 if (i
== g_num_of_iommus
)
1240 /* free context mapping */
1241 free_context_table(iommu
);
1244 static struct dmar_domain
*alloc_domain(void)
1246 struct dmar_domain
*domain
;
1248 domain
= alloc_domain_mem();
1253 memset(&domain
->iommu_bmp
, 0, sizeof(unsigned long));
1259 static int iommu_attach_domain(struct dmar_domain
*domain
,
1260 struct intel_iommu
*iommu
)
1263 unsigned long ndomains
;
1264 unsigned long flags
;
1266 ndomains
= cap_ndoms(iommu
->cap
);
1268 spin_lock_irqsave(&iommu
->lock
, flags
);
1270 num
= find_first_zero_bit(iommu
->domain_ids
, ndomains
);
1271 if (num
>= ndomains
) {
1272 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1273 printk(KERN_ERR
"IOMMU: no free domain ids\n");
1278 set_bit(num
, iommu
->domain_ids
);
1279 set_bit(iommu
->seq_id
, &domain
->iommu_bmp
);
1280 iommu
->domains
[num
] = domain
;
1281 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1286 static void iommu_detach_domain(struct dmar_domain
*domain
,
1287 struct intel_iommu
*iommu
)
1289 unsigned long flags
;
1293 spin_lock_irqsave(&iommu
->lock
, flags
);
1294 ndomains
= cap_ndoms(iommu
->cap
);
1295 num
= find_first_bit(iommu
->domain_ids
, ndomains
);
1296 for (; num
< ndomains
; ) {
1297 if (iommu
->domains
[num
] == domain
) {
1301 num
= find_next_bit(iommu
->domain_ids
,
1302 cap_ndoms(iommu
->cap
), num
+1);
1306 clear_bit(num
, iommu
->domain_ids
);
1307 clear_bit(iommu
->seq_id
, &domain
->iommu_bmp
);
1308 iommu
->domains
[num
] = NULL
;
1310 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1313 static struct iova_domain reserved_iova_list
;
1314 static struct lock_class_key reserved_rbtree_key
;
1316 static void dmar_init_reserved_ranges(void)
1318 struct pci_dev
*pdev
= NULL
;
1322 init_iova_domain(&reserved_iova_list
, DMA_32BIT_PFN
);
1324 lockdep_set_class(&reserved_iova_list
.iova_rbtree_lock
,
1325 &reserved_rbtree_key
);
1327 /* IOAPIC ranges shouldn't be accessed by DMA */
1328 iova
= reserve_iova(&reserved_iova_list
, IOVA_PFN(IOAPIC_RANGE_START
),
1329 IOVA_PFN(IOAPIC_RANGE_END
));
1331 printk(KERN_ERR
"Reserve IOAPIC range failed\n");
1333 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1334 for_each_pci_dev(pdev
) {
1337 for (i
= 0; i
< PCI_NUM_RESOURCES
; i
++) {
1338 r
= &pdev
->resource
[i
];
1339 if (!r
->flags
|| !(r
->flags
& IORESOURCE_MEM
))
1341 iova
= reserve_iova(&reserved_iova_list
,
1345 printk(KERN_ERR
"Reserve iova failed\n");
1351 static void domain_reserve_special_ranges(struct dmar_domain
*domain
)
1353 copy_reserved_iova(&reserved_iova_list
, &domain
->iovad
);
1356 static inline int guestwidth_to_adjustwidth(int gaw
)
1359 int r
= (gaw
- 12) % 9;
1370 static int domain_init(struct dmar_domain
*domain
, int guest_width
)
1372 struct intel_iommu
*iommu
;
1373 int adjust_width
, agaw
;
1374 unsigned long sagaw
;
1376 init_iova_domain(&domain
->iovad
, DMA_32BIT_PFN
);
1377 spin_lock_init(&domain
->iommu_lock
);
1379 domain_reserve_special_ranges(domain
);
1381 /* calculate AGAW */
1382 iommu
= domain_get_iommu(domain
);
1383 if (guest_width
> cap_mgaw(iommu
->cap
))
1384 guest_width
= cap_mgaw(iommu
->cap
);
1385 domain
->gaw
= guest_width
;
1386 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
1387 agaw
= width_to_agaw(adjust_width
);
1388 sagaw
= cap_sagaw(iommu
->cap
);
1389 if (!test_bit(agaw
, &sagaw
)) {
1390 /* hardware doesn't support it, choose a bigger one */
1391 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw
);
1392 agaw
= find_next_bit(&sagaw
, 5, agaw
);
1396 domain
->agaw
= agaw
;
1397 INIT_LIST_HEAD(&domain
->devices
);
1399 if (ecap_coherent(iommu
->ecap
))
1400 domain
->iommu_coherency
= 1;
1402 domain
->iommu_coherency
= 0;
1404 if (ecap_sc_support(iommu
->ecap
))
1405 domain
->iommu_snooping
= 1;
1407 domain
->iommu_snooping
= 0;
1409 domain
->iommu_count
= 1;
1410 domain
->nid
= iommu
->node
;
1412 /* always allocate the top pgd */
1413 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page(domain
->nid
);
1416 __iommu_flush_cache(iommu
, domain
->pgd
, PAGE_SIZE
);
1420 static void domain_exit(struct dmar_domain
*domain
)
1422 struct dmar_drhd_unit
*drhd
;
1423 struct intel_iommu
*iommu
;
1425 /* Domain 0 is reserved, so dont process it */
1429 domain_remove_dev_info(domain
);
1431 put_iova_domain(&domain
->iovad
);
1434 dma_pte_clear_range(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
));
1436 /* free page tables */
1437 dma_pte_free_pagetable(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
));
1439 for_each_active_iommu(iommu
, drhd
)
1440 if (test_bit(iommu
->seq_id
, &domain
->iommu_bmp
))
1441 iommu_detach_domain(domain
, iommu
);
1443 free_domain_mem(domain
);
1446 static int domain_context_mapping_one(struct dmar_domain
*domain
, int segment
,
1447 u8 bus
, u8 devfn
, int translation
)
1449 struct context_entry
*context
;
1450 unsigned long flags
;
1451 struct intel_iommu
*iommu
;
1452 struct dma_pte
*pgd
;
1454 unsigned long ndomains
;
1457 struct device_domain_info
*info
= NULL
;
1459 pr_debug("Set context mapping for %02x:%02x.%d\n",
1460 bus
, PCI_SLOT(devfn
), PCI_FUNC(devfn
));
1462 BUG_ON(!domain
->pgd
);
1463 BUG_ON(translation
!= CONTEXT_TT_PASS_THROUGH
&&
1464 translation
!= CONTEXT_TT_MULTI_LEVEL
);
1466 iommu
= device_to_iommu(segment
, bus
, devfn
);
1470 context
= device_to_context_entry(iommu
, bus
, devfn
);
1473 spin_lock_irqsave(&iommu
->lock
, flags
);
1474 if (context_present(context
)) {
1475 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1482 if (domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
||
1483 domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
) {
1486 /* find an available domain id for this device in iommu */
1487 ndomains
= cap_ndoms(iommu
->cap
);
1488 num
= find_first_bit(iommu
->domain_ids
, ndomains
);
1489 for (; num
< ndomains
; ) {
1490 if (iommu
->domains
[num
] == domain
) {
1495 num
= find_next_bit(iommu
->domain_ids
,
1496 cap_ndoms(iommu
->cap
), num
+1);
1500 num
= find_first_zero_bit(iommu
->domain_ids
, ndomains
);
1501 if (num
>= ndomains
) {
1502 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1503 printk(KERN_ERR
"IOMMU: no free domain ids\n");
1507 set_bit(num
, iommu
->domain_ids
);
1508 iommu
->domains
[num
] = domain
;
1512 /* Skip top levels of page tables for
1513 * iommu which has less agaw than default.
1514 * Unnecessary for PT mode.
1516 if (translation
!= CONTEXT_TT_PASS_THROUGH
) {
1517 for (agaw
= domain
->agaw
; agaw
!= iommu
->agaw
; agaw
--) {
1518 pgd
= phys_to_virt(dma_pte_addr(pgd
));
1519 if (!dma_pte_present(pgd
)) {
1520 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1527 context_set_domain_id(context
, id
);
1529 if (translation
!= CONTEXT_TT_PASS_THROUGH
) {
1530 info
= iommu_support_dev_iotlb(domain
, segment
, bus
, devfn
);
1531 translation
= info
? CONTEXT_TT_DEV_IOTLB
:
1532 CONTEXT_TT_MULTI_LEVEL
;
1535 * In pass through mode, AW must be programmed to indicate the largest
1536 * AGAW value supported by hardware. And ASR is ignored by hardware.
1538 if (unlikely(translation
== CONTEXT_TT_PASS_THROUGH
))
1539 context_set_address_width(context
, iommu
->msagaw
);
1541 context_set_address_root(context
, virt_to_phys(pgd
));
1542 context_set_address_width(context
, iommu
->agaw
);
1545 context_set_translation_type(context
, translation
);
1546 context_set_fault_enable(context
);
1547 context_set_present(context
);
1548 domain_flush_cache(domain
, context
, sizeof(*context
));
1551 * It's a non-present to present mapping. If hardware doesn't cache
1552 * non-present entry we only need to flush the write-buffer. If the
1553 * _does_ cache non-present entries, then it does so in the special
1554 * domain #0, which we have to flush:
1556 if (cap_caching_mode(iommu
->cap
)) {
1557 iommu
->flush
.flush_context(iommu
, 0,
1558 (((u16
)bus
) << 8) | devfn
,
1559 DMA_CCMD_MASK_NOBIT
,
1560 DMA_CCMD_DEVICE_INVL
);
1561 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_DSI_FLUSH
);
1563 iommu_flush_write_buffer(iommu
);
1565 iommu_enable_dev_iotlb(info
);
1566 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1568 spin_lock_irqsave(&domain
->iommu_lock
, flags
);
1569 if (!test_and_set_bit(iommu
->seq_id
, &domain
->iommu_bmp
)) {
1570 domain
->iommu_count
++;
1571 if (domain
->iommu_count
== 1)
1572 domain
->nid
= iommu
->node
;
1573 domain_update_iommu_cap(domain
);
1575 spin_unlock_irqrestore(&domain
->iommu_lock
, flags
);
1580 domain_context_mapping(struct dmar_domain
*domain
, struct pci_dev
*pdev
,
1584 struct pci_dev
*tmp
, *parent
;
1586 ret
= domain_context_mapping_one(domain
, pci_domain_nr(pdev
->bus
),
1587 pdev
->bus
->number
, pdev
->devfn
,
1592 /* dependent device mapping */
1593 tmp
= pci_find_upstream_pcie_bridge(pdev
);
1596 /* Secondary interface's bus number and devfn 0 */
1597 parent
= pdev
->bus
->self
;
1598 while (parent
!= tmp
) {
1599 ret
= domain_context_mapping_one(domain
,
1600 pci_domain_nr(parent
->bus
),
1601 parent
->bus
->number
,
1602 parent
->devfn
, translation
);
1605 parent
= parent
->bus
->self
;
1607 if (pci_is_pcie(tmp
)) /* this is a PCIe-to-PCI bridge */
1608 return domain_context_mapping_one(domain
,
1609 pci_domain_nr(tmp
->subordinate
),
1610 tmp
->subordinate
->number
, 0,
1612 else /* this is a legacy PCI bridge */
1613 return domain_context_mapping_one(domain
,
1614 pci_domain_nr(tmp
->bus
),
1620 static int domain_context_mapped(struct pci_dev
*pdev
)
1623 struct pci_dev
*tmp
, *parent
;
1624 struct intel_iommu
*iommu
;
1626 iommu
= device_to_iommu(pci_domain_nr(pdev
->bus
), pdev
->bus
->number
,
1631 ret
= device_context_mapped(iommu
, pdev
->bus
->number
, pdev
->devfn
);
1634 /* dependent device mapping */
1635 tmp
= pci_find_upstream_pcie_bridge(pdev
);
1638 /* Secondary interface's bus number and devfn 0 */
1639 parent
= pdev
->bus
->self
;
1640 while (parent
!= tmp
) {
1641 ret
= device_context_mapped(iommu
, parent
->bus
->number
,
1645 parent
= parent
->bus
->self
;
1647 if (pci_is_pcie(tmp
))
1648 return device_context_mapped(iommu
, tmp
->subordinate
->number
,
1651 return device_context_mapped(iommu
, tmp
->bus
->number
,
1655 /* Returns a number of VTD pages, but aligned to MM page size */
1656 static inline unsigned long aligned_nrpages(unsigned long host_addr
,
1659 host_addr
&= ~PAGE_MASK
;
1660 return PAGE_ALIGN(host_addr
+ size
) >> VTD_PAGE_SHIFT
;
1663 static int __domain_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
1664 struct scatterlist
*sg
, unsigned long phys_pfn
,
1665 unsigned long nr_pages
, int prot
)
1667 struct dma_pte
*first_pte
= NULL
, *pte
= NULL
;
1668 phys_addr_t
uninitialized_var(pteval
);
1669 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
1670 unsigned long sg_res
;
1672 BUG_ON(addr_width
< BITS_PER_LONG
&& (iov_pfn
+ nr_pages
- 1) >> addr_width
);
1674 if ((prot
& (DMA_PTE_READ
|DMA_PTE_WRITE
)) == 0)
1677 prot
&= DMA_PTE_READ
| DMA_PTE_WRITE
| DMA_PTE_SNP
;
1682 sg_res
= nr_pages
+ 1;
1683 pteval
= ((phys_addr_t
)phys_pfn
<< VTD_PAGE_SHIFT
) | prot
;
1686 while (nr_pages
--) {
1690 sg_res
= aligned_nrpages(sg
->offset
, sg
->length
);
1691 sg
->dma_address
= ((dma_addr_t
)iov_pfn
<< VTD_PAGE_SHIFT
) + sg
->offset
;
1692 sg
->dma_length
= sg
->length
;
1693 pteval
= page_to_phys(sg_page(sg
)) | prot
;
1696 first_pte
= pte
= pfn_to_dma_pte(domain
, iov_pfn
);
1700 /* We don't need lock here, nobody else
1701 * touches the iova range
1703 tmp
= cmpxchg64_local(&pte
->val
, 0ULL, pteval
);
1705 static int dumps
= 5;
1706 printk(KERN_CRIT
"ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1707 iov_pfn
, tmp
, (unsigned long long)pteval
);
1710 debug_dma_dump_mappings(NULL
);
1715 if (!nr_pages
|| first_pte_in_page(pte
)) {
1716 domain_flush_cache(domain
, first_pte
,
1717 (void *)pte
- (void *)first_pte
);
1721 pteval
+= VTD_PAGE_SIZE
;
1729 static inline int domain_sg_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
1730 struct scatterlist
*sg
, unsigned long nr_pages
,
1733 return __domain_mapping(domain
, iov_pfn
, sg
, 0, nr_pages
, prot
);
1736 static inline int domain_pfn_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
1737 unsigned long phys_pfn
, unsigned long nr_pages
,
1740 return __domain_mapping(domain
, iov_pfn
, NULL
, phys_pfn
, nr_pages
, prot
);
1743 static void iommu_detach_dev(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
1748 clear_context_table(iommu
, bus
, devfn
);
1749 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
1750 DMA_CCMD_GLOBAL_INVL
);
1751 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
1754 static void domain_remove_dev_info(struct dmar_domain
*domain
)
1756 struct device_domain_info
*info
;
1757 unsigned long flags
;
1758 struct intel_iommu
*iommu
;
1760 spin_lock_irqsave(&device_domain_lock
, flags
);
1761 while (!list_empty(&domain
->devices
)) {
1762 info
= list_entry(domain
->devices
.next
,
1763 struct device_domain_info
, link
);
1764 list_del(&info
->link
);
1765 list_del(&info
->global
);
1767 info
->dev
->dev
.archdata
.iommu
= NULL
;
1768 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1770 iommu_disable_dev_iotlb(info
);
1771 iommu
= device_to_iommu(info
->segment
, info
->bus
, info
->devfn
);
1772 iommu_detach_dev(iommu
, info
->bus
, info
->devfn
);
1773 free_devinfo_mem(info
);
1775 spin_lock_irqsave(&device_domain_lock
, flags
);
1777 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1782 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1784 static struct dmar_domain
*
1785 find_domain(struct pci_dev
*pdev
)
1787 struct device_domain_info
*info
;
1789 /* No lock here, assumes no domain exit in normal case */
1790 info
= pdev
->dev
.archdata
.iommu
;
1792 return info
->domain
;
1796 /* domain is initialized */
1797 static struct dmar_domain
*get_domain_for_dev(struct pci_dev
*pdev
, int gaw
)
1799 struct dmar_domain
*domain
, *found
= NULL
;
1800 struct intel_iommu
*iommu
;
1801 struct dmar_drhd_unit
*drhd
;
1802 struct device_domain_info
*info
, *tmp
;
1803 struct pci_dev
*dev_tmp
;
1804 unsigned long flags
;
1805 int bus
= 0, devfn
= 0;
1809 domain
= find_domain(pdev
);
1813 segment
= pci_domain_nr(pdev
->bus
);
1815 dev_tmp
= pci_find_upstream_pcie_bridge(pdev
);
1817 if (pci_is_pcie(dev_tmp
)) {
1818 bus
= dev_tmp
->subordinate
->number
;
1821 bus
= dev_tmp
->bus
->number
;
1822 devfn
= dev_tmp
->devfn
;
1824 spin_lock_irqsave(&device_domain_lock
, flags
);
1825 list_for_each_entry(info
, &device_domain_list
, global
) {
1826 if (info
->segment
== segment
&&
1827 info
->bus
== bus
&& info
->devfn
== devfn
) {
1828 found
= info
->domain
;
1832 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1833 /* pcie-pci bridge already has a domain, uses it */
1840 domain
= alloc_domain();
1844 /* Allocate new domain for the device */
1845 drhd
= dmar_find_matched_drhd_unit(pdev
);
1847 printk(KERN_ERR
"IOMMU: can't find DMAR for device %s\n",
1851 iommu
= drhd
->iommu
;
1853 ret
= iommu_attach_domain(domain
, iommu
);
1855 domain_exit(domain
);
1859 if (domain_init(domain
, gaw
)) {
1860 domain_exit(domain
);
1864 /* register pcie-to-pci device */
1866 info
= alloc_devinfo_mem();
1868 domain_exit(domain
);
1871 info
->segment
= segment
;
1873 info
->devfn
= devfn
;
1875 info
->domain
= domain
;
1876 /* This domain is shared by devices under p2p bridge */
1877 domain
->flags
|= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES
;
1879 /* pcie-to-pci bridge already has a domain, uses it */
1881 spin_lock_irqsave(&device_domain_lock
, flags
);
1882 list_for_each_entry(tmp
, &device_domain_list
, global
) {
1883 if (tmp
->segment
== segment
&&
1884 tmp
->bus
== bus
&& tmp
->devfn
== devfn
) {
1885 found
= tmp
->domain
;
1890 free_devinfo_mem(info
);
1891 domain_exit(domain
);
1894 list_add(&info
->link
, &domain
->devices
);
1895 list_add(&info
->global
, &device_domain_list
);
1897 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1901 info
= alloc_devinfo_mem();
1904 info
->segment
= segment
;
1905 info
->bus
= pdev
->bus
->number
;
1906 info
->devfn
= pdev
->devfn
;
1908 info
->domain
= domain
;
1909 spin_lock_irqsave(&device_domain_lock
, flags
);
1910 /* somebody is fast */
1911 found
= find_domain(pdev
);
1912 if (found
!= NULL
) {
1913 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1914 if (found
!= domain
) {
1915 domain_exit(domain
);
1918 free_devinfo_mem(info
);
1921 list_add(&info
->link
, &domain
->devices
);
1922 list_add(&info
->global
, &device_domain_list
);
1923 pdev
->dev
.archdata
.iommu
= info
;
1924 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1927 /* recheck it here, maybe others set it */
1928 return find_domain(pdev
);
1931 static int iommu_identity_mapping
;
1932 #define IDENTMAP_ALL 1
1933 #define IDENTMAP_GFX 2
1934 #define IDENTMAP_AZALIA 4
1936 static int iommu_domain_identity_map(struct dmar_domain
*domain
,
1937 unsigned long long start
,
1938 unsigned long long end
)
1940 unsigned long first_vpfn
= start
>> VTD_PAGE_SHIFT
;
1941 unsigned long last_vpfn
= end
>> VTD_PAGE_SHIFT
;
1943 if (!reserve_iova(&domain
->iovad
, dma_to_mm_pfn(first_vpfn
),
1944 dma_to_mm_pfn(last_vpfn
))) {
1945 printk(KERN_ERR
"IOMMU: reserve iova failed\n");
1949 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1950 start
, end
, domain
->id
);
1952 * RMRR range might have overlap with physical memory range,
1955 dma_pte_clear_range(domain
, first_vpfn
, last_vpfn
);
1957 return domain_pfn_mapping(domain
, first_vpfn
, first_vpfn
,
1958 last_vpfn
- first_vpfn
+ 1,
1959 DMA_PTE_READ
|DMA_PTE_WRITE
);
1962 static int iommu_prepare_identity_map(struct pci_dev
*pdev
,
1963 unsigned long long start
,
1964 unsigned long long end
)
1966 struct dmar_domain
*domain
;
1969 domain
= get_domain_for_dev(pdev
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
1973 /* For _hardware_ passthrough, don't bother. But for software
1974 passthrough, we do it anyway -- it may indicate a memory
1975 range which is reserved in E820, so which didn't get set
1976 up to start with in si_domain */
1977 if (domain
== si_domain
&& hw_pass_through
) {
1978 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1979 pci_name(pdev
), start
, end
);
1984 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1985 pci_name(pdev
), start
, end
);
1988 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1989 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1990 dmi_get_system_info(DMI_BIOS_VENDOR
),
1991 dmi_get_system_info(DMI_BIOS_VERSION
),
1992 dmi_get_system_info(DMI_PRODUCT_VERSION
));
1997 if (end
>> agaw_to_width(domain
->agaw
)) {
1998 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1999 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2000 agaw_to_width(domain
->agaw
),
2001 dmi_get_system_info(DMI_BIOS_VENDOR
),
2002 dmi_get_system_info(DMI_BIOS_VERSION
),
2003 dmi_get_system_info(DMI_PRODUCT_VERSION
));
2008 ret
= iommu_domain_identity_map(domain
, start
, end
);
2012 /* context entry init */
2013 ret
= domain_context_mapping(domain
, pdev
, CONTEXT_TT_MULTI_LEVEL
);
2020 domain_exit(domain
);
2024 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit
*rmrr
,
2025 struct pci_dev
*pdev
)
2027 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
2029 return iommu_prepare_identity_map(pdev
, rmrr
->base_address
,
2030 rmrr
->end_address
+ 1);
2033 #ifdef CONFIG_DMAR_FLOPPY_WA
2034 static inline void iommu_prepare_isa(void)
2036 struct pci_dev
*pdev
;
2039 pdev
= pci_get_class(PCI_CLASS_BRIDGE_ISA
<< 8, NULL
);
2043 printk(KERN_INFO
"IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2044 ret
= iommu_prepare_identity_map(pdev
, 0, 16*1024*1024);
2047 printk(KERN_ERR
"IOMMU: Failed to create 0-16MiB identity map; "
2048 "floppy might not work\n");
2052 static inline void iommu_prepare_isa(void)
2056 #endif /* !CONFIG_DMAR_FLPY_WA */
2058 static int md_domain_init(struct dmar_domain
*domain
, int guest_width
);
2060 static int __init
si_domain_work_fn(unsigned long start_pfn
,
2061 unsigned long end_pfn
, void *datax
)
2065 *ret
= iommu_domain_identity_map(si_domain
,
2066 (uint64_t)start_pfn
<< PAGE_SHIFT
,
2067 (uint64_t)end_pfn
<< PAGE_SHIFT
);
2072 static int __init
si_domain_init(int hw
)
2074 struct dmar_drhd_unit
*drhd
;
2075 struct intel_iommu
*iommu
;
2078 si_domain
= alloc_domain();
2082 pr_debug("Identity mapping domain is domain %d\n", si_domain
->id
);
2084 for_each_active_iommu(iommu
, drhd
) {
2085 ret
= iommu_attach_domain(si_domain
, iommu
);
2087 domain_exit(si_domain
);
2092 if (md_domain_init(si_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
2093 domain_exit(si_domain
);
2097 si_domain
->flags
= DOMAIN_FLAG_STATIC_IDENTITY
;
2102 for_each_online_node(nid
) {
2103 work_with_active_regions(nid
, si_domain_work_fn
, &ret
);
2111 static void domain_remove_one_dev_info(struct dmar_domain
*domain
,
2112 struct pci_dev
*pdev
);
2113 static int identity_mapping(struct pci_dev
*pdev
)
2115 struct device_domain_info
*info
;
2117 if (likely(!iommu_identity_mapping
))
2121 list_for_each_entry(info
, &si_domain
->devices
, link
)
2122 if (info
->dev
== pdev
)
2127 static int domain_add_dev_info(struct dmar_domain
*domain
,
2128 struct pci_dev
*pdev
,
2131 struct device_domain_info
*info
;
2132 unsigned long flags
;
2135 info
= alloc_devinfo_mem();
2139 ret
= domain_context_mapping(domain
, pdev
, translation
);
2141 free_devinfo_mem(info
);
2145 info
->segment
= pci_domain_nr(pdev
->bus
);
2146 info
->bus
= pdev
->bus
->number
;
2147 info
->devfn
= pdev
->devfn
;
2149 info
->domain
= domain
;
2151 spin_lock_irqsave(&device_domain_lock
, flags
);
2152 list_add(&info
->link
, &domain
->devices
);
2153 list_add(&info
->global
, &device_domain_list
);
2154 pdev
->dev
.archdata
.iommu
= info
;
2155 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2160 static int iommu_should_identity_map(struct pci_dev
*pdev
, int startup
)
2162 if ((iommu_identity_mapping
& IDENTMAP_AZALIA
) && IS_AZALIA(pdev
))
2165 if ((iommu_identity_mapping
& IDENTMAP_GFX
) && IS_GFX_DEVICE(pdev
))
2168 if (!(iommu_identity_mapping
& IDENTMAP_ALL
))
2172 * We want to start off with all devices in the 1:1 domain, and
2173 * take them out later if we find they can't access all of memory.
2175 * However, we can't do this for PCI devices behind bridges,
2176 * because all PCI devices behind the same bridge will end up
2177 * with the same source-id on their transactions.
2179 * Practically speaking, we can't change things around for these
2180 * devices at run-time, because we can't be sure there'll be no
2181 * DMA transactions in flight for any of their siblings.
2183 * So PCI devices (unless they're on the root bus) as well as
2184 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2185 * the 1:1 domain, just in _case_ one of their siblings turns out
2186 * not to be able to map all of memory.
2188 if (!pci_is_pcie(pdev
)) {
2189 if (!pci_is_root_bus(pdev
->bus
))
2191 if (pdev
->class >> 8 == PCI_CLASS_BRIDGE_PCI
)
2193 } else if (pdev
->pcie_type
== PCI_EXP_TYPE_PCI_BRIDGE
)
2197 * At boot time, we don't yet know if devices will be 64-bit capable.
2198 * Assume that they will -- if they turn out not to be, then we can
2199 * take them out of the 1:1 domain later.
2202 return pdev
->dma_mask
> DMA_BIT_MASK(32);
2207 static int __init
iommu_prepare_static_identity_mapping(int hw
)
2209 struct pci_dev
*pdev
= NULL
;
2212 ret
= si_domain_init(hw
);
2216 for_each_pci_dev(pdev
) {
2217 if (iommu_should_identity_map(pdev
, 1)) {
2218 printk(KERN_INFO
"IOMMU: %s identity mapping for device %s\n",
2219 hw
? "hardware" : "software", pci_name(pdev
));
2221 ret
= domain_add_dev_info(si_domain
, pdev
,
2222 hw
? CONTEXT_TT_PASS_THROUGH
:
2223 CONTEXT_TT_MULTI_LEVEL
);
2232 int __init
init_dmars(void)
2234 struct dmar_drhd_unit
*drhd
;
2235 struct dmar_rmrr_unit
*rmrr
;
2236 struct pci_dev
*pdev
;
2237 struct intel_iommu
*iommu
;
2243 * initialize and program root entry to not present
2246 for_each_drhd_unit(drhd
) {
2249 * lock not needed as this is only incremented in the single
2250 * threaded kernel __init code path all other access are read
2255 g_iommus
= kcalloc(g_num_of_iommus
, sizeof(struct intel_iommu
*),
2258 printk(KERN_ERR
"Allocating global iommu array failed\n");
2263 deferred_flush
= kzalloc(g_num_of_iommus
*
2264 sizeof(struct deferred_flush_tables
), GFP_KERNEL
);
2265 if (!deferred_flush
) {
2270 for_each_drhd_unit(drhd
) {
2274 iommu
= drhd
->iommu
;
2275 g_iommus
[iommu
->seq_id
] = iommu
;
2277 ret
= iommu_init_domains(iommu
);
2283 * we could share the same root & context tables
2284 * amoung all IOMMU's. Need to Split it later.
2286 ret
= iommu_alloc_root_entry(iommu
);
2288 printk(KERN_ERR
"IOMMU: allocate root entry failed\n");
2291 if (!ecap_pass_through(iommu
->ecap
))
2292 hw_pass_through
= 0;
2296 * Start from the sane iommu hardware state.
2298 for_each_drhd_unit(drhd
) {
2302 iommu
= drhd
->iommu
;
2305 * If the queued invalidation is already initialized by us
2306 * (for example, while enabling interrupt-remapping) then
2307 * we got the things already rolling from a sane state.
2313 * Clear any previous faults.
2315 dmar_fault(-1, iommu
);
2317 * Disable queued invalidation if supported and already enabled
2318 * before OS handover.
2320 dmar_disable_qi(iommu
);
2323 for_each_drhd_unit(drhd
) {
2327 iommu
= drhd
->iommu
;
2329 if (dmar_enable_qi(iommu
)) {
2331 * Queued Invalidate not enabled, use Register Based
2334 iommu
->flush
.flush_context
= __iommu_flush_context
;
2335 iommu
->flush
.flush_iotlb
= __iommu_flush_iotlb
;
2336 printk(KERN_INFO
"IOMMU 0x%Lx: using Register based "
2338 (unsigned long long)drhd
->reg_base_addr
);
2340 iommu
->flush
.flush_context
= qi_flush_context
;
2341 iommu
->flush
.flush_iotlb
= qi_flush_iotlb
;
2342 printk(KERN_INFO
"IOMMU 0x%Lx: using Queued "
2344 (unsigned long long)drhd
->reg_base_addr
);
2348 if (iommu_pass_through
)
2349 iommu_identity_mapping
|= IDENTMAP_ALL
;
2351 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2352 iommu_identity_mapping
|= IDENTMAP_GFX
;
2355 check_tylersburg_isoch();
2358 * If pass through is not set or not enabled, setup context entries for
2359 * identity mappings for rmrr, gfx, and isa and may fall back to static
2360 * identity mapping if iommu_identity_mapping is set.
2362 if (iommu_identity_mapping
) {
2363 ret
= iommu_prepare_static_identity_mapping(hw_pass_through
);
2365 printk(KERN_CRIT
"Failed to setup IOMMU pass-through\n");
2371 * for each dev attached to rmrr
2373 * locate drhd for dev, alloc domain for dev
2374 * allocate free domain
2375 * allocate page table entries for rmrr
2376 * if context not allocated for bus
2377 * allocate and init context
2378 * set present in root table for this bus
2379 * init context with domain, translation etc
2383 printk(KERN_INFO
"IOMMU: Setting RMRR:\n");
2384 for_each_rmrr_units(rmrr
) {
2385 for (i
= 0; i
< rmrr
->devices_cnt
; i
++) {
2386 pdev
= rmrr
->devices
[i
];
2388 * some BIOS lists non-exist devices in DMAR
2393 ret
= iommu_prepare_rmrr_dev(rmrr
, pdev
);
2396 "IOMMU: mapping reserved region failed\n");
2400 iommu_prepare_isa();
2405 * global invalidate context cache
2406 * global invalidate iotlb
2407 * enable translation
2409 for_each_drhd_unit(drhd
) {
2412 iommu
= drhd
->iommu
;
2414 iommu_flush_write_buffer(iommu
);
2416 ret
= dmar_set_interrupt(iommu
);
2420 iommu_set_root_entry(iommu
);
2422 iommu
->flush
.flush_context(iommu
, 0, 0, 0, DMA_CCMD_GLOBAL_INVL
);
2423 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
2425 ret
= iommu_enable_translation(iommu
);
2429 iommu_disable_protect_mem_regions(iommu
);
2434 for_each_drhd_unit(drhd
) {
2437 iommu
= drhd
->iommu
;
2444 /* This takes a number of _MM_ pages, not VTD pages */
2445 static struct iova
*intel_alloc_iova(struct device
*dev
,
2446 struct dmar_domain
*domain
,
2447 unsigned long nrpages
, uint64_t dma_mask
)
2449 struct pci_dev
*pdev
= to_pci_dev(dev
);
2450 struct iova
*iova
= NULL
;
2452 /* Restrict dma_mask to the width that the iommu can handle */
2453 dma_mask
= min_t(uint64_t, DOMAIN_MAX_ADDR(domain
->gaw
), dma_mask
);
2455 if (!dmar_forcedac
&& dma_mask
> DMA_BIT_MASK(32)) {
2457 * First try to allocate an io virtual address in
2458 * DMA_BIT_MASK(32) and if that fails then try allocating
2461 iova
= alloc_iova(&domain
->iovad
, nrpages
,
2462 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2466 iova
= alloc_iova(&domain
->iovad
, nrpages
, IOVA_PFN(dma_mask
), 1);
2467 if (unlikely(!iova
)) {
2468 printk(KERN_ERR
"Allocating %ld-page iova for %s failed",
2469 nrpages
, pci_name(pdev
));
2476 static struct dmar_domain
*__get_valid_domain_for_dev(struct pci_dev
*pdev
)
2478 struct dmar_domain
*domain
;
2481 domain
= get_domain_for_dev(pdev
,
2482 DEFAULT_DOMAIN_ADDRESS_WIDTH
);
2485 "Allocating domain for %s failed", pci_name(pdev
));
2489 /* make sure context mapping is ok */
2490 if (unlikely(!domain_context_mapped(pdev
))) {
2491 ret
= domain_context_mapping(domain
, pdev
,
2492 CONTEXT_TT_MULTI_LEVEL
);
2495 "Domain context map for %s failed",
2504 static inline struct dmar_domain
*get_valid_domain_for_dev(struct pci_dev
*dev
)
2506 struct device_domain_info
*info
;
2508 /* No lock here, assumes no domain exit in normal case */
2509 info
= dev
->dev
.archdata
.iommu
;
2511 return info
->domain
;
2513 return __get_valid_domain_for_dev(dev
);
2516 static int iommu_dummy(struct pci_dev
*pdev
)
2518 return pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
;
2521 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2522 static int iommu_no_mapping(struct device
*dev
)
2524 struct pci_dev
*pdev
;
2527 if (unlikely(dev
->bus
!= &pci_bus_type
))
2530 pdev
= to_pci_dev(dev
);
2531 if (iommu_dummy(pdev
))
2534 if (!iommu_identity_mapping
)
2537 found
= identity_mapping(pdev
);
2539 if (iommu_should_identity_map(pdev
, 0))
2543 * 32 bit DMA is removed from si_domain and fall back
2544 * to non-identity mapping.
2546 domain_remove_one_dev_info(si_domain
, pdev
);
2547 printk(KERN_INFO
"32bit %s uses non-identity mapping\n",
2553 * In case of a detached 64 bit DMA device from vm, the device
2554 * is put into si_domain for identity mapping.
2556 if (iommu_should_identity_map(pdev
, 0)) {
2558 ret
= domain_add_dev_info(si_domain
, pdev
,
2560 CONTEXT_TT_PASS_THROUGH
:
2561 CONTEXT_TT_MULTI_LEVEL
);
2563 printk(KERN_INFO
"64bit %s uses identity mapping\n",
2573 static dma_addr_t
__intel_map_single(struct device
*hwdev
, phys_addr_t paddr
,
2574 size_t size
, int dir
, u64 dma_mask
)
2576 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
2577 struct dmar_domain
*domain
;
2578 phys_addr_t start_paddr
;
2582 struct intel_iommu
*iommu
;
2583 unsigned long paddr_pfn
= paddr
>> PAGE_SHIFT
;
2585 BUG_ON(dir
== DMA_NONE
);
2587 if (iommu_no_mapping(hwdev
))
2590 domain
= get_valid_domain_for_dev(pdev
);
2594 iommu
= domain_get_iommu(domain
);
2595 size
= aligned_nrpages(paddr
, size
);
2597 iova
= intel_alloc_iova(hwdev
, domain
, dma_to_mm_pfn(size
),
2603 * Check if DMAR supports zero-length reads on write only
2606 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
2607 !cap_zlr(iommu
->cap
))
2608 prot
|= DMA_PTE_READ
;
2609 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
2610 prot
|= DMA_PTE_WRITE
;
2612 * paddr - (paddr + size) might be partial page, we should map the whole
2613 * page. Note: if two part of one page are separately mapped, we
2614 * might have two guest_addr mapping to the same host paddr, but this
2615 * is not a big problem
2617 ret
= domain_pfn_mapping(domain
, mm_to_dma_pfn(iova
->pfn_lo
),
2618 mm_to_dma_pfn(paddr_pfn
), size
, prot
);
2622 /* it's a non-present to present mapping. Only flush if caching mode */
2623 if (cap_caching_mode(iommu
->cap
))
2624 iommu_flush_iotlb_psi(iommu
, 0, mm_to_dma_pfn(iova
->pfn_lo
), size
);
2626 iommu_flush_write_buffer(iommu
);
2628 start_paddr
= (phys_addr_t
)iova
->pfn_lo
<< PAGE_SHIFT
;
2629 start_paddr
+= paddr
& ~PAGE_MASK
;
2634 __free_iova(&domain
->iovad
, iova
);
2635 printk(KERN_ERR
"Device %s request: %zx@%llx dir %d --- failed\n",
2636 pci_name(pdev
), size
, (unsigned long long)paddr
, dir
);
2640 static dma_addr_t
intel_map_page(struct device
*dev
, struct page
*page
,
2641 unsigned long offset
, size_t size
,
2642 enum dma_data_direction dir
,
2643 struct dma_attrs
*attrs
)
2645 return __intel_map_single(dev
, page_to_phys(page
) + offset
, size
,
2646 dir
, to_pci_dev(dev
)->dma_mask
);
2649 static void flush_unmaps(void)
2655 /* just flush them all */
2656 for (i
= 0; i
< g_num_of_iommus
; i
++) {
2657 struct intel_iommu
*iommu
= g_iommus
[i
];
2661 if (!deferred_flush
[i
].next
)
2664 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
2665 DMA_TLB_GLOBAL_FLUSH
);
2666 for (j
= 0; j
< deferred_flush
[i
].next
; j
++) {
2668 struct iova
*iova
= deferred_flush
[i
].iova
[j
];
2670 mask
= ilog2(mm_to_dma_pfn(iova
->pfn_hi
- iova
->pfn_lo
+ 1));
2671 iommu_flush_dev_iotlb(deferred_flush
[i
].domain
[j
],
2672 (uint64_t)iova
->pfn_lo
<< PAGE_SHIFT
, mask
);
2673 __free_iova(&deferred_flush
[i
].domain
[j
]->iovad
, iova
);
2675 deferred_flush
[i
].next
= 0;
2681 static void flush_unmaps_timeout(unsigned long data
)
2683 unsigned long flags
;
2685 spin_lock_irqsave(&async_umap_flush_lock
, flags
);
2687 spin_unlock_irqrestore(&async_umap_flush_lock
, flags
);
2690 static void add_unmap(struct dmar_domain
*dom
, struct iova
*iova
)
2692 unsigned long flags
;
2694 struct intel_iommu
*iommu
;
2696 spin_lock_irqsave(&async_umap_flush_lock
, flags
);
2697 if (list_size
== HIGH_WATER_MARK
)
2700 iommu
= domain_get_iommu(dom
);
2701 iommu_id
= iommu
->seq_id
;
2703 next
= deferred_flush
[iommu_id
].next
;
2704 deferred_flush
[iommu_id
].domain
[next
] = dom
;
2705 deferred_flush
[iommu_id
].iova
[next
] = iova
;
2706 deferred_flush
[iommu_id
].next
++;
2709 mod_timer(&unmap_timer
, jiffies
+ msecs_to_jiffies(10));
2713 spin_unlock_irqrestore(&async_umap_flush_lock
, flags
);
2716 static void intel_unmap_page(struct device
*dev
, dma_addr_t dev_addr
,
2717 size_t size
, enum dma_data_direction dir
,
2718 struct dma_attrs
*attrs
)
2720 struct pci_dev
*pdev
= to_pci_dev(dev
);
2721 struct dmar_domain
*domain
;
2722 unsigned long start_pfn
, last_pfn
;
2724 struct intel_iommu
*iommu
;
2726 if (iommu_no_mapping(dev
))
2729 domain
= find_domain(pdev
);
2732 iommu
= domain_get_iommu(domain
);
2734 iova
= find_iova(&domain
->iovad
, IOVA_PFN(dev_addr
));
2735 if (WARN_ONCE(!iova
, "Driver unmaps unmatched page at PFN %llx\n",
2736 (unsigned long long)dev_addr
))
2739 start_pfn
= mm_to_dma_pfn(iova
->pfn_lo
);
2740 last_pfn
= mm_to_dma_pfn(iova
->pfn_hi
+ 1) - 1;
2742 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2743 pci_name(pdev
), start_pfn
, last_pfn
);
2745 /* clear the whole page */
2746 dma_pte_clear_range(domain
, start_pfn
, last_pfn
);
2748 /* free page tables */
2749 dma_pte_free_pagetable(domain
, start_pfn
, last_pfn
);
2751 if (intel_iommu_strict
) {
2752 iommu_flush_iotlb_psi(iommu
, domain
->id
, start_pfn
,
2753 last_pfn
- start_pfn
+ 1);
2755 __free_iova(&domain
->iovad
, iova
);
2757 add_unmap(domain
, iova
);
2759 * queue up the release of the unmap to save the 1/6th of the
2760 * cpu used up by the iotlb flush operation...
2765 static void *intel_alloc_coherent(struct device
*hwdev
, size_t size
,
2766 dma_addr_t
*dma_handle
, gfp_t flags
)
2771 size
= PAGE_ALIGN(size
);
2772 order
= get_order(size
);
2774 if (!iommu_no_mapping(hwdev
))
2775 flags
&= ~(GFP_DMA
| GFP_DMA32
);
2776 else if (hwdev
->coherent_dma_mask
< dma_get_required_mask(hwdev
)) {
2777 if (hwdev
->coherent_dma_mask
< DMA_BIT_MASK(32))
2783 vaddr
= (void *)__get_free_pages(flags
, order
);
2786 memset(vaddr
, 0, size
);
2788 *dma_handle
= __intel_map_single(hwdev
, virt_to_bus(vaddr
), size
,
2790 hwdev
->coherent_dma_mask
);
2793 free_pages((unsigned long)vaddr
, order
);
2797 static void intel_free_coherent(struct device
*hwdev
, size_t size
, void *vaddr
,
2798 dma_addr_t dma_handle
)
2802 size
= PAGE_ALIGN(size
);
2803 order
= get_order(size
);
2805 intel_unmap_page(hwdev
, dma_handle
, size
, DMA_BIDIRECTIONAL
, NULL
);
2806 free_pages((unsigned long)vaddr
, order
);
2809 static void intel_unmap_sg(struct device
*hwdev
, struct scatterlist
*sglist
,
2810 int nelems
, enum dma_data_direction dir
,
2811 struct dma_attrs
*attrs
)
2813 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
2814 struct dmar_domain
*domain
;
2815 unsigned long start_pfn
, last_pfn
;
2817 struct intel_iommu
*iommu
;
2819 if (iommu_no_mapping(hwdev
))
2822 domain
= find_domain(pdev
);
2825 iommu
= domain_get_iommu(domain
);
2827 iova
= find_iova(&domain
->iovad
, IOVA_PFN(sglist
[0].dma_address
));
2828 if (WARN_ONCE(!iova
, "Driver unmaps unmatched sglist at PFN %llx\n",
2829 (unsigned long long)sglist
[0].dma_address
))
2832 start_pfn
= mm_to_dma_pfn(iova
->pfn_lo
);
2833 last_pfn
= mm_to_dma_pfn(iova
->pfn_hi
+ 1) - 1;
2835 /* clear the whole page */
2836 dma_pte_clear_range(domain
, start_pfn
, last_pfn
);
2838 /* free page tables */
2839 dma_pte_free_pagetable(domain
, start_pfn
, last_pfn
);
2841 if (intel_iommu_strict
) {
2842 iommu_flush_iotlb_psi(iommu
, domain
->id
, start_pfn
,
2843 last_pfn
- start_pfn
+ 1);
2845 __free_iova(&domain
->iovad
, iova
);
2847 add_unmap(domain
, iova
);
2849 * queue up the release of the unmap to save the 1/6th of the
2850 * cpu used up by the iotlb flush operation...
2855 static int intel_nontranslate_map_sg(struct device
*hddev
,
2856 struct scatterlist
*sglist
, int nelems
, int dir
)
2859 struct scatterlist
*sg
;
2861 for_each_sg(sglist
, sg
, nelems
, i
) {
2862 BUG_ON(!sg_page(sg
));
2863 sg
->dma_address
= page_to_phys(sg_page(sg
)) + sg
->offset
;
2864 sg
->dma_length
= sg
->length
;
2869 static int intel_map_sg(struct device
*hwdev
, struct scatterlist
*sglist
, int nelems
,
2870 enum dma_data_direction dir
, struct dma_attrs
*attrs
)
2873 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
2874 struct dmar_domain
*domain
;
2877 size_t offset_pfn
= 0;
2878 struct iova
*iova
= NULL
;
2880 struct scatterlist
*sg
;
2881 unsigned long start_vpfn
;
2882 struct intel_iommu
*iommu
;
2884 BUG_ON(dir
== DMA_NONE
);
2885 if (iommu_no_mapping(hwdev
))
2886 return intel_nontranslate_map_sg(hwdev
, sglist
, nelems
, dir
);
2888 domain
= get_valid_domain_for_dev(pdev
);
2892 iommu
= domain_get_iommu(domain
);
2894 for_each_sg(sglist
, sg
, nelems
, i
)
2895 size
+= aligned_nrpages(sg
->offset
, sg
->length
);
2897 iova
= intel_alloc_iova(hwdev
, domain
, dma_to_mm_pfn(size
),
2900 sglist
->dma_length
= 0;
2905 * Check if DMAR supports zero-length reads on write only
2908 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
2909 !cap_zlr(iommu
->cap
))
2910 prot
|= DMA_PTE_READ
;
2911 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
2912 prot
|= DMA_PTE_WRITE
;
2914 start_vpfn
= mm_to_dma_pfn(iova
->pfn_lo
);
2916 ret
= domain_sg_mapping(domain
, start_vpfn
, sglist
, size
, prot
);
2917 if (unlikely(ret
)) {
2918 /* clear the page */
2919 dma_pte_clear_range(domain
, start_vpfn
,
2920 start_vpfn
+ size
- 1);
2921 /* free page tables */
2922 dma_pte_free_pagetable(domain
, start_vpfn
,
2923 start_vpfn
+ size
- 1);
2925 __free_iova(&domain
->iovad
, iova
);
2929 /* it's a non-present to present mapping. Only flush if caching mode */
2930 if (cap_caching_mode(iommu
->cap
))
2931 iommu_flush_iotlb_psi(iommu
, 0, start_vpfn
, offset_pfn
);
2933 iommu_flush_write_buffer(iommu
);
2938 static int intel_mapping_error(struct device
*dev
, dma_addr_t dma_addr
)
2943 struct dma_map_ops intel_dma_ops
= {
2944 .alloc_coherent
= intel_alloc_coherent
,
2945 .free_coherent
= intel_free_coherent
,
2946 .map_sg
= intel_map_sg
,
2947 .unmap_sg
= intel_unmap_sg
,
2948 .map_page
= intel_map_page
,
2949 .unmap_page
= intel_unmap_page
,
2950 .mapping_error
= intel_mapping_error
,
2953 static inline int iommu_domain_cache_init(void)
2957 iommu_domain_cache
= kmem_cache_create("iommu_domain",
2958 sizeof(struct dmar_domain
),
2963 if (!iommu_domain_cache
) {
2964 printk(KERN_ERR
"Couldn't create iommu_domain cache\n");
2971 static inline int iommu_devinfo_cache_init(void)
2975 iommu_devinfo_cache
= kmem_cache_create("iommu_devinfo",
2976 sizeof(struct device_domain_info
),
2980 if (!iommu_devinfo_cache
) {
2981 printk(KERN_ERR
"Couldn't create devinfo cache\n");
2988 static inline int iommu_iova_cache_init(void)
2992 iommu_iova_cache
= kmem_cache_create("iommu_iova",
2993 sizeof(struct iova
),
2997 if (!iommu_iova_cache
) {
2998 printk(KERN_ERR
"Couldn't create iova cache\n");
3005 static int __init
iommu_init_mempool(void)
3008 ret
= iommu_iova_cache_init();
3012 ret
= iommu_domain_cache_init();
3016 ret
= iommu_devinfo_cache_init();
3020 kmem_cache_destroy(iommu_domain_cache
);
3022 kmem_cache_destroy(iommu_iova_cache
);
3027 static void __init
iommu_exit_mempool(void)
3029 kmem_cache_destroy(iommu_devinfo_cache
);
3030 kmem_cache_destroy(iommu_domain_cache
);
3031 kmem_cache_destroy(iommu_iova_cache
);
3035 static void __init
init_no_remapping_devices(void)
3037 struct dmar_drhd_unit
*drhd
;
3039 for_each_drhd_unit(drhd
) {
3040 if (!drhd
->include_all
) {
3042 for (i
= 0; i
< drhd
->devices_cnt
; i
++)
3043 if (drhd
->devices
[i
] != NULL
)
3045 /* ignore DMAR unit if no pci devices exist */
3046 if (i
== drhd
->devices_cnt
)
3054 for_each_drhd_unit(drhd
) {
3056 if (drhd
->ignored
|| drhd
->include_all
)
3059 for (i
= 0; i
< drhd
->devices_cnt
; i
++)
3060 if (drhd
->devices
[i
] &&
3061 !IS_GFX_DEVICE(drhd
->devices
[i
]))
3064 if (i
< drhd
->devices_cnt
)
3067 /* bypass IOMMU if it is just for gfx devices */
3069 for (i
= 0; i
< drhd
->devices_cnt
; i
++) {
3070 if (!drhd
->devices
[i
])
3072 drhd
->devices
[i
]->dev
.archdata
.iommu
= DUMMY_DEVICE_DOMAIN_INFO
;
3077 #ifdef CONFIG_SUSPEND
3078 static int init_iommu_hw(void)
3080 struct dmar_drhd_unit
*drhd
;
3081 struct intel_iommu
*iommu
= NULL
;
3083 for_each_active_iommu(iommu
, drhd
)
3085 dmar_reenable_qi(iommu
);
3087 for_each_active_iommu(iommu
, drhd
) {
3088 iommu_flush_write_buffer(iommu
);
3090 iommu_set_root_entry(iommu
);
3092 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
3093 DMA_CCMD_GLOBAL_INVL
);
3094 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
3095 DMA_TLB_GLOBAL_FLUSH
);
3096 iommu_enable_translation(iommu
);
3097 iommu_disable_protect_mem_regions(iommu
);
3103 static void iommu_flush_all(void)
3105 struct dmar_drhd_unit
*drhd
;
3106 struct intel_iommu
*iommu
;
3108 for_each_active_iommu(iommu
, drhd
) {
3109 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
3110 DMA_CCMD_GLOBAL_INVL
);
3111 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
3112 DMA_TLB_GLOBAL_FLUSH
);
3116 static int iommu_suspend(struct sys_device
*dev
, pm_message_t state
)
3118 struct dmar_drhd_unit
*drhd
;
3119 struct intel_iommu
*iommu
= NULL
;
3122 for_each_active_iommu(iommu
, drhd
) {
3123 iommu
->iommu_state
= kzalloc(sizeof(u32
) * MAX_SR_DMAR_REGS
,
3125 if (!iommu
->iommu_state
)
3131 for_each_active_iommu(iommu
, drhd
) {
3132 iommu_disable_translation(iommu
);
3134 spin_lock_irqsave(&iommu
->register_lock
, flag
);
3136 iommu
->iommu_state
[SR_DMAR_FECTL_REG
] =
3137 readl(iommu
->reg
+ DMAR_FECTL_REG
);
3138 iommu
->iommu_state
[SR_DMAR_FEDATA_REG
] =
3139 readl(iommu
->reg
+ DMAR_FEDATA_REG
);
3140 iommu
->iommu_state
[SR_DMAR_FEADDR_REG
] =
3141 readl(iommu
->reg
+ DMAR_FEADDR_REG
);
3142 iommu
->iommu_state
[SR_DMAR_FEUADDR_REG
] =
3143 readl(iommu
->reg
+ DMAR_FEUADDR_REG
);
3145 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
3150 for_each_active_iommu(iommu
, drhd
)
3151 kfree(iommu
->iommu_state
);
3156 static int iommu_resume(struct sys_device
*dev
)
3158 struct dmar_drhd_unit
*drhd
;
3159 struct intel_iommu
*iommu
= NULL
;
3162 if (init_iommu_hw()) {
3163 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3167 for_each_active_iommu(iommu
, drhd
) {
3169 spin_lock_irqsave(&iommu
->register_lock
, flag
);
3171 writel(iommu
->iommu_state
[SR_DMAR_FECTL_REG
],
3172 iommu
->reg
+ DMAR_FECTL_REG
);
3173 writel(iommu
->iommu_state
[SR_DMAR_FEDATA_REG
],
3174 iommu
->reg
+ DMAR_FEDATA_REG
);
3175 writel(iommu
->iommu_state
[SR_DMAR_FEADDR_REG
],
3176 iommu
->reg
+ DMAR_FEADDR_REG
);
3177 writel(iommu
->iommu_state
[SR_DMAR_FEUADDR_REG
],
3178 iommu
->reg
+ DMAR_FEUADDR_REG
);
3180 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
3183 for_each_active_iommu(iommu
, drhd
)
3184 kfree(iommu
->iommu_state
);
3189 static struct sysdev_class iommu_sysclass
= {
3191 .resume
= iommu_resume
,
3192 .suspend
= iommu_suspend
,
3195 static struct sys_device device_iommu
= {
3196 .cls
= &iommu_sysclass
,
3199 static int __init
init_iommu_sysfs(void)
3203 error
= sysdev_class_register(&iommu_sysclass
);
3207 error
= sysdev_register(&device_iommu
);
3209 sysdev_class_unregister(&iommu_sysclass
);
3215 static int __init
init_iommu_sysfs(void)
3219 #endif /* CONFIG_PM */
3222 * Here we only respond to action of unbound device from driver.
3224 * Added device is not attached to its DMAR domain here yet. That will happen
3225 * when mapping the device to iova.
3227 static int device_notifier(struct notifier_block
*nb
,
3228 unsigned long action
, void *data
)
3230 struct device
*dev
= data
;
3231 struct pci_dev
*pdev
= to_pci_dev(dev
);
3232 struct dmar_domain
*domain
;
3234 if (iommu_no_mapping(dev
))
3237 domain
= find_domain(pdev
);
3241 if (action
== BUS_NOTIFY_UNBOUND_DRIVER
&& !iommu_pass_through
)
3242 domain_remove_one_dev_info(domain
, pdev
);
3247 static struct notifier_block device_nb
= {
3248 .notifier_call
= device_notifier
,
3251 int __init
intel_iommu_init(void)
3256 /* VT-d is required for a TXT/tboot launch, so enforce that */
3257 force_on
= tboot_force_iommu();
3259 if (dmar_table_init()) {
3261 panic("tboot: Failed to initialize DMAR table\n");
3265 if (dmar_dev_scope_init()) {
3267 panic("tboot: Failed to initialize DMAR device scope\n");
3272 * Check the need for DMA-remapping initialization now.
3273 * Above initialization will also be used by Interrupt-remapping.
3275 if (no_iommu
|| dmar_disabled
)
3278 iommu_init_mempool();
3279 dmar_init_reserved_ranges();
3281 init_no_remapping_devices();
3286 panic("tboot: Failed to initialize DMARs\n");
3287 printk(KERN_ERR
"IOMMU: dmar init failed\n");
3288 put_iova_domain(&reserved_iova_list
);
3289 iommu_exit_mempool();
3293 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3295 init_timer(&unmap_timer
);
3296 #ifdef CONFIG_SWIOTLB
3299 dma_ops
= &intel_dma_ops
;
3303 register_iommu(&intel_iommu_ops
);
3305 bus_register_notifier(&pci_bus_type
, &device_nb
);
3310 static void iommu_detach_dependent_devices(struct intel_iommu
*iommu
,
3311 struct pci_dev
*pdev
)
3313 struct pci_dev
*tmp
, *parent
;
3315 if (!iommu
|| !pdev
)
3318 /* dependent device detach */
3319 tmp
= pci_find_upstream_pcie_bridge(pdev
);
3320 /* Secondary interface's bus number and devfn 0 */
3322 parent
= pdev
->bus
->self
;
3323 while (parent
!= tmp
) {
3324 iommu_detach_dev(iommu
, parent
->bus
->number
,
3326 parent
= parent
->bus
->self
;
3328 if (pci_is_pcie(tmp
)) /* this is a PCIe-to-PCI bridge */
3329 iommu_detach_dev(iommu
,
3330 tmp
->subordinate
->number
, 0);
3331 else /* this is a legacy PCI bridge */
3332 iommu_detach_dev(iommu
, tmp
->bus
->number
,
3337 static void domain_remove_one_dev_info(struct dmar_domain
*domain
,
3338 struct pci_dev
*pdev
)
3340 struct device_domain_info
*info
;
3341 struct intel_iommu
*iommu
;
3342 unsigned long flags
;
3344 struct list_head
*entry
, *tmp
;
3346 iommu
= device_to_iommu(pci_domain_nr(pdev
->bus
), pdev
->bus
->number
,
3351 spin_lock_irqsave(&device_domain_lock
, flags
);
3352 list_for_each_safe(entry
, tmp
, &domain
->devices
) {
3353 info
= list_entry(entry
, struct device_domain_info
, link
);
3354 /* No need to compare PCI domain; it has to be the same */
3355 if (info
->bus
== pdev
->bus
->number
&&
3356 info
->devfn
== pdev
->devfn
) {
3357 list_del(&info
->link
);
3358 list_del(&info
->global
);
3360 info
->dev
->dev
.archdata
.iommu
= NULL
;
3361 spin_unlock_irqrestore(&device_domain_lock
, flags
);
3363 iommu_disable_dev_iotlb(info
);
3364 iommu_detach_dev(iommu
, info
->bus
, info
->devfn
);
3365 iommu_detach_dependent_devices(iommu
, pdev
);
3366 free_devinfo_mem(info
);
3368 spin_lock_irqsave(&device_domain_lock
, flags
);
3376 /* if there is no other devices under the same iommu
3377 * owned by this domain, clear this iommu in iommu_bmp
3378 * update iommu count and coherency
3380 if (iommu
== device_to_iommu(info
->segment
, info
->bus
,
3386 unsigned long tmp_flags
;
3387 spin_lock_irqsave(&domain
->iommu_lock
, tmp_flags
);
3388 clear_bit(iommu
->seq_id
, &domain
->iommu_bmp
);
3389 domain
->iommu_count
--;
3390 domain_update_iommu_cap(domain
);
3391 spin_unlock_irqrestore(&domain
->iommu_lock
, tmp_flags
);
3394 spin_unlock_irqrestore(&device_domain_lock
, flags
);
3397 static void vm_domain_remove_all_dev_info(struct dmar_domain
*domain
)
3399 struct device_domain_info
*info
;
3400 struct intel_iommu
*iommu
;
3401 unsigned long flags1
, flags2
;
3403 spin_lock_irqsave(&device_domain_lock
, flags1
);
3404 while (!list_empty(&domain
->devices
)) {
3405 info
= list_entry(domain
->devices
.next
,
3406 struct device_domain_info
, link
);
3407 list_del(&info
->link
);
3408 list_del(&info
->global
);
3410 info
->dev
->dev
.archdata
.iommu
= NULL
;
3412 spin_unlock_irqrestore(&device_domain_lock
, flags1
);
3414 iommu_disable_dev_iotlb(info
);
3415 iommu
= device_to_iommu(info
->segment
, info
->bus
, info
->devfn
);
3416 iommu_detach_dev(iommu
, info
->bus
, info
->devfn
);
3417 iommu_detach_dependent_devices(iommu
, info
->dev
);
3419 /* clear this iommu in iommu_bmp, update iommu count
3422 spin_lock_irqsave(&domain
->iommu_lock
, flags2
);
3423 if (test_and_clear_bit(iommu
->seq_id
,
3424 &domain
->iommu_bmp
)) {
3425 domain
->iommu_count
--;
3426 domain_update_iommu_cap(domain
);
3428 spin_unlock_irqrestore(&domain
->iommu_lock
, flags2
);
3430 free_devinfo_mem(info
);
3431 spin_lock_irqsave(&device_domain_lock
, flags1
);
3433 spin_unlock_irqrestore(&device_domain_lock
, flags1
);
3436 /* domain id for virtual machine, it won't be set in context */
3437 static unsigned long vm_domid
;
3439 static int vm_domain_min_agaw(struct dmar_domain
*domain
)
3442 int min_agaw
= domain
->agaw
;
3444 i
= find_first_bit(&domain
->iommu_bmp
, g_num_of_iommus
);
3445 for (; i
< g_num_of_iommus
; ) {
3446 if (min_agaw
> g_iommus
[i
]->agaw
)
3447 min_agaw
= g_iommus
[i
]->agaw
;
3449 i
= find_next_bit(&domain
->iommu_bmp
, g_num_of_iommus
, i
+1);
3455 static struct dmar_domain
*iommu_alloc_vm_domain(void)
3457 struct dmar_domain
*domain
;
3459 domain
= alloc_domain_mem();
3463 domain
->id
= vm_domid
++;
3465 memset(&domain
->iommu_bmp
, 0, sizeof(unsigned long));
3466 domain
->flags
= DOMAIN_FLAG_VIRTUAL_MACHINE
;
3471 static int md_domain_init(struct dmar_domain
*domain
, int guest_width
)
3475 init_iova_domain(&domain
->iovad
, DMA_32BIT_PFN
);
3476 spin_lock_init(&domain
->iommu_lock
);
3478 domain_reserve_special_ranges(domain
);
3480 /* calculate AGAW */
3481 domain
->gaw
= guest_width
;
3482 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
3483 domain
->agaw
= width_to_agaw(adjust_width
);
3485 INIT_LIST_HEAD(&domain
->devices
);
3487 domain
->iommu_count
= 0;
3488 domain
->iommu_coherency
= 0;
3489 domain
->iommu_snooping
= 0;
3490 domain
->max_addr
= 0;
3493 /* always allocate the top pgd */
3494 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page(domain
->nid
);
3497 domain_flush_cache(domain
, domain
->pgd
, PAGE_SIZE
);
3501 static void iommu_free_vm_domain(struct dmar_domain
*domain
)
3503 unsigned long flags
;
3504 struct dmar_drhd_unit
*drhd
;
3505 struct intel_iommu
*iommu
;
3507 unsigned long ndomains
;
3509 for_each_drhd_unit(drhd
) {
3512 iommu
= drhd
->iommu
;
3514 ndomains
= cap_ndoms(iommu
->cap
);
3515 i
= find_first_bit(iommu
->domain_ids
, ndomains
);
3516 for (; i
< ndomains
; ) {
3517 if (iommu
->domains
[i
] == domain
) {
3518 spin_lock_irqsave(&iommu
->lock
, flags
);
3519 clear_bit(i
, iommu
->domain_ids
);
3520 iommu
->domains
[i
] = NULL
;
3521 spin_unlock_irqrestore(&iommu
->lock
, flags
);
3524 i
= find_next_bit(iommu
->domain_ids
, ndomains
, i
+1);
3529 static void vm_domain_exit(struct dmar_domain
*domain
)
3531 /* Domain 0 is reserved, so dont process it */
3535 vm_domain_remove_all_dev_info(domain
);
3537 put_iova_domain(&domain
->iovad
);
3540 dma_pte_clear_range(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
));
3542 /* free page tables */
3543 dma_pte_free_pagetable(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
));
3545 iommu_free_vm_domain(domain
);
3546 free_domain_mem(domain
);
3549 static int intel_iommu_domain_init(struct iommu_domain
*domain
)
3551 struct dmar_domain
*dmar_domain
;
3553 dmar_domain
= iommu_alloc_vm_domain();
3556 "intel_iommu_domain_init: dmar_domain == NULL\n");
3559 if (md_domain_init(dmar_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
3561 "intel_iommu_domain_init() failed\n");
3562 vm_domain_exit(dmar_domain
);
3565 domain
->priv
= dmar_domain
;
3570 static void intel_iommu_domain_destroy(struct iommu_domain
*domain
)
3572 struct dmar_domain
*dmar_domain
= domain
->priv
;
3574 domain
->priv
= NULL
;
3575 vm_domain_exit(dmar_domain
);
3578 static int intel_iommu_attach_device(struct iommu_domain
*domain
,
3581 struct dmar_domain
*dmar_domain
= domain
->priv
;
3582 struct pci_dev
*pdev
= to_pci_dev(dev
);
3583 struct intel_iommu
*iommu
;
3587 /* normally pdev is not mapped */
3588 if (unlikely(domain_context_mapped(pdev
))) {
3589 struct dmar_domain
*old_domain
;
3591 old_domain
= find_domain(pdev
);
3593 if (dmar_domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
||
3594 dmar_domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
)
3595 domain_remove_one_dev_info(old_domain
, pdev
);
3597 domain_remove_dev_info(old_domain
);
3601 iommu
= device_to_iommu(pci_domain_nr(pdev
->bus
), pdev
->bus
->number
,
3606 /* check if this iommu agaw is sufficient for max mapped address */
3607 addr_width
= agaw_to_width(iommu
->agaw
);
3608 end
= DOMAIN_MAX_ADDR(addr_width
);
3609 end
= end
& VTD_PAGE_MASK
;
3610 if (end
< dmar_domain
->max_addr
) {
3611 printk(KERN_ERR
"%s: iommu agaw (%d) is not "
3612 "sufficient for the mapped address (%llx)\n",
3613 __func__
, iommu
->agaw
, dmar_domain
->max_addr
);
3617 return domain_add_dev_info(dmar_domain
, pdev
, CONTEXT_TT_MULTI_LEVEL
);
3620 static void intel_iommu_detach_device(struct iommu_domain
*domain
,
3623 struct dmar_domain
*dmar_domain
= domain
->priv
;
3624 struct pci_dev
*pdev
= to_pci_dev(dev
);
3626 domain_remove_one_dev_info(dmar_domain
, pdev
);
3629 static int intel_iommu_map_range(struct iommu_domain
*domain
,
3630 unsigned long iova
, phys_addr_t hpa
,
3631 size_t size
, int iommu_prot
)
3633 struct dmar_domain
*dmar_domain
= domain
->priv
;
3639 if (iommu_prot
& IOMMU_READ
)
3640 prot
|= DMA_PTE_READ
;
3641 if (iommu_prot
& IOMMU_WRITE
)
3642 prot
|= DMA_PTE_WRITE
;
3643 if ((iommu_prot
& IOMMU_CACHE
) && dmar_domain
->iommu_snooping
)
3644 prot
|= DMA_PTE_SNP
;
3646 max_addr
= iova
+ size
;
3647 if (dmar_domain
->max_addr
< max_addr
) {
3651 /* check if minimum agaw is sufficient for mapped address */
3652 min_agaw
= vm_domain_min_agaw(dmar_domain
);
3653 addr_width
= agaw_to_width(min_agaw
);
3654 end
= DOMAIN_MAX_ADDR(addr_width
);
3655 end
= end
& VTD_PAGE_MASK
;
3656 if (end
< max_addr
) {
3657 printk(KERN_ERR
"%s: iommu agaw (%d) is not "
3658 "sufficient for the mapped address (%llx)\n",
3659 __func__
, min_agaw
, max_addr
);
3662 dmar_domain
->max_addr
= max_addr
;
3664 /* Round up size to next multiple of PAGE_SIZE, if it and
3665 the low bits of hpa would take us onto the next page */
3666 size
= aligned_nrpages(hpa
, size
);
3667 ret
= domain_pfn_mapping(dmar_domain
, iova
>> VTD_PAGE_SHIFT
,
3668 hpa
>> VTD_PAGE_SHIFT
, size
, prot
);
3672 static void intel_iommu_unmap_range(struct iommu_domain
*domain
,
3673 unsigned long iova
, size_t size
)
3675 struct dmar_domain
*dmar_domain
= domain
->priv
;
3680 dma_pte_clear_range(dmar_domain
, iova
>> VTD_PAGE_SHIFT
,
3681 (iova
+ size
- 1) >> VTD_PAGE_SHIFT
);
3683 if (dmar_domain
->max_addr
== iova
+ size
)
3684 dmar_domain
->max_addr
= iova
;
3687 static phys_addr_t
intel_iommu_iova_to_phys(struct iommu_domain
*domain
,
3690 struct dmar_domain
*dmar_domain
= domain
->priv
;
3691 struct dma_pte
*pte
;
3694 pte
= pfn_to_dma_pte(dmar_domain
, iova
>> VTD_PAGE_SHIFT
);
3696 phys
= dma_pte_addr(pte
);
3701 static int intel_iommu_domain_has_cap(struct iommu_domain
*domain
,
3704 struct dmar_domain
*dmar_domain
= domain
->priv
;
3706 if (cap
== IOMMU_CAP_CACHE_COHERENCY
)
3707 return dmar_domain
->iommu_snooping
;
3712 static struct iommu_ops intel_iommu_ops
= {
3713 .domain_init
= intel_iommu_domain_init
,
3714 .domain_destroy
= intel_iommu_domain_destroy
,
3715 .attach_dev
= intel_iommu_attach_device
,
3716 .detach_dev
= intel_iommu_detach_device
,
3717 .map
= intel_iommu_map_range
,
3718 .unmap
= intel_iommu_unmap_range
,
3719 .iova_to_phys
= intel_iommu_iova_to_phys
,
3720 .domain_has_cap
= intel_iommu_domain_has_cap
,
3723 static void __devinit
quirk_iommu_rwbf(struct pci_dev
*dev
)
3726 * Mobile 4 Series Chipset neglects to set RWBF capability,
3729 printk(KERN_INFO
"DMAR: Forcing write-buffer flush capability\n");
3733 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2a40, quirk_iommu_rwbf
);
3735 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3736 ISOCH DMAR unit for the Azalia sound device, but not give it any
3737 TLB entries, which causes it to deadlock. Check for that. We do
3738 this in a function called from init_dmars(), instead of in a PCI
3739 quirk, because we don't want to print the obnoxious "BIOS broken"
3740 message if VT-d is actually disabled.
3742 static void __init
check_tylersburg_isoch(void)
3744 struct pci_dev
*pdev
;
3745 uint32_t vtisochctrl
;
3747 /* If there's no Azalia in the system anyway, forget it. */
3748 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, 0x3a3e, NULL
);
3753 /* System Management Registers. Might be hidden, in which case
3754 we can't do the sanity check. But that's OK, because the
3755 known-broken BIOSes _don't_ actually hide it, so far. */
3756 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, 0x342e, NULL
);
3760 if (pci_read_config_dword(pdev
, 0x188, &vtisochctrl
)) {
3767 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3768 if (vtisochctrl
& 1)
3771 /* Drop all bits other than the number of TLB entries */
3772 vtisochctrl
&= 0x1c;
3774 /* If we have the recommended number of TLB entries (16), fine. */
3775 if (vtisochctrl
== 0x10)
3778 /* Zero TLB entries? You get to ride the short bus to school. */
3780 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3781 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3782 dmi_get_system_info(DMI_BIOS_VENDOR
),
3783 dmi_get_system_info(DMI_BIOS_VERSION
),
3784 dmi_get_system_info(DMI_PRODUCT_VERSION
));
3785 iommu_identity_mapping
|= IDENTMAP_AZALIA
;
3789 printk(KERN_WARNING
"DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",