2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
37 #include "intel-iommu.h"
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
46 #define IOAPIC_RANGE_START (0xfee00000)
47 #define IOAPIC_RANGE_END (0xfeefffff)
48 #define IOVA_START_ADDR (0x1000)
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
52 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 static void flush_unmaps_timeout(unsigned long data
);
59 DEFINE_TIMER(unmap_timer
, flush_unmaps_timeout
, 0, 0);
61 static struct intel_iommu
*g_iommus
;
63 #define HIGH_WATER_MARK 250
64 struct deferred_flush_tables
{
66 struct iova
*iova
[HIGH_WATER_MARK
];
67 struct dmar_domain
*domain
[HIGH_WATER_MARK
];
70 static struct deferred_flush_tables
*deferred_flush
;
72 /* bitmap for indexing intel_iommus */
73 static int g_num_of_iommus
;
75 static DEFINE_SPINLOCK(async_umap_flush_lock
);
76 static LIST_HEAD(unmaps_to_do
);
79 static long list_size
;
81 static void domain_remove_dev_info(struct dmar_domain
*domain
);
83 static int dmar_disabled
;
84 static int __initdata dmar_map_gfx
= 1;
85 static int dmar_forcedac
;
86 static int intel_iommu_strict
;
88 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
89 static DEFINE_SPINLOCK(device_domain_lock
);
90 static LIST_HEAD(device_domain_list
);
92 static int __init
intel_iommu_setup(char *str
)
97 if (!strncmp(str
, "off", 3)) {
99 printk(KERN_INFO
"Intel-IOMMU: disabled\n");
100 } else if (!strncmp(str
, "igfx_off", 8)) {
103 "Intel-IOMMU: disable GFX device mapping\n");
104 } else if (!strncmp(str
, "forcedac", 8)) {
106 "Intel-IOMMU: Forcing DAC for PCI devices\n");
108 } else if (!strncmp(str
, "strict", 6)) {
110 "Intel-IOMMU: disable batched IOTLB flush\n");
111 intel_iommu_strict
= 1;
114 str
+= strcspn(str
, ",");
120 __setup("intel_iommu=", intel_iommu_setup
);
122 static struct kmem_cache
*iommu_domain_cache
;
123 static struct kmem_cache
*iommu_devinfo_cache
;
124 static struct kmem_cache
*iommu_iova_cache
;
126 static inline void *iommu_kmem_cache_alloc(struct kmem_cache
*cachep
)
131 /* trying to avoid low memory issues */
132 flags
= current
->flags
& PF_MEMALLOC
;
133 current
->flags
|= PF_MEMALLOC
;
134 vaddr
= kmem_cache_alloc(cachep
, GFP_ATOMIC
);
135 current
->flags
&= (~PF_MEMALLOC
| flags
);
140 static inline void *alloc_pgtable_page(void)
145 /* trying to avoid low memory issues */
146 flags
= current
->flags
& PF_MEMALLOC
;
147 current
->flags
|= PF_MEMALLOC
;
148 vaddr
= (void *)get_zeroed_page(GFP_ATOMIC
);
149 current
->flags
&= (~PF_MEMALLOC
| flags
);
153 static inline void free_pgtable_page(void *vaddr
)
155 free_page((unsigned long)vaddr
);
158 static inline void *alloc_domain_mem(void)
160 return iommu_kmem_cache_alloc(iommu_domain_cache
);
163 static inline void free_domain_mem(void *vaddr
)
165 kmem_cache_free(iommu_domain_cache
, vaddr
);
168 static inline void * alloc_devinfo_mem(void)
170 return iommu_kmem_cache_alloc(iommu_devinfo_cache
);
173 static inline void free_devinfo_mem(void *vaddr
)
175 kmem_cache_free(iommu_devinfo_cache
, vaddr
);
178 struct iova
*alloc_iova_mem(void)
180 return iommu_kmem_cache_alloc(iommu_iova_cache
);
183 void free_iova_mem(struct iova
*iova
)
185 kmem_cache_free(iommu_iova_cache
, iova
);
188 static inline void __iommu_flush_cache(
189 struct intel_iommu
*iommu
, void *addr
, int size
)
191 if (!ecap_coherent(iommu
->ecap
))
192 clflush_cache_range(addr
, size
);
195 /* Gets context entry for a given bus and devfn */
196 static struct context_entry
* device_to_context_entry(struct intel_iommu
*iommu
,
199 struct root_entry
*root
;
200 struct context_entry
*context
;
201 unsigned long phy_addr
;
204 spin_lock_irqsave(&iommu
->lock
, flags
);
205 root
= &iommu
->root_entry
[bus
];
206 context
= get_context_addr_from_root(root
);
208 context
= (struct context_entry
*)alloc_pgtable_page();
210 spin_unlock_irqrestore(&iommu
->lock
, flags
);
213 __iommu_flush_cache(iommu
, (void *)context
, PAGE_SIZE_4K
);
214 phy_addr
= virt_to_phys((void *)context
);
215 set_root_value(root
, phy_addr
);
216 set_root_present(root
);
217 __iommu_flush_cache(iommu
, root
, sizeof(*root
));
219 spin_unlock_irqrestore(&iommu
->lock
, flags
);
220 return &context
[devfn
];
223 static int device_context_mapped(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
225 struct root_entry
*root
;
226 struct context_entry
*context
;
230 spin_lock_irqsave(&iommu
->lock
, flags
);
231 root
= &iommu
->root_entry
[bus
];
232 context
= get_context_addr_from_root(root
);
237 ret
= context_present(context
[devfn
]);
239 spin_unlock_irqrestore(&iommu
->lock
, flags
);
243 static void clear_context_table(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
245 struct root_entry
*root
;
246 struct context_entry
*context
;
249 spin_lock_irqsave(&iommu
->lock
, flags
);
250 root
= &iommu
->root_entry
[bus
];
251 context
= get_context_addr_from_root(root
);
253 context_clear_entry(context
[devfn
]);
254 __iommu_flush_cache(iommu
, &context
[devfn
], \
257 spin_unlock_irqrestore(&iommu
->lock
, flags
);
260 static void free_context_table(struct intel_iommu
*iommu
)
262 struct root_entry
*root
;
265 struct context_entry
*context
;
267 spin_lock_irqsave(&iommu
->lock
, flags
);
268 if (!iommu
->root_entry
) {
271 for (i
= 0; i
< ROOT_ENTRY_NR
; i
++) {
272 root
= &iommu
->root_entry
[i
];
273 context
= get_context_addr_from_root(root
);
275 free_pgtable_page(context
);
277 free_pgtable_page(iommu
->root_entry
);
278 iommu
->root_entry
= NULL
;
280 spin_unlock_irqrestore(&iommu
->lock
, flags
);
283 /* page table handling */
284 #define LEVEL_STRIDE (9)
285 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
287 static inline int agaw_to_level(int agaw
)
292 static inline int agaw_to_width(int agaw
)
294 return 30 + agaw
* LEVEL_STRIDE
;
298 static inline int width_to_agaw(int width
)
300 return (width
- 30) / LEVEL_STRIDE
;
303 static inline unsigned int level_to_offset_bits(int level
)
305 return (12 + (level
- 1) * LEVEL_STRIDE
);
308 static inline int address_level_offset(u64 addr
, int level
)
310 return ((addr
>> level_to_offset_bits(level
)) & LEVEL_MASK
);
313 static inline u64
level_mask(int level
)
315 return ((u64
)-1 << level_to_offset_bits(level
));
318 static inline u64
level_size(int level
)
320 return ((u64
)1 << level_to_offset_bits(level
));
323 static inline u64
align_to_level(u64 addr
, int level
)
325 return ((addr
+ level_size(level
) - 1) & level_mask(level
));
328 static struct dma_pte
* addr_to_dma_pte(struct dmar_domain
*domain
, u64 addr
)
330 int addr_width
= agaw_to_width(domain
->agaw
);
331 struct dma_pte
*parent
, *pte
= NULL
;
332 int level
= agaw_to_level(domain
->agaw
);
336 BUG_ON(!domain
->pgd
);
338 addr
&= (((u64
)1) << addr_width
) - 1;
339 parent
= domain
->pgd
;
341 spin_lock_irqsave(&domain
->mapping_lock
, flags
);
345 offset
= address_level_offset(addr
, level
);
346 pte
= &parent
[offset
];
350 if (!dma_pte_present(*pte
)) {
351 tmp_page
= alloc_pgtable_page();
354 spin_unlock_irqrestore(&domain
->mapping_lock
,
358 __iommu_flush_cache(domain
->iommu
, tmp_page
,
360 dma_set_pte_addr(*pte
, virt_to_phys(tmp_page
));
362 * high level table always sets r/w, last level page
363 * table control read/write
365 dma_set_pte_readable(*pte
);
366 dma_set_pte_writable(*pte
);
367 __iommu_flush_cache(domain
->iommu
, pte
, sizeof(*pte
));
369 parent
= phys_to_virt(dma_pte_addr(*pte
));
373 spin_unlock_irqrestore(&domain
->mapping_lock
, flags
);
377 /* return address's pte at specific level */
378 static struct dma_pte
*dma_addr_level_pte(struct dmar_domain
*domain
, u64 addr
,
381 struct dma_pte
*parent
, *pte
= NULL
;
382 int total
= agaw_to_level(domain
->agaw
);
385 parent
= domain
->pgd
;
386 while (level
<= total
) {
387 offset
= address_level_offset(addr
, total
);
388 pte
= &parent
[offset
];
392 if (!dma_pte_present(*pte
))
394 parent
= phys_to_virt(dma_pte_addr(*pte
));
400 /* clear one page's page table */
401 static void dma_pte_clear_one(struct dmar_domain
*domain
, u64 addr
)
403 struct dma_pte
*pte
= NULL
;
405 /* get last level pte */
406 pte
= dma_addr_level_pte(domain
, addr
, 1);
410 __iommu_flush_cache(domain
->iommu
, pte
, sizeof(*pte
));
414 /* clear last level pte, a tlb flush should be followed */
415 static void dma_pte_clear_range(struct dmar_domain
*domain
, u64 start
, u64 end
)
417 int addr_width
= agaw_to_width(domain
->agaw
);
419 start
&= (((u64
)1) << addr_width
) - 1;
420 end
&= (((u64
)1) << addr_width
) - 1;
421 /* in case it's partial page */
422 start
= PAGE_ALIGN_4K(start
);
425 /* we don't need lock here, nobody else touches the iova range */
426 while (start
< end
) {
427 dma_pte_clear_one(domain
, start
);
428 start
+= PAGE_SIZE_4K
;
432 /* free page table pages. last level pte should already be cleared */
433 static void dma_pte_free_pagetable(struct dmar_domain
*domain
,
436 int addr_width
= agaw_to_width(domain
->agaw
);
438 int total
= agaw_to_level(domain
->agaw
);
442 start
&= (((u64
)1) << addr_width
) - 1;
443 end
&= (((u64
)1) << addr_width
) - 1;
445 /* we don't need lock here, nobody else touches the iova range */
447 while (level
<= total
) {
448 tmp
= align_to_level(start
, level
);
449 if (tmp
>= end
|| (tmp
+ level_size(level
) > end
))
453 pte
= dma_addr_level_pte(domain
, tmp
, level
);
456 phys_to_virt(dma_pte_addr(*pte
)));
458 __iommu_flush_cache(domain
->iommu
,
461 tmp
+= level_size(level
);
466 if (start
== 0 && end
>= ((((u64
)1) << addr_width
) - 1)) {
467 free_pgtable_page(domain
->pgd
);
473 static int iommu_alloc_root_entry(struct intel_iommu
*iommu
)
475 struct root_entry
*root
;
478 root
= (struct root_entry
*)alloc_pgtable_page();
482 __iommu_flush_cache(iommu
, root
, PAGE_SIZE_4K
);
484 spin_lock_irqsave(&iommu
->lock
, flags
);
485 iommu
->root_entry
= root
;
486 spin_unlock_irqrestore(&iommu
->lock
, flags
);
491 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
493 cycles_t start_time = get_cycles();\
495 sts = op (iommu->reg + offset);\
498 if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
499 panic("DMAR hardware is malfunctioning\n");\
504 static void iommu_set_root_entry(struct intel_iommu
*iommu
)
510 addr
= iommu
->root_entry
;
512 spin_lock_irqsave(&iommu
->register_lock
, flag
);
513 dmar_writeq(iommu
->reg
+ DMAR_RTADDR_REG
, virt_to_phys(addr
));
515 cmd
= iommu
->gcmd
| DMA_GCMD_SRTP
;
516 writel(cmd
, iommu
->reg
+ DMAR_GCMD_REG
);
518 /* Make sure hardware complete it */
519 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
520 readl
, (sts
& DMA_GSTS_RTPS
), sts
);
522 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
525 static void iommu_flush_write_buffer(struct intel_iommu
*iommu
)
530 if (!cap_rwbf(iommu
->cap
))
532 val
= iommu
->gcmd
| DMA_GCMD_WBF
;
534 spin_lock_irqsave(&iommu
->register_lock
, flag
);
535 writel(val
, iommu
->reg
+ DMAR_GCMD_REG
);
537 /* Make sure hardware complete it */
538 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
539 readl
, (!(val
& DMA_GSTS_WBFS
)), val
);
541 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
544 /* return value determine if we need a write buffer flush */
545 static int __iommu_flush_context(struct intel_iommu
*iommu
,
546 u16 did
, u16 source_id
, u8 function_mask
, u64 type
,
547 int non_present_entry_flush
)
553 * In the non-present entry flush case, if hardware doesn't cache
554 * non-present entry we do nothing and if hardware cache non-present
555 * entry, we flush entries of domain 0 (the domain id is used to cache
556 * any non-present entries)
558 if (non_present_entry_flush
) {
559 if (!cap_caching_mode(iommu
->cap
))
566 case DMA_CCMD_GLOBAL_INVL
:
567 val
= DMA_CCMD_GLOBAL_INVL
;
569 case DMA_CCMD_DOMAIN_INVL
:
570 val
= DMA_CCMD_DOMAIN_INVL
|DMA_CCMD_DID(did
);
572 case DMA_CCMD_DEVICE_INVL
:
573 val
= DMA_CCMD_DEVICE_INVL
|DMA_CCMD_DID(did
)
574 | DMA_CCMD_SID(source_id
) | DMA_CCMD_FM(function_mask
);
581 spin_lock_irqsave(&iommu
->register_lock
, flag
);
582 dmar_writeq(iommu
->reg
+ DMAR_CCMD_REG
, val
);
584 /* Make sure hardware complete it */
585 IOMMU_WAIT_OP(iommu
, DMAR_CCMD_REG
,
586 dmar_readq
, (!(val
& DMA_CCMD_ICC
)), val
);
588 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
590 /* flush context entry will implictly flush write buffer */
594 static int inline iommu_flush_context_global(struct intel_iommu
*iommu
,
595 int non_present_entry_flush
)
597 return __iommu_flush_context(iommu
, 0, 0, 0, DMA_CCMD_GLOBAL_INVL
,
598 non_present_entry_flush
);
601 static int inline iommu_flush_context_domain(struct intel_iommu
*iommu
, u16 did
,
602 int non_present_entry_flush
)
604 return __iommu_flush_context(iommu
, did
, 0, 0, DMA_CCMD_DOMAIN_INVL
,
605 non_present_entry_flush
);
608 static int inline iommu_flush_context_device(struct intel_iommu
*iommu
,
609 u16 did
, u16 source_id
, u8 function_mask
, int non_present_entry_flush
)
611 return __iommu_flush_context(iommu
, did
, source_id
, function_mask
,
612 DMA_CCMD_DEVICE_INVL
, non_present_entry_flush
);
615 /* return value determine if we need a write buffer flush */
616 static int __iommu_flush_iotlb(struct intel_iommu
*iommu
, u16 did
,
617 u64 addr
, unsigned int size_order
, u64 type
,
618 int non_present_entry_flush
)
620 int tlb_offset
= ecap_iotlb_offset(iommu
->ecap
);
621 u64 val
= 0, val_iva
= 0;
625 * In the non-present entry flush case, if hardware doesn't cache
626 * non-present entry we do nothing and if hardware cache non-present
627 * entry, we flush entries of domain 0 (the domain id is used to cache
628 * any non-present entries)
630 if (non_present_entry_flush
) {
631 if (!cap_caching_mode(iommu
->cap
))
638 case DMA_TLB_GLOBAL_FLUSH
:
639 /* global flush doesn't need set IVA_REG */
640 val
= DMA_TLB_GLOBAL_FLUSH
|DMA_TLB_IVT
;
642 case DMA_TLB_DSI_FLUSH
:
643 val
= DMA_TLB_DSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
645 case DMA_TLB_PSI_FLUSH
:
646 val
= DMA_TLB_PSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
647 /* Note: always flush non-leaf currently */
648 val_iva
= size_order
| addr
;
653 /* Note: set drain read/write */
656 * This is probably to be super secure.. Looks like we can
657 * ignore it without any impact.
659 if (cap_read_drain(iommu
->cap
))
660 val
|= DMA_TLB_READ_DRAIN
;
662 if (cap_write_drain(iommu
->cap
))
663 val
|= DMA_TLB_WRITE_DRAIN
;
665 spin_lock_irqsave(&iommu
->register_lock
, flag
);
666 /* Note: Only uses first TLB reg currently */
668 dmar_writeq(iommu
->reg
+ tlb_offset
, val_iva
);
669 dmar_writeq(iommu
->reg
+ tlb_offset
+ 8, val
);
671 /* Make sure hardware complete it */
672 IOMMU_WAIT_OP(iommu
, tlb_offset
+ 8,
673 dmar_readq
, (!(val
& DMA_TLB_IVT
)), val
);
675 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
677 /* check IOTLB invalidation granularity */
678 if (DMA_TLB_IAIG(val
) == 0)
679 printk(KERN_ERR
"IOMMU: flush IOTLB failed\n");
680 if (DMA_TLB_IAIG(val
) != DMA_TLB_IIRG(type
))
681 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
682 DMA_TLB_IIRG(type
), DMA_TLB_IAIG(val
));
683 /* flush context entry will implictly flush write buffer */
687 static int inline iommu_flush_iotlb_global(struct intel_iommu
*iommu
,
688 int non_present_entry_flush
)
690 return __iommu_flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
,
691 non_present_entry_flush
);
694 static int inline iommu_flush_iotlb_dsi(struct intel_iommu
*iommu
, u16 did
,
695 int non_present_entry_flush
)
697 return __iommu_flush_iotlb(iommu
, did
, 0, 0, DMA_TLB_DSI_FLUSH
,
698 non_present_entry_flush
);
701 static int iommu_flush_iotlb_psi(struct intel_iommu
*iommu
, u16 did
,
702 u64 addr
, unsigned int pages
, int non_present_entry_flush
)
706 BUG_ON(addr
& (~PAGE_MASK_4K
));
709 /* Fallback to domain selective flush if no PSI support */
710 if (!cap_pgsel_inv(iommu
->cap
))
711 return iommu_flush_iotlb_dsi(iommu
, did
,
712 non_present_entry_flush
);
715 * PSI requires page size to be 2 ^ x, and the base address is naturally
716 * aligned to the size
718 mask
= ilog2(__roundup_pow_of_two(pages
));
719 /* Fallback to domain selective flush if size is too big */
720 if (mask
> cap_max_amask_val(iommu
->cap
))
721 return iommu_flush_iotlb_dsi(iommu
, did
,
722 non_present_entry_flush
);
724 return __iommu_flush_iotlb(iommu
, did
, addr
, mask
,
725 DMA_TLB_PSI_FLUSH
, non_present_entry_flush
);
728 static void iommu_disable_protect_mem_regions(struct intel_iommu
*iommu
)
733 spin_lock_irqsave(&iommu
->register_lock
, flags
);
734 pmen
= readl(iommu
->reg
+ DMAR_PMEN_REG
);
735 pmen
&= ~DMA_PMEN_EPM
;
736 writel(pmen
, iommu
->reg
+ DMAR_PMEN_REG
);
738 /* wait for the protected region status bit to clear */
739 IOMMU_WAIT_OP(iommu
, DMAR_PMEN_REG
,
740 readl
, !(pmen
& DMA_PMEN_PRS
), pmen
);
742 spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
745 static int iommu_enable_translation(struct intel_iommu
*iommu
)
750 spin_lock_irqsave(&iommu
->register_lock
, flags
);
751 writel(iommu
->gcmd
|DMA_GCMD_TE
, iommu
->reg
+ DMAR_GCMD_REG
);
753 /* Make sure hardware complete it */
754 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
755 readl
, (sts
& DMA_GSTS_TES
), sts
);
757 iommu
->gcmd
|= DMA_GCMD_TE
;
758 spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
762 static int iommu_disable_translation(struct intel_iommu
*iommu
)
767 spin_lock_irqsave(&iommu
->register_lock
, flag
);
768 iommu
->gcmd
&= ~DMA_GCMD_TE
;
769 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
771 /* Make sure hardware complete it */
772 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
773 readl
, (!(sts
& DMA_GSTS_TES
)), sts
);
775 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
779 /* iommu interrupt handling. Most stuff are MSI-like. */
781 static const char *fault_reason_strings
[] =
784 "Present bit in root entry is clear",
785 "Present bit in context entry is clear",
786 "Invalid context entry",
787 "Access beyond MGAW",
788 "PTE Write access is not set",
789 "PTE Read access is not set",
790 "Next page table ptr is invalid",
791 "Root table address invalid",
792 "Context table ptr is invalid",
793 "non-zero reserved fields in RTP",
794 "non-zero reserved fields in CTP",
795 "non-zero reserved fields in PTE",
797 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
799 const char *dmar_get_fault_reason(u8 fault_reason
)
801 if (fault_reason
> MAX_FAULT_REASON_IDX
)
804 return fault_reason_strings
[fault_reason
];
807 void dmar_msi_unmask(unsigned int irq
)
809 struct intel_iommu
*iommu
= get_irq_data(irq
);
813 spin_lock_irqsave(&iommu
->register_lock
, flag
);
814 writel(0, iommu
->reg
+ DMAR_FECTL_REG
);
815 /* Read a reg to force flush the post write */
816 readl(iommu
->reg
+ DMAR_FECTL_REG
);
817 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
820 void dmar_msi_mask(unsigned int irq
)
823 struct intel_iommu
*iommu
= get_irq_data(irq
);
826 spin_lock_irqsave(&iommu
->register_lock
, flag
);
827 writel(DMA_FECTL_IM
, iommu
->reg
+ DMAR_FECTL_REG
);
828 /* Read a reg to force flush the post write */
829 readl(iommu
->reg
+ DMAR_FECTL_REG
);
830 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
833 void dmar_msi_write(int irq
, struct msi_msg
*msg
)
835 struct intel_iommu
*iommu
= get_irq_data(irq
);
838 spin_lock_irqsave(&iommu
->register_lock
, flag
);
839 writel(msg
->data
, iommu
->reg
+ DMAR_FEDATA_REG
);
840 writel(msg
->address_lo
, iommu
->reg
+ DMAR_FEADDR_REG
);
841 writel(msg
->address_hi
, iommu
->reg
+ DMAR_FEUADDR_REG
);
842 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
845 void dmar_msi_read(int irq
, struct msi_msg
*msg
)
847 struct intel_iommu
*iommu
= get_irq_data(irq
);
850 spin_lock_irqsave(&iommu
->register_lock
, flag
);
851 msg
->data
= readl(iommu
->reg
+ DMAR_FEDATA_REG
);
852 msg
->address_lo
= readl(iommu
->reg
+ DMAR_FEADDR_REG
);
853 msg
->address_hi
= readl(iommu
->reg
+ DMAR_FEUADDR_REG
);
854 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
857 static int iommu_page_fault_do_one(struct intel_iommu
*iommu
, int type
,
858 u8 fault_reason
, u16 source_id
, u64 addr
)
862 reason
= dmar_get_fault_reason(fault_reason
);
865 "DMAR:[%s] Request device [%02x:%02x.%d] "
867 "DMAR:[fault reason %02d] %s\n",
868 (type
? "DMA Read" : "DMA Write"),
869 (source_id
>> 8), PCI_SLOT(source_id
& 0xFF),
870 PCI_FUNC(source_id
& 0xFF), addr
, fault_reason
, reason
);
874 #define PRIMARY_FAULT_REG_LEN (16)
875 static irqreturn_t
iommu_page_fault(int irq
, void *dev_id
)
877 struct intel_iommu
*iommu
= dev_id
;
878 int reg
, fault_index
;
882 spin_lock_irqsave(&iommu
->register_lock
, flag
);
883 fault_status
= readl(iommu
->reg
+ DMAR_FSTS_REG
);
885 /* TBD: ignore advanced fault log currently */
886 if (!(fault_status
& DMA_FSTS_PPF
))
889 fault_index
= dma_fsts_fault_record_index(fault_status
);
890 reg
= cap_fault_reg_offset(iommu
->cap
);
898 /* highest 32 bits */
899 data
= readl(iommu
->reg
+ reg
+
900 fault_index
* PRIMARY_FAULT_REG_LEN
+ 12);
901 if (!(data
& DMA_FRCD_F
))
904 fault_reason
= dma_frcd_fault_reason(data
);
905 type
= dma_frcd_type(data
);
907 data
= readl(iommu
->reg
+ reg
+
908 fault_index
* PRIMARY_FAULT_REG_LEN
+ 8);
909 source_id
= dma_frcd_source_id(data
);
911 guest_addr
= dmar_readq(iommu
->reg
+ reg
+
912 fault_index
* PRIMARY_FAULT_REG_LEN
);
913 guest_addr
= dma_frcd_page_addr(guest_addr
);
914 /* clear the fault */
915 writel(DMA_FRCD_F
, iommu
->reg
+ reg
+
916 fault_index
* PRIMARY_FAULT_REG_LEN
+ 12);
918 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
920 iommu_page_fault_do_one(iommu
, type
, fault_reason
,
921 source_id
, guest_addr
);
924 if (fault_index
> cap_num_fault_regs(iommu
->cap
))
926 spin_lock_irqsave(&iommu
->register_lock
, flag
);
929 /* clear primary fault overflow */
930 fault_status
= readl(iommu
->reg
+ DMAR_FSTS_REG
);
931 if (fault_status
& DMA_FSTS_PFO
)
932 writel(DMA_FSTS_PFO
, iommu
->reg
+ DMAR_FSTS_REG
);
934 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
938 int dmar_set_interrupt(struct intel_iommu
*iommu
)
944 printk(KERN_ERR
"IOMMU: no free vectors\n");
948 set_irq_data(irq
, iommu
);
951 ret
= arch_setup_dmar_msi(irq
);
953 set_irq_data(irq
, NULL
);
959 /* Force fault register is cleared */
960 iommu_page_fault(irq
, iommu
);
962 ret
= request_irq(irq
, iommu_page_fault
, 0, iommu
->name
, iommu
);
964 printk(KERN_ERR
"IOMMU: can't request irq\n");
968 static int iommu_init_domains(struct intel_iommu
*iommu
)
970 unsigned long ndomains
;
971 unsigned long nlongs
;
973 ndomains
= cap_ndoms(iommu
->cap
);
974 pr_debug("Number of Domains supportd <%ld>\n", ndomains
);
975 nlongs
= BITS_TO_LONGS(ndomains
);
977 /* TBD: there might be 64K domains,
978 * consider other allocation for future chip
980 iommu
->domain_ids
= kcalloc(nlongs
, sizeof(unsigned long), GFP_KERNEL
);
981 if (!iommu
->domain_ids
) {
982 printk(KERN_ERR
"Allocating domain id array failed\n");
985 iommu
->domains
= kcalloc(ndomains
, sizeof(struct dmar_domain
*),
987 if (!iommu
->domains
) {
988 printk(KERN_ERR
"Allocating domain array failed\n");
989 kfree(iommu
->domain_ids
);
994 * if Caching mode is set, then invalid translations are tagged
995 * with domainid 0. Hence we need to pre-allocate it.
997 if (cap_caching_mode(iommu
->cap
))
998 set_bit(0, iommu
->domain_ids
);
1001 static struct intel_iommu
*alloc_iommu(struct intel_iommu
*iommu
,
1002 struct dmar_drhd_unit
*drhd
)
1008 iommu
->reg
= ioremap(drhd
->reg_base_addr
, PAGE_SIZE_4K
);
1010 printk(KERN_ERR
"IOMMU: can't map the region\n");
1013 iommu
->cap
= dmar_readq(iommu
->reg
+ DMAR_CAP_REG
);
1014 iommu
->ecap
= dmar_readq(iommu
->reg
+ DMAR_ECAP_REG
);
1016 /* the registers might be more than one page */
1017 map_size
= max_t(int, ecap_max_iotlb_offset(iommu
->ecap
),
1018 cap_max_fault_reg_offset(iommu
->cap
));
1019 map_size
= PAGE_ALIGN_4K(map_size
);
1020 if (map_size
> PAGE_SIZE_4K
) {
1021 iounmap(iommu
->reg
);
1022 iommu
->reg
= ioremap(drhd
->reg_base_addr
, map_size
);
1024 printk(KERN_ERR
"IOMMU: can't map the region\n");
1029 ver
= readl(iommu
->reg
+ DMAR_VER_REG
);
1030 pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1031 drhd
->reg_base_addr
, DMAR_VER_MAJOR(ver
), DMAR_VER_MINOR(ver
),
1032 iommu
->cap
, iommu
->ecap
);
1033 ret
= iommu_init_domains(iommu
);
1036 spin_lock_init(&iommu
->lock
);
1037 spin_lock_init(&iommu
->register_lock
);
1039 drhd
->iommu
= iommu
;
1042 iounmap(iommu
->reg
);
1048 static void domain_exit(struct dmar_domain
*domain
);
1049 static void free_iommu(struct intel_iommu
*iommu
)
1051 struct dmar_domain
*domain
;
1057 i
= find_first_bit(iommu
->domain_ids
, cap_ndoms(iommu
->cap
));
1058 for (; i
< cap_ndoms(iommu
->cap
); ) {
1059 domain
= iommu
->domains
[i
];
1060 clear_bit(i
, iommu
->domain_ids
);
1061 domain_exit(domain
);
1062 i
= find_next_bit(iommu
->domain_ids
,
1063 cap_ndoms(iommu
->cap
), i
+1);
1066 if (iommu
->gcmd
& DMA_GCMD_TE
)
1067 iommu_disable_translation(iommu
);
1070 set_irq_data(iommu
->irq
, NULL
);
1071 /* This will mask the irq */
1072 free_irq(iommu
->irq
, iommu
);
1073 destroy_irq(iommu
->irq
);
1076 kfree(iommu
->domains
);
1077 kfree(iommu
->domain_ids
);
1079 /* free context mapping */
1080 free_context_table(iommu
);
1083 iounmap(iommu
->reg
);
1087 static struct dmar_domain
* iommu_alloc_domain(struct intel_iommu
*iommu
)
1090 unsigned long ndomains
;
1091 struct dmar_domain
*domain
;
1092 unsigned long flags
;
1094 domain
= alloc_domain_mem();
1098 ndomains
= cap_ndoms(iommu
->cap
);
1100 spin_lock_irqsave(&iommu
->lock
, flags
);
1101 num
= find_first_zero_bit(iommu
->domain_ids
, ndomains
);
1102 if (num
>= ndomains
) {
1103 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1104 free_domain_mem(domain
);
1105 printk(KERN_ERR
"IOMMU: no free domain ids\n");
1109 set_bit(num
, iommu
->domain_ids
);
1111 domain
->iommu
= iommu
;
1112 iommu
->domains
[num
] = domain
;
1113 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1118 static void iommu_free_domain(struct dmar_domain
*domain
)
1120 unsigned long flags
;
1122 spin_lock_irqsave(&domain
->iommu
->lock
, flags
);
1123 clear_bit(domain
->id
, domain
->iommu
->domain_ids
);
1124 spin_unlock_irqrestore(&domain
->iommu
->lock
, flags
);
1127 static struct iova_domain reserved_iova_list
;
1128 static struct lock_class_key reserved_alloc_key
;
1129 static struct lock_class_key reserved_rbtree_key
;
1131 static void dmar_init_reserved_ranges(void)
1133 struct pci_dev
*pdev
= NULL
;
1138 init_iova_domain(&reserved_iova_list
, DMA_32BIT_PFN
);
1140 lockdep_set_class(&reserved_iova_list
.iova_alloc_lock
,
1141 &reserved_alloc_key
);
1142 lockdep_set_class(&reserved_iova_list
.iova_rbtree_lock
,
1143 &reserved_rbtree_key
);
1145 /* IOAPIC ranges shouldn't be accessed by DMA */
1146 iova
= reserve_iova(&reserved_iova_list
, IOVA_PFN(IOAPIC_RANGE_START
),
1147 IOVA_PFN(IOAPIC_RANGE_END
));
1149 printk(KERN_ERR
"Reserve IOAPIC range failed\n");
1151 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1152 for_each_pci_dev(pdev
) {
1155 for (i
= 0; i
< PCI_NUM_RESOURCES
; i
++) {
1156 r
= &pdev
->resource
[i
];
1157 if (!r
->flags
|| !(r
->flags
& IORESOURCE_MEM
))
1160 addr
&= PAGE_MASK_4K
;
1161 size
= r
->end
- addr
;
1162 size
= PAGE_ALIGN_4K(size
);
1163 iova
= reserve_iova(&reserved_iova_list
, IOVA_PFN(addr
),
1164 IOVA_PFN(size
+ addr
) - 1);
1166 printk(KERN_ERR
"Reserve iova failed\n");
1172 static void domain_reserve_special_ranges(struct dmar_domain
*domain
)
1174 copy_reserved_iova(&reserved_iova_list
, &domain
->iovad
);
1177 static inline int guestwidth_to_adjustwidth(int gaw
)
1180 int r
= (gaw
- 12) % 9;
1191 static int domain_init(struct dmar_domain
*domain
, int guest_width
)
1193 struct intel_iommu
*iommu
;
1194 int adjust_width
, agaw
;
1195 unsigned long sagaw
;
1197 init_iova_domain(&domain
->iovad
, DMA_32BIT_PFN
);
1198 spin_lock_init(&domain
->mapping_lock
);
1200 domain_reserve_special_ranges(domain
);
1202 /* calculate AGAW */
1203 iommu
= domain
->iommu
;
1204 if (guest_width
> cap_mgaw(iommu
->cap
))
1205 guest_width
= cap_mgaw(iommu
->cap
);
1206 domain
->gaw
= guest_width
;
1207 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
1208 agaw
= width_to_agaw(adjust_width
);
1209 sagaw
= cap_sagaw(iommu
->cap
);
1210 if (!test_bit(agaw
, &sagaw
)) {
1211 /* hardware doesn't support it, choose a bigger one */
1212 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw
);
1213 agaw
= find_next_bit(&sagaw
, 5, agaw
);
1217 domain
->agaw
= agaw
;
1218 INIT_LIST_HEAD(&domain
->devices
);
1220 /* always allocate the top pgd */
1221 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page();
1224 __iommu_flush_cache(iommu
, domain
->pgd
, PAGE_SIZE_4K
);
1228 static void domain_exit(struct dmar_domain
*domain
)
1232 /* Domain 0 is reserved, so dont process it */
1236 domain_remove_dev_info(domain
);
1238 put_iova_domain(&domain
->iovad
);
1239 end
= DOMAIN_MAX_ADDR(domain
->gaw
);
1240 end
= end
& (~PAGE_MASK_4K
);
1243 dma_pte_clear_range(domain
, 0, end
);
1245 /* free page tables */
1246 dma_pte_free_pagetable(domain
, 0, end
);
1248 iommu_free_domain(domain
);
1249 free_domain_mem(domain
);
1252 static int domain_context_mapping_one(struct dmar_domain
*domain
,
1255 struct context_entry
*context
;
1256 struct intel_iommu
*iommu
= domain
->iommu
;
1257 unsigned long flags
;
1259 pr_debug("Set context mapping for %02x:%02x.%d\n",
1260 bus
, PCI_SLOT(devfn
), PCI_FUNC(devfn
));
1261 BUG_ON(!domain
->pgd
);
1262 context
= device_to_context_entry(iommu
, bus
, devfn
);
1265 spin_lock_irqsave(&iommu
->lock
, flags
);
1266 if (context_present(*context
)) {
1267 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1271 context_set_domain_id(*context
, domain
->id
);
1272 context_set_address_width(*context
, domain
->agaw
);
1273 context_set_address_root(*context
, virt_to_phys(domain
->pgd
));
1274 context_set_translation_type(*context
, CONTEXT_TT_MULTI_LEVEL
);
1275 context_set_fault_enable(*context
);
1276 context_set_present(*context
);
1277 __iommu_flush_cache(iommu
, context
, sizeof(*context
));
1279 /* it's a non-present to present mapping */
1280 if (iommu_flush_context_device(iommu
, domain
->id
,
1281 (((u16
)bus
) << 8) | devfn
, DMA_CCMD_MASK_NOBIT
, 1))
1282 iommu_flush_write_buffer(iommu
);
1284 iommu_flush_iotlb_dsi(iommu
, 0, 0);
1285 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1290 domain_context_mapping(struct dmar_domain
*domain
, struct pci_dev
*pdev
)
1293 struct pci_dev
*tmp
, *parent
;
1295 ret
= domain_context_mapping_one(domain
, pdev
->bus
->number
,
1300 /* dependent device mapping */
1301 tmp
= pci_find_upstream_pcie_bridge(pdev
);
1304 /* Secondary interface's bus number and devfn 0 */
1305 parent
= pdev
->bus
->self
;
1306 while (parent
!= tmp
) {
1307 ret
= domain_context_mapping_one(domain
, parent
->bus
->number
,
1311 parent
= parent
->bus
->self
;
1313 if (tmp
->is_pcie
) /* this is a PCIE-to-PCI bridge */
1314 return domain_context_mapping_one(domain
,
1315 tmp
->subordinate
->number
, 0);
1316 else /* this is a legacy PCI bridge */
1317 return domain_context_mapping_one(domain
,
1318 tmp
->bus
->number
, tmp
->devfn
);
1321 static int domain_context_mapped(struct dmar_domain
*domain
,
1322 struct pci_dev
*pdev
)
1325 struct pci_dev
*tmp
, *parent
;
1327 ret
= device_context_mapped(domain
->iommu
,
1328 pdev
->bus
->number
, pdev
->devfn
);
1331 /* dependent device mapping */
1332 tmp
= pci_find_upstream_pcie_bridge(pdev
);
1335 /* Secondary interface's bus number and devfn 0 */
1336 parent
= pdev
->bus
->self
;
1337 while (parent
!= tmp
) {
1338 ret
= device_context_mapped(domain
->iommu
, parent
->bus
->number
,
1342 parent
= parent
->bus
->self
;
1345 return device_context_mapped(domain
->iommu
,
1346 tmp
->subordinate
->number
, 0);
1348 return device_context_mapped(domain
->iommu
,
1349 tmp
->bus
->number
, tmp
->devfn
);
1353 domain_page_mapping(struct dmar_domain
*domain
, dma_addr_t iova
,
1354 u64 hpa
, size_t size
, int prot
)
1356 u64 start_pfn
, end_pfn
;
1357 struct dma_pte
*pte
;
1360 if ((prot
& (DMA_PTE_READ
|DMA_PTE_WRITE
)) == 0)
1362 iova
&= PAGE_MASK_4K
;
1363 start_pfn
= ((u64
)hpa
) >> PAGE_SHIFT_4K
;
1364 end_pfn
= (PAGE_ALIGN_4K(((u64
)hpa
) + size
)) >> PAGE_SHIFT_4K
;
1366 while (start_pfn
< end_pfn
) {
1367 pte
= addr_to_dma_pte(domain
, iova
+ PAGE_SIZE_4K
* index
);
1370 /* We don't need lock here, nobody else
1371 * touches the iova range
1373 BUG_ON(dma_pte_addr(*pte
));
1374 dma_set_pte_addr(*pte
, start_pfn
<< PAGE_SHIFT_4K
);
1375 dma_set_pte_prot(*pte
, prot
);
1376 __iommu_flush_cache(domain
->iommu
, pte
, sizeof(*pte
));
1383 static void detach_domain_for_dev(struct dmar_domain
*domain
, u8 bus
, u8 devfn
)
1385 clear_context_table(domain
->iommu
, bus
, devfn
);
1386 iommu_flush_context_global(domain
->iommu
, 0);
1387 iommu_flush_iotlb_global(domain
->iommu
, 0);
1390 static void domain_remove_dev_info(struct dmar_domain
*domain
)
1392 struct device_domain_info
*info
;
1393 unsigned long flags
;
1395 spin_lock_irqsave(&device_domain_lock
, flags
);
1396 while (!list_empty(&domain
->devices
)) {
1397 info
= list_entry(domain
->devices
.next
,
1398 struct device_domain_info
, link
);
1399 list_del(&info
->link
);
1400 list_del(&info
->global
);
1402 info
->dev
->dev
.archdata
.iommu
= NULL
;
1403 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1405 detach_domain_for_dev(info
->domain
, info
->bus
, info
->devfn
);
1406 free_devinfo_mem(info
);
1408 spin_lock_irqsave(&device_domain_lock
, flags
);
1410 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1415 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1417 struct dmar_domain
*
1418 find_domain(struct pci_dev
*pdev
)
1420 struct device_domain_info
*info
;
1422 /* No lock here, assumes no domain exit in normal case */
1423 info
= pdev
->dev
.archdata
.iommu
;
1425 return info
->domain
;
1429 static int dmar_pci_device_match(struct pci_dev
*devices
[], int cnt
,
1430 struct pci_dev
*dev
)
1435 for (index
= 0; index
< cnt
; index
++)
1436 if (dev
== devices
[index
])
1439 /* Check our parent */
1440 dev
= dev
->bus
->self
;
1446 static struct dmar_drhd_unit
*
1447 dmar_find_matched_drhd_unit(struct pci_dev
*dev
)
1449 struct dmar_drhd_unit
*drhd
= NULL
;
1451 list_for_each_entry(drhd
, &dmar_drhd_units
, list
) {
1452 if (drhd
->include_all
|| dmar_pci_device_match(drhd
->devices
,
1453 drhd
->devices_cnt
, dev
))
1460 /* domain is initialized */
1461 static struct dmar_domain
*get_domain_for_dev(struct pci_dev
*pdev
, int gaw
)
1463 struct dmar_domain
*domain
, *found
= NULL
;
1464 struct intel_iommu
*iommu
;
1465 struct dmar_drhd_unit
*drhd
;
1466 struct device_domain_info
*info
, *tmp
;
1467 struct pci_dev
*dev_tmp
;
1468 unsigned long flags
;
1469 int bus
= 0, devfn
= 0;
1471 domain
= find_domain(pdev
);
1475 dev_tmp
= pci_find_upstream_pcie_bridge(pdev
);
1477 if (dev_tmp
->is_pcie
) {
1478 bus
= dev_tmp
->subordinate
->number
;
1481 bus
= dev_tmp
->bus
->number
;
1482 devfn
= dev_tmp
->devfn
;
1484 spin_lock_irqsave(&device_domain_lock
, flags
);
1485 list_for_each_entry(info
, &device_domain_list
, global
) {
1486 if (info
->bus
== bus
&& info
->devfn
== devfn
) {
1487 found
= info
->domain
;
1491 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1492 /* pcie-pci bridge already has a domain, uses it */
1499 /* Allocate new domain for the device */
1500 drhd
= dmar_find_matched_drhd_unit(pdev
);
1502 printk(KERN_ERR
"IOMMU: can't find DMAR for device %s\n",
1506 iommu
= drhd
->iommu
;
1508 domain
= iommu_alloc_domain(iommu
);
1512 if (domain_init(domain
, gaw
)) {
1513 domain_exit(domain
);
1517 /* register pcie-to-pci device */
1519 info
= alloc_devinfo_mem();
1521 domain_exit(domain
);
1525 info
->devfn
= devfn
;
1527 info
->domain
= domain
;
1528 /* This domain is shared by devices under p2p bridge */
1529 domain
->flags
|= DOMAIN_FLAG_MULTIPLE_DEVICES
;
1531 /* pcie-to-pci bridge already has a domain, uses it */
1533 spin_lock_irqsave(&device_domain_lock
, flags
);
1534 list_for_each_entry(tmp
, &device_domain_list
, global
) {
1535 if (tmp
->bus
== bus
&& tmp
->devfn
== devfn
) {
1536 found
= tmp
->domain
;
1541 free_devinfo_mem(info
);
1542 domain_exit(domain
);
1545 list_add(&info
->link
, &domain
->devices
);
1546 list_add(&info
->global
, &device_domain_list
);
1548 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1552 info
= alloc_devinfo_mem();
1555 info
->bus
= pdev
->bus
->number
;
1556 info
->devfn
= pdev
->devfn
;
1558 info
->domain
= domain
;
1559 spin_lock_irqsave(&device_domain_lock
, flags
);
1560 /* somebody is fast */
1561 found
= find_domain(pdev
);
1562 if (found
!= NULL
) {
1563 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1564 if (found
!= domain
) {
1565 domain_exit(domain
);
1568 free_devinfo_mem(info
);
1571 list_add(&info
->link
, &domain
->devices
);
1572 list_add(&info
->global
, &device_domain_list
);
1573 pdev
->dev
.archdata
.iommu
= info
;
1574 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1577 /* recheck it here, maybe others set it */
1578 return find_domain(pdev
);
1581 static int iommu_prepare_identity_map(struct pci_dev
*pdev
, u64 start
, u64 end
)
1583 struct dmar_domain
*domain
;
1589 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1590 pci_name(pdev
), start
, end
);
1591 /* page table init */
1592 domain
= get_domain_for_dev(pdev
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
1596 /* The address might not be aligned */
1597 base
= start
& PAGE_MASK_4K
;
1599 size
= PAGE_ALIGN_4K(size
);
1600 if (!reserve_iova(&domain
->iovad
, IOVA_PFN(base
),
1601 IOVA_PFN(base
+ size
) - 1)) {
1602 printk(KERN_ERR
"IOMMU: reserve iova failed\n");
1607 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1608 size
, base
, pci_name(pdev
));
1610 * RMRR range might have overlap with physical memory range,
1613 dma_pte_clear_range(domain
, base
, base
+ size
);
1615 ret
= domain_page_mapping(domain
, base
, base
, size
,
1616 DMA_PTE_READ
|DMA_PTE_WRITE
);
1620 /* context entry init */
1621 ret
= domain_context_mapping(domain
, pdev
);
1625 domain_exit(domain
);
1630 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit
*rmrr
,
1631 struct pci_dev
*pdev
)
1633 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
1635 return iommu_prepare_identity_map(pdev
, rmrr
->base_address
,
1636 rmrr
->end_address
+ 1);
1639 #ifdef CONFIG_DMAR_GFX_WA
1640 extern int arch_get_ram_range(int slot
, u64
*addr
, u64
*size
);
1641 static void __init
iommu_prepare_gfx_mapping(void)
1643 struct pci_dev
*pdev
= NULL
;
1648 for_each_pci_dev(pdev
) {
1649 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
||
1650 !IS_GFX_DEVICE(pdev
))
1652 printk(KERN_INFO
"IOMMU: gfx device %s 1-1 mapping\n",
1654 slot
= arch_get_ram_range(0, &base
, &size
);
1656 ret
= iommu_prepare_identity_map(pdev
,
1660 slot
= arch_get_ram_range(slot
, &base
, &size
);
1664 printk(KERN_ERR
"IOMMU: mapping reserved region failed\n");
1669 #ifdef CONFIG_DMAR_FLOPPY_WA
1670 static inline void iommu_prepare_isa(void)
1672 struct pci_dev
*pdev
;
1675 pdev
= pci_get_class(PCI_CLASS_BRIDGE_ISA
<< 8, NULL
);
1679 printk(KERN_INFO
"IOMMU: Prepare 0-16M unity mapping for LPC\n");
1680 ret
= iommu_prepare_identity_map(pdev
, 0, 16*1024*1024);
1683 printk("IOMMU: Failed to create 0-64M identity map, "
1684 "floppy might not work\n");
1688 static inline void iommu_prepare_isa(void)
1692 #endif /* !CONFIG_DMAR_FLPY_WA */
1694 int __init
init_dmars(void)
1696 struct dmar_drhd_unit
*drhd
;
1697 struct dmar_rmrr_unit
*rmrr
;
1698 struct pci_dev
*pdev
;
1699 struct intel_iommu
*iommu
;
1700 int i
, ret
, unit
= 0;
1705 * initialize and program root entry to not present
1708 for_each_drhd_unit(drhd
) {
1713 * lock not needed as this is only incremented in the single
1714 * threaded kernel __init code path all other access are read
1719 g_iommus
= kzalloc(g_num_of_iommus
* sizeof(*iommu
), GFP_KERNEL
);
1725 deferred_flush
= kzalloc(g_num_of_iommus
*
1726 sizeof(struct deferred_flush_tables
), GFP_KERNEL
);
1727 if (!deferred_flush
) {
1734 for_each_drhd_unit(drhd
) {
1737 iommu
= alloc_iommu(&g_iommus
[i
], drhd
);
1746 * we could share the same root & context tables
1747 * amoung all IOMMU's. Need to Split it later.
1749 ret
= iommu_alloc_root_entry(iommu
);
1751 printk(KERN_ERR
"IOMMU: allocate root entry failed\n");
1758 * for each dev attached to rmrr
1760 * locate drhd for dev, alloc domain for dev
1761 * allocate free domain
1762 * allocate page table entries for rmrr
1763 * if context not allocated for bus
1764 * allocate and init context
1765 * set present in root table for this bus
1766 * init context with domain, translation etc
1770 for_each_rmrr_units(rmrr
) {
1771 for (i
= 0; i
< rmrr
->devices_cnt
; i
++) {
1772 pdev
= rmrr
->devices
[i
];
1773 /* some BIOS lists non-exist devices in DMAR table */
1776 ret
= iommu_prepare_rmrr_dev(rmrr
, pdev
);
1779 "IOMMU: mapping reserved region failed\n");
1783 iommu_prepare_gfx_mapping();
1785 iommu_prepare_isa();
1790 * global invalidate context cache
1791 * global invalidate iotlb
1792 * enable translation
1794 for_each_drhd_unit(drhd
) {
1797 iommu
= drhd
->iommu
;
1798 sprintf (iommu
->name
, "dmar%d", unit
++);
1800 iommu_flush_write_buffer(iommu
);
1802 ret
= dmar_set_interrupt(iommu
);
1806 iommu_set_root_entry(iommu
);
1808 iommu_flush_context_global(iommu
, 0);
1809 iommu_flush_iotlb_global(iommu
, 0);
1811 iommu_disable_protect_mem_regions(iommu
);
1813 ret
= iommu_enable_translation(iommu
);
1820 for_each_drhd_unit(drhd
) {
1823 iommu
= drhd
->iommu
;
1830 static inline u64
aligned_size(u64 host_addr
, size_t size
)
1833 addr
= (host_addr
& (~PAGE_MASK_4K
)) + size
;
1834 return PAGE_ALIGN_4K(addr
);
1838 iommu_alloc_iova(struct dmar_domain
*domain
, size_t size
, u64 end
)
1842 /* Make sure it's in range */
1843 end
= min_t(u64
, DOMAIN_MAX_ADDR(domain
->gaw
), end
);
1844 if (!size
|| (IOVA_START_ADDR
+ size
> end
))
1847 piova
= alloc_iova(&domain
->iovad
,
1848 size
>> PAGE_SHIFT_4K
, IOVA_PFN(end
), 1);
1852 static struct iova
*
1853 __intel_alloc_iova(struct device
*dev
, struct dmar_domain
*domain
,
1856 struct pci_dev
*pdev
= to_pci_dev(dev
);
1857 struct iova
*iova
= NULL
;
1859 if ((pdev
->dma_mask
<= DMA_32BIT_MASK
) || (dmar_forcedac
)) {
1860 iova
= iommu_alloc_iova(domain
, size
, pdev
->dma_mask
);
1863 * First try to allocate an io virtual address in
1864 * DMA_32BIT_MASK and if that fails then try allocating
1867 iova
= iommu_alloc_iova(domain
, size
, DMA_32BIT_MASK
);
1869 iova
= iommu_alloc_iova(domain
, size
, pdev
->dma_mask
);
1873 printk(KERN_ERR
"Allocating iova for %s failed", pci_name(pdev
));
1880 static struct dmar_domain
*
1881 get_valid_domain_for_dev(struct pci_dev
*pdev
)
1883 struct dmar_domain
*domain
;
1886 domain
= get_domain_for_dev(pdev
,
1887 DEFAULT_DOMAIN_ADDRESS_WIDTH
);
1890 "Allocating domain for %s failed", pci_name(pdev
));
1894 /* make sure context mapping is ok */
1895 if (unlikely(!domain_context_mapped(domain
, pdev
))) {
1896 ret
= domain_context_mapping(domain
, pdev
);
1899 "Domain context map for %s failed",
1909 intel_map_single(struct device
*hwdev
, phys_addr_t paddr
, size_t size
, int dir
)
1911 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
1912 struct dmar_domain
*domain
;
1913 unsigned long start_paddr
;
1918 BUG_ON(dir
== DMA_NONE
);
1919 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
1922 domain
= get_valid_domain_for_dev(pdev
);
1926 size
= aligned_size((u64
)paddr
, size
);
1928 iova
= __intel_alloc_iova(hwdev
, domain
, size
);
1932 start_paddr
= iova
->pfn_lo
<< PAGE_SHIFT_4K
;
1935 * Check if DMAR supports zero-length reads on write only
1938 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
1939 !cap_zlr(domain
->iommu
->cap
))
1940 prot
|= DMA_PTE_READ
;
1941 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
1942 prot
|= DMA_PTE_WRITE
;
1944 * paddr - (paddr + size) might be partial page, we should map the whole
1945 * page. Note: if two part of one page are separately mapped, we
1946 * might have two guest_addr mapping to the same host paddr, but this
1947 * is not a big problem
1949 ret
= domain_page_mapping(domain
, start_paddr
,
1950 ((u64
)paddr
) & PAGE_MASK_4K
, size
, prot
);
1954 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1955 pci_name(pdev
), size
, (u64
)paddr
,
1956 size
, (u64
)start_paddr
, dir
);
1958 /* it's a non-present to present mapping */
1959 ret
= iommu_flush_iotlb_psi(domain
->iommu
, domain
->id
,
1960 start_paddr
, size
>> PAGE_SHIFT_4K
, 1);
1962 iommu_flush_write_buffer(domain
->iommu
);
1964 return (start_paddr
+ ((u64
)paddr
& (~PAGE_MASK_4K
)));
1968 __free_iova(&domain
->iovad
, iova
);
1969 printk(KERN_ERR
"Device %s request: %lx@%llx dir %d --- failed\n",
1970 pci_name(pdev
), size
, (u64
)paddr
, dir
);
1974 static void flush_unmaps(void)
1980 /* just flush them all */
1981 for (i
= 0; i
< g_num_of_iommus
; i
++) {
1982 if (deferred_flush
[i
].next
) {
1983 iommu_flush_iotlb_global(&g_iommus
[i
], 0);
1984 for (j
= 0; j
< deferred_flush
[i
].next
; j
++) {
1985 __free_iova(&deferred_flush
[i
].domain
[j
]->iovad
,
1986 deferred_flush
[i
].iova
[j
]);
1988 deferred_flush
[i
].next
= 0;
1995 static void flush_unmaps_timeout(unsigned long data
)
1997 unsigned long flags
;
1999 spin_lock_irqsave(&async_umap_flush_lock
, flags
);
2001 spin_unlock_irqrestore(&async_umap_flush_lock
, flags
);
2004 static void add_unmap(struct dmar_domain
*dom
, struct iova
*iova
)
2006 unsigned long flags
;
2009 spin_lock_irqsave(&async_umap_flush_lock
, flags
);
2010 if (list_size
== HIGH_WATER_MARK
)
2013 iommu_id
= dom
->iommu
- g_iommus
;
2014 next
= deferred_flush
[iommu_id
].next
;
2015 deferred_flush
[iommu_id
].domain
[next
] = dom
;
2016 deferred_flush
[iommu_id
].iova
[next
] = iova
;
2017 deferred_flush
[iommu_id
].next
++;
2020 mod_timer(&unmap_timer
, jiffies
+ msecs_to_jiffies(10));
2024 spin_unlock_irqrestore(&async_umap_flush_lock
, flags
);
2027 static void intel_unmap_single(struct device
*dev
, dma_addr_t dev_addr
,
2028 size_t size
, int dir
)
2030 struct pci_dev
*pdev
= to_pci_dev(dev
);
2031 struct dmar_domain
*domain
;
2032 unsigned long start_addr
;
2035 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
2037 domain
= find_domain(pdev
);
2040 iova
= find_iova(&domain
->iovad
, IOVA_PFN(dev_addr
));
2044 start_addr
= iova
->pfn_lo
<< PAGE_SHIFT_4K
;
2045 size
= aligned_size((u64
)dev_addr
, size
);
2047 pr_debug("Device %s unmapping: %lx@%llx\n",
2048 pci_name(pdev
), size
, (u64
)start_addr
);
2050 /* clear the whole page */
2051 dma_pte_clear_range(domain
, start_addr
, start_addr
+ size
);
2052 /* free page tables */
2053 dma_pte_free_pagetable(domain
, start_addr
, start_addr
+ size
);
2054 if (intel_iommu_strict
) {
2055 if (iommu_flush_iotlb_psi(domain
->iommu
,
2056 domain
->id
, start_addr
, size
>> PAGE_SHIFT_4K
, 0))
2057 iommu_flush_write_buffer(domain
->iommu
);
2059 __free_iova(&domain
->iovad
, iova
);
2061 add_unmap(domain
, iova
);
2063 * queue up the release of the unmap to save the 1/6th of the
2064 * cpu used up by the iotlb flush operation...
2069 static void * intel_alloc_coherent(struct device
*hwdev
, size_t size
,
2070 dma_addr_t
*dma_handle
, gfp_t flags
)
2075 size
= PAGE_ALIGN_4K(size
);
2076 order
= get_order(size
);
2077 flags
&= ~(GFP_DMA
| GFP_DMA32
);
2079 vaddr
= (void *)__get_free_pages(flags
, order
);
2082 memset(vaddr
, 0, size
);
2084 *dma_handle
= intel_map_single(hwdev
, virt_to_bus(vaddr
), size
, DMA_BIDIRECTIONAL
);
2087 free_pages((unsigned long)vaddr
, order
);
2091 static void intel_free_coherent(struct device
*hwdev
, size_t size
,
2092 void *vaddr
, dma_addr_t dma_handle
)
2096 size
= PAGE_ALIGN_4K(size
);
2097 order
= get_order(size
);
2099 intel_unmap_single(hwdev
, dma_handle
, size
, DMA_BIDIRECTIONAL
);
2100 free_pages((unsigned long)vaddr
, order
);
2103 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2104 static void intel_unmap_sg(struct device
*hwdev
, struct scatterlist
*sglist
,
2105 int nelems
, int dir
)
2108 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
2109 struct dmar_domain
*domain
;
2110 unsigned long start_addr
;
2114 struct scatterlist
*sg
;
2116 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
2119 domain
= find_domain(pdev
);
2121 iova
= find_iova(&domain
->iovad
, IOVA_PFN(sglist
[0].dma_address
));
2124 for_each_sg(sglist
, sg
, nelems
, i
) {
2125 addr
= SG_ENT_VIRT_ADDRESS(sg
);
2126 size
+= aligned_size((u64
)addr
, sg
->length
);
2129 start_addr
= iova
->pfn_lo
<< PAGE_SHIFT_4K
;
2131 /* clear the whole page */
2132 dma_pte_clear_range(domain
, start_addr
, start_addr
+ size
);
2133 /* free page tables */
2134 dma_pte_free_pagetable(domain
, start_addr
, start_addr
+ size
);
2136 if (iommu_flush_iotlb_psi(domain
->iommu
, domain
->id
, start_addr
,
2137 size
>> PAGE_SHIFT_4K
, 0))
2138 iommu_flush_write_buffer(domain
->iommu
);
2141 __free_iova(&domain
->iovad
, iova
);
2144 static int intel_nontranslate_map_sg(struct device
*hddev
,
2145 struct scatterlist
*sglist
, int nelems
, int dir
)
2148 struct scatterlist
*sg
;
2150 for_each_sg(sglist
, sg
, nelems
, i
) {
2151 BUG_ON(!sg_page(sg
));
2152 sg
->dma_address
= virt_to_bus(SG_ENT_VIRT_ADDRESS(sg
));
2153 sg
->dma_length
= sg
->length
;
2158 static int intel_map_sg(struct device
*hwdev
, struct scatterlist
*sglist
,
2159 int nelems
, int dir
)
2163 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
2164 struct dmar_domain
*domain
;
2168 struct iova
*iova
= NULL
;
2170 struct scatterlist
*sg
;
2171 unsigned long start_addr
;
2173 BUG_ON(dir
== DMA_NONE
);
2174 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
2175 return intel_nontranslate_map_sg(hwdev
, sglist
, nelems
, dir
);
2177 domain
= get_valid_domain_for_dev(pdev
);
2181 for_each_sg(sglist
, sg
, nelems
, i
) {
2182 addr
= SG_ENT_VIRT_ADDRESS(sg
);
2183 addr
= (void *)virt_to_phys(addr
);
2184 size
+= aligned_size((u64
)addr
, sg
->length
);
2187 iova
= __intel_alloc_iova(hwdev
, domain
, size
);
2189 sglist
->dma_length
= 0;
2194 * Check if DMAR supports zero-length reads on write only
2197 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
2198 !cap_zlr(domain
->iommu
->cap
))
2199 prot
|= DMA_PTE_READ
;
2200 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
2201 prot
|= DMA_PTE_WRITE
;
2203 start_addr
= iova
->pfn_lo
<< PAGE_SHIFT_4K
;
2205 for_each_sg(sglist
, sg
, nelems
, i
) {
2206 addr
= SG_ENT_VIRT_ADDRESS(sg
);
2207 addr
= (void *)virt_to_phys(addr
);
2208 size
= aligned_size((u64
)addr
, sg
->length
);
2209 ret
= domain_page_mapping(domain
, start_addr
+ offset
,
2210 ((u64
)addr
) & PAGE_MASK_4K
,
2213 /* clear the page */
2214 dma_pte_clear_range(domain
, start_addr
,
2215 start_addr
+ offset
);
2216 /* free page tables */
2217 dma_pte_free_pagetable(domain
, start_addr
,
2218 start_addr
+ offset
);
2220 __free_iova(&domain
->iovad
, iova
);
2223 sg
->dma_address
= start_addr
+ offset
+
2224 ((u64
)addr
& (~PAGE_MASK_4K
));
2225 sg
->dma_length
= sg
->length
;
2229 /* it's a non-present to present mapping */
2230 if (iommu_flush_iotlb_psi(domain
->iommu
, domain
->id
,
2231 start_addr
, offset
>> PAGE_SHIFT_4K
, 1))
2232 iommu_flush_write_buffer(domain
->iommu
);
2236 static struct dma_mapping_ops intel_dma_ops
= {
2237 .alloc_coherent
= intel_alloc_coherent
,
2238 .free_coherent
= intel_free_coherent
,
2239 .map_single
= intel_map_single
,
2240 .unmap_single
= intel_unmap_single
,
2241 .map_sg
= intel_map_sg
,
2242 .unmap_sg
= intel_unmap_sg
,
2245 static inline int iommu_domain_cache_init(void)
2249 iommu_domain_cache
= kmem_cache_create("iommu_domain",
2250 sizeof(struct dmar_domain
),
2255 if (!iommu_domain_cache
) {
2256 printk(KERN_ERR
"Couldn't create iommu_domain cache\n");
2263 static inline int iommu_devinfo_cache_init(void)
2267 iommu_devinfo_cache
= kmem_cache_create("iommu_devinfo",
2268 sizeof(struct device_domain_info
),
2273 if (!iommu_devinfo_cache
) {
2274 printk(KERN_ERR
"Couldn't create devinfo cache\n");
2281 static inline int iommu_iova_cache_init(void)
2285 iommu_iova_cache
= kmem_cache_create("iommu_iova",
2286 sizeof(struct iova
),
2291 if (!iommu_iova_cache
) {
2292 printk(KERN_ERR
"Couldn't create iova cache\n");
2299 static int __init
iommu_init_mempool(void)
2302 ret
= iommu_iova_cache_init();
2306 ret
= iommu_domain_cache_init();
2310 ret
= iommu_devinfo_cache_init();
2314 kmem_cache_destroy(iommu_domain_cache
);
2316 kmem_cache_destroy(iommu_iova_cache
);
2321 static void __init
iommu_exit_mempool(void)
2323 kmem_cache_destroy(iommu_devinfo_cache
);
2324 kmem_cache_destroy(iommu_domain_cache
);
2325 kmem_cache_destroy(iommu_iova_cache
);
2329 void __init
detect_intel_iommu(void)
2331 if (swiotlb
|| no_iommu
|| iommu_detected
|| dmar_disabled
)
2333 if (early_dmar_detect()) {
2338 static void __init
init_no_remapping_devices(void)
2340 struct dmar_drhd_unit
*drhd
;
2342 for_each_drhd_unit(drhd
) {
2343 if (!drhd
->include_all
) {
2345 for (i
= 0; i
< drhd
->devices_cnt
; i
++)
2346 if (drhd
->devices
[i
] != NULL
)
2348 /* ignore DMAR unit if no pci devices exist */
2349 if (i
== drhd
->devices_cnt
)
2357 for_each_drhd_unit(drhd
) {
2359 if (drhd
->ignored
|| drhd
->include_all
)
2362 for (i
= 0; i
< drhd
->devices_cnt
; i
++)
2363 if (drhd
->devices
[i
] &&
2364 !IS_GFX_DEVICE(drhd
->devices
[i
]))
2367 if (i
< drhd
->devices_cnt
)
2370 /* bypass IOMMU if it is just for gfx devices */
2372 for (i
= 0; i
< drhd
->devices_cnt
; i
++) {
2373 if (!drhd
->devices
[i
])
2375 drhd
->devices
[i
]->dev
.archdata
.iommu
= DUMMY_DEVICE_DOMAIN_INFO
;
2380 int __init
intel_iommu_init(void)
2384 if (no_iommu
|| swiotlb
|| dmar_disabled
)
2387 if (dmar_table_init())
2390 iommu_init_mempool();
2391 dmar_init_reserved_ranges();
2393 init_no_remapping_devices();
2397 printk(KERN_ERR
"IOMMU: dmar init failed\n");
2398 put_iova_domain(&reserved_iova_list
);
2399 iommu_exit_mempool();
2403 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2405 init_timer(&unmap_timer
);
2407 dma_ops
= &intel_dma_ops
;