2 * Copyright (C) 2010-2012 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published
7 * by the Free Software Foundation.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 #include <linux/mmu_notifier.h>
20 #include <linux/amd-iommu.h>
21 #include <linux/mm_types.h>
22 #include <linux/profile.h>
23 #include <linux/module.h>
24 #include <linux/sched.h>
25 #include <linux/iommu.h>
26 #include <linux/wait.h>
27 #include <linux/pci.h>
28 #include <linux/gfp.h>
30 #include "amd_iommu_types.h"
31 #include "amd_iommu_proto.h"
33 MODULE_LICENSE("GPL v2");
34 MODULE_AUTHOR("Joerg Roedel <joerg.roedel@amd.com>");
36 #define MAX_DEVICES 0x10000
37 #define PRI_QUEUE_SIZE 512
46 struct list_head list
; /* For global state-list */
47 atomic_t count
; /* Reference count */
48 unsigned mmu_notifier_count
; /* Counting nested mmu_notifier
50 struct mm_struct
*mm
; /* mm_struct for the faults */
51 struct mmu_notifier mn
; /* mmu_notifier handle */
52 struct pri_queue pri
[PRI_QUEUE_SIZE
]; /* PRI tag states */
53 struct device_state
*device_state
; /* Link to our device_state */
54 int pasid
; /* PASID index */
55 bool invalid
; /* Used during setup and
56 teardown of the pasid */
57 spinlock_t lock
; /* Protect pri_queues and
59 wait_queue_head_t wq
; /* To wait for count == 0 */
63 struct list_head list
;
67 struct pasid_state
**states
;
68 struct iommu_domain
*domain
;
71 amd_iommu_invalid_ppr_cb inv_ppr_cb
;
72 amd_iommu_invalidate_ctx inv_ctx_cb
;
78 struct work_struct work
;
79 struct device_state
*dev_state
;
80 struct pasid_state
*state
;
90 static LIST_HEAD(state_list
);
91 static spinlock_t state_lock
;
93 static struct workqueue_struct
*iommu_wq
;
95 static void free_pasid_states(struct device_state
*dev_state
);
97 static u16
device_id(struct pci_dev
*pdev
)
101 devid
= pdev
->bus
->number
;
102 devid
= (devid
<< 8) | pdev
->devfn
;
107 static struct device_state
*__get_device_state(u16 devid
)
109 struct device_state
*dev_state
;
111 list_for_each_entry(dev_state
, &state_list
, list
) {
112 if (dev_state
->devid
== devid
)
119 static struct device_state
*get_device_state(u16 devid
)
121 struct device_state
*dev_state
;
124 spin_lock_irqsave(&state_lock
, flags
);
125 dev_state
= __get_device_state(devid
);
126 if (dev_state
!= NULL
)
127 atomic_inc(&dev_state
->count
);
128 spin_unlock_irqrestore(&state_lock
, flags
);
133 static void free_device_state(struct device_state
*dev_state
)
136 * First detach device from domain - No more PRI requests will arrive
137 * from that device after it is unbound from the IOMMUv2 domain.
139 iommu_detach_device(dev_state
->domain
, &dev_state
->pdev
->dev
);
141 /* Everything is down now, free the IOMMUv2 domain */
142 iommu_domain_free(dev_state
->domain
);
144 /* Finally get rid of the device-state */
148 static void put_device_state(struct device_state
*dev_state
)
150 if (atomic_dec_and_test(&dev_state
->count
))
151 wake_up(&dev_state
->wq
);
154 static void put_device_state_wait(struct device_state
*dev_state
)
158 prepare_to_wait(&dev_state
->wq
, &wait
, TASK_UNINTERRUPTIBLE
);
159 if (!atomic_dec_and_test(&dev_state
->count
))
161 finish_wait(&dev_state
->wq
, &wait
);
163 free_device_state(dev_state
);
166 /* Must be called under dev_state->lock */
167 static struct pasid_state
**__get_pasid_state_ptr(struct device_state
*dev_state
,
168 int pasid
, bool alloc
)
170 struct pasid_state
**root
, **ptr
;
173 level
= dev_state
->pasid_levels
;
174 root
= dev_state
->states
;
178 index
= (pasid
>> (9 * level
)) & 0x1ff;
188 *ptr
= (void *)get_zeroed_page(GFP_ATOMIC
);
193 root
= (struct pasid_state
**)*ptr
;
200 static int set_pasid_state(struct device_state
*dev_state
,
201 struct pasid_state
*pasid_state
,
204 struct pasid_state
**ptr
;
208 spin_lock_irqsave(&dev_state
->lock
, flags
);
209 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, true);
224 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
229 static void clear_pasid_state(struct device_state
*dev_state
, int pasid
)
231 struct pasid_state
**ptr
;
234 spin_lock_irqsave(&dev_state
->lock
, flags
);
235 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, true);
243 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
246 static struct pasid_state
*get_pasid_state(struct device_state
*dev_state
,
249 struct pasid_state
**ptr
, *ret
= NULL
;
252 spin_lock_irqsave(&dev_state
->lock
, flags
);
253 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, false);
260 atomic_inc(&ret
->count
);
263 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
268 static void free_pasid_state(struct pasid_state
*pasid_state
)
273 static void put_pasid_state(struct pasid_state
*pasid_state
)
275 if (atomic_dec_and_test(&pasid_state
->count
))
276 wake_up(&pasid_state
->wq
);
279 static void put_pasid_state_wait(struct pasid_state
*pasid_state
)
283 prepare_to_wait(&pasid_state
->wq
, &wait
, TASK_UNINTERRUPTIBLE
);
285 if (!atomic_dec_and_test(&pasid_state
->count
))
288 finish_wait(&pasid_state
->wq
, &wait
);
289 free_pasid_state(pasid_state
);
292 static void unbind_pasid(struct pasid_state
*pasid_state
)
294 struct iommu_domain
*domain
;
296 domain
= pasid_state
->device_state
->domain
;
299 * Mark pasid_state as invalid, no more faults will we added to the
300 * work queue after this is visible everywhere.
302 pasid_state
->invalid
= true;
304 /* Make sure this is visible */
307 /* After this the device/pasid can't access the mm anymore */
308 amd_iommu_domain_clear_gcr3(domain
, pasid_state
->pasid
);
310 /* Make sure no more pending faults are in the queue */
311 flush_workqueue(iommu_wq
);
314 static void free_pasid_states_level1(struct pasid_state
**tbl
)
318 for (i
= 0; i
< 512; ++i
) {
322 free_page((unsigned long)tbl
[i
]);
326 static void free_pasid_states_level2(struct pasid_state
**tbl
)
328 struct pasid_state
**ptr
;
331 for (i
= 0; i
< 512; ++i
) {
335 ptr
= (struct pasid_state
**)tbl
[i
];
336 free_pasid_states_level1(ptr
);
340 static void free_pasid_states(struct device_state
*dev_state
)
342 struct pasid_state
*pasid_state
;
345 for (i
= 0; i
< dev_state
->max_pasids
; ++i
) {
346 pasid_state
= get_pasid_state(dev_state
, i
);
347 if (pasid_state
== NULL
)
350 put_pasid_state(pasid_state
);
353 * This will call the mn_release function and
356 mmu_notifier_unregister(&pasid_state
->mn
, pasid_state
->mm
);
358 put_pasid_state_wait(pasid_state
); /* Reference taken in
359 amd_iommu_bind_pasid */
361 /* Drop reference taken in amd_iommu_bind_pasid */
362 put_device_state(dev_state
);
365 if (dev_state
->pasid_levels
== 2)
366 free_pasid_states_level2(dev_state
->states
);
367 else if (dev_state
->pasid_levels
== 1)
368 free_pasid_states_level1(dev_state
->states
);
369 else if (dev_state
->pasid_levels
!= 0)
372 free_page((unsigned long)dev_state
->states
);
375 static struct pasid_state
*mn_to_state(struct mmu_notifier
*mn
)
377 return container_of(mn
, struct pasid_state
, mn
);
380 static void __mn_flush_page(struct mmu_notifier
*mn
,
381 unsigned long address
)
383 struct pasid_state
*pasid_state
;
384 struct device_state
*dev_state
;
386 pasid_state
= mn_to_state(mn
);
387 dev_state
= pasid_state
->device_state
;
389 amd_iommu_flush_page(dev_state
->domain
, pasid_state
->pasid
, address
);
392 static int mn_clear_flush_young(struct mmu_notifier
*mn
,
393 struct mm_struct
*mm
,
397 for (; start
< end
; start
+= PAGE_SIZE
)
398 __mn_flush_page(mn
, start
);
403 static void mn_invalidate_page(struct mmu_notifier
*mn
,
404 struct mm_struct
*mm
,
405 unsigned long address
)
407 __mn_flush_page(mn
, address
);
410 static void mn_invalidate_range(struct mmu_notifier
*mn
,
411 struct mm_struct
*mm
,
412 unsigned long start
, unsigned long end
)
414 struct pasid_state
*pasid_state
;
415 struct device_state
*dev_state
;
417 pasid_state
= mn_to_state(mn
);
418 dev_state
= pasid_state
->device_state
;
420 if ((start
^ (end
- 1)) < PAGE_SIZE
)
421 amd_iommu_flush_page(dev_state
->domain
, pasid_state
->pasid
,
424 amd_iommu_flush_tlb(dev_state
->domain
, pasid_state
->pasid
);
427 static void mn_release(struct mmu_notifier
*mn
, struct mm_struct
*mm
)
429 struct pasid_state
*pasid_state
;
430 struct device_state
*dev_state
;
435 pasid_state
= mn_to_state(mn
);
436 dev_state
= pasid_state
->device_state
;
437 run_inv_ctx_cb
= !pasid_state
->invalid
;
439 if (run_inv_ctx_cb
&& pasid_state
->device_state
->inv_ctx_cb
)
440 dev_state
->inv_ctx_cb(dev_state
->pdev
, pasid_state
->pasid
);
442 unbind_pasid(pasid_state
);
445 static struct mmu_notifier_ops iommu_mn
= {
446 .release
= mn_release
,
447 .clear_flush_young
= mn_clear_flush_young
,
448 .invalidate_page
= mn_invalidate_page
,
449 .invalidate_range
= mn_invalidate_range
,
452 static void set_pri_tag_status(struct pasid_state
*pasid_state
,
457 spin_lock_irqsave(&pasid_state
->lock
, flags
);
458 pasid_state
->pri
[tag
].status
= status
;
459 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
462 static void finish_pri_tag(struct device_state
*dev_state
,
463 struct pasid_state
*pasid_state
,
468 spin_lock_irqsave(&pasid_state
->lock
, flags
);
469 if (atomic_dec_and_test(&pasid_state
->pri
[tag
].inflight
) &&
470 pasid_state
->pri
[tag
].finish
) {
471 amd_iommu_complete_ppr(dev_state
->pdev
, pasid_state
->pasid
,
472 pasid_state
->pri
[tag
].status
, tag
);
473 pasid_state
->pri
[tag
].finish
= false;
474 pasid_state
->pri
[tag
].status
= PPR_SUCCESS
;
476 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
479 static void handle_fault_error(struct fault
*fault
)
483 if (!fault
->dev_state
->inv_ppr_cb
) {
484 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_INVALID
);
488 status
= fault
->dev_state
->inv_ppr_cb(fault
->dev_state
->pdev
,
493 case AMD_IOMMU_INV_PRI_RSP_SUCCESS
:
494 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_SUCCESS
);
496 case AMD_IOMMU_INV_PRI_RSP_INVALID
:
497 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_INVALID
);
499 case AMD_IOMMU_INV_PRI_RSP_FAIL
:
500 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_FAILURE
);
507 static void do_fault(struct work_struct
*work
)
509 struct fault
*fault
= container_of(work
, struct fault
, work
);
510 struct mm_struct
*mm
;
511 struct vm_area_struct
*vma
;
515 write
= !!(fault
->flags
& PPR_FAULT_WRITE
);
517 mm
= fault
->state
->mm
;
518 address
= fault
->address
;
520 down_read(&mm
->mmap_sem
);
521 vma
= find_extend_vma(mm
, address
);
522 if (!vma
|| address
< vma
->vm_start
) {
523 /* failed to get a vma in the right range */
524 up_read(&mm
->mmap_sem
);
525 handle_fault_error(fault
);
529 ret
= handle_mm_fault(mm
, vma
, address
, write
);
530 if (ret
& VM_FAULT_ERROR
) {
531 /* failed to service fault */
532 up_read(&mm
->mmap_sem
);
533 handle_fault_error(fault
);
537 up_read(&mm
->mmap_sem
);
540 finish_pri_tag(fault
->dev_state
, fault
->state
, fault
->tag
);
542 put_pasid_state(fault
->state
);
547 static int ppr_notifier(struct notifier_block
*nb
, unsigned long e
, void *data
)
549 struct amd_iommu_fault
*iommu_fault
;
550 struct pasid_state
*pasid_state
;
551 struct device_state
*dev_state
;
559 tag
= iommu_fault
->tag
& 0x1ff;
560 finish
= (iommu_fault
->tag
>> 9) & 1;
563 dev_state
= get_device_state(iommu_fault
->device_id
);
564 if (dev_state
== NULL
)
567 pasid_state
= get_pasid_state(dev_state
, iommu_fault
->pasid
);
568 if (pasid_state
== NULL
|| pasid_state
->invalid
) {
569 /* We know the device but not the PASID -> send INVALID */
570 amd_iommu_complete_ppr(dev_state
->pdev
, iommu_fault
->pasid
,
575 spin_lock_irqsave(&pasid_state
->lock
, flags
);
576 atomic_inc(&pasid_state
->pri
[tag
].inflight
);
578 pasid_state
->pri
[tag
].finish
= true;
579 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
581 fault
= kzalloc(sizeof(*fault
), GFP_ATOMIC
);
583 /* We are OOM - send success and let the device re-fault */
584 finish_pri_tag(dev_state
, pasid_state
, tag
);
588 fault
->dev_state
= dev_state
;
589 fault
->address
= iommu_fault
->address
;
590 fault
->state
= pasid_state
;
592 fault
->finish
= finish
;
593 fault
->pasid
= iommu_fault
->pasid
;
594 fault
->flags
= iommu_fault
->flags
;
595 INIT_WORK(&fault
->work
, do_fault
);
597 queue_work(iommu_wq
, &fault
->work
);
603 if (ret
!= NOTIFY_OK
&& pasid_state
)
604 put_pasid_state(pasid_state
);
606 put_device_state(dev_state
);
612 static struct notifier_block ppr_nb
= {
613 .notifier_call
= ppr_notifier
,
616 int amd_iommu_bind_pasid(struct pci_dev
*pdev
, int pasid
,
617 struct task_struct
*task
)
619 struct pasid_state
*pasid_state
;
620 struct device_state
*dev_state
;
621 struct mm_struct
*mm
;
627 if (!amd_iommu_v2_supported())
630 devid
= device_id(pdev
);
631 dev_state
= get_device_state(devid
);
633 if (dev_state
== NULL
)
637 if (pasid
< 0 || pasid
>= dev_state
->max_pasids
)
641 pasid_state
= kzalloc(sizeof(*pasid_state
), GFP_KERNEL
);
642 if (pasid_state
== NULL
)
646 atomic_set(&pasid_state
->count
, 1);
647 init_waitqueue_head(&pasid_state
->wq
);
648 spin_lock_init(&pasid_state
->lock
);
650 mm
= get_task_mm(task
);
651 pasid_state
->mm
= mm
;
652 pasid_state
->device_state
= dev_state
;
653 pasid_state
->pasid
= pasid
;
654 pasid_state
->invalid
= true; /* Mark as valid only if we are
655 done with setting up the pasid */
656 pasid_state
->mn
.ops
= &iommu_mn
;
658 if (pasid_state
->mm
== NULL
)
661 mmu_notifier_register(&pasid_state
->mn
, mm
);
663 ret
= set_pasid_state(dev_state
, pasid_state
, pasid
);
667 ret
= amd_iommu_domain_set_gcr3(dev_state
->domain
, pasid
,
668 __pa(pasid_state
->mm
->pgd
));
670 goto out_clear_state
;
672 /* Now we are ready to handle faults */
673 pasid_state
->invalid
= false;
676 * Drop the reference to the mm_struct here. We rely on the
677 * mmu_notifier release call-back to inform us when the mm
685 clear_pasid_state(dev_state
, pasid
);
688 mmu_notifier_unregister(&pasid_state
->mn
, mm
);
692 free_pasid_state(pasid_state
);
695 put_device_state(dev_state
);
699 EXPORT_SYMBOL(amd_iommu_bind_pasid
);
701 void amd_iommu_unbind_pasid(struct pci_dev
*pdev
, int pasid
)
703 struct pasid_state
*pasid_state
;
704 struct device_state
*dev_state
;
709 if (!amd_iommu_v2_supported())
712 devid
= device_id(pdev
);
713 dev_state
= get_device_state(devid
);
714 if (dev_state
== NULL
)
717 if (pasid
< 0 || pasid
>= dev_state
->max_pasids
)
720 pasid_state
= get_pasid_state(dev_state
, pasid
);
721 if (pasid_state
== NULL
)
724 * Drop reference taken here. We are safe because we still hold
725 * the reference taken in the amd_iommu_bind_pasid function.
727 put_pasid_state(pasid_state
);
729 /* Clear the pasid state so that the pasid can be re-used */
730 clear_pasid_state(dev_state
, pasid_state
->pasid
);
733 * Call mmu_notifier_unregister to drop our reference
736 mmu_notifier_unregister(&pasid_state
->mn
, pasid_state
->mm
);
738 put_pasid_state_wait(pasid_state
); /* Reference taken in
739 amd_iommu_bind_pasid */
741 /* Drop reference taken in this function */
742 put_device_state(dev_state
);
744 /* Drop reference taken in amd_iommu_bind_pasid */
745 put_device_state(dev_state
);
747 EXPORT_SYMBOL(amd_iommu_unbind_pasid
);
749 int amd_iommu_init_device(struct pci_dev
*pdev
, int pasids
)
751 struct device_state
*dev_state
;
758 if (!amd_iommu_v2_supported())
761 if (pasids
<= 0 || pasids
> (PASID_MASK
+ 1))
764 devid
= device_id(pdev
);
766 dev_state
= kzalloc(sizeof(*dev_state
), GFP_KERNEL
);
767 if (dev_state
== NULL
)
770 spin_lock_init(&dev_state
->lock
);
771 init_waitqueue_head(&dev_state
->wq
);
772 dev_state
->pdev
= pdev
;
773 dev_state
->devid
= devid
;
776 for (dev_state
->pasid_levels
= 0; (tmp
- 1) & ~0x1ff; tmp
>>= 9)
777 dev_state
->pasid_levels
+= 1;
779 atomic_set(&dev_state
->count
, 1);
780 dev_state
->max_pasids
= pasids
;
783 dev_state
->states
= (void *)get_zeroed_page(GFP_KERNEL
);
784 if (dev_state
->states
== NULL
)
785 goto out_free_dev_state
;
787 dev_state
->domain
= iommu_domain_alloc(&pci_bus_type
);
788 if (dev_state
->domain
== NULL
)
789 goto out_free_states
;
791 amd_iommu_domain_direct_map(dev_state
->domain
);
793 ret
= amd_iommu_domain_enable_v2(dev_state
->domain
, pasids
);
795 goto out_free_domain
;
797 ret
= iommu_attach_device(dev_state
->domain
, &pdev
->dev
);
799 goto out_free_domain
;
801 spin_lock_irqsave(&state_lock
, flags
);
803 if (__get_device_state(devid
) != NULL
) {
804 spin_unlock_irqrestore(&state_lock
, flags
);
806 goto out_free_domain
;
809 list_add_tail(&dev_state
->list
, &state_list
);
811 spin_unlock_irqrestore(&state_lock
, flags
);
816 iommu_domain_free(dev_state
->domain
);
819 free_page((unsigned long)dev_state
->states
);
826 EXPORT_SYMBOL(amd_iommu_init_device
);
828 void amd_iommu_free_device(struct pci_dev
*pdev
)
830 struct device_state
*dev_state
;
834 if (!amd_iommu_v2_supported())
837 devid
= device_id(pdev
);
839 spin_lock_irqsave(&state_lock
, flags
);
841 dev_state
= __get_device_state(devid
);
842 if (dev_state
== NULL
) {
843 spin_unlock_irqrestore(&state_lock
, flags
);
847 list_del(&dev_state
->list
);
849 spin_unlock_irqrestore(&state_lock
, flags
);
851 /* Get rid of any remaining pasid states */
852 free_pasid_states(dev_state
);
854 put_device_state_wait(dev_state
);
856 EXPORT_SYMBOL(amd_iommu_free_device
);
858 int amd_iommu_set_invalid_ppr_cb(struct pci_dev
*pdev
,
859 amd_iommu_invalid_ppr_cb cb
)
861 struct device_state
*dev_state
;
866 if (!amd_iommu_v2_supported())
869 devid
= device_id(pdev
);
871 spin_lock_irqsave(&state_lock
, flags
);
874 dev_state
= __get_device_state(devid
);
875 if (dev_state
== NULL
)
878 dev_state
->inv_ppr_cb
= cb
;
883 spin_unlock_irqrestore(&state_lock
, flags
);
887 EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb
);
889 int amd_iommu_set_invalidate_ctx_cb(struct pci_dev
*pdev
,
890 amd_iommu_invalidate_ctx cb
)
892 struct device_state
*dev_state
;
897 if (!amd_iommu_v2_supported())
900 devid
= device_id(pdev
);
902 spin_lock_irqsave(&state_lock
, flags
);
905 dev_state
= __get_device_state(devid
);
906 if (dev_state
== NULL
)
909 dev_state
->inv_ctx_cb
= cb
;
914 spin_unlock_irqrestore(&state_lock
, flags
);
918 EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb
);
920 static int __init
amd_iommu_v2_init(void)
924 pr_info("AMD IOMMUv2 driver by Joerg Roedel <joerg.roedel@amd.com>\n");
926 if (!amd_iommu_v2_supported()) {
927 pr_info("AMD IOMMUv2 functionality not available on this system\n");
929 * Load anyway to provide the symbols to other modules
930 * which may use AMD IOMMUv2 optionally.
935 spin_lock_init(&state_lock
);
938 iommu_wq
= create_workqueue("amd_iommu_v2");
939 if (iommu_wq
== NULL
)
942 amd_iommu_register_ppr_notifier(&ppr_nb
);
950 static void __exit
amd_iommu_v2_exit(void)
952 struct device_state
*dev_state
;
955 if (!amd_iommu_v2_supported())
958 amd_iommu_unregister_ppr_notifier(&ppr_nb
);
960 flush_workqueue(iommu_wq
);
963 * The loop below might call flush_workqueue(), so call
964 * destroy_workqueue() after it
966 for (i
= 0; i
< MAX_DEVICES
; ++i
) {
967 dev_state
= get_device_state(i
);
969 if (dev_state
== NULL
)
974 put_device_state(dev_state
);
975 amd_iommu_free_device(dev_state
->pdev
);
978 destroy_workqueue(iommu_wq
);
981 module_init(amd_iommu_v2_init
);
982 module_exit(amd_iommu_v2_exit
);