2 * Copyright (C) 2010-2012 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <jroedel@suse.de>
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published
7 * by the Free Software Foundation.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 #include <linux/mmu_notifier.h>
20 #include <linux/amd-iommu.h>
21 #include <linux/mm_types.h>
22 #include <linux/profile.h>
23 #include <linux/module.h>
24 #include <linux/sched.h>
25 #include <linux/iommu.h>
26 #include <linux/wait.h>
27 #include <linux/pci.h>
28 #include <linux/gfp.h>
30 #include "amd_iommu_types.h"
31 #include "amd_iommu_proto.h"
33 MODULE_LICENSE("GPL v2");
34 MODULE_AUTHOR("Joerg Roedel <jroedel@suse.de>");
36 #define MAX_DEVICES 0x10000
37 #define PRI_QUEUE_SIZE 512
46 struct list_head list
; /* For global state-list */
47 atomic_t count
; /* Reference count */
48 unsigned mmu_notifier_count
; /* Counting nested mmu_notifier
50 struct mm_struct
*mm
; /* mm_struct for the faults */
51 struct mmu_notifier mn
; /* mmu_notifier handle */
52 struct pri_queue pri
[PRI_QUEUE_SIZE
]; /* PRI tag states */
53 struct device_state
*device_state
; /* Link to our device_state */
54 int pasid
; /* PASID index */
55 bool invalid
; /* Used during setup and
56 teardown of the pasid */
57 spinlock_t lock
; /* Protect pri_queues and
59 wait_queue_head_t wq
; /* To wait for count == 0 */
63 struct list_head list
;
67 struct pasid_state
**states
;
68 struct iommu_domain
*domain
;
71 amd_iommu_invalid_ppr_cb inv_ppr_cb
;
72 amd_iommu_invalidate_ctx inv_ctx_cb
;
78 struct work_struct work
;
79 struct device_state
*dev_state
;
80 struct pasid_state
*state
;
90 static LIST_HEAD(state_list
);
91 static spinlock_t state_lock
;
93 static struct workqueue_struct
*iommu_wq
;
95 static void free_pasid_states(struct device_state
*dev_state
);
97 static u16
device_id(struct pci_dev
*pdev
)
101 devid
= pdev
->bus
->number
;
102 devid
= (devid
<< 8) | pdev
->devfn
;
107 static struct device_state
*__get_device_state(u16 devid
)
109 struct device_state
*dev_state
;
111 list_for_each_entry(dev_state
, &state_list
, list
) {
112 if (dev_state
->devid
== devid
)
119 static struct device_state
*get_device_state(u16 devid
)
121 struct device_state
*dev_state
;
124 spin_lock_irqsave(&state_lock
, flags
);
125 dev_state
= __get_device_state(devid
);
126 if (dev_state
!= NULL
)
127 atomic_inc(&dev_state
->count
);
128 spin_unlock_irqrestore(&state_lock
, flags
);
133 static void free_device_state(struct device_state
*dev_state
)
136 * First detach device from domain - No more PRI requests will arrive
137 * from that device after it is unbound from the IOMMUv2 domain.
139 iommu_detach_device(dev_state
->domain
, &dev_state
->pdev
->dev
);
141 /* Everything is down now, free the IOMMUv2 domain */
142 iommu_domain_free(dev_state
->domain
);
144 /* Finally get rid of the device-state */
148 static void put_device_state(struct device_state
*dev_state
)
150 if (atomic_dec_and_test(&dev_state
->count
))
151 wake_up(&dev_state
->wq
);
154 /* Must be called under dev_state->lock */
155 static struct pasid_state
**__get_pasid_state_ptr(struct device_state
*dev_state
,
156 int pasid
, bool alloc
)
158 struct pasid_state
**root
, **ptr
;
161 level
= dev_state
->pasid_levels
;
162 root
= dev_state
->states
;
166 index
= (pasid
>> (9 * level
)) & 0x1ff;
176 *ptr
= (void *)get_zeroed_page(GFP_ATOMIC
);
181 root
= (struct pasid_state
**)*ptr
;
188 static int set_pasid_state(struct device_state
*dev_state
,
189 struct pasid_state
*pasid_state
,
192 struct pasid_state
**ptr
;
196 spin_lock_irqsave(&dev_state
->lock
, flags
);
197 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, true);
212 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
217 static void clear_pasid_state(struct device_state
*dev_state
, int pasid
)
219 struct pasid_state
**ptr
;
222 spin_lock_irqsave(&dev_state
->lock
, flags
);
223 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, true);
231 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
234 static struct pasid_state
*get_pasid_state(struct device_state
*dev_state
,
237 struct pasid_state
**ptr
, *ret
= NULL
;
240 spin_lock_irqsave(&dev_state
->lock
, flags
);
241 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, false);
248 atomic_inc(&ret
->count
);
251 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
256 static void free_pasid_state(struct pasid_state
*pasid_state
)
261 static void put_pasid_state(struct pasid_state
*pasid_state
)
263 if (atomic_dec_and_test(&pasid_state
->count
))
264 wake_up(&pasid_state
->wq
);
267 static void put_pasid_state_wait(struct pasid_state
*pasid_state
)
269 atomic_dec(&pasid_state
->count
);
270 wait_event(pasid_state
->wq
, !atomic_read(&pasid_state
->count
));
271 free_pasid_state(pasid_state
);
274 static void unbind_pasid(struct pasid_state
*pasid_state
)
276 struct iommu_domain
*domain
;
278 domain
= pasid_state
->device_state
->domain
;
281 * Mark pasid_state as invalid, no more faults will we added to the
282 * work queue after this is visible everywhere.
284 pasid_state
->invalid
= true;
286 /* Make sure this is visible */
289 /* After this the device/pasid can't access the mm anymore */
290 amd_iommu_domain_clear_gcr3(domain
, pasid_state
->pasid
);
292 /* Make sure no more pending faults are in the queue */
293 flush_workqueue(iommu_wq
);
296 static void free_pasid_states_level1(struct pasid_state
**tbl
)
300 for (i
= 0; i
< 512; ++i
) {
304 free_page((unsigned long)tbl
[i
]);
308 static void free_pasid_states_level2(struct pasid_state
**tbl
)
310 struct pasid_state
**ptr
;
313 for (i
= 0; i
< 512; ++i
) {
317 ptr
= (struct pasid_state
**)tbl
[i
];
318 free_pasid_states_level1(ptr
);
322 static void free_pasid_states(struct device_state
*dev_state
)
324 struct pasid_state
*pasid_state
;
327 for (i
= 0; i
< dev_state
->max_pasids
; ++i
) {
328 pasid_state
= get_pasid_state(dev_state
, i
);
329 if (pasid_state
== NULL
)
332 put_pasid_state(pasid_state
);
335 * This will call the mn_release function and
338 mmu_notifier_unregister(&pasid_state
->mn
, pasid_state
->mm
);
340 put_pasid_state_wait(pasid_state
); /* Reference taken in
341 amd_iommu_bind_pasid */
343 /* Drop reference taken in amd_iommu_bind_pasid */
344 put_device_state(dev_state
);
347 if (dev_state
->pasid_levels
== 2)
348 free_pasid_states_level2(dev_state
->states
);
349 else if (dev_state
->pasid_levels
== 1)
350 free_pasid_states_level1(dev_state
->states
);
351 else if (dev_state
->pasid_levels
!= 0)
354 free_page((unsigned long)dev_state
->states
);
357 static struct pasid_state
*mn_to_state(struct mmu_notifier
*mn
)
359 return container_of(mn
, struct pasid_state
, mn
);
362 static void __mn_flush_page(struct mmu_notifier
*mn
,
363 unsigned long address
)
365 struct pasid_state
*pasid_state
;
366 struct device_state
*dev_state
;
368 pasid_state
= mn_to_state(mn
);
369 dev_state
= pasid_state
->device_state
;
371 amd_iommu_flush_page(dev_state
->domain
, pasid_state
->pasid
, address
);
374 static int mn_clear_flush_young(struct mmu_notifier
*mn
,
375 struct mm_struct
*mm
,
379 for (; start
< end
; start
+= PAGE_SIZE
)
380 __mn_flush_page(mn
, start
);
385 static void mn_invalidate_page(struct mmu_notifier
*mn
,
386 struct mm_struct
*mm
,
387 unsigned long address
)
389 __mn_flush_page(mn
, address
);
392 static void mn_invalidate_range(struct mmu_notifier
*mn
,
393 struct mm_struct
*mm
,
394 unsigned long start
, unsigned long end
)
396 struct pasid_state
*pasid_state
;
397 struct device_state
*dev_state
;
399 pasid_state
= mn_to_state(mn
);
400 dev_state
= pasid_state
->device_state
;
402 if ((start
^ (end
- 1)) < PAGE_SIZE
)
403 amd_iommu_flush_page(dev_state
->domain
, pasid_state
->pasid
,
406 amd_iommu_flush_tlb(dev_state
->domain
, pasid_state
->pasid
);
409 static void mn_release(struct mmu_notifier
*mn
, struct mm_struct
*mm
)
411 struct pasid_state
*pasid_state
;
412 struct device_state
*dev_state
;
417 pasid_state
= mn_to_state(mn
);
418 dev_state
= pasid_state
->device_state
;
419 run_inv_ctx_cb
= !pasid_state
->invalid
;
421 if (run_inv_ctx_cb
&& dev_state
->inv_ctx_cb
)
422 dev_state
->inv_ctx_cb(dev_state
->pdev
, pasid_state
->pasid
);
424 unbind_pasid(pasid_state
);
427 static struct mmu_notifier_ops iommu_mn
= {
428 .release
= mn_release
,
429 .clear_flush_young
= mn_clear_flush_young
,
430 .invalidate_page
= mn_invalidate_page
,
431 .invalidate_range
= mn_invalidate_range
,
434 static void set_pri_tag_status(struct pasid_state
*pasid_state
,
439 spin_lock_irqsave(&pasid_state
->lock
, flags
);
440 pasid_state
->pri
[tag
].status
= status
;
441 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
444 static void finish_pri_tag(struct device_state
*dev_state
,
445 struct pasid_state
*pasid_state
,
450 spin_lock_irqsave(&pasid_state
->lock
, flags
);
451 if (atomic_dec_and_test(&pasid_state
->pri
[tag
].inflight
) &&
452 pasid_state
->pri
[tag
].finish
) {
453 amd_iommu_complete_ppr(dev_state
->pdev
, pasid_state
->pasid
,
454 pasid_state
->pri
[tag
].status
, tag
);
455 pasid_state
->pri
[tag
].finish
= false;
456 pasid_state
->pri
[tag
].status
= PPR_SUCCESS
;
458 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
461 static void handle_fault_error(struct fault
*fault
)
465 if (!fault
->dev_state
->inv_ppr_cb
) {
466 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_INVALID
);
470 status
= fault
->dev_state
->inv_ppr_cb(fault
->dev_state
->pdev
,
475 case AMD_IOMMU_INV_PRI_RSP_SUCCESS
:
476 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_SUCCESS
);
478 case AMD_IOMMU_INV_PRI_RSP_INVALID
:
479 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_INVALID
);
481 case AMD_IOMMU_INV_PRI_RSP_FAIL
:
482 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_FAILURE
);
489 static void do_fault(struct work_struct
*work
)
491 struct fault
*fault
= container_of(work
, struct fault
, work
);
492 struct mm_struct
*mm
;
493 struct vm_area_struct
*vma
;
497 write
= !!(fault
->flags
& PPR_FAULT_WRITE
);
499 mm
= fault
->state
->mm
;
500 address
= fault
->address
;
502 down_read(&mm
->mmap_sem
);
503 vma
= find_extend_vma(mm
, address
);
504 if (!vma
|| address
< vma
->vm_start
) {
505 /* failed to get a vma in the right range */
506 up_read(&mm
->mmap_sem
);
507 handle_fault_error(fault
);
511 ret
= handle_mm_fault(mm
, vma
, address
, write
);
512 if (ret
& VM_FAULT_ERROR
) {
513 /* failed to service fault */
514 up_read(&mm
->mmap_sem
);
515 handle_fault_error(fault
);
519 up_read(&mm
->mmap_sem
);
522 finish_pri_tag(fault
->dev_state
, fault
->state
, fault
->tag
);
524 put_pasid_state(fault
->state
);
529 static int ppr_notifier(struct notifier_block
*nb
, unsigned long e
, void *data
)
531 struct amd_iommu_fault
*iommu_fault
;
532 struct pasid_state
*pasid_state
;
533 struct device_state
*dev_state
;
541 tag
= iommu_fault
->tag
& 0x1ff;
542 finish
= (iommu_fault
->tag
>> 9) & 1;
545 dev_state
= get_device_state(iommu_fault
->device_id
);
546 if (dev_state
== NULL
)
549 pasid_state
= get_pasid_state(dev_state
, iommu_fault
->pasid
);
550 if (pasid_state
== NULL
|| pasid_state
->invalid
) {
551 /* We know the device but not the PASID -> send INVALID */
552 amd_iommu_complete_ppr(dev_state
->pdev
, iommu_fault
->pasid
,
557 spin_lock_irqsave(&pasid_state
->lock
, flags
);
558 atomic_inc(&pasid_state
->pri
[tag
].inflight
);
560 pasid_state
->pri
[tag
].finish
= true;
561 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
563 fault
= kzalloc(sizeof(*fault
), GFP_ATOMIC
);
565 /* We are OOM - send success and let the device re-fault */
566 finish_pri_tag(dev_state
, pasid_state
, tag
);
570 fault
->dev_state
= dev_state
;
571 fault
->address
= iommu_fault
->address
;
572 fault
->state
= pasid_state
;
574 fault
->finish
= finish
;
575 fault
->pasid
= iommu_fault
->pasid
;
576 fault
->flags
= iommu_fault
->flags
;
577 INIT_WORK(&fault
->work
, do_fault
);
579 queue_work(iommu_wq
, &fault
->work
);
585 if (ret
!= NOTIFY_OK
&& pasid_state
)
586 put_pasid_state(pasid_state
);
588 put_device_state(dev_state
);
594 static struct notifier_block ppr_nb
= {
595 .notifier_call
= ppr_notifier
,
598 int amd_iommu_bind_pasid(struct pci_dev
*pdev
, int pasid
,
599 struct task_struct
*task
)
601 struct pasid_state
*pasid_state
;
602 struct device_state
*dev_state
;
603 struct mm_struct
*mm
;
609 if (!amd_iommu_v2_supported())
612 devid
= device_id(pdev
);
613 dev_state
= get_device_state(devid
);
615 if (dev_state
== NULL
)
619 if (pasid
< 0 || pasid
>= dev_state
->max_pasids
)
623 pasid_state
= kzalloc(sizeof(*pasid_state
), GFP_KERNEL
);
624 if (pasid_state
== NULL
)
628 atomic_set(&pasid_state
->count
, 1);
629 init_waitqueue_head(&pasid_state
->wq
);
630 spin_lock_init(&pasid_state
->lock
);
632 mm
= get_task_mm(task
);
633 pasid_state
->mm
= mm
;
634 pasid_state
->device_state
= dev_state
;
635 pasid_state
->pasid
= pasid
;
636 pasid_state
->invalid
= true; /* Mark as valid only if we are
637 done with setting up the pasid */
638 pasid_state
->mn
.ops
= &iommu_mn
;
640 if (pasid_state
->mm
== NULL
)
643 mmu_notifier_register(&pasid_state
->mn
, mm
);
645 ret
= set_pasid_state(dev_state
, pasid_state
, pasid
);
649 ret
= amd_iommu_domain_set_gcr3(dev_state
->domain
, pasid
,
650 __pa(pasid_state
->mm
->pgd
));
652 goto out_clear_state
;
654 /* Now we are ready to handle faults */
655 pasid_state
->invalid
= false;
658 * Drop the reference to the mm_struct here. We rely on the
659 * mmu_notifier release call-back to inform us when the mm
667 clear_pasid_state(dev_state
, pasid
);
670 mmu_notifier_unregister(&pasid_state
->mn
, mm
);
674 free_pasid_state(pasid_state
);
677 put_device_state(dev_state
);
681 EXPORT_SYMBOL(amd_iommu_bind_pasid
);
683 void amd_iommu_unbind_pasid(struct pci_dev
*pdev
, int pasid
)
685 struct pasid_state
*pasid_state
;
686 struct device_state
*dev_state
;
691 if (!amd_iommu_v2_supported())
694 devid
= device_id(pdev
);
695 dev_state
= get_device_state(devid
);
696 if (dev_state
== NULL
)
699 if (pasid
< 0 || pasid
>= dev_state
->max_pasids
)
702 pasid_state
= get_pasid_state(dev_state
, pasid
);
703 if (pasid_state
== NULL
)
706 * Drop reference taken here. We are safe because we still hold
707 * the reference taken in the amd_iommu_bind_pasid function.
709 put_pasid_state(pasid_state
);
711 /* Clear the pasid state so that the pasid can be re-used */
712 clear_pasid_state(dev_state
, pasid_state
->pasid
);
715 * Call mmu_notifier_unregister to drop our reference
718 mmu_notifier_unregister(&pasid_state
->mn
, pasid_state
->mm
);
720 put_pasid_state_wait(pasid_state
); /* Reference taken in
721 amd_iommu_bind_pasid */
723 /* Drop reference taken in this function */
724 put_device_state(dev_state
);
726 /* Drop reference taken in amd_iommu_bind_pasid */
727 put_device_state(dev_state
);
729 EXPORT_SYMBOL(amd_iommu_unbind_pasid
);
731 int amd_iommu_init_device(struct pci_dev
*pdev
, int pasids
)
733 struct device_state
*dev_state
;
740 if (!amd_iommu_v2_supported())
743 if (pasids
<= 0 || pasids
> (PASID_MASK
+ 1))
746 devid
= device_id(pdev
);
748 dev_state
= kzalloc(sizeof(*dev_state
), GFP_KERNEL
);
749 if (dev_state
== NULL
)
752 spin_lock_init(&dev_state
->lock
);
753 init_waitqueue_head(&dev_state
->wq
);
754 dev_state
->pdev
= pdev
;
755 dev_state
->devid
= devid
;
758 for (dev_state
->pasid_levels
= 0; (tmp
- 1) & ~0x1ff; tmp
>>= 9)
759 dev_state
->pasid_levels
+= 1;
761 atomic_set(&dev_state
->count
, 1);
762 dev_state
->max_pasids
= pasids
;
765 dev_state
->states
= (void *)get_zeroed_page(GFP_KERNEL
);
766 if (dev_state
->states
== NULL
)
767 goto out_free_dev_state
;
769 dev_state
->domain
= iommu_domain_alloc(&pci_bus_type
);
770 if (dev_state
->domain
== NULL
)
771 goto out_free_states
;
773 amd_iommu_domain_direct_map(dev_state
->domain
);
775 ret
= amd_iommu_domain_enable_v2(dev_state
->domain
, pasids
);
777 goto out_free_domain
;
779 ret
= iommu_attach_device(dev_state
->domain
, &pdev
->dev
);
781 goto out_free_domain
;
783 spin_lock_irqsave(&state_lock
, flags
);
785 if (__get_device_state(devid
) != NULL
) {
786 spin_unlock_irqrestore(&state_lock
, flags
);
788 goto out_free_domain
;
791 list_add_tail(&dev_state
->list
, &state_list
);
793 spin_unlock_irqrestore(&state_lock
, flags
);
798 iommu_domain_free(dev_state
->domain
);
801 free_page((unsigned long)dev_state
->states
);
808 EXPORT_SYMBOL(amd_iommu_init_device
);
810 void amd_iommu_free_device(struct pci_dev
*pdev
)
812 struct device_state
*dev_state
;
816 if (!amd_iommu_v2_supported())
819 devid
= device_id(pdev
);
821 spin_lock_irqsave(&state_lock
, flags
);
823 dev_state
= __get_device_state(devid
);
824 if (dev_state
== NULL
) {
825 spin_unlock_irqrestore(&state_lock
, flags
);
829 list_del(&dev_state
->list
);
831 spin_unlock_irqrestore(&state_lock
, flags
);
833 /* Get rid of any remaining pasid states */
834 free_pasid_states(dev_state
);
836 put_device_state(dev_state
);
838 * Wait until the last reference is dropped before freeing
841 wait_event(dev_state
->wq
, !atomic_read(&dev_state
->count
));
842 free_device_state(dev_state
);
844 EXPORT_SYMBOL(amd_iommu_free_device
);
846 int amd_iommu_set_invalid_ppr_cb(struct pci_dev
*pdev
,
847 amd_iommu_invalid_ppr_cb cb
)
849 struct device_state
*dev_state
;
854 if (!amd_iommu_v2_supported())
857 devid
= device_id(pdev
);
859 spin_lock_irqsave(&state_lock
, flags
);
862 dev_state
= __get_device_state(devid
);
863 if (dev_state
== NULL
)
866 dev_state
->inv_ppr_cb
= cb
;
871 spin_unlock_irqrestore(&state_lock
, flags
);
875 EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb
);
877 int amd_iommu_set_invalidate_ctx_cb(struct pci_dev
*pdev
,
878 amd_iommu_invalidate_ctx cb
)
880 struct device_state
*dev_state
;
885 if (!amd_iommu_v2_supported())
888 devid
= device_id(pdev
);
890 spin_lock_irqsave(&state_lock
, flags
);
893 dev_state
= __get_device_state(devid
);
894 if (dev_state
== NULL
)
897 dev_state
->inv_ctx_cb
= cb
;
902 spin_unlock_irqrestore(&state_lock
, flags
);
906 EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb
);
908 static int __init
amd_iommu_v2_init(void)
912 pr_info("AMD IOMMUv2 driver by Joerg Roedel <jroedel@suse.de>\n");
914 if (!amd_iommu_v2_supported()) {
915 pr_info("AMD IOMMUv2 functionality not available on this system\n");
917 * Load anyway to provide the symbols to other modules
918 * which may use AMD IOMMUv2 optionally.
923 spin_lock_init(&state_lock
);
926 iommu_wq
= create_workqueue("amd_iommu_v2");
927 if (iommu_wq
== NULL
)
930 amd_iommu_register_ppr_notifier(&ppr_nb
);
938 static void __exit
amd_iommu_v2_exit(void)
940 struct device_state
*dev_state
;
943 if (!amd_iommu_v2_supported())
946 amd_iommu_unregister_ppr_notifier(&ppr_nb
);
948 flush_workqueue(iommu_wq
);
951 * The loop below might call flush_workqueue(), so call
952 * destroy_workqueue() after it
954 for (i
= 0; i
< MAX_DEVICES
; ++i
) {
955 dev_state
= get_device_state(i
);
957 if (dev_state
== NULL
)
962 put_device_state(dev_state
);
963 amd_iommu_free_device(dev_state
->pdev
);
966 destroy_workqueue(iommu_wq
);
969 module_init(amd_iommu_v2_init
);
970 module_exit(amd_iommu_v2_exit
);