Merge remote-tracking branch 'qemu-project/master'
[qemu/ar7.git] / target / i386 / kvm / xen-emu.c
blob2f89dc628ef11d06c40bf39d48965f42800cc077
1 /*
2 * Xen HVM emulation support in KVM
4 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
5 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include "qemu/log.h"
14 #include "qemu/main-loop.h"
15 #include "qemu/error-report.h"
16 #include "hw/xen/xen.h"
17 #include "sysemu/kvm_int.h"
18 #include "sysemu/kvm_xen.h"
19 #include "kvm/kvm_i386.h"
20 #include "exec/address-spaces.h"
21 #include "xen-emu.h"
22 #include "trace.h"
23 #include "sysemu/runstate.h"
25 #include "hw/pci/msi.h"
26 #include "hw/i386/apic-msidef.h"
27 #include "hw/i386/e820_memory_layout.h"
28 #include "hw/i386/kvm/xen_overlay.h"
29 #include "hw/i386/kvm/xen_evtchn.h"
30 #include "hw/i386/kvm/xen_gnttab.h"
31 #include "hw/i386/kvm/xen_primary_console.h"
32 #include "hw/i386/kvm/xen_xenstore.h"
34 #include "hw/xen/interface/version.h"
35 #include "hw/xen/interface/sched.h"
36 #include "hw/xen/interface/memory.h"
37 #include "hw/xen/interface/hvm/hvm_op.h"
38 #include "hw/xen/interface/hvm/params.h"
39 #include "hw/xen/interface/vcpu.h"
40 #include "hw/xen/interface/event_channel.h"
41 #include "hw/xen/interface/grant_table.h"
43 #include "xen-compat.h"
45 static void xen_vcpu_singleshot_timer_event(void *opaque);
46 static void xen_vcpu_periodic_timer_event(void *opaque);
47 static int vcpuop_stop_singleshot_timer(CPUState *cs);
49 #ifdef TARGET_X86_64
50 #define hypercall_compat32(longmode) (!(longmode))
51 #else
52 #define hypercall_compat32(longmode) (false)
53 #endif
55 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa,
56 size_t *len, bool is_write)
58 struct kvm_translation tr = {
59 .linear_address = gva,
62 if (len) {
63 *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK);
66 if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid ||
67 (is_write && !tr.writeable)) {
68 return false;
70 *gpa = tr.physical_address;
71 return true;
74 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
75 bool is_write)
77 uint8_t *buf = (uint8_t *)_buf;
78 uint64_t gpa;
79 size_t len;
81 while (sz) {
82 if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) {
83 return -EFAULT;
85 if (len > sz) {
86 len = sz;
89 cpu_physical_memory_rw(gpa, buf, len, is_write);
91 buf += len;
92 sz -= len;
93 gva += len;
96 return 0;
99 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf,
100 size_t sz)
102 return kvm_gva_rw(cs, gva, buf, sz, false);
105 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf,
106 size_t sz)
108 return kvm_gva_rw(cs, gva, buf, sz, true);
111 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
113 const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
114 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO;
115 struct kvm_xen_hvm_config cfg = {
116 .msr = hypercall_msr,
117 .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
119 int xen_caps, ret;
121 xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM);
122 if (required_caps & ~xen_caps) {
123 error_report("kvm: Xen HVM guest support not present or insufficient");
124 return -ENOSYS;
127 if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) {
128 struct kvm_xen_hvm_attr ha = {
129 .type = KVM_XEN_ATTR_TYPE_XEN_VERSION,
130 .u.xen_version = s->xen_version,
132 (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha);
134 cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
137 ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg);
138 if (ret < 0) {
139 error_report("kvm: Failed to enable Xen HVM support: %s",
140 strerror(-ret));
141 return ret;
144 /* If called a second time, don't repeat the rest of the setup. */
145 if (s->xen_caps) {
146 return 0;
150 * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
151 * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
153 * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
154 * such things to be polled at precisely the right time. We *could* do
155 * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
156 * the moment the IRQ is acked, and see if it should be reasserted.
158 * But the in-kernel irqchip is deprecated, so we're unlikely to add
159 * that support in the kernel. Insist on using the split irqchip mode
160 * instead.
162 * This leaves us polling for the level going low in QEMU, which lacks
163 * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
164 * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
165 * the device (for which it has to unmap the device and trap access, for
166 * some period after an IRQ!!). In the Xen case, we do it on exit from
167 * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
168 * Which is kind of icky, but less so than the VFIO one. I may fix them
169 * both later...
171 if (!kvm_kernel_irqchip_split()) {
172 error_report("kvm: Xen support requires kernel-irqchip=split");
173 return -EINVAL;
176 s->xen_caps = xen_caps;
178 /* Tell fw_cfg to notify the BIOS to reserve the range. */
179 e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE, E820_RESERVED);
181 /* The pages couldn't be overlaid until KVM was initialized */
182 xen_primary_console_reset();
183 xen_xenstore_reset();
185 return 0;
188 int kvm_xen_init_vcpu(CPUState *cs)
190 X86CPU *cpu = X86_CPU(cs);
191 CPUX86State *env = &cpu->env;
192 int err;
195 * The kernel needs to know the Xen/ACPI vCPU ID because that's
196 * what the guest uses in hypercalls such as timers. It doesn't
197 * match the APIC ID which is generally used for talking to the
198 * kernel about vCPUs. And if vCPU threads race with creating
199 * their KVM vCPUs out of order, it doesn't necessarily match
200 * with the kernel's internal vCPU indices either.
202 if (kvm_xen_has_cap(EVTCHN_SEND)) {
203 struct kvm_xen_vcpu_attr va = {
204 .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID,
205 .u.vcpu_id = cs->cpu_index,
207 err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
208 if (err) {
209 error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
210 strerror(-err));
211 return err;
215 env->xen_vcpu_info_gpa = INVALID_GPA;
216 env->xen_vcpu_info_default_gpa = INVALID_GPA;
217 env->xen_vcpu_time_info_gpa = INVALID_GPA;
218 env->xen_vcpu_runstate_gpa = INVALID_GPA;
220 qemu_mutex_init(&env->xen_timers_lock);
221 env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
222 xen_vcpu_singleshot_timer_event,
223 cpu);
224 if (!env->xen_singleshot_timer) {
225 return -ENOMEM;
227 env->xen_singleshot_timer->opaque = cs;
229 env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
230 xen_vcpu_periodic_timer_event,
231 cpu);
232 if (!env->xen_periodic_timer) {
233 return -ENOMEM;
235 env->xen_periodic_timer->opaque = cs;
237 return 0;
240 uint32_t kvm_xen_get_caps(void)
242 return kvm_state->xen_caps;
245 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu,
246 int cmd, uint64_t arg)
248 int err = 0;
250 switch (cmd) {
251 case XENVER_get_features: {
252 struct xen_feature_info fi;
254 /* No need for 32/64 compat handling */
255 qemu_build_assert(sizeof(fi) == 8);
257 err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi));
258 if (err) {
259 break;
262 fi.submap = 0;
263 if (fi.submap_idx == 0) {
264 fi.submap |= 1 << XENFEAT_writable_page_tables |
265 1 << XENFEAT_writable_descriptor_tables |
266 1 << XENFEAT_auto_translated_physmap |
267 1 << XENFEAT_hvm_callback_vector |
268 1 << XENFEAT_hvm_safe_pvclock |
269 1 << XENFEAT_hvm_pirqs;
272 err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
273 break;
276 default:
277 return false;
280 exit->u.hcall.result = err;
281 return true;
284 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa)
286 struct kvm_xen_vcpu_attr xhsi;
288 xhsi.type = type;
289 xhsi.u.gpa = gpa;
291 trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa);
293 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
296 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs)
298 uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
299 struct kvm_xen_vcpu_attr xva;
301 xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR;
302 xva.u.vector = vector;
304 trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector);
306 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xva);
309 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data)
311 X86CPU *cpu = X86_CPU(cs);
312 CPUX86State *env = &cpu->env;
314 env->xen_vcpu_callback_vector = data.host_int;
316 if (kvm_xen_has_cap(EVTCHN_SEND)) {
317 kvm_xen_set_vcpu_callback_vector(cs);
321 static int set_vcpu_info(CPUState *cs, uint64_t gpa)
323 X86CPU *cpu = X86_CPU(cs);
324 CPUX86State *env = &cpu->env;
325 MemoryRegionSection mrs = { .mr = NULL };
326 void *vcpu_info_hva = NULL;
327 int ret;
329 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa);
330 if (ret || gpa == INVALID_GPA) {
331 goto out;
334 mrs = memory_region_find(get_system_memory(), gpa,
335 sizeof(struct vcpu_info));
336 if (mrs.mr && mrs.mr->ram_block &&
337 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
338 vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block,
339 mrs.offset_within_region);
341 if (!vcpu_info_hva) {
342 if (mrs.mr) {
343 memory_region_unref(mrs.mr);
344 mrs.mr = NULL;
346 ret = -EINVAL;
349 out:
350 if (env->xen_vcpu_info_mr) {
351 memory_region_unref(env->xen_vcpu_info_mr);
353 env->xen_vcpu_info_hva = vcpu_info_hva;
354 env->xen_vcpu_info_mr = mrs.mr;
355 return ret;
358 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
360 X86CPU *cpu = X86_CPU(cs);
361 CPUX86State *env = &cpu->env;
363 env->xen_vcpu_info_default_gpa = data.host_ulong;
365 /* Changing the default does nothing if a vcpu_info was explicitly set. */
366 if (env->xen_vcpu_info_gpa == INVALID_GPA) {
367 set_vcpu_info(cs, env->xen_vcpu_info_default_gpa);
371 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data)
373 X86CPU *cpu = X86_CPU(cs);
374 CPUX86State *env = &cpu->env;
376 env->xen_vcpu_info_gpa = data.host_ulong;
378 set_vcpu_info(cs, env->xen_vcpu_info_gpa);
381 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id)
383 CPUState *cs = qemu_get_cpu(vcpu_id);
384 if (!cs) {
385 return NULL;
388 return X86_CPU(cs)->env.xen_vcpu_info_hva;
391 void kvm_xen_maybe_deassert_callback(CPUState *cs)
393 CPUX86State *env = &X86_CPU(cs)->env;
394 struct vcpu_info *vi = env->xen_vcpu_info_hva;
395 if (!vi) {
396 return;
399 /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
400 if (!vi->evtchn_upcall_pending) {
401 bql_lock();
403 * Check again now we have the lock, because it may have been
404 * asserted in the interim. And we don't want to take the lock
405 * every time because this is a fast path.
407 if (!vi->evtchn_upcall_pending) {
408 X86_CPU(cs)->env.xen_callback_asserted = false;
409 xen_evtchn_set_callback_level(0);
411 bql_unlock();
415 void kvm_xen_set_callback_asserted(void)
417 CPUState *cs = qemu_get_cpu(0);
419 if (cs) {
420 X86_CPU(cs)->env.xen_callback_asserted = true;
424 bool kvm_xen_has_vcpu_callback_vector(void)
426 CPUState *cs = qemu_get_cpu(0);
428 return cs && !!X86_CPU(cs)->env.xen_vcpu_callback_vector;
431 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type)
433 CPUState *cs = qemu_get_cpu(vcpu_id);
434 uint8_t vector;
436 if (!cs) {
437 return;
440 vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
441 if (vector) {
443 * The per-vCPU callback vector injected via lapic. Just
444 * deliver it as an MSI.
446 MSIMessage msg = {
447 .address = APIC_DEFAULT_ADDRESS |
448 (X86_CPU(cs)->apic_id << MSI_ADDR_DEST_ID_SHIFT),
449 .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT),
451 kvm_irqchip_send_msi(kvm_state, msg);
452 return;
455 switch (type) {
456 case HVM_PARAM_CALLBACK_TYPE_VECTOR:
458 * If the evtchn_upcall_pending field in the vcpu_info is set, then
459 * KVM will automatically deliver the vector on entering the vCPU
460 * so all we have to do is kick it out.
462 qemu_cpu_kick(cs);
463 break;
465 case HVM_PARAM_CALLBACK_TYPE_GSI:
466 case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
467 if (vcpu_id == 0) {
468 xen_evtchn_set_callback_level(1);
470 break;
474 /* Must always be called with xen_timers_lock held */
475 static int kvm_xen_set_vcpu_timer(CPUState *cs)
477 X86CPU *cpu = X86_CPU(cs);
478 CPUX86State *env = &cpu->env;
480 struct kvm_xen_vcpu_attr va = {
481 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
482 .u.timer.port = env->xen_virq[VIRQ_TIMER],
483 .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
484 .u.timer.expires_ns = env->xen_singleshot_timer_ns,
487 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
490 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data)
492 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
493 kvm_xen_set_vcpu_timer(cs);
496 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port)
498 CPUState *cs = qemu_get_cpu(vcpu_id);
500 if (!cs) {
501 return -ENOENT;
504 /* cpu.h doesn't include the actual Xen header. */
505 qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS);
507 if (virq >= NR_VIRQS) {
508 return -EINVAL;
511 if (port && X86_CPU(cs)->env.xen_virq[virq]) {
512 return -EEXIST;
515 X86_CPU(cs)->env.xen_virq[virq] = port;
516 if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) {
517 async_run_on_cpu(cs, do_set_vcpu_timer_virq,
518 RUN_ON_CPU_HOST_INT(port));
520 return 0;
523 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data)
525 X86CPU *cpu = X86_CPU(cs);
526 CPUX86State *env = &cpu->env;
528 env->xen_vcpu_time_info_gpa = data.host_ulong;
530 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
531 env->xen_vcpu_time_info_gpa);
534 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data)
536 X86CPU *cpu = X86_CPU(cs);
537 CPUX86State *env = &cpu->env;
539 env->xen_vcpu_runstate_gpa = data.host_ulong;
541 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
542 env->xen_vcpu_runstate_gpa);
545 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
547 X86CPU *cpu = X86_CPU(cs);
548 CPUX86State *env = &cpu->env;
550 env->xen_vcpu_info_gpa = INVALID_GPA;
551 env->xen_vcpu_info_default_gpa = INVALID_GPA;
552 env->xen_vcpu_time_info_gpa = INVALID_GPA;
553 env->xen_vcpu_runstate_gpa = INVALID_GPA;
554 env->xen_vcpu_callback_vector = 0;
555 memset(env->xen_virq, 0, sizeof(env->xen_virq));
557 set_vcpu_info(cs, INVALID_GPA);
558 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
559 INVALID_GPA);
560 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
561 INVALID_GPA);
562 if (kvm_xen_has_cap(EVTCHN_SEND)) {
563 kvm_xen_set_vcpu_callback_vector(cs);
565 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
566 env->xen_singleshot_timer_ns = 0;
567 kvm_xen_set_vcpu_timer(cs);
568 } else {
569 vcpuop_stop_singleshot_timer(cs);
574 static int xen_set_shared_info(uint64_t gfn)
576 uint64_t gpa = gfn << TARGET_PAGE_BITS;
577 int i, err;
579 BQL_LOCK_GUARD();
582 * The xen_overlay device tells KVM about it too, since it had to
583 * do that on migration load anyway (unless we're going to jump
584 * through lots of hoops to maintain the fiction that this isn't
585 * KVM-specific.
587 err = xen_overlay_map_shinfo_page(gpa);
588 if (err) {
589 return err;
592 trace_kvm_xen_set_shared_info(gfn);
594 for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
595 CPUState *cpu = qemu_get_cpu(i);
596 if (cpu) {
597 async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa,
598 RUN_ON_CPU_HOST_ULONG(gpa));
600 gpa += sizeof(vcpu_info_t);
603 return err;
606 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn)
608 switch (space) {
609 case XENMAPSPACE_shared_info:
610 if (idx > 0) {
611 return -EINVAL;
613 return xen_set_shared_info(gfn);
615 case XENMAPSPACE_grant_table:
616 return xen_gnttab_map_page(idx, gfn);
618 case XENMAPSPACE_gmfn:
619 case XENMAPSPACE_gmfn_range:
620 return -ENOTSUP;
622 case XENMAPSPACE_gmfn_foreign:
623 case XENMAPSPACE_dev_mmio:
624 return -EPERM;
626 default:
627 return -EINVAL;
631 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu,
632 uint64_t arg)
634 struct xen_add_to_physmap xatp;
635 CPUState *cs = CPU(cpu);
637 if (hypercall_compat32(exit->u.hcall.longmode)) {
638 struct compat_xen_add_to_physmap xatp32;
640 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16);
641 if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) {
642 return -EFAULT;
644 xatp.domid = xatp32.domid;
645 xatp.size = xatp32.size;
646 xatp.space = xatp32.space;
647 xatp.idx = xatp32.idx;
648 xatp.gpfn = xatp32.gpfn;
649 } else {
650 if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) {
651 return -EFAULT;
655 if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) {
656 return -ESRCH;
659 return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn);
662 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu,
663 uint64_t arg)
665 struct xen_add_to_physmap_batch xatpb;
666 unsigned long idxs_gva, gpfns_gva, errs_gva;
667 CPUState *cs = CPU(cpu);
668 size_t op_sz;
670 if (hypercall_compat32(exit->u.hcall.longmode)) {
671 struct compat_xen_add_to_physmap_batch xatpb32;
673 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20);
674 if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) {
675 return -EFAULT;
677 xatpb.domid = xatpb32.domid;
678 xatpb.space = xatpb32.space;
679 xatpb.size = xatpb32.size;
681 idxs_gva = xatpb32.idxs.c;
682 gpfns_gva = xatpb32.gpfns.c;
683 errs_gva = xatpb32.errs.c;
684 op_sz = sizeof(uint32_t);
685 } else {
686 if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) {
687 return -EFAULT;
689 op_sz = sizeof(unsigned long);
690 idxs_gva = (unsigned long)xatpb.idxs.p;
691 gpfns_gva = (unsigned long)xatpb.gpfns.p;
692 errs_gva = (unsigned long)xatpb.errs.p;
695 if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) {
696 return -ESRCH;
699 /* Explicitly invalid for the batch op. Not that we implement it anyway. */
700 if (xatpb.space == XENMAPSPACE_gmfn_range) {
701 return -EINVAL;
704 while (xatpb.size--) {
705 unsigned long idx = 0;
706 unsigned long gpfn = 0;
707 int err;
709 /* For 32-bit compat this only copies the low 32 bits of each */
710 if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) ||
711 kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) {
712 return -EFAULT;
714 idxs_gva += op_sz;
715 gpfns_gva += op_sz;
717 err = add_to_physmap_one(xatpb.space, idx, gpfn);
719 if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) {
720 return -EFAULT;
722 errs_gva += sizeof(err);
724 return 0;
727 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu,
728 int cmd, uint64_t arg)
730 int err;
732 switch (cmd) {
733 case XENMEM_add_to_physmap:
734 err = do_add_to_physmap(exit, cpu, arg);
735 break;
737 case XENMEM_add_to_physmap_batch:
738 err = do_add_to_physmap_batch(exit, cpu, arg);
739 break;
741 default:
742 return false;
745 exit->u.hcall.result = err;
746 return true;
749 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
750 uint64_t arg)
752 CPUState *cs = CPU(cpu);
753 struct xen_hvm_param hp;
754 int err = 0;
756 /* No need for 32/64 compat handling */
757 qemu_build_assert(sizeof(hp) == 16);
759 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
760 err = -EFAULT;
761 goto out;
764 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
765 err = -ESRCH;
766 goto out;
769 switch (hp.index) {
770 case HVM_PARAM_CALLBACK_IRQ:
771 bql_lock();
772 err = xen_evtchn_set_callback_param(hp.value);
773 bql_unlock();
774 xen_set_long_mode(exit->u.hcall.longmode);
775 break;
776 default:
777 return false;
780 out:
781 exit->u.hcall.result = err;
782 return true;
785 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu,
786 uint64_t arg)
788 CPUState *cs = CPU(cpu);
789 struct xen_hvm_param hp;
790 int err = 0;
792 /* No need for 32/64 compat handling */
793 qemu_build_assert(sizeof(hp) == 16);
795 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
796 err = -EFAULT;
797 goto out;
800 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
801 err = -ESRCH;
802 goto out;
805 switch (hp.index) {
806 case HVM_PARAM_STORE_PFN:
807 hp.value = XEN_SPECIAL_PFN(XENSTORE);
808 break;
809 case HVM_PARAM_STORE_EVTCHN:
810 hp.value = xen_xenstore_get_port();
811 break;
812 case HVM_PARAM_CONSOLE_PFN:
813 hp.value = xen_primary_console_get_pfn();
814 if (!hp.value) {
815 err = -EINVAL;
817 break;
818 case HVM_PARAM_CONSOLE_EVTCHN:
819 hp.value = xen_primary_console_get_port();
820 if (!hp.value) {
821 err = -EINVAL;
823 break;
824 default:
825 return false;
828 if (!err && kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) {
829 err = -EFAULT;
831 out:
832 exit->u.hcall.result = err;
833 return true;
836 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
837 X86CPU *cpu, uint64_t arg)
839 struct xen_hvm_evtchn_upcall_vector up;
840 CPUState *target_cs;
842 /* No need for 32/64 compat handling */
843 qemu_build_assert(sizeof(up) == 8);
845 if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) {
846 return -EFAULT;
849 if (up.vector < 0x10) {
850 return -EINVAL;
853 target_cs = qemu_get_cpu(up.vcpu);
854 if (!target_cs) {
855 return -EINVAL;
858 async_run_on_cpu(target_cs, do_set_vcpu_callback_vector,
859 RUN_ON_CPU_HOST_INT(up.vector));
860 return 0;
863 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
864 int cmd, uint64_t arg)
866 int ret = -ENOSYS;
867 switch (cmd) {
868 case HVMOP_set_evtchn_upcall_vector:
869 ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, arg);
870 break;
872 case HVMOP_pagetable_dying:
873 ret = -ENOSYS;
874 break;
876 case HVMOP_set_param:
877 return handle_set_param(exit, cpu, arg);
879 case HVMOP_get_param:
880 return handle_get_param(exit, cpu, arg);
882 default:
883 return false;
886 exit->u.hcall.result = ret;
887 return true;
890 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target,
891 uint64_t arg)
893 struct vcpu_register_vcpu_info rvi;
894 uint64_t gpa;
896 /* No need for 32/64 compat handling */
897 qemu_build_assert(sizeof(rvi) == 16);
898 qemu_build_assert(sizeof(struct vcpu_info) == 64);
900 if (!target) {
901 return -ENOENT;
904 if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) {
905 return -EFAULT;
908 if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) {
909 return -EINVAL;
912 gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset);
913 async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa));
914 return 0;
917 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target,
918 uint64_t arg)
920 struct vcpu_register_time_memory_area tma;
921 uint64_t gpa;
922 size_t len;
924 /* No need for 32/64 compat handling */
925 qemu_build_assert(sizeof(tma) == 8);
926 qemu_build_assert(sizeof(struct vcpu_time_info) == 32);
928 if (!target) {
929 return -ENOENT;
932 if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) {
933 return -EFAULT;
937 * Xen actually uses the GVA and does the translation through the guest
938 * page tables each time. But Linux/KVM uses the GPA, on the assumption
939 * that guests only ever use *global* addresses (kernel virtual addresses)
940 * for it. If Linux is changed to redo the GVA→GPA translation each time,
941 * it will offer a new vCPU attribute for that, and we'll use it instead.
943 if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) ||
944 len < sizeof(struct vcpu_time_info)) {
945 return -EFAULT;
948 async_run_on_cpu(target, do_set_vcpu_time_info_gpa,
949 RUN_ON_CPU_HOST_ULONG(gpa));
950 return 0;
953 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target,
954 uint64_t arg)
956 struct vcpu_register_runstate_memory_area rma;
957 uint64_t gpa;
958 size_t len;
960 /* No need for 32/64 compat handling */
961 qemu_build_assert(sizeof(rma) == 8);
962 /* The runstate area actually does change size, but Linux copes. */
964 if (!target) {
965 return -ENOENT;
968 if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) {
969 return -EFAULT;
972 /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */
973 if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) {
974 return -EFAULT;
977 async_run_on_cpu(target, do_set_vcpu_runstate_gpa,
978 RUN_ON_CPU_HOST_ULONG(gpa));
979 return 0;
982 static uint64_t kvm_get_current_ns(void)
984 struct kvm_clock_data data;
985 int ret;
987 ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
988 if (ret < 0) {
989 fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
990 abort();
993 return data.clock;
996 static void xen_vcpu_singleshot_timer_event(void *opaque)
998 CPUState *cpu = opaque;
999 CPUX86State *env = &X86_CPU(cpu)->env;
1000 uint16_t port = env->xen_virq[VIRQ_TIMER];
1002 if (likely(port)) {
1003 xen_evtchn_set_port(port);
1006 qemu_mutex_lock(&env->xen_timers_lock);
1007 env->xen_singleshot_timer_ns = 0;
1008 qemu_mutex_unlock(&env->xen_timers_lock);
1011 static void xen_vcpu_periodic_timer_event(void *opaque)
1013 CPUState *cpu = opaque;
1014 CPUX86State *env = &X86_CPU(cpu)->env;
1015 uint16_t port = env->xen_virq[VIRQ_TIMER];
1016 int64_t qemu_now;
1018 if (likely(port)) {
1019 xen_evtchn_set_port(port);
1022 qemu_mutex_lock(&env->xen_timers_lock);
1024 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1025 timer_mod_ns(env->xen_periodic_timer,
1026 qemu_now + env->xen_periodic_timer_period);
1028 qemu_mutex_unlock(&env->xen_timers_lock);
1031 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns)
1033 CPUX86State *tenv = &X86_CPU(target)->env;
1034 int64_t qemu_now;
1036 timer_del(tenv->xen_periodic_timer);
1038 qemu_mutex_lock(&tenv->xen_timers_lock);
1040 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1041 timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns);
1042 tenv->xen_periodic_timer_period = period_ns;
1044 qemu_mutex_unlock(&tenv->xen_timers_lock);
1045 return 0;
1048 #define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL))
1049 #define MICROSECS(_us) ((int64_t)((_us) * 1000ULL))
1050 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1))
1051 /* Chosen so (NOW() + delta) won't overflow without an uptime of 200 years */
1052 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2))
1054 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target,
1055 uint64_t arg)
1057 struct vcpu_set_periodic_timer spt;
1059 qemu_build_assert(sizeof(spt) == 8);
1060 if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) {
1061 return -EFAULT;
1064 if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) {
1065 return -EINVAL;
1068 return do_set_periodic_timer(target, spt.period_ns);
1071 static int vcpuop_stop_periodic_timer(CPUState *target)
1073 CPUX86State *tenv = &X86_CPU(target)->env;
1075 qemu_mutex_lock(&tenv->xen_timers_lock);
1077 timer_del(tenv->xen_periodic_timer);
1078 tenv->xen_periodic_timer_period = 0;
1080 qemu_mutex_unlock(&tenv->xen_timers_lock);
1081 return 0;
1085 * Userspace handling of timer, for older kernels.
1086 * Must always be called with xen_timers_lock held.
1088 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs,
1089 bool linux_wa)
1091 CPUX86State *env = &X86_CPU(cs)->env;
1092 int64_t now = kvm_get_current_ns();
1093 int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1094 int64_t delta = timeout_abs - now;
1096 if (linux_wa && unlikely((int64_t)timeout_abs < 0 ||
1097 (delta > 0 && (uint32_t)(delta >> 50) != 0))) {
1099 * Xen has a 'Linux workaround' in do_set_timer_op() which checks
1100 * for negative absolute timeout values (caused by integer
1101 * overflow), and for values about 13 days in the future (2^50ns)
1102 * which would be caused by jiffies overflow. For those cases, it
1103 * sets the timeout 100ms in the future (not *too* soon, since if
1104 * a guest really did set a long timeout on purpose we don't want
1105 * to keep churning CPU time by waking it up).
1107 delta = (100 * SCALE_MS);
1108 timeout_abs = now + delta;
1111 timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta);
1112 env->xen_singleshot_timer_ns = now + delta;
1113 return 0;
1116 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg)
1118 struct vcpu_set_singleshot_timer sst = { 0 };
1121 * The struct is a uint64_t followed by a uint32_t. On 32-bit that
1122 * makes it 12 bytes. On 64-bit it gets padded to 16. The parts
1123 * that get used are identical, and there's four bytes of padding
1124 * unused at the end. For true Xen compatibility we should attempt
1125 * to copy the full 16 bytes from 64-bit guests, and return -EFAULT
1126 * if we can't get the padding too. But that's daft. Just copy what
1127 * we need.
1129 qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8);
1130 qemu_build_assert(sizeof(sst) >= 12);
1132 if (kvm_copy_from_gva(cs, arg, &sst, 12)) {
1133 return -EFAULT;
1136 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
1139 * We ignore the VCPU_SSHOTTMR_future flag, just as Xen now does.
1140 * The only guest that ever used it, got it wrong.
1141 * https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=19c6cbd909
1143 return do_set_singleshot_timer(cs, sst.timeout_abs_ns, false);
1146 static int vcpuop_stop_singleshot_timer(CPUState *cs)
1148 CPUX86State *env = &X86_CPU(cs)->env;
1150 qemu_mutex_lock(&env->xen_timers_lock);
1152 timer_del(env->xen_singleshot_timer);
1153 env->xen_singleshot_timer_ns = 0;
1155 qemu_mutex_unlock(&env->xen_timers_lock);
1156 return 0;
1159 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1160 uint64_t timeout)
1162 int err;
1164 if (unlikely(timeout == 0)) {
1165 err = vcpuop_stop_singleshot_timer(CPU(cpu));
1166 } else {
1167 QEMU_LOCK_GUARD(&X86_CPU(cpu)->env.xen_timers_lock);
1168 err = do_set_singleshot_timer(CPU(cpu), timeout, true);
1170 exit->u.hcall.result = err;
1171 return true;
1174 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1175 int cmd, int vcpu_id, uint64_t arg)
1177 CPUState *cs = CPU(cpu);
1178 CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id);
1179 int err;
1181 if (!dest) {
1182 err = -ENOENT;
1183 goto out;
1186 switch (cmd) {
1187 case VCPUOP_register_runstate_memory_area:
1188 err = vcpuop_register_runstate_info(cs, dest, arg);
1189 break;
1190 case VCPUOP_register_vcpu_time_memory_area:
1191 err = vcpuop_register_vcpu_time_info(cs, dest, arg);
1192 break;
1193 case VCPUOP_register_vcpu_info:
1194 err = vcpuop_register_vcpu_info(cs, dest, arg);
1195 break;
1196 case VCPUOP_set_singleshot_timer: {
1197 if (cs->cpu_index == vcpu_id) {
1198 err = vcpuop_set_singleshot_timer(dest, arg);
1199 } else {
1200 err = -EINVAL;
1202 break;
1204 case VCPUOP_stop_singleshot_timer:
1205 if (cs->cpu_index == vcpu_id) {
1206 err = vcpuop_stop_singleshot_timer(dest);
1207 } else {
1208 err = -EINVAL;
1210 break;
1211 case VCPUOP_set_periodic_timer: {
1212 err = vcpuop_set_periodic_timer(cs, dest, arg);
1213 break;
1215 case VCPUOP_stop_periodic_timer:
1216 err = vcpuop_stop_periodic_timer(dest);
1217 break;
1219 default:
1220 return false;
1223 out:
1224 exit->u.hcall.result = err;
1225 return true;
1228 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1229 int cmd, uint64_t arg)
1231 CPUState *cs = CPU(cpu);
1232 int err = -ENOSYS;
1234 switch (cmd) {
1235 case EVTCHNOP_init_control:
1236 case EVTCHNOP_expand_array:
1237 case EVTCHNOP_set_priority:
1238 /* We do not support FIFO channels at this point */
1239 err = -ENOSYS;
1240 break;
1242 case EVTCHNOP_status: {
1243 struct evtchn_status status;
1245 qemu_build_assert(sizeof(status) == 24);
1246 if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) {
1247 err = -EFAULT;
1248 break;
1251 err = xen_evtchn_status_op(&status);
1252 if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) {
1253 err = -EFAULT;
1255 break;
1257 case EVTCHNOP_close: {
1258 struct evtchn_close close;
1260 qemu_build_assert(sizeof(close) == 4);
1261 if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) {
1262 err = -EFAULT;
1263 break;
1266 err = xen_evtchn_close_op(&close);
1267 break;
1269 case EVTCHNOP_unmask: {
1270 struct evtchn_unmask unmask;
1272 qemu_build_assert(sizeof(unmask) == 4);
1273 if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) {
1274 err = -EFAULT;
1275 break;
1278 err = xen_evtchn_unmask_op(&unmask);
1279 break;
1281 case EVTCHNOP_bind_virq: {
1282 struct evtchn_bind_virq virq;
1284 qemu_build_assert(sizeof(virq) == 12);
1285 if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) {
1286 err = -EFAULT;
1287 break;
1290 err = xen_evtchn_bind_virq_op(&virq);
1291 if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) {
1292 err = -EFAULT;
1294 break;
1296 case EVTCHNOP_bind_pirq: {
1297 struct evtchn_bind_pirq pirq;
1299 qemu_build_assert(sizeof(pirq) == 12);
1300 if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) {
1301 err = -EFAULT;
1302 break;
1305 err = xen_evtchn_bind_pirq_op(&pirq);
1306 if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) {
1307 err = -EFAULT;
1309 break;
1311 case EVTCHNOP_bind_ipi: {
1312 struct evtchn_bind_ipi ipi;
1314 qemu_build_assert(sizeof(ipi) == 8);
1315 if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) {
1316 err = -EFAULT;
1317 break;
1320 err = xen_evtchn_bind_ipi_op(&ipi);
1321 if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) {
1322 err = -EFAULT;
1324 break;
1326 case EVTCHNOP_send: {
1327 struct evtchn_send send;
1329 qemu_build_assert(sizeof(send) == 4);
1330 if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) {
1331 err = -EFAULT;
1332 break;
1335 err = xen_evtchn_send_op(&send);
1336 break;
1338 case EVTCHNOP_alloc_unbound: {
1339 struct evtchn_alloc_unbound alloc;
1341 qemu_build_assert(sizeof(alloc) == 8);
1342 if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) {
1343 err = -EFAULT;
1344 break;
1347 err = xen_evtchn_alloc_unbound_op(&alloc);
1348 if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) {
1349 err = -EFAULT;
1351 break;
1353 case EVTCHNOP_bind_interdomain: {
1354 struct evtchn_bind_interdomain interdomain;
1356 qemu_build_assert(sizeof(interdomain) == 12);
1357 if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1358 err = -EFAULT;
1359 break;
1362 err = xen_evtchn_bind_interdomain_op(&interdomain);
1363 if (!err &&
1364 kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1365 err = -EFAULT;
1367 break;
1369 case EVTCHNOP_bind_vcpu: {
1370 struct evtchn_bind_vcpu vcpu;
1372 qemu_build_assert(sizeof(vcpu) == 8);
1373 if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) {
1374 err = -EFAULT;
1375 break;
1378 err = xen_evtchn_bind_vcpu_op(&vcpu);
1379 break;
1381 case EVTCHNOP_reset: {
1382 struct evtchn_reset reset;
1384 qemu_build_assert(sizeof(reset) == 2);
1385 if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) {
1386 err = -EFAULT;
1387 break;
1390 err = xen_evtchn_reset_op(&reset);
1391 break;
1393 default:
1394 return false;
1397 exit->u.hcall.result = err;
1398 return true;
1401 int kvm_xen_soft_reset(void)
1403 CPUState *cpu;
1404 int err;
1406 assert(bql_locked());
1408 trace_kvm_xen_soft_reset();
1410 err = xen_evtchn_soft_reset();
1411 if (err) {
1412 return err;
1416 * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
1417 * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
1418 * to deliver to the timer interrupt and treats that as 'disabled'.
1420 err = xen_evtchn_set_callback_param(0);
1421 if (err) {
1422 return err;
1425 CPU_FOREACH(cpu) {
1426 async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL);
1429 err = xen_overlay_map_shinfo_page(INVALID_GFN);
1430 if (err) {
1431 return err;
1434 err = xen_gnttab_reset();
1435 if (err) {
1436 return err;
1439 err = xen_primary_console_reset();
1440 if (err) {
1441 return err;
1444 err = xen_xenstore_reset();
1445 if (err) {
1446 return err;
1449 return 0;
1452 static int schedop_shutdown(CPUState *cs, uint64_t arg)
1454 struct sched_shutdown shutdown;
1455 int ret = 0;
1457 /* No need for 32/64 compat handling */
1458 qemu_build_assert(sizeof(shutdown) == 4);
1460 if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) {
1461 return -EFAULT;
1464 switch (shutdown.reason) {
1465 case SHUTDOWN_crash:
1466 cpu_dump_state(cs, stderr, CPU_DUMP_CODE);
1467 qemu_system_guest_panicked(NULL);
1468 break;
1470 case SHUTDOWN_reboot:
1471 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
1472 break;
1474 case SHUTDOWN_poweroff:
1475 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
1476 break;
1478 case SHUTDOWN_soft_reset:
1479 bql_lock();
1480 ret = kvm_xen_soft_reset();
1481 bql_unlock();
1482 break;
1484 default:
1485 ret = -EINVAL;
1486 break;
1489 return ret;
1492 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1493 int cmd, uint64_t arg)
1495 CPUState *cs = CPU(cpu);
1496 int err = -ENOSYS;
1498 switch (cmd) {
1499 case SCHEDOP_shutdown:
1500 err = schedop_shutdown(cs, arg);
1501 break;
1503 case SCHEDOP_poll:
1505 * Linux will panic if this doesn't work. Just yield; it's not
1506 * worth overthinking it because with event channel handling
1507 * in KVM, the kernel will intercept this and it will never
1508 * reach QEMU anyway. The semantics of the hypercall explicltly
1509 * permit spurious wakeups.
1511 case SCHEDOP_yield:
1512 sched_yield();
1513 err = 0;
1514 break;
1516 default:
1517 return false;
1520 exit->u.hcall.result = err;
1521 return true;
1524 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1525 int cmd, uint64_t arg, int count)
1527 CPUState *cs = CPU(cpu);
1528 int err;
1530 switch (cmd) {
1531 case GNTTABOP_set_version: {
1532 struct gnttab_set_version set;
1534 qemu_build_assert(sizeof(set) == 4);
1535 if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) {
1536 err = -EFAULT;
1537 break;
1540 err = xen_gnttab_set_version_op(&set);
1541 if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) {
1542 err = -EFAULT;
1544 break;
1546 case GNTTABOP_get_version: {
1547 struct gnttab_get_version get;
1549 qemu_build_assert(sizeof(get) == 8);
1550 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1551 err = -EFAULT;
1552 break;
1555 err = xen_gnttab_get_version_op(&get);
1556 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1557 err = -EFAULT;
1559 break;
1561 case GNTTABOP_query_size: {
1562 struct gnttab_query_size size;
1564 qemu_build_assert(sizeof(size) == 16);
1565 if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) {
1566 err = -EFAULT;
1567 break;
1570 err = xen_gnttab_query_size_op(&size);
1571 if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) {
1572 err = -EFAULT;
1574 break;
1576 case GNTTABOP_setup_table:
1577 case GNTTABOP_copy:
1578 case GNTTABOP_map_grant_ref:
1579 case GNTTABOP_unmap_grant_ref:
1580 case GNTTABOP_swap_grant_ref:
1581 return false;
1583 default:
1584 /* Xen explicitly returns -ENOSYS to HVM guests for all others */
1585 err = -ENOSYS;
1586 break;
1589 exit->u.hcall.result = err;
1590 return true;
1593 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1594 int cmd, uint64_t arg)
1596 CPUState *cs = CPU(cpu);
1597 int err;
1599 switch (cmd) {
1600 case PHYSDEVOP_map_pirq: {
1601 struct physdev_map_pirq map;
1603 if (hypercall_compat32(exit->u.hcall.longmode)) {
1604 struct compat_physdev_map_pirq *map32 = (void *)&map;
1606 if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) {
1607 return -EFAULT;
1611 * The only thing that's different is the alignment of the
1612 * uint64_t table_base at the end, which gets padding to make
1613 * it 64-bit aligned in the 64-bit version.
1615 qemu_build_assert(sizeof(*map32) == 36);
1616 qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) ==
1617 offsetof(struct compat_physdev_map_pirq, entry_nr));
1618 memmove(&map.table_base, &map32->table_base, sizeof(map.table_base));
1619 } else {
1620 if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) {
1621 err = -EFAULT;
1622 break;
1625 err = xen_physdev_map_pirq(&map);
1627 * Since table_base is an IN parameter and won't be changed, just
1628 * copy the size of the compat structure back to the guest.
1630 if (!err && kvm_copy_to_gva(cs, arg, &map,
1631 sizeof(struct compat_physdev_map_pirq))) {
1632 err = -EFAULT;
1634 break;
1636 case PHYSDEVOP_unmap_pirq: {
1637 struct physdev_unmap_pirq unmap;
1639 qemu_build_assert(sizeof(unmap) == 8);
1640 if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) {
1641 err = -EFAULT;
1642 break;
1645 err = xen_physdev_unmap_pirq(&unmap);
1646 if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) {
1647 err = -EFAULT;
1649 break;
1651 case PHYSDEVOP_eoi: {
1652 struct physdev_eoi eoi;
1654 qemu_build_assert(sizeof(eoi) == 4);
1655 if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) {
1656 err = -EFAULT;
1657 break;
1660 err = xen_physdev_eoi_pirq(&eoi);
1661 if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) {
1662 err = -EFAULT;
1664 break;
1666 case PHYSDEVOP_irq_status_query: {
1667 struct physdev_irq_status_query query;
1669 qemu_build_assert(sizeof(query) == 8);
1670 if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) {
1671 err = -EFAULT;
1672 break;
1675 err = xen_physdev_query_pirq(&query);
1676 if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) {
1677 err = -EFAULT;
1679 break;
1681 case PHYSDEVOP_get_free_pirq: {
1682 struct physdev_get_free_pirq get;
1684 qemu_build_assert(sizeof(get) == 8);
1685 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1686 err = -EFAULT;
1687 break;
1690 err = xen_physdev_get_free_pirq(&get);
1691 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1692 err = -EFAULT;
1694 break;
1696 case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */
1697 err = -ENOSYS;
1698 break;
1700 default:
1701 return false;
1704 exit->u.hcall.result = err;
1705 return true;
1708 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1710 uint16_t code = exit->u.hcall.input;
1712 if (exit->u.hcall.cpl > 0) {
1713 exit->u.hcall.result = -EPERM;
1714 return true;
1717 switch (code) {
1718 case __HYPERVISOR_set_timer_op:
1719 if (exit->u.hcall.longmode) {
1720 return kvm_xen_hcall_set_timer_op(exit, cpu,
1721 exit->u.hcall.params[0]);
1722 } else {
1723 /* In 32-bit mode, the 64-bit timer value is in two args. */
1724 uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 |
1725 (uint32_t)exit->u.hcall.params[0];
1726 return kvm_xen_hcall_set_timer_op(exit, cpu, val);
1728 case __HYPERVISOR_grant_table_op:
1729 return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0],
1730 exit->u.hcall.params[1],
1731 exit->u.hcall.params[2]);
1732 case __HYPERVISOR_sched_op:
1733 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
1734 exit->u.hcall.params[1]);
1735 case __HYPERVISOR_event_channel_op:
1736 return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0],
1737 exit->u.hcall.params[1]);
1738 case __HYPERVISOR_vcpu_op:
1739 return kvm_xen_hcall_vcpu_op(exit, cpu,
1740 exit->u.hcall.params[0],
1741 exit->u.hcall.params[1],
1742 exit->u.hcall.params[2]);
1743 case __HYPERVISOR_hvm_op:
1744 return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
1745 exit->u.hcall.params[1]);
1746 case __HYPERVISOR_memory_op:
1747 return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0],
1748 exit->u.hcall.params[1]);
1749 case __HYPERVISOR_physdev_op:
1750 return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0],
1751 exit->u.hcall.params[1]);
1752 case __HYPERVISOR_xen_version:
1753 return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0],
1754 exit->u.hcall.params[1]);
1755 default:
1756 return false;
1760 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1762 if (exit->type != KVM_EXIT_XEN_HCALL) {
1763 return -1;
1767 * The kernel latches the guest 32/64 mode when the MSR is used to fill
1768 * the hypercall page. So if we see a hypercall in a mode that doesn't
1769 * match our own idea of the guest mode, fetch the kernel's idea of the
1770 * "long mode" to remain in sync.
1772 if (exit->u.hcall.longmode != xen_is_long_mode()) {
1773 xen_sync_long_mode();
1776 if (!do_kvm_xen_handle_exit(cpu, exit)) {
1778 * Some hypercalls will be deliberately "implemented" by returning
1779 * -ENOSYS. This case is for hypercalls which are unexpected.
1781 exit->u.hcall.result = -ENOSYS;
1782 qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %"
1783 PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n",
1784 (uint64_t)exit->u.hcall.input,
1785 (uint64_t)exit->u.hcall.params[0],
1786 (uint64_t)exit->u.hcall.params[1],
1787 (uint64_t)exit->u.hcall.params[2]);
1790 trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
1791 exit->u.hcall.input, exit->u.hcall.params[0],
1792 exit->u.hcall.params[1], exit->u.hcall.params[2],
1793 exit->u.hcall.result);
1794 return 0;
1797 uint16_t kvm_xen_get_gnttab_max_frames(void)
1799 KVMState *s = KVM_STATE(current_accel());
1800 return s->xen_gnttab_max_frames;
1803 uint16_t kvm_xen_get_evtchn_max_pirq(void)
1805 KVMState *s = KVM_STATE(current_accel());
1806 return s->xen_evtchn_max_pirq;
1809 int kvm_put_xen_state(CPUState *cs)
1811 X86CPU *cpu = X86_CPU(cs);
1812 CPUX86State *env = &cpu->env;
1813 uint64_t gpa;
1814 int ret;
1816 gpa = env->xen_vcpu_info_gpa;
1817 if (gpa == INVALID_GPA) {
1818 gpa = env->xen_vcpu_info_default_gpa;
1821 if (gpa != INVALID_GPA) {
1822 ret = set_vcpu_info(cs, gpa);
1823 if (ret < 0) {
1824 return ret;
1828 gpa = env->xen_vcpu_time_info_gpa;
1829 if (gpa != INVALID_GPA) {
1830 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
1831 gpa);
1832 if (ret < 0) {
1833 return ret;
1837 gpa = env->xen_vcpu_runstate_gpa;
1838 if (gpa != INVALID_GPA) {
1839 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
1840 gpa);
1841 if (ret < 0) {
1842 return ret;
1846 if (env->xen_periodic_timer_period) {
1847 ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period);
1848 if (ret < 0) {
1849 return ret;
1853 if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1855 * If the kernel has EVTCHN_SEND support then it handles timers too,
1856 * so the timer will be restored by kvm_xen_set_vcpu_timer() below.
1858 QEMU_LOCK_GUARD(&env->xen_timers_lock);
1859 if (env->xen_singleshot_timer_ns) {
1860 ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns,
1861 false);
1862 if (ret < 0) {
1863 return ret;
1866 return 0;
1869 if (env->xen_vcpu_callback_vector) {
1870 ret = kvm_xen_set_vcpu_callback_vector(cs);
1871 if (ret < 0) {
1872 return ret;
1876 if (env->xen_virq[VIRQ_TIMER]) {
1877 do_set_vcpu_timer_virq(cs,
1878 RUN_ON_CPU_HOST_INT(env->xen_virq[VIRQ_TIMER]));
1880 return 0;
1883 int kvm_get_xen_state(CPUState *cs)
1885 X86CPU *cpu = X86_CPU(cs);
1886 CPUX86State *env = &cpu->env;
1887 uint64_t gpa;
1888 int ret;
1891 * The kernel does not mark vcpu_info as dirty when it delivers interrupts
1892 * to it. It's up to userspace to *assume* that any page shared thus is
1893 * always considered dirty. The shared_info page is different since it's
1894 * an overlay and migrated separately anyway.
1896 gpa = env->xen_vcpu_info_gpa;
1897 if (gpa == INVALID_GPA) {
1898 gpa = env->xen_vcpu_info_default_gpa;
1900 if (gpa != INVALID_GPA) {
1901 MemoryRegionSection mrs = memory_region_find(get_system_memory(),
1902 gpa,
1903 sizeof(struct vcpu_info));
1904 if (mrs.mr &&
1905 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
1906 memory_region_set_dirty(mrs.mr, mrs.offset_within_region,
1907 sizeof(struct vcpu_info));
1911 if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1912 return 0;
1916 * If the kernel is accelerating timers, read out the current value of the
1917 * singleshot timer deadline.
1919 if (env->xen_virq[VIRQ_TIMER]) {
1920 struct kvm_xen_vcpu_attr va = {
1921 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
1923 ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va);
1924 if (ret < 0) {
1925 return ret;
1929 * This locking is fairly pointless, and is here to appease Coverity.
1930 * There is an unavoidable race condition if a different vCPU sets a
1931 * timer for this vCPU after the value has been read out. But that's
1932 * OK in practice because *all* the vCPUs need to be stopped before
1933 * we set about migrating their state.
1935 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
1936 env->xen_singleshot_timer_ns = va.u.timer.expires_ns;
1939 return 0;