ppc: Update cpu_model in MachineState
[qemu/ar7.git] / hw / ppc / spapr.c
blob5ca817c40595d44d6db40f00f07daa0c3f784dac
1 /*
2 * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
4 * Copyright (c) 2004-2007 Fabrice Bellard
5 * Copyright (c) 2007 Jocelyn Mayer
6 * Copyright (c) 2010 David Gibson, IBM Corporation.
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
27 #include "sysemu/sysemu.h"
28 #include "sysemu/numa.h"
29 #include "hw/hw.h"
30 #include "hw/fw-path-provider.h"
31 #include "elf.h"
32 #include "net/net.h"
33 #include "sysemu/block-backend.h"
34 #include "sysemu/cpus.h"
35 #include "sysemu/kvm.h"
36 #include "kvm_ppc.h"
37 #include "mmu-hash64.h"
38 #include "qom/cpu.h"
40 #include "hw/boards.h"
41 #include "hw/ppc/ppc.h"
42 #include "hw/loader.h"
44 #include "hw/ppc/spapr.h"
45 #include "hw/ppc/spapr_vio.h"
46 #include "hw/pci-host/spapr.h"
47 #include "hw/ppc/xics.h"
48 #include "hw/pci/msi.h"
50 #include "hw/pci/pci.h"
51 #include "hw/scsi/scsi.h"
52 #include "hw/virtio/virtio-scsi.h"
54 #include "exec/address-spaces.h"
55 #include "hw/usb.h"
56 #include "qemu/config-file.h"
57 #include "qemu/error-report.h"
58 #include "trace.h"
59 #include "hw/nmi.h"
61 #include "hw/compat.h"
63 #include <libfdt.h>
65 /* SLOF memory layout:
67 * SLOF raw image loaded at 0, copies its romfs right below the flat
68 * device-tree, then position SLOF itself 31M below that
70 * So we set FW_OVERHEAD to 40MB which should account for all of that
71 * and more
73 * We load our kernel at 4M, leaving space for SLOF initial image
75 #define FDT_MAX_SIZE 0x40000
76 #define RTAS_MAX_SIZE 0x10000
77 #define RTAS_MAX_ADDR 0x80000000 /* RTAS must stay below that */
78 #define FW_MAX_SIZE 0x400000
79 #define FW_FILE_NAME "slof.bin"
80 #define FW_OVERHEAD 0x2800000
81 #define KERNEL_LOAD_ADDR FW_MAX_SIZE
83 #define MIN_RMA_SLOF 128UL
85 #define TIMEBASE_FREQ 512000000ULL
87 #define MAX_CPUS 255
89 #define PHANDLE_XICP 0x00001111
91 #define HTAB_SIZE(spapr) (1ULL << ((spapr)->htab_shift))
93 static XICSState *try_create_xics(const char *type, int nr_servers,
94 int nr_irqs, Error **errp)
96 Error *err = NULL;
97 DeviceState *dev;
99 dev = qdev_create(NULL, type);
100 qdev_prop_set_uint32(dev, "nr_servers", nr_servers);
101 qdev_prop_set_uint32(dev, "nr_irqs", nr_irqs);
102 object_property_set_bool(OBJECT(dev), true, "realized", &err);
103 if (err) {
104 error_propagate(errp, err);
105 object_unparent(OBJECT(dev));
106 return NULL;
108 return XICS_COMMON(dev);
111 static XICSState *xics_system_init(MachineState *machine,
112 int nr_servers, int nr_irqs)
114 XICSState *icp = NULL;
116 if (kvm_enabled()) {
117 Error *err = NULL;
119 if (machine_kernel_irqchip_allowed(machine)) {
120 icp = try_create_xics(TYPE_KVM_XICS, nr_servers, nr_irqs, &err);
122 if (machine_kernel_irqchip_required(machine) && !icp) {
123 error_report("kernel_irqchip requested but unavailable: %s",
124 error_get_pretty(err));
128 if (!icp) {
129 icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs, &error_abort);
132 return icp;
135 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
136 int smt_threads)
138 int i, ret = 0;
139 uint32_t servers_prop[smt_threads];
140 uint32_t gservers_prop[smt_threads * 2];
141 int index = ppc_get_vcpu_dt_id(cpu);
143 if (cpu->cpu_version) {
144 ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->cpu_version);
145 if (ret < 0) {
146 return ret;
150 /* Build interrupt servers and gservers properties */
151 for (i = 0; i < smt_threads; i++) {
152 servers_prop[i] = cpu_to_be32(index + i);
153 /* Hack, direct the group queues back to cpu 0 */
154 gservers_prop[i*2] = cpu_to_be32(index + i);
155 gservers_prop[i*2 + 1] = 0;
157 ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
158 servers_prop, sizeof(servers_prop));
159 if (ret < 0) {
160 return ret;
162 ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
163 gservers_prop, sizeof(gservers_prop));
165 return ret;
168 static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, CPUState *cs)
170 int ret = 0;
171 PowerPCCPU *cpu = POWERPC_CPU(cs);
172 int index = ppc_get_vcpu_dt_id(cpu);
173 uint32_t associativity[] = {cpu_to_be32(0x5),
174 cpu_to_be32(0x0),
175 cpu_to_be32(0x0),
176 cpu_to_be32(0x0),
177 cpu_to_be32(cs->numa_node),
178 cpu_to_be32(index)};
180 /* Advertise NUMA via ibm,associativity */
181 if (nb_numa_nodes > 1) {
182 ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
183 sizeof(associativity));
186 return ret;
189 static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
191 int ret = 0, offset, cpus_offset;
192 CPUState *cs;
193 char cpu_model[32];
194 int smt = kvmppc_smt_threads();
195 uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
197 CPU_FOREACH(cs) {
198 PowerPCCPU *cpu = POWERPC_CPU(cs);
199 DeviceClass *dc = DEVICE_GET_CLASS(cs);
200 int index = ppc_get_vcpu_dt_id(cpu);
202 if ((index % smt) != 0) {
203 continue;
206 snprintf(cpu_model, 32, "%s@%x", dc->fw_name, index);
208 cpus_offset = fdt_path_offset(fdt, "/cpus");
209 if (cpus_offset < 0) {
210 cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"),
211 "cpus");
212 if (cpus_offset < 0) {
213 return cpus_offset;
216 offset = fdt_subnode_offset(fdt, cpus_offset, cpu_model);
217 if (offset < 0) {
218 offset = fdt_add_subnode(fdt, cpus_offset, cpu_model);
219 if (offset < 0) {
220 return offset;
224 ret = fdt_setprop(fdt, offset, "ibm,pft-size",
225 pft_size_prop, sizeof(pft_size_prop));
226 if (ret < 0) {
227 return ret;
230 ret = spapr_fixup_cpu_numa_dt(fdt, offset, cs);
231 if (ret < 0) {
232 return ret;
235 ret = spapr_fixup_cpu_smt_dt(fdt, offset, cpu,
236 ppc_get_compat_smt_threads(cpu));
237 if (ret < 0) {
238 return ret;
241 return ret;
245 static size_t create_page_sizes_prop(CPUPPCState *env, uint32_t *prop,
246 size_t maxsize)
248 size_t maxcells = maxsize / sizeof(uint32_t);
249 int i, j, count;
250 uint32_t *p = prop;
252 for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
253 struct ppc_one_seg_page_size *sps = &env->sps.sps[i];
255 if (!sps->page_shift) {
256 break;
258 for (count = 0; count < PPC_PAGE_SIZES_MAX_SZ; count++) {
259 if (sps->enc[count].page_shift == 0) {
260 break;
263 if ((p - prop) >= (maxcells - 3 - count * 2)) {
264 break;
266 *(p++) = cpu_to_be32(sps->page_shift);
267 *(p++) = cpu_to_be32(sps->slb_enc);
268 *(p++) = cpu_to_be32(count);
269 for (j = 0; j < count; j++) {
270 *(p++) = cpu_to_be32(sps->enc[j].page_shift);
271 *(p++) = cpu_to_be32(sps->enc[j].pte_enc);
275 return (p - prop) * sizeof(uint32_t);
278 static hwaddr spapr_node0_size(void)
280 MachineState *machine = MACHINE(qdev_get_machine());
282 if (nb_numa_nodes) {
283 int i;
284 for (i = 0; i < nb_numa_nodes; ++i) {
285 if (numa_info[i].node_mem) {
286 return MIN(pow2floor(numa_info[i].node_mem),
287 machine->ram_size);
291 return machine->ram_size;
294 #define _FDT(exp) \
295 do { \
296 int ret = (exp); \
297 if (ret < 0) { \
298 fprintf(stderr, "qemu: error creating device tree: %s: %s\n", \
299 #exp, fdt_strerror(ret)); \
300 exit(1); \
302 } while (0)
304 static void add_str(GString *s, const gchar *s1)
306 g_string_append_len(s, s1, strlen(s1) + 1);
309 static void *spapr_create_fdt_skel(hwaddr initrd_base,
310 hwaddr initrd_size,
311 hwaddr kernel_size,
312 bool little_endian,
313 const char *kernel_cmdline,
314 uint32_t epow_irq)
316 void *fdt;
317 uint32_t start_prop = cpu_to_be32(initrd_base);
318 uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
319 GString *hypertas = g_string_sized_new(256);
320 GString *qemu_hypertas = g_string_sized_new(256);
321 uint32_t refpoints[] = {cpu_to_be32(0x4), cpu_to_be32(0x4)};
322 uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(max_cpus)};
323 unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80};
324 char *buf;
326 add_str(hypertas, "hcall-pft");
327 add_str(hypertas, "hcall-term");
328 add_str(hypertas, "hcall-dabr");
329 add_str(hypertas, "hcall-interrupt");
330 add_str(hypertas, "hcall-tce");
331 add_str(hypertas, "hcall-vio");
332 add_str(hypertas, "hcall-splpar");
333 add_str(hypertas, "hcall-bulk");
334 add_str(hypertas, "hcall-set-mode");
335 add_str(qemu_hypertas, "hcall-memop1");
337 fdt = g_malloc0(FDT_MAX_SIZE);
338 _FDT((fdt_create(fdt, FDT_MAX_SIZE)));
340 if (kernel_size) {
341 _FDT((fdt_add_reservemap_entry(fdt, KERNEL_LOAD_ADDR, kernel_size)));
343 if (initrd_size) {
344 _FDT((fdt_add_reservemap_entry(fdt, initrd_base, initrd_size)));
346 _FDT((fdt_finish_reservemap(fdt)));
348 /* Root node */
349 _FDT((fdt_begin_node(fdt, "")));
350 _FDT((fdt_property_string(fdt, "device_type", "chrp")));
351 _FDT((fdt_property_string(fdt, "model", "IBM pSeries (emulated by qemu)")));
352 _FDT((fdt_property_string(fdt, "compatible", "qemu,pseries")));
355 * Add info to guest to indentify which host is it being run on
356 * and what is the uuid of the guest
358 if (kvmppc_get_host_model(&buf)) {
359 _FDT((fdt_property_string(fdt, "host-model", buf)));
360 g_free(buf);
362 if (kvmppc_get_host_serial(&buf)) {
363 _FDT((fdt_property_string(fdt, "host-serial", buf)));
364 g_free(buf);
367 buf = g_strdup_printf(UUID_FMT, qemu_uuid[0], qemu_uuid[1],
368 qemu_uuid[2], qemu_uuid[3], qemu_uuid[4],
369 qemu_uuid[5], qemu_uuid[6], qemu_uuid[7],
370 qemu_uuid[8], qemu_uuid[9], qemu_uuid[10],
371 qemu_uuid[11], qemu_uuid[12], qemu_uuid[13],
372 qemu_uuid[14], qemu_uuid[15]);
374 _FDT((fdt_property_string(fdt, "vm,uuid", buf)));
375 g_free(buf);
377 _FDT((fdt_property_cell(fdt, "#address-cells", 0x2)));
378 _FDT((fdt_property_cell(fdt, "#size-cells", 0x2)));
380 /* /chosen */
381 _FDT((fdt_begin_node(fdt, "chosen")));
383 /* Set Form1_affinity */
384 _FDT((fdt_property(fdt, "ibm,architecture-vec-5", vec5, sizeof(vec5))));
386 _FDT((fdt_property_string(fdt, "bootargs", kernel_cmdline)));
387 _FDT((fdt_property(fdt, "linux,initrd-start",
388 &start_prop, sizeof(start_prop))));
389 _FDT((fdt_property(fdt, "linux,initrd-end",
390 &end_prop, sizeof(end_prop))));
391 if (kernel_size) {
392 uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR),
393 cpu_to_be64(kernel_size) };
395 _FDT((fdt_property(fdt, "qemu,boot-kernel", &kprop, sizeof(kprop))));
396 if (little_endian) {
397 _FDT((fdt_property(fdt, "qemu,boot-kernel-le", NULL, 0)));
400 if (boot_menu) {
401 _FDT((fdt_property_cell(fdt, "qemu,boot-menu", boot_menu)));
403 _FDT((fdt_property_cell(fdt, "qemu,graphic-width", graphic_width)));
404 _FDT((fdt_property_cell(fdt, "qemu,graphic-height", graphic_height)));
405 _FDT((fdt_property_cell(fdt, "qemu,graphic-depth", graphic_depth)));
407 _FDT((fdt_end_node(fdt)));
409 /* RTAS */
410 _FDT((fdt_begin_node(fdt, "rtas")));
412 if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
413 add_str(hypertas, "hcall-multi-tce");
415 _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas->str,
416 hypertas->len)));
417 g_string_free(hypertas, TRUE);
418 _FDT((fdt_property(fdt, "qemu,hypertas-functions", qemu_hypertas->str,
419 qemu_hypertas->len)));
420 g_string_free(qemu_hypertas, TRUE);
422 _FDT((fdt_property(fdt, "ibm,associativity-reference-points",
423 refpoints, sizeof(refpoints))));
425 _FDT((fdt_property_cell(fdt, "rtas-error-log-max", RTAS_ERROR_LOG_MAX)));
426 _FDT((fdt_property_cell(fdt, "rtas-event-scan-rate",
427 RTAS_EVENT_SCAN_RATE)));
430 * According to PAPR, rtas ibm,os-term does not guarantee a return
431 * back to the guest cpu.
433 * While an additional ibm,extended-os-term property indicates that
434 * rtas call return will always occur. Set this property.
436 _FDT((fdt_property(fdt, "ibm,extended-os-term", NULL, 0)));
438 _FDT((fdt_end_node(fdt)));
440 /* interrupt controller */
441 _FDT((fdt_begin_node(fdt, "interrupt-controller")));
443 _FDT((fdt_property_string(fdt, "device_type",
444 "PowerPC-External-Interrupt-Presentation")));
445 _FDT((fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")));
446 _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
447 _FDT((fdt_property(fdt, "ibm,interrupt-server-ranges",
448 interrupt_server_ranges_prop,
449 sizeof(interrupt_server_ranges_prop))));
450 _FDT((fdt_property_cell(fdt, "#interrupt-cells", 2)));
451 _FDT((fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP)));
452 _FDT((fdt_property_cell(fdt, "phandle", PHANDLE_XICP)));
454 _FDT((fdt_end_node(fdt)));
456 /* vdevice */
457 _FDT((fdt_begin_node(fdt, "vdevice")));
459 _FDT((fdt_property_string(fdt, "device_type", "vdevice")));
460 _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
461 _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
462 _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
463 _FDT((fdt_property_cell(fdt, "#interrupt-cells", 0x2)));
464 _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
466 _FDT((fdt_end_node(fdt)));
468 /* event-sources */
469 spapr_events_fdt_skel(fdt, epow_irq);
471 /* /hypervisor node */
472 if (kvm_enabled()) {
473 uint8_t hypercall[16];
475 /* indicate KVM hypercall interface */
476 _FDT((fdt_begin_node(fdt, "hypervisor")));
477 _FDT((fdt_property_string(fdt, "compatible", "linux,kvm")));
478 if (kvmppc_has_cap_fixup_hcalls()) {
480 * Older KVM versions with older guest kernels were broken with the
481 * magic page, don't allow the guest to map it.
483 kvmppc_get_hypercall(first_cpu->env_ptr, hypercall,
484 sizeof(hypercall));
485 _FDT((fdt_property(fdt, "hcall-instructions", hypercall,
486 sizeof(hypercall))));
488 _FDT((fdt_end_node(fdt)));
491 _FDT((fdt_end_node(fdt))); /* close root node */
492 _FDT((fdt_finish(fdt)));
494 return fdt;
497 int spapr_h_cas_compose_response(sPAPRMachineState *spapr,
498 target_ulong addr, target_ulong size)
500 void *fdt, *fdt_skel;
501 sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };
503 size -= sizeof(hdr);
505 /* Create sceleton */
506 fdt_skel = g_malloc0(size);
507 _FDT((fdt_create(fdt_skel, size)));
508 _FDT((fdt_begin_node(fdt_skel, "")));
509 _FDT((fdt_end_node(fdt_skel)));
510 _FDT((fdt_finish(fdt_skel)));
511 fdt = g_malloc0(size);
512 _FDT((fdt_open_into(fdt_skel, fdt, size)));
513 g_free(fdt_skel);
515 /* Fix skeleton up */
516 _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
518 /* Pack resulting tree */
519 _FDT((fdt_pack(fdt)));
521 if (fdt_totalsize(fdt) + sizeof(hdr) > size) {
522 trace_spapr_cas_failed(size);
523 return -1;
526 cpu_physical_memory_write(addr, &hdr, sizeof(hdr));
527 cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt));
528 trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr));
529 g_free(fdt);
531 return 0;
534 static void spapr_populate_memory_node(void *fdt, int nodeid, hwaddr start,
535 hwaddr size)
537 uint32_t associativity[] = {
538 cpu_to_be32(0x4), /* length */
539 cpu_to_be32(0x0), cpu_to_be32(0x0),
540 cpu_to_be32(0x0), cpu_to_be32(nodeid)
542 char mem_name[32];
543 uint64_t mem_reg_property[2];
544 int off;
546 mem_reg_property[0] = cpu_to_be64(start);
547 mem_reg_property[1] = cpu_to_be64(size);
549 sprintf(mem_name, "memory@" TARGET_FMT_lx, start);
550 off = fdt_add_subnode(fdt, 0, mem_name);
551 _FDT(off);
552 _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
553 _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
554 sizeof(mem_reg_property))));
555 _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
556 sizeof(associativity))));
559 static int spapr_populate_memory(sPAPRMachineState *spapr, void *fdt)
561 MachineState *machine = MACHINE(spapr);
562 hwaddr mem_start, node_size;
563 int i, nb_nodes = nb_numa_nodes;
564 NodeInfo *nodes = numa_info;
565 NodeInfo ramnode;
567 /* No NUMA nodes, assume there is just one node with whole RAM */
568 if (!nb_numa_nodes) {
569 nb_nodes = 1;
570 ramnode.node_mem = machine->ram_size;
571 nodes = &ramnode;
574 for (i = 0, mem_start = 0; i < nb_nodes; ++i) {
575 if (!nodes[i].node_mem) {
576 continue;
578 if (mem_start >= machine->ram_size) {
579 node_size = 0;
580 } else {
581 node_size = nodes[i].node_mem;
582 if (node_size > machine->ram_size - mem_start) {
583 node_size = machine->ram_size - mem_start;
586 if (!mem_start) {
587 /* ppc_spapr_init() checks for rma_size <= node0_size already */
588 spapr_populate_memory_node(fdt, i, 0, spapr->rma_size);
589 mem_start += spapr->rma_size;
590 node_size -= spapr->rma_size;
592 for ( ; node_size; ) {
593 hwaddr sizetmp = pow2floor(node_size);
595 /* mem_start != 0 here */
596 if (ctzl(mem_start) < ctzl(sizetmp)) {
597 sizetmp = 1ULL << ctzl(mem_start);
600 spapr_populate_memory_node(fdt, i, mem_start, sizetmp);
601 node_size -= sizetmp;
602 mem_start += sizetmp;
606 return 0;
609 static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
610 sPAPRMachineState *spapr)
612 PowerPCCPU *cpu = POWERPC_CPU(cs);
613 CPUPPCState *env = &cpu->env;
614 PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
615 int index = ppc_get_vcpu_dt_id(cpu);
616 uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
617 0xffffffff, 0xffffffff};
618 uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() : TIMEBASE_FREQ;
619 uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
620 uint32_t page_sizes_prop[64];
621 size_t page_sizes_prop_size;
622 QemuOpts *opts = qemu_opts_find(qemu_find_opts("smp-opts"), NULL);
623 unsigned sockets = opts ? qemu_opt_get_number(opts, "sockets", 0) : 0;
624 uint32_t cpus_per_socket = sockets ? (smp_cpus / sockets) : 1;
625 uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
627 _FDT((fdt_setprop_cell(fdt, offset, "reg", index)));
628 _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
630 _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR])));
631 _FDT((fdt_setprop_cell(fdt, offset, "d-cache-block-size",
632 env->dcache_line_size)));
633 _FDT((fdt_setprop_cell(fdt, offset, "d-cache-line-size",
634 env->dcache_line_size)));
635 _FDT((fdt_setprop_cell(fdt, offset, "i-cache-block-size",
636 env->icache_line_size)));
637 _FDT((fdt_setprop_cell(fdt, offset, "i-cache-line-size",
638 env->icache_line_size)));
640 if (pcc->l1_dcache_size) {
641 _FDT((fdt_setprop_cell(fdt, offset, "d-cache-size",
642 pcc->l1_dcache_size)));
643 } else {
644 fprintf(stderr, "Warning: Unknown L1 dcache size for cpu\n");
646 if (pcc->l1_icache_size) {
647 _FDT((fdt_setprop_cell(fdt, offset, "i-cache-size",
648 pcc->l1_icache_size)));
649 } else {
650 fprintf(stderr, "Warning: Unknown L1 icache size for cpu\n");
653 _FDT((fdt_setprop_cell(fdt, offset, "timebase-frequency", tbfreq)));
654 _FDT((fdt_setprop_cell(fdt, offset, "clock-frequency", cpufreq)));
655 _FDT((fdt_setprop_cell(fdt, offset, "ibm,slb-size", env->slb_nr)));
656 _FDT((fdt_setprop_string(fdt, offset, "status", "okay")));
657 _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0)));
659 if (env->spr_cb[SPR_PURR].oea_read) {
660 _FDT((fdt_setprop(fdt, offset, "ibm,purr", NULL, 0)));
663 if (env->mmu_model & POWERPC_MMU_1TSEG) {
664 _FDT((fdt_setprop(fdt, offset, "ibm,processor-segment-sizes",
665 segs, sizeof(segs))));
668 /* Advertise VMX/VSX (vector extensions) if available
669 * 0 / no property == no vector extensions
670 * 1 == VMX / Altivec available
671 * 2 == VSX available */
672 if (env->insns_flags & PPC_ALTIVEC) {
673 uint32_t vmx = (env->insns_flags2 & PPC2_VSX) ? 2 : 1;
675 _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", vmx)));
678 /* Advertise DFP (Decimal Floating Point) if available
679 * 0 / no property == no DFP
680 * 1 == DFP available */
681 if (env->insns_flags2 & PPC2_DFP) {
682 _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1)));
685 page_sizes_prop_size = create_page_sizes_prop(env, page_sizes_prop,
686 sizeof(page_sizes_prop));
687 if (page_sizes_prop_size) {
688 _FDT((fdt_setprop(fdt, offset, "ibm,segment-page-sizes",
689 page_sizes_prop, page_sizes_prop_size)));
692 _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
693 cs->cpu_index / cpus_per_socket)));
695 _FDT((fdt_setprop(fdt, offset, "ibm,pft-size",
696 pft_size_prop, sizeof(pft_size_prop))));
698 _FDT(spapr_fixup_cpu_numa_dt(fdt, offset, cs));
700 _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu,
701 ppc_get_compat_smt_threads(cpu)));
704 static void spapr_populate_cpus_dt_node(void *fdt, sPAPRMachineState *spapr)
706 CPUState *cs;
707 int cpus_offset;
708 char *nodename;
709 int smt = kvmppc_smt_threads();
711 cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
712 _FDT(cpus_offset);
713 _FDT((fdt_setprop_cell(fdt, cpus_offset, "#address-cells", 0x1)));
714 _FDT((fdt_setprop_cell(fdt, cpus_offset, "#size-cells", 0x0)));
717 * We walk the CPUs in reverse order to ensure that CPU DT nodes
718 * created by fdt_add_subnode() end up in the right order in FDT
719 * for the guest kernel the enumerate the CPUs correctly.
721 CPU_FOREACH_REVERSE(cs) {
722 PowerPCCPU *cpu = POWERPC_CPU(cs);
723 int index = ppc_get_vcpu_dt_id(cpu);
724 DeviceClass *dc = DEVICE_GET_CLASS(cs);
725 int offset;
727 if ((index % smt) != 0) {
728 continue;
731 nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
732 offset = fdt_add_subnode(fdt, cpus_offset, nodename);
733 g_free(nodename);
734 _FDT(offset);
735 spapr_populate_cpu_dt(cs, fdt, offset, spapr);
740 static void spapr_finalize_fdt(sPAPRMachineState *spapr,
741 hwaddr fdt_addr,
742 hwaddr rtas_addr,
743 hwaddr rtas_size)
745 MachineState *machine = MACHINE(qdev_get_machine());
746 const char *boot_device = machine->boot_order;
747 int ret, i;
748 size_t cb = 0;
749 char *bootlist;
750 void *fdt;
751 sPAPRPHBState *phb;
753 fdt = g_malloc(FDT_MAX_SIZE);
755 /* open out the base tree into a temp buffer for the final tweaks */
756 _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE)));
758 ret = spapr_populate_memory(spapr, fdt);
759 if (ret < 0) {
760 fprintf(stderr, "couldn't setup memory nodes in fdt\n");
761 exit(1);
764 ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
765 if (ret < 0) {
766 fprintf(stderr, "couldn't setup vio devices in fdt\n");
767 exit(1);
770 QLIST_FOREACH(phb, &spapr->phbs, list) {
771 ret = spapr_populate_pci_dt(phb, PHANDLE_XICP, fdt);
774 if (ret < 0) {
775 fprintf(stderr, "couldn't setup PCI devices in fdt\n");
776 exit(1);
779 /* RTAS */
780 ret = spapr_rtas_device_tree_setup(fdt, rtas_addr, rtas_size);
781 if (ret < 0) {
782 fprintf(stderr, "Couldn't set up RTAS device tree properties\n");
785 /* cpus */
786 spapr_populate_cpus_dt_node(fdt, spapr);
788 bootlist = get_boot_devices_list(&cb, true);
789 if (cb && bootlist) {
790 int offset = fdt_path_offset(fdt, "/chosen");
791 if (offset < 0) {
792 exit(1);
794 for (i = 0; i < cb; i++) {
795 if (bootlist[i] == '\n') {
796 bootlist[i] = ' ';
800 ret = fdt_setprop_string(fdt, offset, "qemu,boot-list", bootlist);
803 if (boot_device && strlen(boot_device)) {
804 int offset = fdt_path_offset(fdt, "/chosen");
806 if (offset < 0) {
807 exit(1);
809 fdt_setprop_string(fdt, offset, "qemu,boot-device", boot_device);
812 if (!spapr->has_graphics) {
813 spapr_populate_chosen_stdout(fdt, spapr->vio_bus);
816 _FDT((fdt_pack(fdt)));
818 if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
819 error_report("FDT too big ! 0x%x bytes (max is 0x%x)",
820 fdt_totalsize(fdt), FDT_MAX_SIZE);
821 exit(1);
824 cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
826 g_free(bootlist);
827 g_free(fdt);
830 static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
832 return (addr & 0x0fffffff) + KERNEL_LOAD_ADDR;
835 static void emulate_spapr_hypercall(PowerPCCPU *cpu)
837 CPUPPCState *env = &cpu->env;
839 if (msr_pr) {
840 hcall_dprintf("Hypercall made with MSR[PR]=1\n");
841 env->gpr[3] = H_PRIVILEGE;
842 } else {
843 env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
847 #define HPTE(_table, _i) (void *)(((uint64_t *)(_table)) + ((_i) * 2))
848 #define HPTE_VALID(_hpte) (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
849 #define HPTE_DIRTY(_hpte) (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
850 #define CLEAN_HPTE(_hpte) ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))
851 #define DIRTY_HPTE(_hpte) ((*(uint64_t *)(_hpte)) |= tswap64(HPTE64_V_HPTE_DIRTY))
853 static void spapr_reset_htab(sPAPRMachineState *spapr)
855 long shift;
856 int index;
858 /* allocate hash page table. For now we always make this 16mb,
859 * later we should probably make it scale to the size of guest
860 * RAM */
862 shift = kvmppc_reset_htab(spapr->htab_shift);
864 if (shift > 0) {
865 /* Kernel handles htab, we don't need to allocate one */
866 spapr->htab_shift = shift;
867 kvmppc_kern_htab = true;
869 /* Tell readers to update their file descriptor */
870 if (spapr->htab_fd >= 0) {
871 spapr->htab_fd_stale = true;
873 } else {
874 if (!spapr->htab) {
875 /* Allocate an htab if we don't yet have one */
876 spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
879 /* And clear it */
880 memset(spapr->htab, 0, HTAB_SIZE(spapr));
882 for (index = 0; index < HTAB_SIZE(spapr) / HASH_PTE_SIZE_64; index++) {
883 DIRTY_HPTE(HPTE(spapr->htab, index));
887 /* Update the RMA size if necessary */
888 if (spapr->vrma_adjust) {
889 spapr->rma_size = kvmppc_rma_size(spapr_node0_size(),
890 spapr->htab_shift);
894 static int find_unknown_sysbus_device(SysBusDevice *sbdev, void *opaque)
896 bool matched = false;
898 if (object_dynamic_cast(OBJECT(sbdev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
899 matched = true;
902 if (!matched) {
903 error_report("Device %s is not supported by this machine yet.",
904 qdev_fw_name(DEVICE(sbdev)));
905 exit(1);
908 return 0;
912 * A guest reset will cause spapr->htab_fd to become stale if being used.
913 * Reopen the file descriptor to make sure the whole HTAB is properly read.
915 static int spapr_check_htab_fd(sPAPRMachineState *spapr)
917 int rc = 0;
919 if (spapr->htab_fd_stale) {
920 close(spapr->htab_fd);
921 spapr->htab_fd = kvmppc_get_htab_fd(false);
922 if (spapr->htab_fd < 0) {
923 error_report("Unable to open fd for reading hash table from KVM: "
924 "%s", strerror(errno));
925 rc = -1;
927 spapr->htab_fd_stale = false;
930 return rc;
933 static void ppc_spapr_reset(void)
935 sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
936 PowerPCCPU *first_ppc_cpu;
937 uint32_t rtas_limit;
939 /* Check for unknown sysbus devices */
940 foreach_dynamic_sysbus_device(find_unknown_sysbus_device, NULL);
942 /* Reset the hash table & recalc the RMA */
943 spapr_reset_htab(spapr);
945 qemu_devices_reset();
948 * We place the device tree and RTAS just below either the top of the RMA,
949 * or just below 2GB, whichever is lowere, so that it can be
950 * processed with 32-bit real mode code if necessary
952 rtas_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR);
953 spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
954 spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
956 /* Load the fdt */
957 spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
958 spapr->rtas_size);
960 /* Copy RTAS over */
961 cpu_physical_memory_write(spapr->rtas_addr, spapr->rtas_blob,
962 spapr->rtas_size);
964 /* Set up the entry state */
965 first_ppc_cpu = POWERPC_CPU(first_cpu);
966 first_ppc_cpu->env.gpr[3] = spapr->fdt_addr;
967 first_ppc_cpu->env.gpr[5] = 0;
968 first_cpu->halted = 0;
969 first_ppc_cpu->env.nip = SPAPR_ENTRY_POINT;
973 static void spapr_cpu_reset(void *opaque)
975 sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
976 PowerPCCPU *cpu = opaque;
977 CPUState *cs = CPU(cpu);
978 CPUPPCState *env = &cpu->env;
980 cpu_reset(cs);
982 /* All CPUs start halted. CPU0 is unhalted from the machine level
983 * reset code and the rest are explicitly started up by the guest
984 * using an RTAS call */
985 cs->halted = 1;
987 env->spr[SPR_HIOR] = 0;
989 env->external_htab = (uint8_t *)spapr->htab;
990 if (kvm_enabled() && !env->external_htab) {
992 * HV KVM, set external_htab to 1 so our ppc_hash64_load_hpte*
993 * functions do the right thing.
995 env->external_htab = (void *)1;
997 env->htab_base = -1;
999 * htab_mask is the mask used to normalize hash value to PTEG index.
1000 * htab_shift is log2 of hash table size.
1001 * We have 8 hpte per group, and each hpte is 16 bytes.
1002 * ie have 128 bytes per hpte entry.
1004 env->htab_mask = (1ULL << (spapr->htab_shift - 7)) - 1;
1005 env->spr[SPR_SDR1] = (target_ulong)(uintptr_t)spapr->htab |
1006 (spapr->htab_shift - 18);
1009 static void spapr_create_nvram(sPAPRMachineState *spapr)
1011 DeviceState *dev = qdev_create(&spapr->vio_bus->bus, "spapr-nvram");
1012 DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
1014 if (dinfo) {
1015 qdev_prop_set_drive_nofail(dev, "drive", blk_by_legacy_dinfo(dinfo));
1018 qdev_init_nofail(dev);
1020 spapr->nvram = (struct sPAPRNVRAM *)dev;
1023 static void spapr_rtc_create(sPAPRMachineState *spapr)
1025 DeviceState *dev = qdev_create(NULL, TYPE_SPAPR_RTC);
1027 qdev_init_nofail(dev);
1028 spapr->rtc = dev;
1030 object_property_add_alias(qdev_get_machine(), "rtc-time",
1031 OBJECT(spapr->rtc), "date", NULL);
1034 /* Returns whether we want to use VGA or not */
1035 static int spapr_vga_init(PCIBus *pci_bus)
1037 switch (vga_interface_type) {
1038 case VGA_NONE:
1039 return false;
1040 case VGA_DEVICE:
1041 return true;
1042 case VGA_STD:
1043 return pci_vga_init(pci_bus) != NULL;
1044 default:
1045 fprintf(stderr, "This vga model is not supported,"
1046 "currently it only supports -vga std\n");
1047 exit(0);
1051 static int spapr_post_load(void *opaque, int version_id)
1053 sPAPRMachineState *spapr = (sPAPRMachineState *)opaque;
1054 int err = 0;
1056 /* In earlier versions, there was no separate qdev for the PAPR
1057 * RTC, so the RTC offset was stored directly in sPAPREnvironment.
1058 * So when migrating from those versions, poke the incoming offset
1059 * value into the RTC device */
1060 if (version_id < 3) {
1061 err = spapr_rtc_import_offset(spapr->rtc, spapr->rtc_offset);
1064 return err;
1067 static bool version_before_3(void *opaque, int version_id)
1069 return version_id < 3;
1072 static const VMStateDescription vmstate_spapr = {
1073 .name = "spapr",
1074 .version_id = 3,
1075 .minimum_version_id = 1,
1076 .post_load = spapr_post_load,
1077 .fields = (VMStateField[]) {
1078 /* used to be @next_irq */
1079 VMSTATE_UNUSED_BUFFER(version_before_3, 0, 4),
1081 /* RTC offset */
1082 VMSTATE_UINT64_TEST(rtc_offset, sPAPRMachineState, version_before_3),
1084 VMSTATE_PPC_TIMEBASE_V(tb, sPAPRMachineState, 2),
1085 VMSTATE_END_OF_LIST()
1089 static int htab_save_setup(QEMUFile *f, void *opaque)
1091 sPAPRMachineState *spapr = opaque;
1093 /* "Iteration" header */
1094 qemu_put_be32(f, spapr->htab_shift);
1096 if (spapr->htab) {
1097 spapr->htab_save_index = 0;
1098 spapr->htab_first_pass = true;
1099 } else {
1100 assert(kvm_enabled());
1102 spapr->htab_fd = kvmppc_get_htab_fd(false);
1103 spapr->htab_fd_stale = false;
1104 if (spapr->htab_fd < 0) {
1105 fprintf(stderr, "Unable to open fd for reading hash table from KVM: %s\n",
1106 strerror(errno));
1107 return -1;
1112 return 0;
1115 static void htab_save_first_pass(QEMUFile *f, sPAPRMachineState *spapr,
1116 int64_t max_ns)
1118 int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
1119 int index = spapr->htab_save_index;
1120 int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1122 assert(spapr->htab_first_pass);
1124 do {
1125 int chunkstart;
1127 /* Consume invalid HPTEs */
1128 while ((index < htabslots)
1129 && !HPTE_VALID(HPTE(spapr->htab, index))) {
1130 index++;
1131 CLEAN_HPTE(HPTE(spapr->htab, index));
1134 /* Consume valid HPTEs */
1135 chunkstart = index;
1136 while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
1137 && HPTE_VALID(HPTE(spapr->htab, index))) {
1138 index++;
1139 CLEAN_HPTE(HPTE(spapr->htab, index));
1142 if (index > chunkstart) {
1143 int n_valid = index - chunkstart;
1145 qemu_put_be32(f, chunkstart);
1146 qemu_put_be16(f, n_valid);
1147 qemu_put_be16(f, 0);
1148 qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
1149 HASH_PTE_SIZE_64 * n_valid);
1151 if ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
1152 break;
1155 } while ((index < htabslots) && !qemu_file_rate_limit(f));
1157 if (index >= htabslots) {
1158 assert(index == htabslots);
1159 index = 0;
1160 spapr->htab_first_pass = false;
1162 spapr->htab_save_index = index;
1165 static int htab_save_later_pass(QEMUFile *f, sPAPRMachineState *spapr,
1166 int64_t max_ns)
1168 bool final = max_ns < 0;
1169 int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
1170 int examined = 0, sent = 0;
1171 int index = spapr->htab_save_index;
1172 int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1174 assert(!spapr->htab_first_pass);
1176 do {
1177 int chunkstart, invalidstart;
1179 /* Consume non-dirty HPTEs */
1180 while ((index < htabslots)
1181 && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
1182 index++;
1183 examined++;
1186 chunkstart = index;
1187 /* Consume valid dirty HPTEs */
1188 while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
1189 && HPTE_DIRTY(HPTE(spapr->htab, index))
1190 && HPTE_VALID(HPTE(spapr->htab, index))) {
1191 CLEAN_HPTE(HPTE(spapr->htab, index));
1192 index++;
1193 examined++;
1196 invalidstart = index;
1197 /* Consume invalid dirty HPTEs */
1198 while ((index < htabslots) && (index - invalidstart < USHRT_MAX)
1199 && HPTE_DIRTY(HPTE(spapr->htab, index))
1200 && !HPTE_VALID(HPTE(spapr->htab, index))) {
1201 CLEAN_HPTE(HPTE(spapr->htab, index));
1202 index++;
1203 examined++;
1206 if (index > chunkstart) {
1207 int n_valid = invalidstart - chunkstart;
1208 int n_invalid = index - invalidstart;
1210 qemu_put_be32(f, chunkstart);
1211 qemu_put_be16(f, n_valid);
1212 qemu_put_be16(f, n_invalid);
1213 qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
1214 HASH_PTE_SIZE_64 * n_valid);
1215 sent += index - chunkstart;
1217 if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
1218 break;
1222 if (examined >= htabslots) {
1223 break;
1226 if (index >= htabslots) {
1227 assert(index == htabslots);
1228 index = 0;
1230 } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));
1232 if (index >= htabslots) {
1233 assert(index == htabslots);
1234 index = 0;
1237 spapr->htab_save_index = index;
1239 return (examined >= htabslots) && (sent == 0) ? 1 : 0;
1242 #define MAX_ITERATION_NS 5000000 /* 5 ms */
1243 #define MAX_KVM_BUF_SIZE 2048
1245 static int htab_save_iterate(QEMUFile *f, void *opaque)
1247 sPAPRMachineState *spapr = opaque;
1248 int rc = 0;
1250 /* Iteration header */
1251 qemu_put_be32(f, 0);
1253 if (!spapr->htab) {
1254 assert(kvm_enabled());
1256 rc = spapr_check_htab_fd(spapr);
1257 if (rc < 0) {
1258 return rc;
1261 rc = kvmppc_save_htab(f, spapr->htab_fd,
1262 MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
1263 if (rc < 0) {
1264 return rc;
1266 } else if (spapr->htab_first_pass) {
1267 htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
1268 } else {
1269 rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
1272 /* End marker */
1273 qemu_put_be32(f, 0);
1274 qemu_put_be16(f, 0);
1275 qemu_put_be16(f, 0);
1277 return rc;
1280 static int htab_save_complete(QEMUFile *f, void *opaque)
1282 sPAPRMachineState *spapr = opaque;
1284 /* Iteration header */
1285 qemu_put_be32(f, 0);
1287 if (!spapr->htab) {
1288 int rc;
1290 assert(kvm_enabled());
1292 rc = spapr_check_htab_fd(spapr);
1293 if (rc < 0) {
1294 return rc;
1297 rc = kvmppc_save_htab(f, spapr->htab_fd, MAX_KVM_BUF_SIZE, -1);
1298 if (rc < 0) {
1299 return rc;
1301 close(spapr->htab_fd);
1302 spapr->htab_fd = -1;
1303 } else {
1304 htab_save_later_pass(f, spapr, -1);
1307 /* End marker */
1308 qemu_put_be32(f, 0);
1309 qemu_put_be16(f, 0);
1310 qemu_put_be16(f, 0);
1312 return 0;
1315 static int htab_load(QEMUFile *f, void *opaque, int version_id)
1317 sPAPRMachineState *spapr = opaque;
1318 uint32_t section_hdr;
1319 int fd = -1;
1321 if (version_id < 1 || version_id > 1) {
1322 fprintf(stderr, "htab_load() bad version\n");
1323 return -EINVAL;
1326 section_hdr = qemu_get_be32(f);
1328 if (section_hdr) {
1329 /* First section, just the hash shift */
1330 if (spapr->htab_shift != section_hdr) {
1331 return -EINVAL;
1333 return 0;
1336 if (!spapr->htab) {
1337 assert(kvm_enabled());
1339 fd = kvmppc_get_htab_fd(true);
1340 if (fd < 0) {
1341 fprintf(stderr, "Unable to open fd to restore KVM hash table: %s\n",
1342 strerror(errno));
1346 while (true) {
1347 uint32_t index;
1348 uint16_t n_valid, n_invalid;
1350 index = qemu_get_be32(f);
1351 n_valid = qemu_get_be16(f);
1352 n_invalid = qemu_get_be16(f);
1354 if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
1355 /* End of Stream */
1356 break;
1359 if ((index + n_valid + n_invalid) >
1360 (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
1361 /* Bad index in stream */
1362 fprintf(stderr, "htab_load() bad index %d (%hd+%hd entries) "
1363 "in htab stream (htab_shift=%d)\n", index, n_valid, n_invalid,
1364 spapr->htab_shift);
1365 return -EINVAL;
1368 if (spapr->htab) {
1369 if (n_valid) {
1370 qemu_get_buffer(f, HPTE(spapr->htab, index),
1371 HASH_PTE_SIZE_64 * n_valid);
1373 if (n_invalid) {
1374 memset(HPTE(spapr->htab, index + n_valid), 0,
1375 HASH_PTE_SIZE_64 * n_invalid);
1377 } else {
1378 int rc;
1380 assert(fd >= 0);
1382 rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid);
1383 if (rc < 0) {
1384 return rc;
1389 if (!spapr->htab) {
1390 assert(fd >= 0);
1391 close(fd);
1394 return 0;
1397 static SaveVMHandlers savevm_htab_handlers = {
1398 .save_live_setup = htab_save_setup,
1399 .save_live_iterate = htab_save_iterate,
1400 .save_live_complete = htab_save_complete,
1401 .load_state = htab_load,
1404 static void spapr_boot_set(void *opaque, const char *boot_device,
1405 Error **errp)
1407 MachineState *machine = MACHINE(qdev_get_machine());
1408 machine->boot_order = g_strdup(boot_device);
1411 static void spapr_cpu_init(sPAPRMachineState *spapr, PowerPCCPU *cpu)
1413 CPUPPCState *env = &cpu->env;
1415 /* Set time-base frequency to 512 MHz */
1416 cpu_ppc_tb_init(env, TIMEBASE_FREQ);
1418 /* PAPR always has exception vectors in RAM not ROM. To ensure this,
1419 * MSR[IP] should never be set.
1421 env->msr_mask &= ~(1 << 6);
1423 /* Tell KVM that we're in PAPR mode */
1424 if (kvm_enabled()) {
1425 kvmppc_set_papr(cpu);
1428 if (cpu->max_compat) {
1429 if (ppc_set_compat(cpu, cpu->max_compat) < 0) {
1430 exit(1);
1434 xics_cpu_setup(spapr->icp, cpu);
1436 qemu_register_reset(spapr_cpu_reset, cpu);
1439 /* pSeries LPAR / sPAPR hardware init */
1440 static void ppc_spapr_init(MachineState *machine)
1442 sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
1443 const char *kernel_filename = machine->kernel_filename;
1444 const char *kernel_cmdline = machine->kernel_cmdline;
1445 const char *initrd_filename = machine->initrd_filename;
1446 PowerPCCPU *cpu;
1447 PCIHostState *phb;
1448 int i;
1449 MemoryRegion *sysmem = get_system_memory();
1450 MemoryRegion *ram = g_new(MemoryRegion, 1);
1451 MemoryRegion *rma_region;
1452 void *rma = NULL;
1453 hwaddr rma_alloc_size;
1454 hwaddr node0_size = spapr_node0_size();
1455 uint32_t initrd_base = 0;
1456 long kernel_size = 0, initrd_size = 0;
1457 long load_limit, fw_size;
1458 bool kernel_le = false;
1459 char *filename;
1461 msi_supported = true;
1463 QLIST_INIT(&spapr->phbs);
1465 cpu_ppc_hypercall = emulate_spapr_hypercall;
1467 /* Allocate RMA if necessary */
1468 rma_alloc_size = kvmppc_alloc_rma(&rma);
1470 if (rma_alloc_size == -1) {
1471 error_report("Unable to create RMA");
1472 exit(1);
1475 if (rma_alloc_size && (rma_alloc_size < node0_size)) {
1476 spapr->rma_size = rma_alloc_size;
1477 } else {
1478 spapr->rma_size = node0_size;
1480 /* With KVM, we don't actually know whether KVM supports an
1481 * unbounded RMA (PR KVM) or is limited by the hash table size
1482 * (HV KVM using VRMA), so we always assume the latter
1484 * In that case, we also limit the initial allocations for RTAS
1485 * etc... to 256M since we have no way to know what the VRMA size
1486 * is going to be as it depends on the size of the hash table
1487 * isn't determined yet.
1489 if (kvm_enabled()) {
1490 spapr->vrma_adjust = 1;
1491 spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
1495 if (spapr->rma_size > node0_size) {
1496 fprintf(stderr, "Error: Numa node 0 has to span the RMA (%#08"HWADDR_PRIx")\n",
1497 spapr->rma_size);
1498 exit(1);
1501 /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */
1502 load_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR) - FW_OVERHEAD;
1504 /* We aim for a hash table of size 1/128 the size of RAM. The
1505 * normal rule of thumb is 1/64 the size of RAM, but that's much
1506 * more than needed for the Linux guests we support. */
1507 spapr->htab_shift = 18; /* Minimum architected size */
1508 while (spapr->htab_shift <= 46) {
1509 if ((1ULL << (spapr->htab_shift + 7)) >= machine->ram_size) {
1510 break;
1512 spapr->htab_shift++;
1515 /* Set up Interrupt Controller before we create the VCPUs */
1516 spapr->icp = xics_system_init(machine,
1517 DIV_ROUND_UP(max_cpus * kvmppc_smt_threads(),
1518 smp_threads),
1519 XICS_IRQS);
1521 /* init CPUs */
1522 if (machine->cpu_model == NULL) {
1523 machine->cpu_model = kvm_enabled() ? "host" : "POWER7";
1525 for (i = 0; i < smp_cpus; i++) {
1526 cpu = cpu_ppc_init(machine->cpu_model);
1527 if (cpu == NULL) {
1528 fprintf(stderr, "Unable to find PowerPC CPU definition\n");
1529 exit(1);
1531 spapr_cpu_init(spapr, cpu);
1534 if (kvm_enabled()) {
1535 /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
1536 kvmppc_enable_logical_ci_hcalls();
1539 /* allocate RAM */
1540 memory_region_allocate_system_memory(ram, NULL, "ppc_spapr.ram",
1541 machine->ram_size);
1542 memory_region_add_subregion(sysmem, 0, ram);
1544 if (rma_alloc_size && rma) {
1545 rma_region = g_new(MemoryRegion, 1);
1546 memory_region_init_ram_ptr(rma_region, NULL, "ppc_spapr.rma",
1547 rma_alloc_size, rma);
1548 vmstate_register_ram_global(rma_region);
1549 memory_region_add_subregion(sysmem, 0, rma_region);
1552 filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
1553 if (!filename) {
1554 error_report("Could not find LPAR rtas '%s'", "spapr-rtas.bin");
1555 exit(1);
1557 spapr->rtas_size = get_image_size(filename);
1558 spapr->rtas_blob = g_malloc(spapr->rtas_size);
1559 if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
1560 error_report("Could not load LPAR rtas '%s'", filename);
1561 exit(1);
1563 if (spapr->rtas_size > RTAS_MAX_SIZE) {
1564 error_report("RTAS too big ! 0x%zx bytes (max is 0x%x)",
1565 (size_t)spapr->rtas_size, RTAS_MAX_SIZE);
1566 exit(1);
1568 g_free(filename);
1570 /* Set up EPOW events infrastructure */
1571 spapr_events_init(spapr);
1573 /* Set up the RTC RTAS interfaces */
1574 spapr_rtc_create(spapr);
1576 /* Set up VIO bus */
1577 spapr->vio_bus = spapr_vio_bus_init();
1579 for (i = 0; i < MAX_SERIAL_PORTS; i++) {
1580 if (serial_hds[i]) {
1581 spapr_vty_create(spapr->vio_bus, serial_hds[i]);
1585 /* We always have at least the nvram device on VIO */
1586 spapr_create_nvram(spapr);
1588 /* Set up PCI */
1589 spapr_pci_rtas_init();
1591 phb = spapr_create_phb(spapr, 0);
1593 for (i = 0; i < nb_nics; i++) {
1594 NICInfo *nd = &nd_table[i];
1596 if (!nd->model) {
1597 nd->model = g_strdup("ibmveth");
1600 if (strcmp(nd->model, "ibmveth") == 0) {
1601 spapr_vlan_create(spapr->vio_bus, nd);
1602 } else {
1603 pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
1607 for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
1608 spapr_vscsi_create(spapr->vio_bus);
1611 /* Graphics */
1612 if (spapr_vga_init(phb->bus)) {
1613 spapr->has_graphics = true;
1614 machine->usb |= defaults_enabled() && !machine->usb_disabled;
1617 if (machine->usb) {
1618 pci_create_simple(phb->bus, -1, "pci-ohci");
1620 if (spapr->has_graphics) {
1621 USBBus *usb_bus = usb_bus_find(-1);
1623 usb_create_simple(usb_bus, "usb-kbd");
1624 usb_create_simple(usb_bus, "usb-mouse");
1628 if (spapr->rma_size < (MIN_RMA_SLOF << 20)) {
1629 fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
1630 "%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
1631 exit(1);
1634 if (kernel_filename) {
1635 uint64_t lowaddr = 0;
1637 kernel_size = load_elf(kernel_filename, translate_kernel_address, NULL,
1638 NULL, &lowaddr, NULL, 1, ELF_MACHINE, 0);
1639 if (kernel_size == ELF_LOAD_WRONG_ENDIAN) {
1640 kernel_size = load_elf(kernel_filename,
1641 translate_kernel_address, NULL,
1642 NULL, &lowaddr, NULL, 0, ELF_MACHINE, 0);
1643 kernel_le = kernel_size > 0;
1645 if (kernel_size < 0) {
1646 fprintf(stderr, "qemu: error loading %s: %s\n",
1647 kernel_filename, load_elf_strerror(kernel_size));
1648 exit(1);
1651 /* load initrd */
1652 if (initrd_filename) {
1653 /* Try to locate the initrd in the gap between the kernel
1654 * and the firmware. Add a bit of space just in case
1656 initrd_base = (KERNEL_LOAD_ADDR + kernel_size + 0x1ffff) & ~0xffff;
1657 initrd_size = load_image_targphys(initrd_filename, initrd_base,
1658 load_limit - initrd_base);
1659 if (initrd_size < 0) {
1660 fprintf(stderr, "qemu: could not load initial ram disk '%s'\n",
1661 initrd_filename);
1662 exit(1);
1664 } else {
1665 initrd_base = 0;
1666 initrd_size = 0;
1670 if (bios_name == NULL) {
1671 bios_name = FW_FILE_NAME;
1673 filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
1674 if (!filename) {
1675 error_report("Could not find LPAR firmware '%s'", bios_name);
1676 exit(1);
1678 fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
1679 if (fw_size <= 0) {
1680 error_report("Could not load LPAR firmware '%s'", filename);
1681 exit(1);
1683 g_free(filename);
1685 /* FIXME: Should register things through the MachineState's qdev
1686 * interface, this is a legacy from the sPAPREnvironment structure
1687 * which predated MachineState but had a similar function */
1688 vmstate_register(NULL, 0, &vmstate_spapr, spapr);
1689 register_savevm_live(NULL, "spapr/htab", -1, 1,
1690 &savevm_htab_handlers, spapr);
1692 /* Prepare the device tree */
1693 spapr->fdt_skel = spapr_create_fdt_skel(initrd_base, initrd_size,
1694 kernel_size, kernel_le,
1695 kernel_cmdline,
1696 spapr->check_exception_irq);
1697 assert(spapr->fdt_skel != NULL);
1699 /* used by RTAS */
1700 QTAILQ_INIT(&spapr->ccs_list);
1701 qemu_register_reset(spapr_ccs_reset_hook, spapr);
1703 qemu_register_boot_set(spapr_boot_set, spapr);
1706 static int spapr_kvm_type(const char *vm_type)
1708 if (!vm_type) {
1709 return 0;
1712 if (!strcmp(vm_type, "HV")) {
1713 return 1;
1716 if (!strcmp(vm_type, "PR")) {
1717 return 2;
1720 error_report("Unknown kvm-type specified '%s'", vm_type);
1721 exit(1);
1725 * Implementation of an interface to adjust firmware path
1726 * for the bootindex property handling.
1728 static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
1729 DeviceState *dev)
1731 #define CAST(type, obj, name) \
1732 ((type *)object_dynamic_cast(OBJECT(obj), (name)))
1733 SCSIDevice *d = CAST(SCSIDevice, dev, TYPE_SCSI_DEVICE);
1734 sPAPRPHBState *phb = CAST(sPAPRPHBState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);
1736 if (d) {
1737 void *spapr = CAST(void, bus->parent, "spapr-vscsi");
1738 VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
1739 USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);
1741 if (spapr) {
1743 * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
1744 * We use SRP luns of the form 8000 | (bus << 8) | (id << 5) | lun
1745 * in the top 16 bits of the 64-bit LUN
1747 unsigned id = 0x8000 | (d->id << 8) | d->lun;
1748 return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
1749 (uint64_t)id << 48);
1750 } else if (virtio) {
1752 * We use SRP luns of the form 01000000 | (target << 8) | lun
1753 * in the top 32 bits of the 64-bit LUN
1754 * Note: the quote above is from SLOF and it is wrong,
1755 * the actual binding is:
1756 * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
1758 unsigned id = 0x1000000 | (d->id << 16) | d->lun;
1759 return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
1760 (uint64_t)id << 32);
1761 } else if (usb) {
1763 * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
1764 * in the top 32 bits of the 64-bit LUN
1766 unsigned usb_port = atoi(usb->port->path);
1767 unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
1768 return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
1769 (uint64_t)id << 32);
1773 if (phb) {
1774 /* Replace "pci" with "pci@800000020000000" */
1775 return g_strdup_printf("pci@%"PRIX64, phb->buid);
1778 return NULL;
1781 static char *spapr_get_kvm_type(Object *obj, Error **errp)
1783 sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
1785 return g_strdup(spapr->kvm_type);
1788 static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
1790 sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
1792 g_free(spapr->kvm_type);
1793 spapr->kvm_type = g_strdup(value);
1796 static void spapr_machine_initfn(Object *obj)
1798 object_property_add_str(obj, "kvm-type",
1799 spapr_get_kvm_type, spapr_set_kvm_type, NULL);
1800 object_property_set_description(obj, "kvm-type",
1801 "Specifies the KVM virtualization mode (HV, PR)",
1802 NULL);
1805 static void ppc_cpu_do_nmi_on_cpu(void *arg)
1807 CPUState *cs = arg;
1809 cpu_synchronize_state(cs);
1810 ppc_cpu_do_system_reset(cs);
1813 static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
1815 CPUState *cs;
1817 CPU_FOREACH(cs) {
1818 async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, cs);
1822 static void spapr_machine_class_init(ObjectClass *oc, void *data)
1824 MachineClass *mc = MACHINE_CLASS(oc);
1825 FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
1826 NMIClass *nc = NMI_CLASS(oc);
1828 mc->init = ppc_spapr_init;
1829 mc->reset = ppc_spapr_reset;
1830 mc->block_default_type = IF_SCSI;
1831 mc->max_cpus = MAX_CPUS;
1832 mc->no_parallel = 1;
1833 mc->default_boot_order = "";
1834 mc->default_ram_size = 512 * M_BYTE;
1835 mc->kvm_type = spapr_kvm_type;
1836 mc->has_dynamic_sysbus = true;
1838 fwc->get_dev_path = spapr_get_fw_dev_path;
1839 nc->nmi_monitor_handler = spapr_nmi;
1842 static const TypeInfo spapr_machine_info = {
1843 .name = TYPE_SPAPR_MACHINE,
1844 .parent = TYPE_MACHINE,
1845 .abstract = true,
1846 .instance_size = sizeof(sPAPRMachineState),
1847 .instance_init = spapr_machine_initfn,
1848 .class_size = sizeof(sPAPRMachineClass),
1849 .class_init = spapr_machine_class_init,
1850 .interfaces = (InterfaceInfo[]) {
1851 { TYPE_FW_PATH_PROVIDER },
1852 { TYPE_NMI },
1857 #define SPAPR_COMPAT_2_3 \
1858 HW_COMPAT_2_3 \
1860 .driver = "spapr-pci-host-bridge",\
1861 .property = "dynamic-reconfiguration",\
1862 .value = "off",\
1865 #define SPAPR_COMPAT_2_2 \
1866 SPAPR_COMPAT_2_3 \
1867 HW_COMPAT_2_2 \
1869 .driver = TYPE_SPAPR_PCI_HOST_BRIDGE,\
1870 .property = "mem_win_size",\
1871 .value = "0x20000000",\
1874 #define SPAPR_COMPAT_2_1 \
1875 SPAPR_COMPAT_2_2 \
1876 HW_COMPAT_2_1
1878 static void spapr_compat_2_3(Object *obj)
1882 static void spapr_compat_2_2(Object *obj)
1884 spapr_compat_2_3(obj);
1887 static void spapr_compat_2_1(Object *obj)
1889 spapr_compat_2_2(obj);
1892 static void spapr_machine_2_3_instance_init(Object *obj)
1894 spapr_compat_2_3(obj);
1895 spapr_machine_initfn(obj);
1898 static void spapr_machine_2_2_instance_init(Object *obj)
1900 spapr_compat_2_2(obj);
1901 spapr_machine_initfn(obj);
1904 static void spapr_machine_2_1_instance_init(Object *obj)
1906 spapr_compat_2_1(obj);
1907 spapr_machine_initfn(obj);
1910 static void spapr_machine_2_1_class_init(ObjectClass *oc, void *data)
1912 MachineClass *mc = MACHINE_CLASS(oc);
1913 static GlobalProperty compat_props[] = {
1914 SPAPR_COMPAT_2_1
1915 { /* end of list */ }
1918 mc->name = "pseries-2.1";
1919 mc->desc = "pSeries Logical Partition (PAPR compliant) v2.1";
1920 mc->compat_props = compat_props;
1923 static const TypeInfo spapr_machine_2_1_info = {
1924 .name = TYPE_SPAPR_MACHINE "2.1",
1925 .parent = TYPE_SPAPR_MACHINE,
1926 .class_init = spapr_machine_2_1_class_init,
1927 .instance_init = spapr_machine_2_1_instance_init,
1930 static void spapr_machine_2_2_class_init(ObjectClass *oc, void *data)
1932 static GlobalProperty compat_props[] = {
1933 SPAPR_COMPAT_2_2
1934 { /* end of list */ }
1936 MachineClass *mc = MACHINE_CLASS(oc);
1938 mc->name = "pseries-2.2";
1939 mc->desc = "pSeries Logical Partition (PAPR compliant) v2.2";
1940 mc->compat_props = compat_props;
1943 static const TypeInfo spapr_machine_2_2_info = {
1944 .name = TYPE_SPAPR_MACHINE "2.2",
1945 .parent = TYPE_SPAPR_MACHINE,
1946 .class_init = spapr_machine_2_2_class_init,
1947 .instance_init = spapr_machine_2_2_instance_init,
1950 static void spapr_machine_2_3_class_init(ObjectClass *oc, void *data)
1952 static GlobalProperty compat_props[] = {
1953 SPAPR_COMPAT_2_3
1954 { /* end of list */ }
1956 MachineClass *mc = MACHINE_CLASS(oc);
1958 mc->name = "pseries-2.3";
1959 mc->desc = "pSeries Logical Partition (PAPR compliant) v2.3";
1960 mc->compat_props = compat_props;
1963 static const TypeInfo spapr_machine_2_3_info = {
1964 .name = TYPE_SPAPR_MACHINE "2.3",
1965 .parent = TYPE_SPAPR_MACHINE,
1966 .class_init = spapr_machine_2_3_class_init,
1967 .instance_init = spapr_machine_2_3_instance_init,
1970 static void spapr_machine_2_4_class_init(ObjectClass *oc, void *data)
1972 MachineClass *mc = MACHINE_CLASS(oc);
1974 mc->name = "pseries-2.4";
1975 mc->desc = "pSeries Logical Partition (PAPR compliant) v2.4";
1976 mc->alias = "pseries";
1977 mc->is_default = 1;
1980 static const TypeInfo spapr_machine_2_4_info = {
1981 .name = TYPE_SPAPR_MACHINE "2.4",
1982 .parent = TYPE_SPAPR_MACHINE,
1983 .class_init = spapr_machine_2_4_class_init,
1986 static void spapr_machine_register_types(void)
1988 type_register_static(&spapr_machine_info);
1989 type_register_static(&spapr_machine_2_1_info);
1990 type_register_static(&spapr_machine_2_2_info);
1991 type_register_static(&spapr_machine_2_3_info);
1992 type_register_static(&spapr_machine_2_4_info);
1995 type_init(spapr_machine_register_types)