exec.c

   1 /*
   2  *  Virtual page mapping
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19 #include "qemu/osdep.h"
  20 #include "qapi/error.h"
  21 #ifndef _WIN32
  22 #endif
  23
  24 #include "qemu/cutils.h"
  25 #include "cpu.h"
  26 #include "exec/exec-all.h"
  27 #include "exec/target_page.h"
  28 #include "tcg.h"
  29 #include "hw/qdev-core.h"
  30 #if !defined(CONFIG_USER_ONLY)
  31 #include "hw/boards.h"
  32 #include "hw/xen/xen.h"
  33 #endif
  34 #include "sysemu/kvm.h"
  35 #include "sysemu/sysemu.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/config-file.h"
  38 #include "qemu/error-report.h"
  39 #if defined(CONFIG_USER_ONLY)
  40 #include "qemu.h"
  41 #else /* !CONFIG_USER_ONLY */
  42 #include "hw/hw.h"
  43 #include "exec/memory.h"
  44 #include "exec/ioport.h"
  45 #include "sysemu/dma.h"
  46 #include "sysemu/numa.h"
  47 #include "sysemu/hw_accel.h"
  48 #include "exec/address-spaces.h"
  49 #include "sysemu/xen-mapcache.h"
  50 #include "trace-root.h"
  51
  52 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
  53 #include <fcntl.h>
  54 #include <linux/falloc.h>
  55 #endif
  56
  57 #endif
  58 #include "exec/cpu-all.h"
  59 #include "qemu/rcu_queue.h"
  60 #include "qemu/main-loop.h"
  61 #include "translate-all.h"
  62 #include "sysemu/replay.h"
  63
  64 #include "exec/memory-internal.h"
  65 #include "exec/ram_addr.h"
  66 #include "exec/log.h"
  67
  68 #include "migration/vmstate.h"
  69
  70 #include "qemu/range.h"
  71 #ifndef _WIN32
  72 #include "qemu/mmap-alloc.h"
  73 #endif
  74
  75 #include "monitor/monitor.h"
  76
  77 //#define DEBUG_SUBPAGE
  78
  79 #if !defined(CONFIG_USER_ONLY)
  80 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  81  * are protected by the ramlist lock.
  82  */
  83 RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  84
  85 static MemoryRegion *system_memory;
  86 static MemoryRegion *system_io;
  87
  88 AddressSpace address_space_io;
  89 AddressSpace address_space_memory;
  90
  91 MemoryRegion io_mem_rom, io_mem_notdirty;
  92 static MemoryRegion io_mem_unassigned;
  93
  94 /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
  95 #define RAM_PREALLOC   (1 << 0)
  96
  97 /* RAM is mmap-ed with MAP_SHARED */
  98 #define RAM_SHARED     (1 << 1)
  99
 100 /* Only a portion of RAM (used_length) is actually used, and migrated.
 101  * This used_length size can change across reboots.
 102  */
 103 #define RAM_RESIZEABLE (1 << 2)
 104
 105 #endif
 106
 107 #ifdef TARGET_PAGE_BITS_VARY
 108 int target_page_bits;
 109 bool target_page_bits_decided;
 110 #endif
 111
 112 struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
 113 /* current CPU in the current thread. It is only valid inside
 114    cpu_exec() */
 115 __thread CPUState *current_cpu;
 116 /* 0 = Do not count executed instructions.
 117    1 = Precise instruction counting.
 118    2 = Adaptive rate instruction counting.  */
 119 int use_icount;
 120
 121 uintptr_t qemu_host_page_size;
 122 intptr_t qemu_host_page_mask;
 123 uintptr_t qemu_real_host_page_size;
 124 intptr_t qemu_real_host_page_mask;
 125
 126 bool set_preferred_target_page_bits(int bits)
 127 {
 128     /* The target page size is the lowest common denominator for all
 129      * the CPUs in the system, so we can only make it smaller, never
 130      * larger. And we can't make it smaller once we've committed to
 131      * a particular size.
 132      */
 133 #ifdef TARGET_PAGE_BITS_VARY
 134     assert(bits >= TARGET_PAGE_BITS_MIN);
 135     if (target_page_bits == 0 || target_page_bits > bits) {
 136         if (target_page_bits_decided) {
 137             return false;
 138         }
 139         target_page_bits = bits;
 140     }
 141 #endif
 142     return true;
 143 }
 144
 145 #if !defined(CONFIG_USER_ONLY)
 146
 147 static void finalize_target_page_bits(void)
 148 {
 149 #ifdef TARGET_PAGE_BITS_VARY
 150     if (target_page_bits == 0) {
 151         target_page_bits = TARGET_PAGE_BITS_MIN;
 152     }
 153     target_page_bits_decided = true;
 154 #endif
 155 }
 156
 157 typedef struct PhysPageEntry PhysPageEntry;
 158
 159 struct PhysPageEntry {
 160     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 161     uint32_t skip : 6;
 162      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 163     uint32_t ptr : 26;
 164 };
 165
 166 #define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 167
 168 /* Size of the L2 (and L3, etc) page tables.  */
 169 #define ADDR_SPACE_BITS 64
 170
 171 #define P_L2_BITS 9
 172 #define P_L2_SIZE (1 << P_L2_BITS)
 173
 174 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 175
 176 typedef PhysPageEntry Node[P_L2_SIZE];
 177
 178 typedef struct PhysPageMap {
 179     struct rcu_head rcu;
 180
 181     unsigned sections_nb;
 182     unsigned sections_nb_alloc;
 183     unsigned nodes_nb;
 184     unsigned nodes_nb_alloc;
 185     Node *nodes;
 186     MemoryRegionSection *sections;
 187 } PhysPageMap;
 188
 189 struct AddressSpaceDispatch {
 190     struct rcu_head rcu;
 191
 192     MemoryRegionSection *mru_section;
 193     /* This is a multi-level map on the physical address space.
 194      * The bottom level has pointers to MemoryRegionSections.
 195      */
 196     PhysPageEntry phys_map;
 197     PhysPageMap map;
 198     AddressSpace *as;
 199 };
 200
 201 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 202 typedef struct subpage_t {
 203     MemoryRegion iomem;
 204     AddressSpace *as;
 205     hwaddr base;
 206     uint16_t sub_section[];
 207 } subpage_t;
 208
 209 #define PHYS_SECTION_UNASSIGNED 0
 210 #define PHYS_SECTION_NOTDIRTY 1
 211 #define PHYS_SECTION_ROM 2
 212 #define PHYS_SECTION_WATCH 3
 213
 214 static void io_mem_init(void);
 215 static void memory_map_init(void);
 216 static void tcg_commit(MemoryListener *listener);
 217
 218 static MemoryRegion io_mem_watch;
 219
 220 /**
 221  * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 222  * @cpu: the CPU whose AddressSpace this is
 223  * @as: the AddressSpace itself
 224  * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 225  * @tcg_as_listener: listener for tracking changes to the AddressSpace
 226  */
 227 struct CPUAddressSpace {
 228     CPUState *cpu;
 229     AddressSpace *as;
 230     struct AddressSpaceDispatch *memory_dispatch;
 231     MemoryListener tcg_as_listener;
 232 };
 233
 234 struct DirtyBitmapSnapshot {
 235     ram_addr_t start;
 236     ram_addr_t end;
 237     unsigned long dirty[];
 238 };
 239
 240 #endif
 241
 242 #if !defined(CONFIG_USER_ONLY)
 243
 244 static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 245 {
 246     static unsigned alloc_hint = 16;
 247     if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 248         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
 249         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
 250         map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 251         alloc_hint = map->nodes_nb_alloc;
 252     }
 253 }
 254
 255 static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
 256 {
 257     unsigned i;
 258     uint32_t ret;
 259     PhysPageEntry e;
 260     PhysPageEntry *p;
 261
 262     ret = map->nodes_nb++;
 263     p = map->nodes[ret];
 264     assert(ret != PHYS_MAP_NODE_NIL);
 265     assert(ret != map->nodes_nb_alloc);
 266
 267     e.skip = leaf ? 0 : 1;
 268     e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
 269     for (i = 0; i < P_L2_SIZE; ++i) {
 270         memcpy(&p[i], &e, sizeof(e));
 271     }
 272     return ret;
 273 }
 274
 275 static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 276                                 hwaddr *index, hwaddr *nb, uint16_t leaf,
 277                                 int level)
 278 {
 279     PhysPageEntry *p;
 280     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 281
 282     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 283         lp->ptr = phys_map_node_alloc(map, level == 0);
 284     }
 285     p = map->nodes[lp->ptr];
 286     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 287
 288     while (*nb && lp < &p[P_L2_SIZE]) {
 289         if ((*index & (step - 1)) == 0 && *nb >= step) {
 290             lp->skip = 0;
 291             lp->ptr = leaf;
 292             *index += step;
 293             *nb -= step;
 294         } else {
 295             phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 296         }
 297         ++lp;
 298     }
 299 }
 300
 301 static void phys_page_set(AddressSpaceDispatch *d,
 302                           hwaddr index, hwaddr nb,
 303                           uint16_t leaf)
 304 {
 305     /* Wildly overreserve - it doesn't matter much. */
 306     phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 307
 308     phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 309 }
 310
 311 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
 312  * and update our entry so we can skip it and go directly to the destination.
 313  */
 314 static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
 315 {
 316     unsigned valid_ptr = P_L2_SIZE;
 317     int valid = 0;
 318     PhysPageEntry *p;
 319     int i;
 320
 321     if (lp->ptr == PHYS_MAP_NODE_NIL) {
 322         return;
 323     }
 324
 325     p = nodes[lp->ptr];
 326     for (i = 0; i < P_L2_SIZE; i++) {
 327         if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 328             continue;
 329         }
 330
 331         valid_ptr = i;
 332         valid++;
 333         if (p[i].skip) {
 334             phys_page_compact(&p[i], nodes);
 335         }
 336     }
 337
 338     /* We can only compress if there's only one child. */
 339     if (valid != 1) {
 340         return;
 341     }
 342
 343     assert(valid_ptr < P_L2_SIZE);
 344
 345     /* Don't compress if it won't fit in the # of bits we have. */
 346     if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
 347         return;
 348     }
 349
 350     lp->ptr = p[valid_ptr].ptr;
 351     if (!p[valid_ptr].skip) {
 352         /* If our only child is a leaf, make this a leaf. */
 353         /* By design, we should have made this node a leaf to begin with so we
 354          * should never reach here.
 355          * But since it's so simple to handle this, let's do it just in case we
 356          * change this rule.
 357          */
 358         lp->skip = 0;
 359     } else {
 360         lp->skip += p[valid_ptr].skip;
 361     }
 362 }
 363
 364 static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
 365 {
 366     if (d->phys_map.skip) {
 367         phys_page_compact(&d->phys_map, d->map.nodes);
 368     }
 369 }
 370
 371 static inline bool section_covers_addr(const MemoryRegionSection *section,
 372                                        hwaddr addr)
 373 {
 374     /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
 375      * the section must cover the entire address space.
 376      */
 377     return int128_gethi(section->size) ||
 378            range_covers_byte(section->offset_within_address_space,
 379                              int128_getlo(section->size), addr);
 380 }
 381
 382 static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
 383 {
 384     PhysPageEntry lp = d->phys_map, *p;
 385     Node *nodes = d->map.nodes;
 386     MemoryRegionSection *sections = d->map.sections;
 387     hwaddr index = addr >> TARGET_PAGE_BITS;
 388     int i;
 389
 390     for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 391         if (lp.ptr == PHYS_MAP_NODE_NIL) {
 392             return &sections[PHYS_SECTION_UNASSIGNED];
 393         }
 394         p = nodes[lp.ptr];
 395         lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 396     }
 397
 398     if (section_covers_addr(&sections[lp.ptr], addr)) {
 399         return &sections[lp.ptr];
 400     } else {
 401         return &sections[PHYS_SECTION_UNASSIGNED];
 402     }
 403 }
 404
 405 bool memory_region_is_unassigned(MemoryRegion *mr)
 406 {
 407     return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
 408         && mr != &io_mem_watch;
 409 }
 410
 411 /* Called from RCU critical section */
 412 static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 413                                                         hwaddr addr,
 414                                                         bool resolve_subpage)
 415 {
 416     MemoryRegionSection *section = atomic_read(&d->mru_section);
 417     subpage_t *subpage;
 418     bool update;
 419
 420     if (section && section != &d->map.sections[PHYS_SECTION_UNASSIGNED] &&
 421         section_covers_addr(section, addr)) {
 422         update = false;
 423     } else {
 424         section = phys_page_find(d, addr);
 425         update = true;
 426     }
 427     if (resolve_subpage && section->mr->subpage) {
 428         subpage = container_of(section->mr, subpage_t, iomem);
 429         section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 430     }
 431     if (update) {
 432         atomic_set(&d->mru_section, section);
 433     }
 434     return section;
 435 }
 436
 437 /* Called from RCU critical section */
 438 static MemoryRegionSection *
 439 address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 440                                  hwaddr *plen, bool resolve_subpage)
 441 {
 442     MemoryRegionSection *section;
 443     MemoryRegion *mr;
 444     Int128 diff;
 445
 446     section = address_space_lookup_region(d, addr, resolve_subpage);
 447     /* Compute offset within MemoryRegionSection */
 448     addr -= section->offset_within_address_space;
 449
 450     /* Compute offset within MemoryRegion */
 451     *xlat = addr + section->offset_within_region;
 452
 453     mr = section->mr;
 454
 455     /* MMIO registers can be expected to perform full-width accesses based only
 456      * on their address, without considering adjacent registers that could
 457      * decode to completely different MemoryRegions.  When such registers
 458      * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
 459      * regions overlap wildly.  For this reason we cannot clamp the accesses
 460      * here.
 461      *
 462      * If the length is small (as is the case for address_space_ldl/stl),
 463      * everything works fine.  If the incoming length is large, however,
 464      * the caller really has to do the clamping through memory_access_size.
 465      */
 466     if (memory_region_is_ram(mr)) {
 467         diff = int128_sub(section->size, int128_make64(addr));
 468         *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 469     }
 470     return section;
 471 }
 472
 473 /* Called from RCU critical section */
 474 static MemoryRegionSection address_space_do_translate(AddressSpace *as,
 475                                                       hwaddr addr,
 476                                                       hwaddr *xlat,
 477                                                       hwaddr *plen,
 478                                                       bool is_write,
 479                                                       bool is_mmio)
 480 {
 481     IOMMUTLBEntry iotlb;
 482     MemoryRegionSection *section;
 483     MemoryRegion *mr;
 484
 485     for (;;) {
 486         AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
 487         section = address_space_translate_internal(d, addr, &addr, plen, is_mmio);
 488         mr = section->mr;
 489
 490         if (!mr->iommu_ops) {
 491             break;
 492         }
 493
 494         iotlb = mr->iommu_ops->translate(mr, addr, is_write ?
 495                                          IOMMU_WO : IOMMU_RO);
 496         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 497                 | (addr & iotlb.addr_mask));
 498         *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
 499         if (!(iotlb.perm & (1 << is_write))) {
 500             goto translate_fail;
 501         }
 502
 503         as = iotlb.target_as;
 504     }
 505
 506     *xlat = addr;
 507
 508     return *section;
 509
 510 translate_fail:
 511     return (MemoryRegionSection) { .mr = &io_mem_unassigned };
 512 }
 513
 514 /* Called from RCU critical section */
 515 IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
 516                                             bool is_write)
 517 {
 518     MemoryRegionSection section;
 519     hwaddr xlat, plen;
 520
 521     /* Try to get maximum page mask during translation. */
 522     plen = (hwaddr)-1;
 523
 524     /* This can never be MMIO. */
 525     section = address_space_do_translate(as, addr, &xlat, &plen,
 526                                          is_write, false);
 527
 528     /* Illegal translation */
 529     if (section.mr == &io_mem_unassigned) {
 530         goto iotlb_fail;
 531     }
 532
 533     /* Convert memory region offset into address space offset */
 534     xlat += section.offset_within_address_space -
 535         section.offset_within_region;
 536
 537     if (plen == (hwaddr)-1) {
 538         /*
 539          * We use default page size here. Logically it only happens
 540          * for identity mappings.
 541          */
 542         plen = TARGET_PAGE_SIZE;
 543     }
 544
 545     /* Convert to address mask */
 546     plen -= 1;
 547
 548     return (IOMMUTLBEntry) {
 549         .target_as = section.address_space,
 550         .iova = addr & ~plen,
 551         .translated_addr = xlat & ~plen,
 552         .addr_mask = plen,
 553         /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
 554         .perm = IOMMU_RW,
 555     };
 556
 557 iotlb_fail:
 558     return (IOMMUTLBEntry) {0};
 559 }
 560
 561 /* Called from RCU critical section */
 562 MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
 563                                       hwaddr *xlat, hwaddr *plen,
 564                                       bool is_write)
 565 {
 566     MemoryRegion *mr;
 567     MemoryRegionSection section;
 568
 569     /* This can be MMIO, so setup MMIO bit. */
 570     section = address_space_do_translate(as, addr, xlat, plen, is_write, true);
 571     mr = section.mr;
 572
 573     if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 574         hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 575         *plen = MIN(page, *plen);
 576     }
 577
 578     return mr;
 579 }
 580
 581 /* Called from RCU critical section */
 582 MemoryRegionSection *
 583 address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
 584                                   hwaddr *xlat, hwaddr *plen)
 585 {
 586     MemoryRegionSection *section;
 587     AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
 588
 589     section = address_space_translate_internal(d, addr, xlat, plen, false);
 590
 591     assert(!section->mr->iommu_ops);
 592     return section;
 593 }
 594 #endif
 595
 596 #if !defined(CONFIG_USER_ONLY)
 597
 598 static int cpu_common_post_load(void *opaque, int version_id)
 599 {
 600     CPUState *cpu = opaque;
 601
 602     /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 603        version_id is increased. */
 604     cpu->interrupt_request &= ~0x01;
 605     tlb_flush(cpu);
 606
 607     return 0;
 608 }
 609
 610 static int cpu_common_pre_load(void *opaque)
 611 {
 612     CPUState *cpu = opaque;
 613
 614     cpu->exception_index = -1;
 615
 616     return 0;
 617 }
 618
 619 static bool cpu_common_exception_index_needed(void *opaque)
 620 {
 621     CPUState *cpu = opaque;
 622
 623     return tcg_enabled() && cpu->exception_index != -1;
 624 }
 625
 626 static const VMStateDescription vmstate_cpu_common_exception_index = {
 627     .name = "cpu_common/exception_index",
 628     .version_id = 1,
 629     .minimum_version_id = 1,
 630     .needed = cpu_common_exception_index_needed,
 631     .fields = (VMStateField[]) {
 632         VMSTATE_INT32(exception_index, CPUState),
 633         VMSTATE_END_OF_LIST()
 634     }
 635 };
 636
 637 static bool cpu_common_crash_occurred_needed(void *opaque)
 638 {
 639     CPUState *cpu = opaque;
 640
 641     return cpu->crash_occurred;
 642 }
 643
 644 static const VMStateDescription vmstate_cpu_common_crash_occurred = {
 645     .name = "cpu_common/crash_occurred",
 646     .version_id = 1,
 647     .minimum_version_id = 1,
 648     .needed = cpu_common_crash_occurred_needed,
 649     .fields = (VMStateField[]) {
 650         VMSTATE_BOOL(crash_occurred, CPUState),
 651         VMSTATE_END_OF_LIST()
 652     }
 653 };
 654
 655 const VMStateDescription vmstate_cpu_common = {
 656     .name = "cpu_common",
 657     .version_id = 1,
 658     .minimum_version_id = 1,
 659     .pre_load = cpu_common_pre_load,
 660     .post_load = cpu_common_post_load,
 661     .fields = (VMStateField[]) {
 662         VMSTATE_UINT32(halted, CPUState),
 663         VMSTATE_UINT32(interrupt_request, CPUState),
 664         VMSTATE_END_OF_LIST()
 665     },
 666     .subsections = (const VMStateDescription*[]) {
 667         &vmstate_cpu_common_exception_index,
 668         &vmstate_cpu_common_crash_occurred,
 669         NULL
 670     }
 671 };
 672
 673 #endif
 674
 675 CPUState *qemu_get_cpu(int index)
 676 {
 677     CPUState *cpu;
 678
 679     CPU_FOREACH(cpu) {
 680         if (cpu->cpu_index == index) {
 681             return cpu;
 682         }
 683     }
 684
 685     return NULL;
 686 }
 687
 688 #if !defined(CONFIG_USER_ONLY)
 689 void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx)
 690 {
 691     CPUAddressSpace *newas;
 692
 693     /* Target code should have set num_ases before calling us */
 694     assert(asidx < cpu->num_ases);
 695
 696     if (asidx == 0) {
 697         /* address space 0 gets the convenience alias */
 698         cpu->as = as;
 699     }
 700
 701     /* KVM cannot currently support multiple address spaces. */
 702     assert(asidx == 0 || !kvm_enabled());
 703
 704     if (!cpu->cpu_ases) {
 705         cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
 706     }
 707
 708     newas = &cpu->cpu_ases[asidx];
 709     newas->cpu = cpu;
 710     newas->as = as;
 711     if (tcg_enabled()) {
 712         newas->tcg_as_listener.commit = tcg_commit;
 713         memory_listener_register(&newas->tcg_as_listener, as);
 714     }
 715 }
 716
 717 AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 718 {
 719     /* Return the AddressSpace corresponding to the specified index */
 720     return cpu->cpu_ases[asidx].as;
 721 }
 722 #endif
 723
 724 void cpu_exec_unrealizefn(CPUState *cpu)
 725 {
 726     CPUClass *cc = CPU_GET_CLASS(cpu);
 727
 728     cpu_list_remove(cpu);
 729
 730     if (cc->vmsd != NULL) {
 731         vmstate_unregister(NULL, cc->vmsd, cpu);
 732     }
 733     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 734         vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
 735     }
 736 }
 737
 738 void cpu_exec_initfn(CPUState *cpu)
 739 {
 740     cpu->as = NULL;
 741     cpu->num_ases = 0;
 742
 743 #ifndef CONFIG_USER_ONLY
 744     cpu->thread_id = qemu_get_thread_id();
 745
 746     /* This is a softmmu CPU object, so create a property for it
 747      * so users can wire up its memory. (This can't go in qom/cpu.c
 748      * because that file is compiled only once for both user-mode
 749      * and system builds.) The default if no link is set up is to use
 750      * the system address space.
 751      */
 752     object_property_add_link(OBJECT(cpu), "memory", TYPE_MEMORY_REGION,
 753                              (Object **)&cpu->memory,
 754                              qdev_prop_allow_set_link_before_realize,
 755                              OBJ_PROP_LINK_UNREF_ON_RELEASE,
 756                              &error_abort);
 757     cpu->memory = system_memory;
 758     object_ref(OBJECT(cpu->memory));
 759 #endif
 760 }
 761
 762 void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 763 {
 764     CPUClass *cc ATTRIBUTE_UNUSED = CPU_GET_CLASS(cpu);
 765
 766     cpu_list_add(cpu);
 767
 768 #ifndef CONFIG_USER_ONLY
 769     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 770         vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
 771     }
 772     if (cc->vmsd != NULL) {
 773         vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
 774     }
 775 #endif
 776 }
 777
 778 #if defined(CONFIG_USER_ONLY)
 779 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 780 {
 781     mmap_lock();
 782     tb_lock();
 783     tb_invalidate_phys_page_range(pc, pc + 1, 0);
 784     tb_unlock();
 785     mmap_unlock();
 786 }
 787 #else
 788 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 789 {
 790     MemTxAttrs attrs;
 791     hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
 792     int asidx = cpu_asidx_from_attrs(cpu, attrs);
 793     if (phys != -1) {
 794         /* Locks grabbed by tb_invalidate_phys_addr */
 795         tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
 796                                 phys | (pc & ~TARGET_PAGE_MASK));
 797     }
 798 }
 799 #endif
 800
 801 #if defined(CONFIG_USER_ONLY)
 802 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 803
 804 {
 805 }
 806
 807 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 808                           int flags)
 809 {
 810     return -ENOSYS;
 811 }
 812
 813 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 814 {
 815 }
 816
 817 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 818                           int flags, CPUWatchpoint **watchpoint)
 819 {
 820     return -ENOSYS;
 821 }
 822 #else
 823 /* Add a watchpoint.  */
 824 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 825                           int flags, CPUWatchpoint **watchpoint)
 826 {
 827     CPUWatchpoint *wp;
 828
 829     /* forbid ranges which are empty or run off the end of the address space */
 830     if (len == 0 || (addr + len - 1) < addr) {
 831         error_report("tried to set invalid watchpoint at %"
 832                      VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
 833         return -EINVAL;
 834     }
 835     wp = g_malloc(sizeof(*wp));
 836
 837     wp->vaddr = addr;
 838     wp->len = len;
 839     wp->flags = flags;
 840
 841     /* keep all GDB-injected watchpoints in front */
 842     if (flags & BP_GDB) {
 843         QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
 844     } else {
 845         QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
 846     }
 847
 848     tlb_flush_page(cpu, addr);
 849
 850     if (watchpoint)
 851         *watchpoint = wp;
 852     return 0;
 853 }
 854
 855 /* Remove a specific watchpoint.  */
 856 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 857                           int flags)
 858 {
 859     CPUWatchpoint *wp;
 860
 861     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 862         if (addr == wp->vaddr && len == wp->len
 863                 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
 864             cpu_watchpoint_remove_by_ref(cpu, wp);
 865             return 0;
 866         }
 867     }
 868     return -ENOENT;
 869 }
 870
 871 /* Remove a specific watchpoint by reference.  */
 872 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 873 {
 874     QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
 875
 876     tlb_flush_page(cpu, watchpoint->vaddr);
 877
 878     g_free(watchpoint);
 879 }
 880
 881 /* Remove all matching watchpoints.  */
 882 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 883 {
 884     CPUWatchpoint *wp, *next;
 885
 886     QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
 887         if (wp->flags & mask) {
 888             cpu_watchpoint_remove_by_ref(cpu, wp);
 889         }
 890     }
 891 }
 892
 893 /* Return true if this watchpoint address matches the specified
 894  * access (ie the address range covered by the watchpoint overlaps
 895  * partially or completely with the address range covered by the
 896  * access).
 897  */
 898 static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
 899                                                   vaddr addr,
 900                                                   vaddr len)
 901 {
 902     /* We know the lengths are non-zero, but a little caution is
 903      * required to avoid errors in the case where the range ends
 904      * exactly at the top of the address space and so addr + len
 905      * wraps round to zero.
 906      */
 907     vaddr wpend = wp->vaddr + wp->len - 1;
 908     vaddr addrend = addr + len - 1;
 909
 910     return !(addr > wpend || wp->vaddr > addrend);
 911 }
 912
 913 #endif
 914
 915 /* Add a breakpoint.  */
 916 int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
 917                           CPUBreakpoint **breakpoint)
 918 {
 919     CPUBreakpoint *bp;
 920
 921     bp = g_malloc(sizeof(*bp));
 922
 923     bp->pc = pc;
 924     bp->flags = flags;
 925
 926     /* keep all GDB-injected breakpoints in front */
 927     if (flags & BP_GDB) {
 928         QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
 929     } else {
 930         QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
 931     }
 932
 933     breakpoint_invalidate(cpu, pc);
 934
 935     if (breakpoint) {
 936         *breakpoint = bp;
 937     }
 938     return 0;
 939 }
 940
 941 /* Remove a specific breakpoint.  */
 942 int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
 943 {
 944     CPUBreakpoint *bp;
 945
 946     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
 947         if (bp->pc == pc && bp->flags == flags) {
 948             cpu_breakpoint_remove_by_ref(cpu, bp);
 949             return 0;
 950         }
 951     }
 952     return -ENOENT;
 953 }
 954
 955 /* Remove a specific breakpoint by reference.  */
 956 void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
 957 {
 958     QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
 959
 960     breakpoint_invalidate(cpu, breakpoint->pc);
 961
 962     g_free(breakpoint);
 963 }
 964
 965 /* Remove all matching breakpoints. */
 966 void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
 967 {
 968     CPUBreakpoint *bp, *next;
 969
 970     QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
 971         if (bp->flags & mask) {
 972             cpu_breakpoint_remove_by_ref(cpu, bp);
 973         }
 974     }
 975 }
 976
 977 /* enable or disable single step mode. EXCP_DEBUG is returned by the
 978    CPU loop after each instruction */
 979 void cpu_single_step(CPUState *cpu, int enabled)
 980 {
 981     if (cpu->singlestep_enabled != enabled) {
 982         cpu->singlestep_enabled = enabled;
 983         if (kvm_enabled()) {
 984             kvm_update_guest_debug(cpu, 0);
 985         } else {
 986             /* must flush all the translated code to avoid inconsistencies */
 987             /* XXX: only flush what is necessary */
 988             tb_flush(cpu);
 989         }
 990     }
 991 }
 992
 993 void cpu_abort(CPUState *cpu, const char *fmt, ...)
 994 {
 995     va_list ap;
 996     va_list ap2;
 997
 998     va_start(ap, fmt);
 999     va_copy(ap2, ap);
1000     fprintf(stderr, "qemu: fatal: ");
1001     vfprintf(stderr, fmt, ap);
1002     fprintf(stderr, "\n");
1003     cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1004     if (qemu_log_separate()) {
1005         qemu_log_lock();
1006         qemu_log("qemu: fatal: ");
1007         qemu_log_vprintf(fmt, ap2);
1008         qemu_log("\n");
1009         log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1010         qemu_log_flush();
1011         qemu_log_unlock();
1012         qemu_log_close();
1013     }
1014     va_end(ap2);
1015     va_end(ap);
1016     replay_finish();
1017 #if defined(CONFIG_USER_ONLY)
1018     {
1019         struct sigaction act;
1020         sigfillset(&act.sa_mask);
1021         act.sa_handler = SIG_DFL;
1022         sigaction(SIGABRT, &act, NULL);
1023     }
1024 #endif
1025     abort();
1026 }
1027
1028 #if !defined(CONFIG_USER_ONLY)
1029 /* Called from RCU critical section */
1030 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
1031 {
1032     RAMBlock *block;
1033
1034     block = atomic_rcu_read(&ram_list.mru_block);
1035     if (block && addr - block->offset < block->max_length) {
1036         return block;
1037     }
1038     RAMBLOCK_FOREACH(block) {
1039         if (addr - block->offset < block->max_length) {
1040             goto found;
1041         }
1042     }
1043
1044     fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
1045     abort();
1046
1047 found:
1048     /* It is safe to write mru_block outside the iothread lock.  This
1049      * is what happens:
1050      *
1051      *     mru_block = xxx
1052      *     rcu_read_unlock()
1053      *                                        xxx removed from list
1054      *                  rcu_read_lock()
1055      *                  read mru_block
1056      *                                        mru_block = NULL;
1057      *                                        call_rcu(reclaim_ramblock, xxx);
1058      *                  rcu_read_unlock()
1059      *
1060      * atomic_rcu_set is not needed here.  The block was already published
1061      * when it was placed into the list.  Here we're just making an extra
1062      * copy of the pointer.
1063      */
1064     ram_list.mru_block = block;
1065     return block;
1066 }
1067
1068 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
1069 {
1070     CPUState *cpu;
1071     ram_addr_t start1;
1072     RAMBlock *block;
1073     ram_addr_t end;
1074
1075     end = TARGET_PAGE_ALIGN(start + length);
1076     start &= TARGET_PAGE_MASK;
1077
1078     rcu_read_lock();
1079     block = qemu_get_ram_block(start);
1080     assert(block == qemu_get_ram_block(end - 1));
1081     start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
1082     CPU_FOREACH(cpu) {
1083         tlb_reset_dirty(cpu, start1, length);
1084     }
1085     rcu_read_unlock();
1086 }
1087
1088 /* Note: start and end must be within the same ram block.  */
1089 bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
1090                                               ram_addr_t length,
1091                                               unsigned client)
1092 {
1093     DirtyMemoryBlocks *blocks;
1094     unsigned long end, page;
1095     bool dirty = false;
1096
1097     if (length == 0) {
1098         return false;
1099     }
1100
1101     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
1102     page = start >> TARGET_PAGE_BITS;
1103
1104     rcu_read_lock();
1105
1106     blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1107
1108     while (page < end) {
1109         unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1110         unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1111         unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
1112
1113         dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
1114                                               offset, num);
1115         page += num;
1116     }
1117
1118     rcu_read_unlock();
1119
1120     if (dirty && tcg_enabled()) {
1121         tlb_reset_dirty_range_all(start, length);
1122     }
1123
1124     return dirty;
1125 }
1126
1127 DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
1128      (ram_addr_t start, ram_addr_t length, unsigned client)
1129 {
1130     DirtyMemoryBlocks *blocks;
1131     unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
1132     ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
1133     ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
1134     DirtyBitmapSnapshot *snap;
1135     unsigned long page, end, dest;
1136
1137     snap = g_malloc0(sizeof(*snap) +
1138                      ((last - first) >> (TARGET_PAGE_BITS + 3)));
1139     snap->start = first;
1140     snap->end   = last;
1141
1142     page = first >> TARGET_PAGE_BITS;
1143     end  = last  >> TARGET_PAGE_BITS;
1144     dest = 0;
1145
1146     rcu_read_lock();
1147
1148     blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1149
1150     while (page < end) {
1151         unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1152         unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1153         unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
1154
1155         assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
1156         assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
1157         offset >>= BITS_PER_LEVEL;
1158
1159         bitmap_copy_and_clear_atomic(snap->dirty + dest,
1160                                      blocks->blocks[idx] + offset,
1161                                      num);
1162         page += num;
1163         dest += num >> BITS_PER_LEVEL;
1164     }
1165
1166     rcu_read_unlock();
1167
1168     if (tcg_enabled()) {
1169         tlb_reset_dirty_range_all(start, length);
1170     }
1171
1172     return snap;
1173 }
1174
1175 bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
1176                                             ram_addr_t start,
1177                                             ram_addr_t length)
1178 {
1179     unsigned long page, end;
1180
1181     assert(start >= snap->start);
1182     assert(start + length <= snap->end);
1183
1184     end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
1185     page = (start - snap->start) >> TARGET_PAGE_BITS;
1186
1187     while (page < end) {
1188         if (test_bit(page, snap->dirty)) {
1189             return true;
1190         }
1191         page++;
1192     }
1193     return false;
1194 }
1195
1196 /* Called from RCU critical section */
1197 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1198                                        MemoryRegionSection *section,
1199                                        target_ulong vaddr,
1200                                        hwaddr paddr, hwaddr xlat,
1201                                        int prot,
1202                                        target_ulong *address)
1203 {
1204     hwaddr iotlb;
1205     CPUWatchpoint *wp;
1206
1207     if (memory_region_is_ram(section->mr)) {
1208         /* Normal RAM.  */
1209         iotlb = memory_region_get_ram_addr(section->mr) + xlat;
1210         if (!section->readonly) {
1211             iotlb |= PHYS_SECTION_NOTDIRTY;
1212         } else {
1213             iotlb |= PHYS_SECTION_ROM;
1214         }
1215     } else {
1216         AddressSpaceDispatch *d;
1217
1218         d = atomic_rcu_read(&section->address_space->dispatch);
1219         iotlb = section - d->map.sections;
1220         iotlb += xlat;
1221     }
1222
1223     /* Make accesses to pages with watchpoints go via the
1224        watchpoint trap routines.  */
1225     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1226         if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
1227             /* Avoid trapping reads of pages with a write breakpoint. */
1228             if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
1229                 iotlb = PHYS_SECTION_WATCH + paddr;
1230                 *address |= TLB_MMIO;
1231                 break;
1232             }
1233         }
1234     }
1235
1236     return iotlb;
1237 }
1238 #endif /* defined(CONFIG_USER_ONLY) */
1239
1240 #if !defined(CONFIG_USER_ONLY)
1241
1242 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
1243                              uint16_t section);
1244 static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
1245
1246 static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
1247                                qemu_anon_ram_alloc;
1248
1249 /*
1250  * Set a custom physical guest memory alloator.
1251  * Accelerators with unusual needs may need this.  Hopefully, we can
1252  * get rid of it eventually.
1253  */
1254 void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
1255 {
1256     phys_mem_alloc = alloc;
1257 }
1258
1259 static uint16_t phys_section_add(PhysPageMap *map,
1260                                  MemoryRegionSection *section)
1261 {
1262     /* The physical section number is ORed with a page-aligned
1263      * pointer to produce the iotlb entries.  Thus it should
1264      * never overflow into the page-aligned value.
1265      */
1266     assert(map->sections_nb < TARGET_PAGE_SIZE);
1267
1268     if (map->sections_nb == map->sections_nb_alloc) {
1269         map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
1270         map->sections = g_renew(MemoryRegionSection, map->sections,
1271                                 map->sections_nb_alloc);
1272     }
1273     map->sections[map->sections_nb] = *section;
1274     memory_region_ref(section->mr);
1275     return map->sections_nb++;
1276 }
1277
1278 static void phys_section_destroy(MemoryRegion *mr)
1279 {
1280     bool have_sub_page = mr->subpage;
1281
1282     memory_region_unref(mr);
1283
1284     if (have_sub_page) {
1285         subpage_t *subpage = container_of(mr, subpage_t, iomem);
1286         object_unref(OBJECT(&subpage->iomem));
1287         g_free(subpage);
1288     }
1289 }
1290
1291 static void phys_sections_free(PhysPageMap *map)
1292 {
1293     while (map->sections_nb > 0) {
1294         MemoryRegionSection *section = &map->sections[--map->sections_nb];
1295         phys_section_destroy(section->mr);
1296     }
1297     g_free(map->sections);
1298     g_free(map->nodes);
1299 }
1300
1301 static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
1302 {
1303     subpage_t *subpage;
1304     hwaddr base = section->offset_within_address_space
1305         & TARGET_PAGE_MASK;
1306     MemoryRegionSection *existing = phys_page_find(d, base);
1307     MemoryRegionSection subsection = {
1308         .offset_within_address_space = base,
1309         .size = int128_make64(TARGET_PAGE_SIZE),
1310     };
1311     hwaddr start, end;
1312
1313     assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1314
1315     if (!(existing->mr->subpage)) {
1316         subpage = subpage_init(d->as, base);
1317         subsection.address_space = d->as;
1318         subsection.mr = &subpage->iomem;
1319         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1320                       phys_section_add(&d->map, &subsection));
1321     } else {
1322         subpage = container_of(existing->mr, subpage_t, iomem);
1323     }
1324     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1325     end = start + int128_get64(section->size) - 1;
1326     subpage_register(subpage, start, end,
1327                      phys_section_add(&d->map, section));
1328 }
1329
1330
1331 static void register_multipage(AddressSpaceDispatch *d,
1332                                MemoryRegionSection *section)
1333 {
1334     hwaddr start_addr = section->offset_within_address_space;
1335     uint16_t section_index = phys_section_add(&d->map, section);
1336     uint64_t num_pages = int128_get64(int128_rshift(section->size,
1337                                                     TARGET_PAGE_BITS));
1338
1339     assert(num_pages);
1340     phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1341 }
1342
1343 static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
1344 {
1345     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
1346     AddressSpaceDispatch *d = as->next_dispatch;
1347     MemoryRegionSection now = *section, remain = *section;
1348     Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1349
1350     if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
1351         uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
1352                        - now.offset_within_address_space;
1353
1354         now.size = int128_min(int128_make64(left), now.size);
1355         register_subpage(d, &now);
1356     } else {
1357         now.size = int128_zero();
1358     }
1359     while (int128_ne(remain.size, now.size)) {
1360         remain.size = int128_sub(remain.size, now.size);
1361         remain.offset_within_address_space += int128_get64(now.size);
1362         remain.offset_within_region += int128_get64(now.size);
1363         now = remain;
1364         if (int128_lt(remain.size, page_size)) {
1365             register_subpage(d, &now);
1366         } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1367             now.size = page_size;
1368             register_subpage(d, &now);
1369         } else {
1370             now.size = int128_and(now.size, int128_neg(page_size));
1371             register_multipage(d, &now);
1372         }
1373     }
1374 }
1375
1376 void qemu_flush_coalesced_mmio_buffer(void)
1377 {
1378     if (kvm_enabled())
1379         kvm_flush_coalesced_mmio_buffer();
1380 }
1381
1382 void qemu_mutex_lock_ramlist(void)
1383 {
1384     qemu_mutex_lock(&ram_list.mutex);
1385 }
1386
1387 void qemu_mutex_unlock_ramlist(void)
1388 {
1389     qemu_mutex_unlock(&ram_list.mutex);
1390 }
1391
1392 void ram_block_dump(Monitor *mon)
1393 {
1394     RAMBlock *block;
1395     char *psize;
1396
1397     rcu_read_lock();
1398     monitor_printf(mon, "%24s %8s  %18s %18s %18s\n",
1399                    "Block Name", "PSize", "Offset", "Used", "Total");
1400     RAMBLOCK_FOREACH(block) {
1401         psize = size_to_str(block->page_size);
1402         monitor_printf(mon, "%24s %8s  0x%016" PRIx64 " 0x%016" PRIx64
1403                        " 0x%016" PRIx64 "\n", block->idstr, psize,
1404                        (uint64_t)block->offset,
1405                        (uint64_t)block->used_length,
1406                        (uint64_t)block->max_length);
1407         g_free(psize);
1408     }
1409     rcu_read_unlock();
1410 }
1411
1412 #ifdef __linux__
1413 /*
1414  * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
1415  * may or may not name the same files / on the same filesystem now as
1416  * when we actually open and map them.  Iterate over the file
1417  * descriptors instead, and use qemu_fd_getpagesize().
1418  */
1419 static int find_max_supported_pagesize(Object *obj, void *opaque)
1420 {
1421     char *mem_path;
1422     long *hpsize_min = opaque;
1423
1424     if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1425         mem_path = object_property_get_str(obj, "mem-path", NULL);
1426         if (mem_path) {
1427             long hpsize = qemu_mempath_getpagesize(mem_path);
1428             if (hpsize < *hpsize_min) {
1429                 *hpsize_min = hpsize;
1430             }
1431         } else {
1432             *hpsize_min = getpagesize();
1433         }
1434     }
1435
1436     return 0;
1437 }
1438
1439 long qemu_getrampagesize(void)
1440 {
1441     long hpsize = LONG_MAX;
1442     long mainrampagesize;
1443     Object *memdev_root;
1444
1445     if (mem_path) {
1446         mainrampagesize = qemu_mempath_getpagesize(mem_path);
1447     } else {
1448         mainrampagesize = getpagesize();
1449     }
1450
1451     /* it's possible we have memory-backend objects with
1452      * hugepage-backed RAM. these may get mapped into system
1453      * address space via -numa parameters or memory hotplug
1454      * hooks. we want to take these into account, but we
1455      * also want to make sure these supported hugepage
1456      * sizes are applicable across the entire range of memory
1457      * we may boot from, so we take the min across all
1458      * backends, and assume normal pages in cases where a
1459      * backend isn't backed by hugepages.
1460      */
1461     memdev_root = object_resolve_path("/objects", NULL);
1462     if (memdev_root) {
1463         object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize);
1464     }
1465     if (hpsize == LONG_MAX) {
1466         /* No additional memory regions found ==> Report main RAM page size */
1467         return mainrampagesize;
1468     }
1469
1470     /* If NUMA is disabled or the NUMA nodes are not backed with a
1471      * memory-backend, then there is at least one node using "normal" RAM,
1472      * so if its page size is smaller we have got to report that size instead.
1473      */
1474     if (hpsize > mainrampagesize &&
1475         (nb_numa_nodes == 0 || numa_info[0].node_memdev == NULL)) {
1476         static bool warned;
1477         if (!warned) {
1478             error_report("Huge page support disabled (n/a for main memory).");
1479             warned = true;
1480         }
1481         return mainrampagesize;
1482     }
1483
1484     return hpsize;
1485 }
1486 #else
1487 long qemu_getrampagesize(void)
1488 {
1489     return getpagesize();
1490 }
1491 #endif
1492
1493 #ifdef __linux__
1494 static int64_t get_file_size(int fd)
1495 {
1496     int64_t size = lseek(fd, 0, SEEK_END);
1497     if (size < 0) {
1498         return -errno;
1499     }
1500     return size;
1501 }
1502
1503 static int file_ram_open(const char *path,
1504                          const char *region_name,
1505                          bool *created,
1506                          Error **errp)
1507 {
1508     char *filename;
1509     char *sanitized_name;
1510     char *c;
1511     int fd = -1;
1512
1513     *created = false;
1514     for (;;) {
1515         fd = open(path, O_RDWR);
1516         if (fd >= 0) {
1517             /* @path names an existing file, use it */
1518             break;
1519         }
1520         if (errno == ENOENT) {
1521             /* @path names a file that doesn't exist, create it */
1522             fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
1523             if (fd >= 0) {
1524                 *created = true;
1525                 break;
1526             }
1527         } else if (errno == EISDIR) {
1528             /* @path names a directory, create a file there */
1529             /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1530             sanitized_name = g_strdup(region_name);
1531             for (c = sanitized_name; *c != '\0'; c++) {
1532                 if (*c == '/') {
1533                     *c = '_';
1534                 }
1535             }
1536
1537             filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1538                                        sanitized_name);
1539             g_free(sanitized_name);
1540
1541             fd = mkstemp(filename);
1542             if (fd >= 0) {
1543                 unlink(filename);
1544                 g_free(filename);
1545                 break;
1546             }
1547             g_free(filename);
1548         }
1549         if (errno != EEXIST && errno != EINTR) {
1550             error_setg_errno(errp, errno,
1551                              "can't open backing store %s for guest RAM",
1552                              path);
1553             return -1;
1554         }
1555         /*
1556          * Try again on EINTR and EEXIST.  The latter happens when
1557          * something else creates the file between our two open().
1558          */
1559     }
1560
1561     return fd;
1562 }
1563
1564 static void *file_ram_alloc(RAMBlock *block,
1565                             ram_addr_t memory,
1566                             int fd,
1567                             bool truncate,
1568                             Error **errp)
1569 {
1570     void *area;
1571
1572     block->page_size = qemu_fd_getpagesize(fd);
1573     block->mr->align = block->page_size;
1574 #if defined(__s390x__)
1575     if (kvm_enabled()) {
1576         block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
1577     }
1578 #endif
1579
1580     if (memory < block->page_size) {
1581         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1582                    "or larger than page size 0x%zx",
1583                    memory, block->page_size);
1584         return NULL;
1585     }
1586
1587     memory = ROUND_UP(memory, block->page_size);
1588
1589     /*
1590      * ftruncate is not supported by hugetlbfs in older
1591      * hosts, so don't bother bailing out on errors.
1592      * If anything goes wrong with it under other filesystems,
1593      * mmap will fail.
1594      *
1595      * Do not truncate the non-empty backend file to avoid corrupting
1596      * the existing data in the file. Disabling shrinking is not
1597      * enough. For example, the current vNVDIMM implementation stores
1598      * the guest NVDIMM labels at the end of the backend file. If the
1599      * backend file is later extended, QEMU will not be able to find
1600      * those labels. Therefore, extending the non-empty backend file
1601      * is disabled as well.
1602      */
1603     if (truncate && ftruncate(fd, memory)) {
1604         perror("ftruncate");
1605     }
1606
1607     area = qemu_ram_mmap(fd, memory, block->mr->align,
1608                          block->flags & RAM_SHARED);
1609     if (area == MAP_FAILED) {
1610         error_setg_errno(errp, errno,
1611                          "unable to map backing store for guest RAM");
1612         return NULL;
1613     }
1614
1615     if (mem_prealloc) {
1616         os_mem_prealloc(fd, area, memory, smp_cpus, errp);
1617         if (errp && *errp) {
1618             qemu_ram_munmap(area, memory);
1619             return NULL;
1620         }
1621     }
1622
1623     block->fd = fd;
1624     return area;
1625 }
1626 #endif
1627
1628 /* Called with the ramlist lock held.  */
1629 static ram_addr_t find_ram_offset(ram_addr_t size)
1630 {
1631     RAMBlock *block, *next_block;
1632     ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1633
1634     assert(size != 0); /* it would hand out same offset multiple times */
1635
1636     if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1637         return 0;
1638     }
1639
1640     RAMBLOCK_FOREACH(block) {
1641         ram_addr_t end, next = RAM_ADDR_MAX;
1642
1643         end = block->offset + block->max_length;
1644
1645         RAMBLOCK_FOREACH(next_block) {
1646             if (next_block->offset >= end) {
1647                 next = MIN(next, next_block->offset);
1648             }
1649         }
1650         if (next - end >= size && next - end < mingap) {
1651             offset = end;
1652             mingap = next - end;
1653         }
1654     }
1655
1656     if (offset == RAM_ADDR_MAX) {
1657         fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1658                 (uint64_t)size);
1659         abort();
1660     }
1661
1662     return offset;
1663 }
1664
1665 unsigned long last_ram_page(void)
1666 {
1667     RAMBlock *block;
1668     ram_addr_t last = 0;
1669
1670     rcu_read_lock();
1671     RAMBLOCK_FOREACH(block) {
1672         last = MAX(last, block->offset + block->max_length);
1673     }
1674     rcu_read_unlock();
1675     return last >> TARGET_PAGE_BITS;
1676 }
1677
1678 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1679 {
1680     int ret;
1681
1682     /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1683     if (!machine_dump_guest_core(current_machine)) {
1684         ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1685         if (ret) {
1686             perror("qemu_madvise");
1687             fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1688                             "but dump_guest_core=off specified\n");
1689         }
1690     }
1691 }
1692
1693 const char *qemu_ram_get_idstr(RAMBlock *rb)
1694 {
1695     return rb->idstr;
1696 }
1697
1698 bool qemu_ram_is_shared(RAMBlock *rb)
1699 {
1700     return rb->flags & RAM_SHARED;
1701 }
1702
1703 /* Called with iothread lock held.  */
1704 void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
1705 {
1706     RAMBlock *block;
1707
1708     assert(new_block);
1709     assert(!new_block->idstr[0]);
1710
1711     if (dev) {
1712         char *id = qdev_get_dev_path(dev);
1713         if (id) {
1714             snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1715             g_free(id);
1716         }
1717     }
1718     pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
1719
1720     rcu_read_lock();
1721     RAMBLOCK_FOREACH(block) {
1722         if (block != new_block &&
1723             !strcmp(block->idstr, new_block->idstr)) {
1724             fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
1725                     new_block->idstr);
1726             abort();
1727         }
1728     }
1729     rcu_read_unlock();
1730 }
1731
1732 /* Called with iothread lock held.  */
1733 void qemu_ram_unset_idstr(RAMBlock *block)
1734 {
1735     /* FIXME: arch_init.c assumes that this is not called throughout
1736      * migration.  Ignore the problem since hot-unplug during migration
1737      * does not work anyway.
1738      */
1739     if (block) {
1740         memset(block->idstr, 0, sizeof(block->idstr));
1741     }
1742 }
1743
1744 size_t qemu_ram_pagesize(RAMBlock *rb)
1745 {
1746     return rb->page_size;
1747 }
1748
1749 /* Returns the largest size of page in use */
1750 size_t qemu_ram_pagesize_largest(void)
1751 {
1752     RAMBlock *block;
1753     size_t largest = 0;
1754
1755     RAMBLOCK_FOREACH(block) {
1756         largest = MAX(largest, qemu_ram_pagesize(block));
1757     }
1758
1759     return largest;
1760 }
1761
1762 static int memory_try_enable_merging(void *addr, size_t len)
1763 {
1764     if (!machine_mem_merge(current_machine)) {
1765         /* disabled by the user */
1766         return 0;
1767     }
1768
1769     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
1770 }
1771
1772 /* Only legal before guest might have detected the memory size: e.g. on
1773  * incoming migration, or right after reset.
1774  *
1775  * As memory core doesn't know how is memory accessed, it is up to
1776  * resize callback to update device state and/or add assertions to detect
1777  * misuse, if necessary.
1778  */
1779 int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
1780 {
1781     assert(block);
1782
1783     newsize = HOST_PAGE_ALIGN(newsize);
1784
1785     if (block->used_length == newsize) {
1786         return 0;
1787     }
1788
1789     if (!(block->flags & RAM_RESIZEABLE)) {
1790         error_setg_errno(errp, EINVAL,
1791                          "Length mismatch: %s: 0x" RAM_ADDR_FMT
1792                          " in != 0x" RAM_ADDR_FMT, block->idstr,
1793                          newsize, block->used_length);
1794         return -EINVAL;
1795     }
1796
1797     if (block->max_length < newsize) {
1798         error_setg_errno(errp, EINVAL,
1799                          "Length too large: %s: 0x" RAM_ADDR_FMT
1800                          " > 0x" RAM_ADDR_FMT, block->idstr,
1801                          newsize, block->max_length);
1802         return -EINVAL;
1803     }
1804
1805     cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
1806     block->used_length = newsize;
1807     cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
1808                                         DIRTY_CLIENTS_ALL);
1809     memory_region_set_size(block->mr, newsize);
1810     if (block->resized) {
1811         block->resized(block->idstr, newsize, block->host);
1812     }
1813     return 0;
1814 }
1815
1816 /* Called with ram_list.mutex held */
1817 static void dirty_memory_extend(ram_addr_t old_ram_size,
1818                                 ram_addr_t new_ram_size)
1819 {
1820     ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
1821                                              DIRTY_MEMORY_BLOCK_SIZE);
1822     ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
1823                                              DIRTY_MEMORY_BLOCK_SIZE);
1824     int i;
1825
1826     /* Only need to extend if block count increased */
1827     if (new_num_blocks <= old_num_blocks) {
1828         return;
1829     }
1830
1831     for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
1832         DirtyMemoryBlocks *old_blocks;
1833         DirtyMemoryBlocks *new_blocks;
1834         int j;
1835
1836         old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
1837         new_blocks = g_malloc(sizeof(*new_blocks) +
1838                               sizeof(new_blocks->blocks[0]) * new_num_blocks);
1839
1840         if (old_num_blocks) {
1841             memcpy(new_blocks->blocks, old_blocks->blocks,
1842                    old_num_blocks * sizeof(old_blocks->blocks[0]));
1843         }
1844
1845         for (j = old_num_blocks; j < new_num_blocks; j++) {
1846             new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
1847         }
1848
1849         atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
1850
1851         if (old_blocks) {
1852             g_free_rcu(old_blocks, rcu);
1853         }
1854     }
1855 }
1856
1857 static void ram_block_add(RAMBlock *new_block, Error **errp)
1858 {
1859     RAMBlock *block;
1860     RAMBlock *last_block = NULL;
1861     ram_addr_t old_ram_size, new_ram_size;
1862     Error *err = NULL;
1863
1864     old_ram_size = last_ram_page();
1865
1866     qemu_mutex_lock_ramlist();
1867     new_block->offset = find_ram_offset(new_block->max_length);
1868
1869     if (!new_block->host) {
1870         if (xen_enabled()) {
1871             xen_ram_alloc(new_block->offset, new_block->max_length,
1872                           new_block->mr, &err);
1873             if (err) {
1874                 error_propagate(errp, err);
1875                 qemu_mutex_unlock_ramlist();
1876                 return;
1877             }
1878         } else {
1879             new_block->host = phys_mem_alloc(new_block->max_length,
1880                                              &new_block->mr->align);
1881             if (!new_block->host) {
1882                 error_setg_errno(errp, errno,
1883                                  "cannot set up guest memory '%s'",
1884                                  memory_region_name(new_block->mr));
1885                 qemu_mutex_unlock_ramlist();
1886                 return;
1887             }
1888             memory_try_enable_merging(new_block->host, new_block->max_length);
1889         }
1890     }
1891
1892     new_ram_size = MAX(old_ram_size,
1893               (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
1894     if (new_ram_size > old_ram_size) {
1895         dirty_memory_extend(old_ram_size, new_ram_size);
1896     }
1897     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
1898      * QLIST (which has an RCU-friendly variant) does not have insertion at
1899      * tail, so save the last element in last_block.
1900      */
1901     RAMBLOCK_FOREACH(block) {
1902         last_block = block;
1903         if (block->max_length < new_block->max_length) {
1904             break;
1905         }
1906     }
1907     if (block) {
1908         QLIST_INSERT_BEFORE_RCU(block, new_block, next);
1909     } else if (last_block) {
1910         QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
1911     } else { /* list is empty */
1912         QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1913     }
1914     ram_list.mru_block = NULL;
1915
1916     /* Write list before version */
1917     smp_wmb();
1918     ram_list.version++;
1919     qemu_mutex_unlock_ramlist();
1920
1921     cpu_physical_memory_set_dirty_range(new_block->offset,
1922                                         new_block->used_length,
1923                                         DIRTY_CLIENTS_ALL);
1924
1925     if (new_block->host) {
1926         qemu_ram_setup_dump(new_block->host, new_block->max_length);
1927         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
1928         /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
1929         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
1930         ram_block_notify_add(new_block->host, new_block->max_length);
1931     }
1932 }
1933
1934 #ifdef __linux__
1935 RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
1936                                  bool share, int fd,
1937                                  Error **errp)
1938 {
1939     RAMBlock *new_block;
1940     Error *local_err = NULL;
1941     int64_t file_size;
1942
1943     if (xen_enabled()) {
1944         error_setg(errp, "-mem-path not supported with Xen");
1945         return NULL;
1946     }
1947
1948     if (kvm_enabled() && !kvm_has_sync_mmu()) {
1949         error_setg(errp,
1950                    "host lacks kvm mmu notifiers, -mem-path unsupported");
1951         return NULL;
1952     }
1953
1954     if (phys_mem_alloc != qemu_anon_ram_alloc) {
1955         /*
1956          * file_ram_alloc() needs to allocate just like
1957          * phys_mem_alloc, but we haven't bothered to provide
1958          * a hook there.
1959          */
1960         error_setg(errp,
1961                    "-mem-path not supported with this accelerator");
1962         return NULL;
1963     }
1964
1965     size = HOST_PAGE_ALIGN(size);
1966     file_size = get_file_size(fd);
1967     if (file_size > 0 && file_size < size) {
1968         error_setg(errp, "backing store %s size 0x%" PRIx64
1969                    " does not match 'size' option 0x" RAM_ADDR_FMT,
1970                    mem_path, file_size, size);
1971         return NULL;
1972     }
1973
1974     new_block = g_malloc0(sizeof(*new_block));
1975     new_block->mr = mr;
1976     new_block->used_length = size;
1977     new_block->max_length = size;
1978     new_block->flags = share ? RAM_SHARED : 0;
1979     new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
1980     if (!new_block->host) {
1981         g_free(new_block);
1982         return NULL;
1983     }
1984
1985     ram_block_add(new_block, &local_err);
1986     if (local_err) {
1987         g_free(new_block);
1988         error_propagate(errp, local_err);
1989         return NULL;
1990     }
1991     return new_block;
1992
1993 }
1994
1995
1996 RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
1997                                    bool share, const char *mem_path,
1998                                    Error **errp)
1999 {
2000     int fd;
2001     bool created;
2002     RAMBlock *block;
2003
2004     fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
2005     if (fd < 0) {
2006         return NULL;
2007     }
2008
2009     block = qemu_ram_alloc_from_fd(size, mr, share, fd, errp);
2010     if (!block) {
2011         if (created) {
2012             unlink(mem_path);
2013         }
2014         close(fd);
2015         return NULL;
2016     }
2017
2018     return block;
2019 }
2020 #endif
2021
2022 static
2023 RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
2024                                   void (*resized)(const char*,
2025                                                   uint64_t length,
2026                                                   void *host),
2027                                   void *host, bool resizeable,
2028                                   MemoryRegion *mr, Error **errp)
2029 {
2030     RAMBlock *new_block;
2031     Error *local_err = NULL;
2032
2033     size = HOST_PAGE_ALIGN(size);
2034     max_size = HOST_PAGE_ALIGN(max_size);
2035     new_block = g_malloc0(sizeof(*new_block));
2036     new_block->mr = mr;
2037     new_block->resized = resized;
2038     new_block->used_length = size;
2039     new_block->max_length = max_size;
2040     assert(max_size >= size);
2041     new_block->fd = -1;
2042     new_block->page_size = getpagesize();
2043     new_block->host = host;
2044     if (host) {
2045         new_block->flags |= RAM_PREALLOC;
2046     }
2047     if (resizeable) {
2048         new_block->flags |= RAM_RESIZEABLE;
2049     }
2050     ram_block_add(new_block, &local_err);
2051     if (local_err) {
2052         g_free(new_block);
2053         error_propagate(errp, local_err);
2054         return NULL;
2055     }
2056     return new_block;
2057 }
2058
2059 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
2060                                    MemoryRegion *mr, Error **errp)
2061 {
2062     return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
2063 }
2064
2065 RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
2066 {
2067     return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
2068 }
2069
2070 RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
2071                                      void (*resized)(const char*,
2072                                                      uint64_t length,
2073                                                      void *host),
2074                                      MemoryRegion *mr, Error **errp)
2075 {
2076     return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
2077 }
2078
2079 static void reclaim_ramblock(RAMBlock *block)
2080 {
2081     if (block->flags & RAM_PREALLOC) {
2082         ;
2083     } else if (xen_enabled()) {
2084         xen_invalidate_map_cache_entry(block->host);
2085 #ifndef _WIN32
2086     } else if (block->fd >= 0) {
2087         qemu_ram_munmap(block->host, block->max_length);
2088         close(block->fd);
2089 #endif
2090     } else {
2091         qemu_anon_ram_free(block->host, block->max_length);
2092     }
2093     g_free(block);
2094 }
2095
2096 void qemu_ram_free(RAMBlock *block)
2097 {
2098     if (!block) {
2099         return;
2100     }
2101
2102     if (block->host) {
2103         ram_block_notify_remove(block->host, block->max_length);
2104     }
2105
2106     qemu_mutex_lock_ramlist();
2107     QLIST_REMOVE_RCU(block, next);
2108     ram_list.mru_block = NULL;
2109     /* Write list before version */
2110     smp_wmb();
2111     ram_list.version++;
2112     call_rcu(block, reclaim_ramblock, rcu);
2113     qemu_mutex_unlock_ramlist();
2114 }
2115
2116 #ifndef _WIN32
2117 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
2118 {
2119     RAMBlock *block;
2120     ram_addr_t offset;
2121     int flags;
2122     void *area, *vaddr;
2123
2124     RAMBLOCK_FOREACH(block) {
2125         offset = addr - block->offset;
2126         if (offset < block->max_length) {
2127             vaddr = ramblock_ptr(block, offset);
2128             if (block->flags & RAM_PREALLOC) {
2129                 ;
2130             } else if (xen_enabled()) {
2131                 abort();
2132             } else {
2133                 flags = MAP_FIXED;
2134                 if (block->fd >= 0) {
2135                     flags |= (block->flags & RAM_SHARED ?
2136                               MAP_SHARED : MAP_PRIVATE);
2137                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2138                                 flags, block->fd, offset);
2139                 } else {
2140                     /*
2141                      * Remap needs to match alloc.  Accelerators that
2142                      * set phys_mem_alloc never remap.  If they did,
2143                      * we'd need a remap hook here.
2144                      */
2145                     assert(phys_mem_alloc == qemu_anon_ram_alloc);
2146
2147                     flags |= MAP_PRIVATE | MAP_ANONYMOUS;
2148                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2149                                 flags, -1, 0);
2150                 }
2151                 if (area != vaddr) {
2152                     fprintf(stderr, "Could not remap addr: "
2153                             RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
2154                             length, addr);
2155                     exit(1);
2156                 }
2157                 memory_try_enable_merging(vaddr, length);
2158                 qemu_ram_setup_dump(vaddr, length);
2159             }
2160         }
2161     }
2162 }
2163 #endif /* !_WIN32 */
2164
2165 /* Return a host pointer to ram allocated with qemu_ram_alloc.
2166  * This should not be used for general purpose DMA.  Use address_space_map
2167  * or address_space_rw instead. For local memory (e.g. video ram) that the
2168  * device owns, use memory_region_get_ram_ptr.
2169  *
2170  * Called within RCU critical section.
2171  */
2172 void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
2173 {
2174     RAMBlock *block = ram_block;
2175
2176     if (block == NULL) {
2177         block = qemu_get_ram_block(addr);
2178         addr -= block->offset;
2179     }
2180
2181     if (xen_enabled() && block->host == NULL) {
2182         /* We need to check if the requested address is in the RAM
2183          * because we don't want to map the entire memory in QEMU.
2184          * In that case just map until the end of the page.
2185          */
2186         if (block->offset == 0) {
2187             return xen_map_cache(addr, 0, 0, false);
2188         }
2189
2190         block->host = xen_map_cache(block->offset, block->max_length, 1, false);
2191     }
2192     return ramblock_ptr(block, addr);
2193 }
2194
2195 /* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
2196  * but takes a size argument.
2197  *
2198  * Called within RCU critical section.
2199  */
2200 static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
2201                                  hwaddr *size)
2202 {
2203     RAMBlock *block = ram_block;
2204     if (*size == 0) {
2205         return NULL;
2206     }
2207
2208     if (block == NULL) {
2209         block = qemu_get_ram_block(addr);
2210         addr -= block->offset;
2211     }
2212     *size = MIN(*size, block->max_length - addr);
2213
2214     if (xen_enabled() && block->host == NULL) {
2215         /* We need to check if the requested address is in the RAM
2216          * because we don't want to map the entire memory in QEMU.
2217          * In that case just map the requested area.
2218          */
2219         if (block->offset == 0) {
2220             return xen_map_cache(addr, *size, 1, true);
2221         }
2222
2223         block->host = xen_map_cache(block->offset, block->max_length, 1, true);
2224     }
2225
2226     return ramblock_ptr(block, addr);
2227 }
2228
2229 /*
2230  * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
2231  * in that RAMBlock.
2232  *
2233  * ptr: Host pointer to look up
2234  * round_offset: If true round the result offset down to a page boundary
2235  * *ram_addr: set to result ram_addr
2236  * *offset: set to result offset within the RAMBlock
2237  *
2238  * Returns: RAMBlock (or NULL if not found)
2239  *
2240  * By the time this function returns, the returned pointer is not protected
2241  * by RCU anymore.  If the caller is not within an RCU critical section and
2242  * does not hold the iothread lock, it must have other means of protecting the
2243  * pointer, such as a reference to the region that includes the incoming
2244  * ram_addr_t.
2245  */
2246 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
2247                                    ram_addr_t *offset)
2248 {
2249     RAMBlock *block;
2250     uint8_t *host = ptr;
2251
2252     if (xen_enabled()) {
2253         ram_addr_t ram_addr;
2254         rcu_read_lock();
2255         ram_addr = xen_ram_addr_from_mapcache(ptr);
2256         block = qemu_get_ram_block(ram_addr);
2257         if (block) {
2258             *offset = ram_addr - block->offset;
2259         }
2260         rcu_read_unlock();
2261         return block;
2262     }
2263
2264     rcu_read_lock();
2265     block = atomic_rcu_read(&ram_list.mru_block);
2266     if (block && block->host && host - block->host < block->max_length) {
2267         goto found;
2268     }
2269
2270     RAMBLOCK_FOREACH(block) {
2271         /* This case append when the block is not mapped. */
2272         if (block->host == NULL) {
2273             continue;
2274         }
2275         if (host - block->host < block->max_length) {
2276             goto found;
2277         }
2278     }
2279
2280     rcu_read_unlock();
2281     return NULL;
2282
2283 found:
2284     *offset = (host - block->host);
2285     if (round_offset) {
2286         *offset &= TARGET_PAGE_MASK;
2287     }
2288     rcu_read_unlock();
2289     return block;
2290 }
2291
2292 /*
2293  * Finds the named RAMBlock
2294  *
2295  * name: The name of RAMBlock to find
2296  *
2297  * Returns: RAMBlock (or NULL if not found)
2298  */
2299 RAMBlock *qemu_ram_block_by_name(const char *name)
2300 {
2301     RAMBlock *block;
2302
2303     RAMBLOCK_FOREACH(block) {
2304         if (!strcmp(name, block->idstr)) {
2305             return block;
2306         }
2307     }
2308
2309     return NULL;
2310 }
2311
2312 /* Some of the softmmu routines need to translate from a host pointer
2313    (typically a TLB entry) back to a ram offset.  */
2314 ram_addr_t qemu_ram_addr_from_host(void *ptr)
2315 {
2316     RAMBlock *block;
2317     ram_addr_t offset;
2318
2319     block = qemu_ram_block_from_host(ptr, false, &offset);
2320     if (!block) {
2321         return RAM_ADDR_INVALID;
2322     }
2323
2324     return block->offset + offset;
2325 }
2326
2327 /* Called within RCU critical section.  */
2328 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
2329                                uint64_t val, unsigned size)
2330 {
2331     bool locked = false;
2332
2333     assert(tcg_enabled());
2334     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
2335         locked = true;
2336         tb_lock();
2337         tb_invalidate_phys_page_fast(ram_addr, size);
2338     }
2339     switch (size) {
2340     case 1:
2341         stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2342         break;
2343     case 2:
2344         stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2345         break;
2346     case 4:
2347         stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2348         break;
2349     default:
2350         abort();
2351     }
2352
2353     if (locked) {
2354         tb_unlock();
2355     }
2356
2357     /* Set both VGA and migration bits for simplicity and to remove
2358      * the notdirty callback faster.
2359      */
2360     cpu_physical_memory_set_dirty_range(ram_addr, size,
2361                                         DIRTY_CLIENTS_NOCODE);
2362     /* we remove the notdirty callback only if the code has been
2363        flushed */
2364     if (!cpu_physical_memory_is_clean(ram_addr)) {
2365         tlb_set_dirty(current_cpu, current_cpu->mem_io_vaddr);
2366     }
2367 }
2368
2369 static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
2370                                  unsigned size, bool is_write)
2371 {
2372     return is_write;
2373 }
2374
2375 static const MemoryRegionOps notdirty_mem_ops = {
2376     .write = notdirty_mem_write,
2377     .valid.accepts = notdirty_mem_accepts,
2378     .endianness = DEVICE_NATIVE_ENDIAN,
2379 };
2380
2381 /* Generate a debug exception if a watchpoint has been hit.  */
2382 static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
2383 {
2384     CPUState *cpu = current_cpu;
2385     CPUClass *cc = CPU_GET_CLASS(cpu);
2386     CPUArchState *env = cpu->env_ptr;
2387     target_ulong pc, cs_base;
2388     target_ulong vaddr;
2389     CPUWatchpoint *wp;
2390     uint32_t cpu_flags;
2391
2392     assert(tcg_enabled());
2393     if (cpu->watchpoint_hit) {
2394         /* We re-entered the check after replacing the TB. Now raise
2395          * the debug interrupt so that is will trigger after the
2396          * current instruction. */
2397         cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2398         return;
2399     }
2400     vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
2401     vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len);
2402     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2403         if (cpu_watchpoint_address_matches(wp, vaddr, len)
2404             && (wp->flags & flags)) {
2405             if (flags == BP_MEM_READ) {
2406                 wp->flags |= BP_WATCHPOINT_HIT_READ;
2407             } else {
2408                 wp->flags |= BP_WATCHPOINT_HIT_WRITE;
2409             }
2410             wp->hitaddr = vaddr;
2411             wp->hitattrs = attrs;
2412             if (!cpu->watchpoint_hit) {
2413                 if (wp->flags & BP_CPU &&
2414                     !cc->debug_check_watchpoint(cpu, wp)) {
2415                     wp->flags &= ~BP_WATCHPOINT_HIT;
2416                     continue;
2417                 }
2418                 cpu->watchpoint_hit = wp;
2419
2420                 /* Both tb_lock and iothread_mutex will be reset when
2421                  * cpu_loop_exit or cpu_loop_exit_noexc longjmp
2422                  * back into the cpu_exec main loop.
2423                  */
2424                 tb_lock();
2425                 tb_check_watchpoint(cpu);
2426                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2427                     cpu->exception_index = EXCP_DEBUG;
2428                     cpu_loop_exit(cpu);
2429                 } else {
2430                     cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
2431                     tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
2432                     cpu_loop_exit_noexc(cpu);
2433                 }
2434             }
2435         } else {
2436             wp->flags &= ~BP_WATCHPOINT_HIT;
2437         }
2438     }
2439 }
2440
2441 /* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
2442    so these check for a hit then pass through to the normal out-of-line
2443    phys routines.  */
2444 static MemTxResult watch_mem_read(void *opaque, hwaddr addr, uint64_t *pdata,
2445                                   unsigned size, MemTxAttrs attrs)
2446 {
2447     MemTxResult res;
2448     uint64_t data;
2449     int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2450     AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2451
2452     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
2453     switch (size) {
2454     case 1:
2455         data = address_space_ldub(as, addr, attrs, &res);
2456         break;
2457     case 2:
2458         data = address_space_lduw(as, addr, attrs, &res);
2459         break;
2460     case 4:
2461         data = address_space_ldl(as, addr, attrs, &res);
2462         break;
2463     default: abort();
2464     }
2465     *pdata = data;
2466     return res;
2467 }
2468
2469 static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
2470                                    uint64_t val, unsigned size,
2471                                    MemTxAttrs attrs)
2472 {
2473     MemTxResult res;
2474     int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2475     AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2476
2477     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
2478     switch (size) {
2479     case 1:
2480         address_space_stb(as, addr, val, attrs, &res);
2481         break;
2482     case 2:
2483         address_space_stw(as, addr, val, attrs, &res);
2484         break;
2485     case 4:
2486         address_space_stl(as, addr, val, attrs, &res);
2487         break;
2488     default: abort();
2489     }
2490     return res;
2491 }
2492
2493 static const MemoryRegionOps watch_mem_ops = {
2494     .read_with_attrs = watch_mem_read,
2495     .write_with_attrs = watch_mem_write,
2496     .endianness = DEVICE_NATIVE_ENDIAN,
2497 };
2498
2499 static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
2500                                 unsigned len, MemTxAttrs attrs)
2501 {
2502     subpage_t *subpage = opaque;
2503     uint8_t buf[8];
2504     MemTxResult res;
2505
2506 #if defined(DEBUG_SUBPAGE)
2507     printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2508            subpage, len, addr);
2509 #endif
2510     res = address_space_read(subpage->as, addr + subpage->base,
2511                              attrs, buf, len);
2512     if (res) {
2513         return res;
2514     }
2515     switch (len) {
2516     case 1:
2517         *data = ldub_p(buf);
2518         return MEMTX_OK;
2519     case 2:
2520         *data = lduw_p(buf);
2521         return MEMTX_OK;
2522     case 4:
2523         *data = ldl_p(buf);
2524         return MEMTX_OK;
2525     case 8:
2526         *data = ldq_p(buf);
2527         return MEMTX_OK;
2528     default:
2529         abort();
2530     }
2531 }
2532
2533 static MemTxResult subpage_write(void *opaque, hwaddr addr,
2534                                  uint64_t value, unsigned len, MemTxAttrs attrs)
2535 {
2536     subpage_t *subpage = opaque;
2537     uint8_t buf[8];
2538
2539 #if defined(DEBUG_SUBPAGE)
2540     printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2541            " value %"PRIx64"\n",
2542            __func__, subpage, len, addr, value);
2543 #endif
2544     switch (len) {
2545     case 1:
2546         stb_p(buf, value);
2547         break;
2548     case 2:
2549         stw_p(buf, value);
2550         break;
2551     case 4:
2552         stl_p(buf, value);
2553         break;
2554     case 8:
2555         stq_p(buf, value);
2556         break;
2557     default:
2558         abort();
2559     }
2560     return address_space_write(subpage->as, addr + subpage->base,
2561                                attrs, buf, len);
2562 }
2563
2564 static bool subpage_accepts(void *opaque, hwaddr addr,
2565                             unsigned len, bool is_write)
2566 {
2567     subpage_t *subpage = opaque;
2568 #if defined(DEBUG_SUBPAGE)
2569     printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2570            __func__, subpage, is_write ? 'w' : 'r', len, addr);
2571 #endif
2572
2573     return address_space_access_valid(subpage->as, addr + subpage->base,
2574                                       len, is_write);
2575 }
2576
2577 static const MemoryRegionOps subpage_ops = {
2578     .read_with_attrs = subpage_read,
2579     .write_with_attrs = subpage_write,
2580     .impl.min_access_size = 1,
2581     .impl.max_access_size = 8,
2582     .valid.min_access_size = 1,
2583     .valid.max_access_size = 8,
2584     .valid.accepts = subpage_accepts,
2585     .endianness = DEVICE_NATIVE_ENDIAN,
2586 };
2587
2588 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2589                              uint16_t section)
2590 {
2591     int idx, eidx;
2592
2593     if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2594         return -1;
2595     idx = SUBPAGE_IDX(start);
2596     eidx = SUBPAGE_IDX(end);
2597 #if defined(DEBUG_SUBPAGE)
2598     printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2599            __func__, mmio, start, end, idx, eidx, section);
2600 #endif
2601     for (; idx <= eidx; idx++) {
2602         mmio->sub_section[idx] = section;
2603     }
2604
2605     return 0;
2606 }
2607
2608 static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
2609 {
2610     subpage_t *mmio;
2611
2612     mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2613     mmio->as = as;
2614     mmio->base = base;
2615     memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2616                           NULL, TARGET_PAGE_SIZE);
2617     mmio->iomem.subpage = true;
2618 #if defined(DEBUG_SUBPAGE)
2619     printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2620            mmio, base, TARGET_PAGE_SIZE);
2621 #endif
2622     subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2623
2624     return mmio;
2625 }
2626
2627 static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as,
2628                               MemoryRegion *mr)
2629 {
2630     assert(as);
2631     MemoryRegionSection section = {
2632         .address_space = as,
2633         .mr = mr,
2634         .offset_within_address_space = 0,
2635         .offset_within_region = 0,
2636         .size = int128_2_64(),
2637     };
2638
2639     return phys_section_add(map, &section);
2640 }
2641
2642 MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs)
2643 {
2644     int asidx = cpu_asidx_from_attrs(cpu, attrs);
2645     CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2646     AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2647     MemoryRegionSection *sections = d->map.sections;
2648
2649     return sections[index & ~TARGET_PAGE_MASK].mr;
2650 }
2651
2652 static void io_mem_init(void)
2653 {
2654     memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2655     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2656                           NULL, UINT64_MAX);
2657
2658     /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
2659      * which can be called without the iothread mutex.
2660      */
2661     memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2662                           NULL, UINT64_MAX);
2663     memory_region_clear_global_locking(&io_mem_notdirty);
2664
2665     memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2666                           NULL, UINT64_MAX);
2667 }
2668
2669 static void mem_begin(MemoryListener *listener)
2670 {
2671     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2672     AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2673     uint16_t n;
2674
2675     n = dummy_section(&d->map, as, &io_mem_unassigned);
2676     assert(n == PHYS_SECTION_UNASSIGNED);
2677     n = dummy_section(&d->map, as, &io_mem_notdirty);
2678     assert(n == PHYS_SECTION_NOTDIRTY);
2679     n = dummy_section(&d->map, as, &io_mem_rom);
2680     assert(n == PHYS_SECTION_ROM);
2681     n = dummy_section(&d->map, as, &io_mem_watch);
2682     assert(n == PHYS_SECTION_WATCH);
2683
2684     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2685     d->as = as;
2686     as->next_dispatch = d;
2687 }
2688
2689 static void address_space_dispatch_free(AddressSpaceDispatch *d)
2690 {
2691     phys_sections_free(&d->map);
2692     g_free(d);
2693 }
2694
2695 static void mem_commit(MemoryListener *listener)
2696 {
2697     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2698     AddressSpaceDispatch *cur = as->dispatch;
2699     AddressSpaceDispatch *next = as->next_dispatch;
2700
2701     phys_page_compact_all(next, next->map.nodes_nb);
2702
2703     atomic_rcu_set(&as->dispatch, next);
2704     if (cur) {
2705         call_rcu(cur, address_space_dispatch_free, rcu);
2706     }
2707 }
2708
2709 static void tcg_commit(MemoryListener *listener)
2710 {
2711     CPUAddressSpace *cpuas;
2712     AddressSpaceDispatch *d;
2713
2714     /* since each CPU stores ram addresses in its TLB cache, we must
2715        reset the modified entries */
2716     cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2717     cpu_reloading_memory_map();
2718     /* The CPU and TLB are protected by the iothread lock.
2719      * We reload the dispatch pointer now because cpu_reloading_memory_map()
2720      * may have split the RCU critical section.
2721      */
2722     d = atomic_rcu_read(&cpuas->as->dispatch);
2723     atomic_rcu_set(&cpuas->memory_dispatch, d);
2724     tlb_flush(cpuas->cpu);
2725 }
2726
2727 void address_space_init_dispatch(AddressSpace *as)
2728 {
2729     as->dispatch = NULL;
2730     as->dispatch_listener = (MemoryListener) {
2731         .begin = mem_begin,
2732         .commit = mem_commit,
2733         .region_add = mem_add,
2734         .region_nop = mem_add,
2735         .priority = 0,
2736     };
2737     memory_listener_register(&as->dispatch_listener, as);
2738 }
2739
2740 void address_space_unregister(AddressSpace *as)
2741 {
2742     memory_listener_unregister(&as->dispatch_listener);
2743 }
2744
2745 void address_space_destroy_dispatch(AddressSpace *as)
2746 {
2747     AddressSpaceDispatch *d = as->dispatch;
2748
2749     atomic_rcu_set(&as->dispatch, NULL);
2750     if (d) {
2751         call_rcu(d, address_space_dispatch_free, rcu);
2752     }
2753 }
2754
2755 static void memory_map_init(void)
2756 {
2757     system_memory = g_malloc(sizeof(*system_memory));
2758
2759     memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2760     address_space_init(&address_space_memory, system_memory, "memory");
2761
2762     system_io = g_malloc(sizeof(*system_io));
2763     memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2764                           65536);
2765     address_space_init(&address_space_io, system_io, "I/O");
2766 }
2767
2768 MemoryRegion *get_system_memory(void)
2769 {
2770     return system_memory;
2771 }
2772
2773 MemoryRegion *get_system_io(void)
2774 {
2775     return system_io;
2776 }
2777
2778 #endif /* !defined(CONFIG_USER_ONLY) */
2779
2780 /* physical memory access (slow version, mainly for debug) */
2781 #if defined(CONFIG_USER_ONLY)
2782 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2783                         uint8_t *buf, int len, int is_write)
2784 {
2785     int l, flags;
2786     target_ulong page;
2787     void * p;
2788
2789     while (len > 0) {
2790         page = addr & TARGET_PAGE_MASK;
2791         l = (page + TARGET_PAGE_SIZE) - addr;
2792         if (l > len)
2793             l = len;
2794         flags = page_get_flags(page);
2795         if (!(flags & PAGE_VALID))
2796             return -1;
2797         if (is_write) {
2798             if (!(flags & PAGE_WRITE))
2799                 return -1;
2800             /* XXX: this code should not depend on lock_user */
2801             if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
2802                 return -1;
2803             memcpy(p, buf, l);
2804             unlock_user(p, addr, l);
2805         } else {
2806             if (!(flags & PAGE_READ))
2807                 return -1;
2808             /* XXX: this code should not depend on lock_user */
2809             if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
2810                 return -1;
2811             memcpy(buf, p, l);
2812             unlock_user(p, addr, 0);
2813         }
2814         len -= l;
2815         buf += l;
2816         addr += l;
2817     }
2818     return 0;
2819 }
2820
2821 #else
2822
2823 static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
2824                                      hwaddr length)
2825 {
2826     uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
2827     addr += memory_region_get_ram_addr(mr);
2828
2829     /* No early return if dirty_log_mask is or becomes 0, because
2830      * cpu_physical_memory_set_dirty_range will still call
2831      * xen_modified_memory.
2832      */
2833     if (dirty_log_mask) {
2834         dirty_log_mask =
2835             cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
2836     }
2837     if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
2838         assert(tcg_enabled());
2839         tb_lock();
2840         tb_invalidate_phys_range(addr, addr + length);
2841         tb_unlock();
2842         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
2843     }
2844     cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
2845 }
2846
2847 static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2848 {
2849     unsigned access_size_max = mr->ops->valid.max_access_size;
2850
2851     /* Regions are assumed to support 1-4 byte accesses unless
2852        otherwise specified.  */
2853     if (access_size_max == 0) {
2854         access_size_max = 4;
2855     }
2856
2857     /* Bound the maximum access by the alignment of the address.  */
2858     if (!mr->ops->impl.unaligned) {
2859         unsigned align_size_max = addr & -addr;
2860         if (align_size_max != 0 && align_size_max < access_size_max) {
2861             access_size_max = align_size_max;
2862         }
2863     }
2864
2865     /* Don't attempt accesses larger than the maximum.  */
2866     if (l > access_size_max) {
2867         l = access_size_max;
2868     }
2869     l = pow2floor(l);
2870
2871     return l;
2872 }
2873
2874 static bool prepare_mmio_access(MemoryRegion *mr)
2875 {
2876     bool unlocked = !qemu_mutex_iothread_locked();
2877     bool release_lock = false;
2878
2879     if (unlocked && mr->global_locking) {
2880         qemu_mutex_lock_iothread();
2881         unlocked = false;
2882         release_lock = true;
2883     }
2884     if (mr->flush_coalesced_mmio) {
2885         if (unlocked) {
2886             qemu_mutex_lock_iothread();
2887         }
2888         qemu_flush_coalesced_mmio_buffer();
2889         if (unlocked) {
2890             qemu_mutex_unlock_iothread();
2891         }
2892     }
2893
2894     return release_lock;
2895 }
2896
2897 /* Called within RCU critical section.  */
2898 static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
2899                                                 MemTxAttrs attrs,
2900                                                 const uint8_t *buf,
2901                                                 int len, hwaddr addr1,
2902                                                 hwaddr l, MemoryRegion *mr)
2903 {
2904     uint8_t *ptr;
2905     uint64_t val;
2906     MemTxResult result = MEMTX_OK;
2907     bool release_lock = false;
2908
2909     for (;;) {
2910         if (!memory_access_is_direct(mr, true)) {
2911             release_lock |= prepare_mmio_access(mr);
2912             l = memory_access_size(mr, l, addr1);
2913             /* XXX: could force current_cpu to NULL to avoid
2914                potential bugs */
2915             switch (l) {
2916             case 8:
2917                 /* 64 bit write access */
2918                 val = ldq_p(buf);
2919                 result |= memory_region_dispatch_write(mr, addr1, val, 8,
2920                                                        attrs);
2921                 break;
2922             case 4:
2923                 /* 32 bit write access */
2924                 val = (uint32_t)ldl_p(buf);
2925                 result |= memory_region_dispatch_write(mr, addr1, val, 4,
2926                                                        attrs);
2927                 break;
2928             case 2:
2929                 /* 16 bit write access */
2930                 val = lduw_p(buf);
2931                 result |= memory_region_dispatch_write(mr, addr1, val, 2,
2932                                                        attrs);
2933                 break;
2934             case 1:
2935                 /* 8 bit write access */
2936                 val = ldub_p(buf);
2937                 result |= memory_region_dispatch_write(mr, addr1, val, 1,
2938                                                        attrs);
2939                 break;
2940             default:
2941                 abort();
2942             }
2943         } else {
2944             /* RAM case */
2945             ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l);
2946             memcpy(ptr, buf, l);
2947             invalidate_and_set_dirty(mr, addr1, l);
2948         }
2949
2950         if (release_lock) {
2951             qemu_mutex_unlock_iothread();
2952             release_lock = false;
2953         }
2954
2955         len -= l;
2956         buf += l;
2957         addr += l;
2958
2959         if (!len) {
2960             break;
2961         }
2962
2963         l = len;
2964         mr = address_space_translate(as, addr, &addr1, &l, true);
2965     }
2966
2967     return result;
2968 }
2969
2970 MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
2971                                 const uint8_t *buf, int len)
2972 {
2973     hwaddr l;
2974     hwaddr addr1;
2975     MemoryRegion *mr;
2976     MemTxResult result = MEMTX_OK;
2977
2978     if (len > 0) {
2979         rcu_read_lock();
2980         l = len;
2981         mr = address_space_translate(as, addr, &addr1, &l, true);
2982         result = address_space_write_continue(as, addr, attrs, buf, len,
2983                                               addr1, l, mr);
2984         rcu_read_unlock();
2985     }
2986
2987     return result;
2988 }
2989
2990 /* Called within RCU critical section.  */
2991 MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
2992                                         MemTxAttrs attrs, uint8_t *buf,
2993                                         int len, hwaddr addr1, hwaddr l,
2994                                         MemoryRegion *mr)
2995 {
2996     uint8_t *ptr;
2997     uint64_t val;
2998     MemTxResult result = MEMTX_OK;
2999     bool release_lock = false;
3000
3001     for (;;) {
3002         if (!memory_access_is_direct(mr, false)) {
3003             /* I/O case */
3004             release_lock |= prepare_mmio_access(mr);
3005             l = memory_access_size(mr, l, addr1);
3006             switch (l) {
3007             case 8:
3008                 /* 64 bit read access */
3009                 result |= memory_region_dispatch_read(mr, addr1, &val, 8,
3010                                                       attrs);
3011                 stq_p(buf, val);
3012                 break;
3013             case 4:
3014                 /* 32 bit read access */
3015                 result |= memory_region_dispatch_read(mr, addr1, &val, 4,
3016                                                       attrs);
3017                 stl_p(buf, val);
3018                 break;
3019             case 2:
3020                 /* 16 bit read access */
3021                 result |= memory_region_dispatch_read(mr, addr1, &val, 2,
3022                                                       attrs);
3023                 stw_p(buf, val);
3024                 break;
3025             case 1:
3026                 /* 8 bit read access */
3027                 result |= memory_region_dispatch_read(mr, addr1, &val, 1,
3028                                                       attrs);
3029                 stb_p(buf, val);
3030                 break;
3031             default:
3032                 abort();
3033             }
3034         } else {
3035             /* RAM case */
3036             ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l);
3037             memcpy(buf, ptr, l);
3038         }
3039
3040         if (release_lock) {
3041             qemu_mutex_unlock_iothread();
3042             release_lock = false;
3043         }
3044
3045         len -= l;
3046         buf += l;
3047         addr += l;
3048
3049         if (!len) {
3050             break;
3051         }
3052
3053         l = len;
3054         mr = address_space_translate(as, addr, &addr1, &l, false);
3055     }
3056
3057     return result;
3058 }
3059
3060 MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
3061                                     MemTxAttrs attrs, uint8_t *buf, int len)
3062 {
3063     hwaddr l;
3064     hwaddr addr1;
3065     MemoryRegion *mr;
3066     MemTxResult result = MEMTX_OK;
3067
3068     if (len > 0) {
3069         rcu_read_lock();
3070         l = len;
3071         mr = address_space_translate(as, addr, &addr1, &l, false);
3072         result = address_space_read_continue(as, addr, attrs, buf, len,
3073                                              addr1, l, mr);
3074         rcu_read_unlock();
3075     }
3076
3077     return result;
3078 }
3079
3080 MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
3081                              uint8_t *buf, int len, bool is_write)
3082 {
3083     if (is_write) {
3084         return address_space_write(as, addr, attrs, (uint8_t *)buf, len);
3085     } else {
3086         return address_space_read(as, addr, attrs, (uint8_t *)buf, len);
3087     }
3088 }
3089
3090 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
3091                             int len, int is_write)
3092 {
3093     address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
3094                      buf, len, is_write);
3095 }
3096
3097 enum write_rom_type {
3098     WRITE_DATA,
3099     FLUSH_CACHE,
3100 };
3101
3102 static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
3103     hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
3104 {
3105     hwaddr l;
3106     uint8_t *ptr;
3107     hwaddr addr1;
3108     MemoryRegion *mr;
3109
3110     rcu_read_lock();
3111     while (len > 0) {
3112         l = len;
3113         mr = address_space_translate(as, addr, &addr1, &l, true);
3114
3115         if (!(memory_region_is_ram(mr) ||
3116               memory_region_is_romd(mr))) {
3117             l = memory_access_size(mr, l, addr1);
3118         } else {
3119             /* ROM/RAM case */
3120             ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3121             switch (type) {
3122             case WRITE_DATA:
3123                 memcpy(ptr, buf, l);
3124                 invalidate_and_set_dirty(mr, addr1, l);
3125                 break;
3126             case FLUSH_CACHE:
3127                 flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
3128                 break;
3129             }
3130         }
3131         len -= l;
3132         buf += l;
3133         addr += l;
3134     }
3135     rcu_read_unlock();
3136 }
3137
3138 /* used for ROM loading : can write in RAM and ROM */
3139 void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
3140                                    const uint8_t *buf, int len)
3141 {
3142     cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
3143 }
3144
3145 void cpu_flush_icache_range(hwaddr start, int len)
3146 {
3147     /*
3148      * This function should do the same thing as an icache flush that was
3149      * triggered from within the guest. For TCG we are always cache coherent,
3150      * so there is no need to flush anything. For KVM / Xen we need to flush
3151      * the host's instruction cache at least.
3152      */
3153     if (tcg_enabled()) {
3154         return;
3155     }
3156
3157     cpu_physical_memory_write_rom_internal(&address_space_memory,
3158                                            start, NULL, len, FLUSH_CACHE);
3159 }
3160
3161 typedef struct {
3162     MemoryRegion *mr;
3163     void *buffer;
3164     hwaddr addr;
3165     hwaddr len;
3166     bool in_use;
3167 } BounceBuffer;
3168
3169 static BounceBuffer bounce;
3170
3171 typedef struct MapClient {
3172     QEMUBH *bh;
3173     QLIST_ENTRY(MapClient) link;
3174 } MapClient;
3175
3176 QemuMutex map_client_list_lock;
3177 static QLIST_HEAD(map_client_list, MapClient) map_client_list
3178     = QLIST_HEAD_INITIALIZER(map_client_list);
3179
3180 static void cpu_unregister_map_client_do(MapClient *client)
3181 {
3182     QLIST_REMOVE(client, link);
3183     g_free(client);
3184 }
3185
3186 static void cpu_notify_map_clients_locked(void)
3187 {
3188     MapClient *client;
3189
3190     while (!QLIST_EMPTY(&map_client_list)) {
3191         client = QLIST_FIRST(&map_client_list);
3192         qemu_bh_schedule(client->bh);
3193         cpu_unregister_map_client_do(client);
3194     }
3195 }
3196
3197 void cpu_register_map_client(QEMUBH *bh)
3198 {
3199     MapClient *client = g_malloc(sizeof(*client));
3200
3201     qemu_mutex_lock(&map_client_list_lock);
3202     client->bh = bh;
3203     QLIST_INSERT_HEAD(&map_client_list, client, link);
3204     if (!atomic_read(&bounce.in_use)) {
3205         cpu_notify_map_clients_locked();
3206     }
3207     qemu_mutex_unlock(&map_client_list_lock);
3208 }
3209
3210 void cpu_exec_init_all(void)
3211 {
3212     qemu_mutex_init(&ram_list.mutex);
3213     /* The data structures we set up here depend on knowing the page size,
3214      * so no more changes can be made after this point.
3215      * In an ideal world, nothing we did before we had finished the
3216      * machine setup would care about the target page size, and we could
3217      * do this much later, rather than requiring board models to state
3218      * up front what their requirements are.
3219      */
3220     finalize_target_page_bits();
3221     io_mem_init();
3222     memory_map_init();
3223     qemu_mutex_init(&map_client_list_lock);
3224 }
3225
3226 void cpu_unregister_map_client(QEMUBH *bh)
3227 {
3228     MapClient *client;
3229
3230     qemu_mutex_lock(&map_client_list_lock);
3231     QLIST_FOREACH(client, &map_client_list, link) {
3232         if (client->bh == bh) {
3233             cpu_unregister_map_client_do(client);
3234             break;
3235         }
3236     }
3237     qemu_mutex_unlock(&map_client_list_lock);
3238 }
3239
3240 static void cpu_notify_map_clients(void)
3241 {
3242     qemu_mutex_lock(&map_client_list_lock);
3243     cpu_notify_map_clients_locked();
3244     qemu_mutex_unlock(&map_client_list_lock);
3245 }
3246
3247 bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
3248 {
3249     MemoryRegion *mr;
3250     hwaddr l, xlat;
3251
3252     rcu_read_lock();
3253     while (len > 0) {
3254         l = len;
3255         mr = address_space_translate(as, addr, &xlat, &l, is_write);
3256         if (!memory_access_is_direct(mr, is_write)) {
3257             l = memory_access_size(mr, l, addr);
3258             if (!memory_region_access_valid(mr, xlat, l, is_write)) {
3259                 rcu_read_unlock();
3260                 return false;
3261             }
3262         }
3263
3264         len -= l;
3265         addr += l;
3266     }
3267     rcu_read_unlock();
3268     return true;
3269 }
3270
3271 static hwaddr
3272 address_space_extend_translation(AddressSpace *as, hwaddr addr, hwaddr target_len,
3273                                  MemoryRegion *mr, hwaddr base, hwaddr len,
3274                                  bool is_write)
3275 {
3276     hwaddr done = 0;
3277     hwaddr xlat;
3278     MemoryRegion *this_mr;
3279
3280     for (;;) {
3281         target_len -= len;
3282         addr += len;
3283         done += len;
3284         if (target_len == 0) {
3285             return done;
3286         }
3287
3288         len = target_len;
3289         this_mr = address_space_translate(as, addr, &xlat, &len, is_write);
3290         if (this_mr != mr || xlat != base + done) {
3291             return done;
3292         }
3293     }
3294 }
3295
3296 /* Map a physical memory region into a host virtual address.
3297  * May map a subset of the requested range, given by and returned in *plen.
3298  * May return NULL if resources needed to perform the mapping are exhausted.
3299  * Use only for reads OR writes - not for read-modify-write operations.
3300  * Use cpu_register_map_client() to know when retrying the map operation is
3301  * likely to succeed.
3302  */
3303 void *address_space_map(AddressSpace *as,
3304                         hwaddr addr,
3305                         hwaddr *plen,
3306                         bool is_write)
3307 {
3308     hwaddr len = *plen;
3309     hwaddr l, xlat;
3310     MemoryRegion *mr;
3311     void *ptr;
3312
3313     if (len == 0) {
3314         return NULL;
3315     }
3316
3317     l = len;
3318     rcu_read_lock();
3319     mr = address_space_translate(as, addr, &xlat, &l, is_write);
3320
3321     if (!memory_access_is_direct(mr, is_write)) {
3322         if (atomic_xchg(&bounce.in_use, true)) {
3323             rcu_read_unlock();
3324             return NULL;
3325         }
3326         /* Avoid unbounded allocations */
3327         l = MIN(l, TARGET_PAGE_SIZE);
3328         bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3329         bounce.addr = addr;
3330         bounce.len = l;
3331
3332         memory_region_ref(mr);
3333         bounce.mr = mr;
3334         if (!is_write) {
3335             address_space_read(as, addr, MEMTXATTRS_UNSPECIFIED,
3336                                bounce.buffer, l);
3337         }
3338
3339         rcu_read_unlock();
3340         *plen = l;
3341         return bounce.buffer;
3342     }
3343
3344
3345     memory_region_ref(mr);
3346     *plen = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write);
3347     ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen);
3348     rcu_read_unlock();
3349
3350     return ptr;
3351 }
3352
3353 /* Unmaps a memory region previously mapped by address_space_map().
3354  * Will also mark the memory as dirty if is_write == 1.  access_len gives
3355  * the amount of memory that was actually read or written by the caller.
3356  */
3357 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
3358                          int is_write, hwaddr access_len)
3359 {
3360     if (buffer != bounce.buffer) {
3361         MemoryRegion *mr;
3362         ram_addr_t addr1;
3363
3364         mr = memory_region_from_host(buffer, &addr1);
3365         assert(mr != NULL);
3366         if (is_write) {
3367             invalidate_and_set_dirty(mr, addr1, access_len);
3368         }
3369         if (xen_enabled()) {
3370             xen_invalidate_map_cache_entry(buffer);
3371         }
3372         memory_region_unref(mr);
3373         return;
3374     }
3375     if (is_write) {
3376         address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
3377                             bounce.buffer, access_len);
3378     }
3379     qemu_vfree(bounce.buffer);
3380     bounce.buffer = NULL;
3381     memory_region_unref(bounce.mr);
3382     atomic_mb_set(&bounce.in_use, false);
3383     cpu_notify_map_clients();
3384 }
3385
3386 void *cpu_physical_memory_map(hwaddr addr,
3387                               hwaddr *plen,
3388                               int is_write)
3389 {
3390     return address_space_map(&address_space_memory, addr, plen, is_write);
3391 }
3392
3393 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
3394                                int is_write, hwaddr access_len)
3395 {
3396     return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
3397 }
3398
3399 #define ARG1_DECL                AddressSpace *as
3400 #define ARG1                     as
3401 #define SUFFIX
3402 #define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
3403 #define IS_DIRECT(mr, is_write)  memory_access_is_direct(mr, is_write)
3404 #define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
3405 #define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
3406 #define RCU_READ_LOCK(...)       rcu_read_lock()
3407 #define RCU_READ_UNLOCK(...)     rcu_read_unlock()
3408 #include "memory_ldst.inc.c"
3409
3410 int64_t address_space_cache_init(MemoryRegionCache *cache,
3411                                  AddressSpace *as,
3412                                  hwaddr addr,
3413                                  hwaddr len,
3414                                  bool is_write)
3415 {
3416     cache->len = len;
3417     cache->as = as;
3418     cache->xlat = addr;
3419     return len;
3420 }
3421
3422 void address_space_cache_invalidate(MemoryRegionCache *cache,
3423                                     hwaddr addr,
3424                                     hwaddr access_len)
3425 {
3426 }
3427
3428 void address_space_cache_destroy(MemoryRegionCache *cache)
3429 {
3430     cache->as = NULL;
3431 }
3432
3433 #define ARG1_DECL                MemoryRegionCache *cache
3434 #define ARG1                     cache
3435 #define SUFFIX                   _cached
3436 #define TRANSLATE(addr, ...)     \
3437     address_space_translate(cache->as, cache->xlat + (addr), __VA_ARGS__)
3438 #define IS_DIRECT(mr, is_write)  true
3439 #define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
3440 #define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
3441 #define RCU_READ_LOCK()          rcu_read_lock()
3442 #define RCU_READ_UNLOCK()        rcu_read_unlock()
3443 #include "memory_ldst.inc.c"
3444
3445 /* virtual memory access for debug (includes writing to ROM) */
3446 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3447                         uint8_t *buf, int len, int is_write)
3448 {
3449     int l;
3450     hwaddr phys_addr;
3451     target_ulong page;
3452
3453     cpu_synchronize_state(cpu);
3454     while (len > 0) {
3455         int asidx;
3456         MemTxAttrs attrs;
3457
3458         page = addr & TARGET_PAGE_MASK;
3459         phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
3460         asidx = cpu_asidx_from_attrs(cpu, attrs);
3461         /* if no physical page mapped, return an error */
3462         if (phys_addr == -1)
3463             return -1;
3464         l = (page + TARGET_PAGE_SIZE) - addr;
3465         if (l > len)
3466             l = len;
3467         phys_addr += (addr & ~TARGET_PAGE_MASK);
3468         if (is_write) {
3469             cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
3470                                           phys_addr, buf, l);
3471         } else {
3472             address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
3473                              MEMTXATTRS_UNSPECIFIED,
3474                              buf, l, 0);
3475         }
3476         len -= l;
3477         buf += l;
3478         addr += l;
3479     }
3480     return 0;
3481 }
3482
3483 /*
3484  * Allows code that needs to deal with migration bitmaps etc to still be built
3485  * target independent.
3486  */
3487 size_t qemu_target_page_size(void)
3488 {
3489     return TARGET_PAGE_SIZE;
3490 }
3491
3492 int qemu_target_page_bits(void)
3493 {
3494     return TARGET_PAGE_BITS;
3495 }
3496
3497 int qemu_target_page_bits_min(void)
3498 {
3499     return TARGET_PAGE_BITS_MIN;
3500 }
3501 #endif
3502
3503 /*
3504  * A helper function for the _utterly broken_ virtio device model to find out if
3505  * it's running on a big endian machine. Don't do this at home kids!
3506  */
3507 bool target_words_bigendian(void);
3508 bool target_words_bigendian(void)
3509 {
3510 #if defined(TARGET_WORDS_BIGENDIAN)
3511     return true;
3512 #else
3513     return false;
3514 #endif
3515 }
3516
3517 #ifndef CONFIG_USER_ONLY
3518 bool cpu_physical_memory_is_io(hwaddr phys_addr)
3519 {
3520     MemoryRegion*mr;
3521     hwaddr l = 1;
3522     bool res;
3523
3524     rcu_read_lock();
3525     mr = address_space_translate(&address_space_memory,
3526                                  phys_addr, &phys_addr, &l, false);
3527
3528     res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
3529     rcu_read_unlock();
3530     return res;
3531 }
3532
3533 int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3534 {
3535     RAMBlock *block;
3536     int ret = 0;
3537
3538     rcu_read_lock();
3539     RAMBLOCK_FOREACH(block) {
3540         ret = func(block->idstr, block->host, block->offset,
3541                    block->used_length, opaque);
3542         if (ret) {
3543             break;
3544         }
3545     }
3546     rcu_read_unlock();
3547     return ret;
3548 }
3549
3550 /*
3551  * Unmap pages of memory from start to start+length such that
3552  * they a) read as 0, b) Trigger whatever fault mechanism
3553  * the OS provides for postcopy.
3554  * The pages must be unmapped by the end of the function.
3555  * Returns: 0 on success, none-0 on failure
3556  *
3557  */
3558 int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
3559 {
3560     int ret = -1;
3561
3562     uint8_t *host_startaddr = rb->host + start;
3563
3564     if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
3565         error_report("ram_block_discard_range: Unaligned start address: %p",
3566                      host_startaddr);
3567         goto err;
3568     }
3569
3570     if ((start + length) <= rb->used_length) {
3571         uint8_t *host_endaddr = host_startaddr + length;
3572         if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
3573             error_report("ram_block_discard_range: Unaligned end address: %p",
3574                          host_endaddr);
3575             goto err;
3576         }
3577
3578         errno = ENOTSUP; /* If we are missing MADVISE etc */
3579
3580         if (rb->page_size == qemu_host_page_size) {
3581 #if defined(CONFIG_MADVISE)
3582             /* Note: We need the madvise MADV_DONTNEED behaviour of definitely
3583              * freeing the page.
3584              */
3585             ret = madvise(host_startaddr, length, MADV_DONTNEED);
3586 #endif
3587         } else {
3588             /* Huge page case  - unfortunately it can't do DONTNEED, but
3589              * it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
3590              * huge page file.
3591              */
3592 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
3593             ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
3594                             start, length);
3595 #endif
3596         }
3597         if (ret) {
3598             ret = -errno;
3599             error_report("ram_block_discard_range: Failed to discard range "
3600                          "%s:%" PRIx64 " +%zx (%d)",
3601                          rb->idstr, start, length, ret);
3602         }
3603     } else {
3604         error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
3605                      "/%zx/" RAM_ADDR_FMT")",
3606                      rb->idstr, start, length, rb->used_length);
3607     }
3608
3609 err:
3610     return ret;
3611 }
3612
3613 #endif
3614
3615 void page_size_init(void)
3616 {
3617     /* NOTE: we can always suppose that qemu_host_page_size >=
3618        TARGET_PAGE_SIZE */
3619     qemu_real_host_page_size = getpagesize();
3620     qemu_real_host_page_mask = -(intptr_t)qemu_real_host_page_size;
3621     if (qemu_host_page_size == 0) {
3622         qemu_host_page_size = qemu_real_host_page_size;
3623     }
3624     if (qemu_host_page_size < TARGET_PAGE_SIZE) {
3625         qemu_host_page_size = TARGET_PAGE_SIZE;
3626     }
3627     qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
3628 }