exec.c

   1 /*
   2  *  Virtual page mapping
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19 #include "qemu/osdep.h"
  20 #include "qapi/error.h"
  21 #ifndef _WIN32
  22 #endif
  23
  24 #include "qemu/cutils.h"
  25 #include "cpu.h"
  26 #include "exec/exec-all.h"
  27 #include "tcg.h"
  28 #include "hw/qdev-core.h"
  29 #if !defined(CONFIG_USER_ONLY)
  30 #include "hw/boards.h"
  31 #include "hw/xen/xen.h"
  32 #endif
  33 #include "sysemu/kvm.h"
  34 #include "sysemu/sysemu.h"
  35 #include "qemu/timer.h"
  36 #include "qemu/config-file.h"
  37 #include "qemu/error-report.h"
  38 #if defined(CONFIG_USER_ONLY)
  39 #include "qemu.h"
  40 #else /* !CONFIG_USER_ONLY */
  41 #include "hw/hw.h"
  42 #include "exec/memory.h"
  43 #include "exec/ioport.h"
  44 #include "sysemu/dma.h"
  45 #include "exec/address-spaces.h"
  46 #include "sysemu/xen-mapcache.h"
  47 #include "trace.h"
  48 #endif
  49 #include "exec/cpu-all.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "qemu/main-loop.h"
  52 #include "translate-all.h"
  53 #include "sysemu/replay.h"
  54
  55 #include "exec/memory-internal.h"
  56 #include "exec/ram_addr.h"
  57 #include "exec/log.h"
  58
  59 #include "migration/vmstate.h"
  60
  61 #include "qemu/range.h"
  62 #ifndef _WIN32
  63 #include "qemu/mmap-alloc.h"
  64 #endif
  65
  66 //#define DEBUG_SUBPAGE
  67
  68 #if !defined(CONFIG_USER_ONLY)
  69 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  70  * are protected by the ramlist lock.
  71  */
  72 RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  73
  74 static MemoryRegion *system_memory;
  75 static MemoryRegion *system_io;
  76
  77 AddressSpace address_space_io;
  78 AddressSpace address_space_memory;
  79
  80 MemoryRegion io_mem_rom, io_mem_notdirty;
  81 static MemoryRegion io_mem_unassigned;
  82
  83 /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
  84 #define RAM_PREALLOC   (1 << 0)
  85
  86 /* RAM is mmap-ed with MAP_SHARED */
  87 #define RAM_SHARED     (1 << 1)
  88
  89 /* Only a portion of RAM (used_length) is actually used, and migrated.
  90  * This used_length size can change across reboots.
  91  */
  92 #define RAM_RESIZEABLE (1 << 2)
  93
  94 #endif
  95
  96 #ifdef TARGET_PAGE_BITS_VARY
  97 int target_page_bits;
  98 bool target_page_bits_decided;
  99 #endif
 100
 101 struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
 102 /* current CPU in the current thread. It is only valid inside
 103    cpu_exec() */
 104 __thread CPUState *current_cpu;
 105 /* 0 = Do not count executed instructions.
 106    1 = Precise instruction counting.
 107    2 = Adaptive rate instruction counting.  */
 108 int use_icount;
 109
 110 bool set_preferred_target_page_bits(int bits)
 111 {
 112     /* The target page size is the lowest common denominator for all
 113      * the CPUs in the system, so we can only make it smaller, never
 114      * larger. And we can't make it smaller once we've committed to
 115      * a particular size.
 116      */
 117 #ifdef TARGET_PAGE_BITS_VARY
 118     assert(bits >= TARGET_PAGE_BITS_MIN);
 119     if (target_page_bits == 0 || target_page_bits > bits) {
 120         if (target_page_bits_decided) {
 121             return false;
 122         }
 123         target_page_bits = bits;
 124     }
 125 #endif
 126     return true;
 127 }
 128
 129 #if !defined(CONFIG_USER_ONLY)
 130
 131 static void finalize_target_page_bits(void)
 132 {
 133 #ifdef TARGET_PAGE_BITS_VARY
 134     if (target_page_bits == 0) {
 135         target_page_bits = TARGET_PAGE_BITS_MIN;
 136     }
 137     target_page_bits_decided = true;
 138 #endif
 139 }
 140
 141 typedef struct PhysPageEntry PhysPageEntry;
 142
 143 struct PhysPageEntry {
 144     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 145     uint32_t skip : 6;
 146      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 147     uint32_t ptr : 26;
 148 };
 149
 150 #define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 151
 152 /* Size of the L2 (and L3, etc) page tables.  */
 153 #define ADDR_SPACE_BITS 64
 154
 155 #define P_L2_BITS 9
 156 #define P_L2_SIZE (1 << P_L2_BITS)
 157
 158 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 159
 160 typedef PhysPageEntry Node[P_L2_SIZE];
 161
 162 typedef struct PhysPageMap {
 163     struct rcu_head rcu;
 164
 165     unsigned sections_nb;
 166     unsigned sections_nb_alloc;
 167     unsigned nodes_nb;
 168     unsigned nodes_nb_alloc;
 169     Node *nodes;
 170     MemoryRegionSection *sections;
 171 } PhysPageMap;
 172
 173 struct AddressSpaceDispatch {
 174     struct rcu_head rcu;
 175
 176     MemoryRegionSection *mru_section;
 177     /* This is a multi-level map on the physical address space.
 178      * The bottom level has pointers to MemoryRegionSections.
 179      */
 180     PhysPageEntry phys_map;
 181     PhysPageMap map;
 182     AddressSpace *as;
 183 };
 184
 185 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 186 typedef struct subpage_t {
 187     MemoryRegion iomem;
 188     AddressSpace *as;
 189     hwaddr base;
 190     uint16_t sub_section[];
 191 } subpage_t;
 192
 193 #define PHYS_SECTION_UNASSIGNED 0
 194 #define PHYS_SECTION_NOTDIRTY 1
 195 #define PHYS_SECTION_ROM 2
 196 #define PHYS_SECTION_WATCH 3
 197
 198 static void io_mem_init(void);
 199 static void memory_map_init(void);
 200 static void tcg_commit(MemoryListener *listener);
 201
 202 static MemoryRegion io_mem_watch;
 203
 204 /**
 205  * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 206  * @cpu: the CPU whose AddressSpace this is
 207  * @as: the AddressSpace itself
 208  * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 209  * @tcg_as_listener: listener for tracking changes to the AddressSpace
 210  */
 211 struct CPUAddressSpace {
 212     CPUState *cpu;
 213     AddressSpace *as;
 214     struct AddressSpaceDispatch *memory_dispatch;
 215     MemoryListener tcg_as_listener;
 216 };
 217
 218 #endif
 219
 220 #if !defined(CONFIG_USER_ONLY)
 221
 222 static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 223 {
 224     static unsigned alloc_hint = 16;
 225     if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 226         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
 227         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
 228         map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 229         alloc_hint = map->nodes_nb_alloc;
 230     }
 231 }
 232
 233 static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
 234 {
 235     unsigned i;
 236     uint32_t ret;
 237     PhysPageEntry e;
 238     PhysPageEntry *p;
 239
 240     ret = map->nodes_nb++;
 241     p = map->nodes[ret];
 242     assert(ret != PHYS_MAP_NODE_NIL);
 243     assert(ret != map->nodes_nb_alloc);
 244
 245     e.skip = leaf ? 0 : 1;
 246     e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
 247     for (i = 0; i < P_L2_SIZE; ++i) {
 248         memcpy(&p[i], &e, sizeof(e));
 249     }
 250     return ret;
 251 }
 252
 253 static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 254                                 hwaddr *index, hwaddr *nb, uint16_t leaf,
 255                                 int level)
 256 {
 257     PhysPageEntry *p;
 258     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 259
 260     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 261         lp->ptr = phys_map_node_alloc(map, level == 0);
 262     }
 263     p = map->nodes[lp->ptr];
 264     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 265
 266     while (*nb && lp < &p[P_L2_SIZE]) {
 267         if ((*index & (step - 1)) == 0 && *nb >= step) {
 268             lp->skip = 0;
 269             lp->ptr = leaf;
 270             *index += step;
 271             *nb -= step;
 272         } else {
 273             phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 274         }
 275         ++lp;
 276     }
 277 }
 278
 279 static void phys_page_set(AddressSpaceDispatch *d,
 280                           hwaddr index, hwaddr nb,
 281                           uint16_t leaf)
 282 {
 283     /* Wildly overreserve - it doesn't matter much. */
 284     phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 285
 286     phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 287 }
 288
 289 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
 290  * and update our entry so we can skip it and go directly to the destination.
 291  */
 292 static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
 293 {
 294     unsigned valid_ptr = P_L2_SIZE;
 295     int valid = 0;
 296     PhysPageEntry *p;
 297     int i;
 298
 299     if (lp->ptr == PHYS_MAP_NODE_NIL) {
 300         return;
 301     }
 302
 303     p = nodes[lp->ptr];
 304     for (i = 0; i < P_L2_SIZE; i++) {
 305         if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 306             continue;
 307         }
 308
 309         valid_ptr = i;
 310         valid++;
 311         if (p[i].skip) {
 312             phys_page_compact(&p[i], nodes);
 313         }
 314     }
 315
 316     /* We can only compress if there's only one child. */
 317     if (valid != 1) {
 318         return;
 319     }
 320
 321     assert(valid_ptr < P_L2_SIZE);
 322
 323     /* Don't compress if it won't fit in the # of bits we have. */
 324     if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
 325         return;
 326     }
 327
 328     lp->ptr = p[valid_ptr].ptr;
 329     if (!p[valid_ptr].skip) {
 330         /* If our only child is a leaf, make this a leaf. */
 331         /* By design, we should have made this node a leaf to begin with so we
 332          * should never reach here.
 333          * But since it's so simple to handle this, let's do it just in case we
 334          * change this rule.
 335          */
 336         lp->skip = 0;
 337     } else {
 338         lp->skip += p[valid_ptr].skip;
 339     }
 340 }
 341
 342 static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
 343 {
 344     if (d->phys_map.skip) {
 345         phys_page_compact(&d->phys_map, d->map.nodes);
 346     }
 347 }
 348
 349 static inline bool section_covers_addr(const MemoryRegionSection *section,
 350                                        hwaddr addr)
 351 {
 352     /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
 353      * the section must cover the entire address space.
 354      */
 355     return int128_gethi(section->size) ||
 356            range_covers_byte(section->offset_within_address_space,
 357                              int128_getlo(section->size), addr);
 358 }
 359
 360 static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
 361                                            Node *nodes, MemoryRegionSection *sections)
 362 {
 363     PhysPageEntry *p;
 364     hwaddr index = addr >> TARGET_PAGE_BITS;
 365     int i;
 366
 367     for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 368         if (lp.ptr == PHYS_MAP_NODE_NIL) {
 369             return &sections[PHYS_SECTION_UNASSIGNED];
 370         }
 371         p = nodes[lp.ptr];
 372         lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 373     }
 374
 375     if (section_covers_addr(&sections[lp.ptr], addr)) {
 376         return &sections[lp.ptr];
 377     } else {
 378         return &sections[PHYS_SECTION_UNASSIGNED];
 379     }
 380 }
 381
 382 bool memory_region_is_unassigned(MemoryRegion *mr)
 383 {
 384     return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
 385         && mr != &io_mem_watch;
 386 }
 387
 388 /* Called from RCU critical section */
 389 static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 390                                                         hwaddr addr,
 391                                                         bool resolve_subpage)
 392 {
 393     MemoryRegionSection *section = atomic_read(&d->mru_section);
 394     subpage_t *subpage;
 395     bool update;
 396
 397     if (section && section != &d->map.sections[PHYS_SECTION_UNASSIGNED] &&
 398         section_covers_addr(section, addr)) {
 399         update = false;
 400     } else {
 401         section = phys_page_find(d->phys_map, addr, d->map.nodes,
 402                                  d->map.sections);
 403         update = true;
 404     }
 405     if (resolve_subpage && section->mr->subpage) {
 406         subpage = container_of(section->mr, subpage_t, iomem);
 407         section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 408     }
 409     if (update) {
 410         atomic_set(&d->mru_section, section);
 411     }
 412     return section;
 413 }
 414
 415 /* Called from RCU critical section */
 416 static MemoryRegionSection *
 417 address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 418                                  hwaddr *plen, bool resolve_subpage)
 419 {
 420     MemoryRegionSection *section;
 421     MemoryRegion *mr;
 422     Int128 diff;
 423
 424     section = address_space_lookup_region(d, addr, resolve_subpage);
 425     /* Compute offset within MemoryRegionSection */
 426     addr -= section->offset_within_address_space;
 427
 428     /* Compute offset within MemoryRegion */
 429     *xlat = addr + section->offset_within_region;
 430
 431     mr = section->mr;
 432
 433     /* MMIO registers can be expected to perform full-width accesses based only
 434      * on their address, without considering adjacent registers that could
 435      * decode to completely different MemoryRegions.  When such registers
 436      * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
 437      * regions overlap wildly.  For this reason we cannot clamp the accesses
 438      * here.
 439      *
 440      * If the length is small (as is the case for address_space_ldl/stl),
 441      * everything works fine.  If the incoming length is large, however,
 442      * the caller really has to do the clamping through memory_access_size.
 443      */
 444     if (memory_region_is_ram(mr)) {
 445         diff = int128_sub(section->size, int128_make64(addr));
 446         *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 447     }
 448     return section;
 449 }
 450
 451 /* Called from RCU critical section */
 452 MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
 453                                       hwaddr *xlat, hwaddr *plen,
 454                                       bool is_write)
 455 {
 456     IOMMUTLBEntry iotlb;
 457     MemoryRegionSection *section;
 458     MemoryRegion *mr;
 459
 460     for (;;) {
 461         AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
 462         section = address_space_translate_internal(d, addr, &addr, plen, true);
 463         mr = section->mr;
 464
 465         if (!mr->iommu_ops) {
 466             break;
 467         }
 468
 469         iotlb = mr->iommu_ops->translate(mr, addr, is_write);
 470         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 471                 | (addr & iotlb.addr_mask));
 472         *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
 473         if (!(iotlb.perm & (1 << is_write))) {
 474             mr = &io_mem_unassigned;
 475             break;
 476         }
 477
 478         as = iotlb.target_as;
 479     }
 480
 481     if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 482         hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 483         *plen = MIN(page, *plen);
 484     }
 485
 486     *xlat = addr;
 487     return mr;
 488 }
 489
 490 /* Called from RCU critical section */
 491 MemoryRegionSection *
 492 address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
 493                                   hwaddr *xlat, hwaddr *plen)
 494 {
 495     MemoryRegionSection *section;
 496     AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
 497
 498     section = address_space_translate_internal(d, addr, xlat, plen, false);
 499
 500     assert(!section->mr->iommu_ops);
 501     return section;
 502 }
 503 #endif
 504
 505 #if !defined(CONFIG_USER_ONLY)
 506
 507 static int cpu_common_post_load(void *opaque, int version_id)
 508 {
 509     CPUState *cpu = opaque;
 510
 511     /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 512        version_id is increased. */
 513     cpu->interrupt_request &= ~0x01;
 514     tlb_flush(cpu, 1);
 515
 516     return 0;
 517 }
 518
 519 static int cpu_common_pre_load(void *opaque)
 520 {
 521     CPUState *cpu = opaque;
 522
 523     cpu->exception_index = -1;
 524
 525     return 0;
 526 }
 527
 528 static bool cpu_common_exception_index_needed(void *opaque)
 529 {
 530     CPUState *cpu = opaque;
 531
 532     return tcg_enabled() && cpu->exception_index != -1;
 533 }
 534
 535 static const VMStateDescription vmstate_cpu_common_exception_index = {
 536     .name = "cpu_common/exception_index",
 537     .version_id = 1,
 538     .minimum_version_id = 1,
 539     .needed = cpu_common_exception_index_needed,
 540     .fields = (VMStateField[]) {
 541         VMSTATE_INT32(exception_index, CPUState),
 542         VMSTATE_END_OF_LIST()
 543     }
 544 };
 545
 546 static bool cpu_common_crash_occurred_needed(void *opaque)
 547 {
 548     CPUState *cpu = opaque;
 549
 550     return cpu->crash_occurred;
 551 }
 552
 553 static const VMStateDescription vmstate_cpu_common_crash_occurred = {
 554     .name = "cpu_common/crash_occurred",
 555     .version_id = 1,
 556     .minimum_version_id = 1,
 557     .needed = cpu_common_crash_occurred_needed,
 558     .fields = (VMStateField[]) {
 559         VMSTATE_BOOL(crash_occurred, CPUState),
 560         VMSTATE_END_OF_LIST()
 561     }
 562 };
 563
 564 const VMStateDescription vmstate_cpu_common = {
 565     .name = "cpu_common",
 566     .version_id = 1,
 567     .minimum_version_id = 1,
 568     .pre_load = cpu_common_pre_load,
 569     .post_load = cpu_common_post_load,
 570     .fields = (VMStateField[]) {
 571         VMSTATE_UINT32(halted, CPUState),
 572         VMSTATE_UINT32(interrupt_request, CPUState),
 573         VMSTATE_END_OF_LIST()
 574     },
 575     .subsections = (const VMStateDescription*[]) {
 576         &vmstate_cpu_common_exception_index,
 577         &vmstate_cpu_common_crash_occurred,
 578         NULL
 579     }
 580 };
 581
 582 #endif
 583
 584 CPUState *qemu_get_cpu(int index)
 585 {
 586     CPUState *cpu;
 587
 588     CPU_FOREACH(cpu) {
 589         if (cpu->cpu_index == index) {
 590             return cpu;
 591         }
 592     }
 593
 594     return NULL;
 595 }
 596
 597 #if !defined(CONFIG_USER_ONLY)
 598 void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx)
 599 {
 600     CPUAddressSpace *newas;
 601
 602     /* Target code should have set num_ases before calling us */
 603     assert(asidx < cpu->num_ases);
 604
 605     if (asidx == 0) {
 606         /* address space 0 gets the convenience alias */
 607         cpu->as = as;
 608     }
 609
 610     /* KVM cannot currently support multiple address spaces. */
 611     assert(asidx == 0 || !kvm_enabled());
 612
 613     if (!cpu->cpu_ases) {
 614         cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
 615     }
 616
 617     newas = &cpu->cpu_ases[asidx];
 618     newas->cpu = cpu;
 619     newas->as = as;
 620     if (tcg_enabled()) {
 621         newas->tcg_as_listener.commit = tcg_commit;
 622         memory_listener_register(&newas->tcg_as_listener, as);
 623     }
 624 }
 625
 626 AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 627 {
 628     /* Return the AddressSpace corresponding to the specified index */
 629     return cpu->cpu_ases[asidx].as;
 630 }
 631 #endif
 632
 633 void cpu_exec_unrealizefn(CPUState *cpu)
 634 {
 635     CPUClass *cc = CPU_GET_CLASS(cpu);
 636
 637     cpu_list_remove(cpu);
 638
 639     if (cc->vmsd != NULL) {
 640         vmstate_unregister(NULL, cc->vmsd, cpu);
 641     }
 642     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 643         vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
 644     }
 645 }
 646
 647 void cpu_exec_initfn(CPUState *cpu)
 648 {
 649     cpu->as = NULL;
 650     cpu->num_ases = 0;
 651
 652 #ifndef CONFIG_USER_ONLY
 653     cpu->thread_id = qemu_get_thread_id();
 654
 655     /* This is a softmmu CPU object, so create a property for it
 656      * so users can wire up its memory. (This can't go in qom/cpu.c
 657      * because that file is compiled only once for both user-mode
 658      * and system builds.) The default if no link is set up is to use
 659      * the system address space.
 660      */
 661     object_property_add_link(OBJECT(cpu), "memory", TYPE_MEMORY_REGION,
 662                              (Object **)&cpu->memory,
 663                              qdev_prop_allow_set_link_before_realize,
 664                              OBJ_PROP_LINK_UNREF_ON_RELEASE,
 665                              &error_abort);
 666     cpu->memory = system_memory;
 667     object_ref(OBJECT(cpu->memory));
 668 #endif
 669 }
 670
 671 void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 672 {
 673     CPUClass *cc ATTRIBUTE_UNUSED = CPU_GET_CLASS(cpu);
 674
 675     cpu_list_add(cpu);
 676
 677 #ifndef CONFIG_USER_ONLY
 678     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 679         vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
 680     }
 681     if (cc->vmsd != NULL) {
 682         vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
 683     }
 684 #endif
 685 }
 686
 687 #if defined(CONFIG_USER_ONLY)
 688 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 689 {
 690     mmap_lock();
 691     tb_lock();
 692     tb_invalidate_phys_page_range(pc, pc + 1, 0);
 693     tb_unlock();
 694     mmap_unlock();
 695 }
 696 #else
 697 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 698 {
 699     MemTxAttrs attrs;
 700     hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
 701     int asidx = cpu_asidx_from_attrs(cpu, attrs);
 702     if (phys != -1) {
 703         /* Locks grabbed by tb_invalidate_phys_addr */
 704         tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
 705                                 phys | (pc & ~TARGET_PAGE_MASK));
 706     }
 707 }
 708 #endif
 709
 710 #if defined(CONFIG_USER_ONLY)
 711 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 712
 713 {
 714 }
 715
 716 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 717                           int flags)
 718 {
 719     return -ENOSYS;
 720 }
 721
 722 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 723 {
 724 }
 725
 726 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 727                           int flags, CPUWatchpoint **watchpoint)
 728 {
 729     return -ENOSYS;
 730 }
 731 #else
 732 /* Add a watchpoint.  */
 733 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 734                           int flags, CPUWatchpoint **watchpoint)
 735 {
 736     CPUWatchpoint *wp;
 737
 738     /* forbid ranges which are empty or run off the end of the address space */
 739     if (len == 0 || (addr + len - 1) < addr) {
 740         error_report("tried to set invalid watchpoint at %"
 741                      VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
 742         return -EINVAL;
 743     }
 744     wp = g_malloc(sizeof(*wp));
 745
 746     wp->vaddr = addr;
 747     wp->len = len;
 748     wp->flags = flags;
 749
 750     /* keep all GDB-injected watchpoints in front */
 751     if (flags & BP_GDB) {
 752         QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
 753     } else {
 754         QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
 755     }
 756
 757     tlb_flush_page(cpu, addr);
 758
 759     if (watchpoint)
 760         *watchpoint = wp;
 761     return 0;
 762 }
 763
 764 /* Remove a specific watchpoint.  */
 765 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 766                           int flags)
 767 {
 768     CPUWatchpoint *wp;
 769
 770     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 771         if (addr == wp->vaddr && len == wp->len
 772                 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
 773             cpu_watchpoint_remove_by_ref(cpu, wp);
 774             return 0;
 775         }
 776     }
 777     return -ENOENT;
 778 }
 779
 780 /* Remove a specific watchpoint by reference.  */
 781 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 782 {
 783     QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
 784
 785     tlb_flush_page(cpu, watchpoint->vaddr);
 786
 787     g_free(watchpoint);
 788 }
 789
 790 /* Remove all matching watchpoints.  */
 791 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 792 {
 793     CPUWatchpoint *wp, *next;
 794
 795     QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
 796         if (wp->flags & mask) {
 797             cpu_watchpoint_remove_by_ref(cpu, wp);
 798         }
 799     }
 800 }
 801
 802 /* Return true if this watchpoint address matches the specified
 803  * access (ie the address range covered by the watchpoint overlaps
 804  * partially or completely with the address range covered by the
 805  * access).
 806  */
 807 static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
 808                                                   vaddr addr,
 809                                                   vaddr len)
 810 {
 811     /* We know the lengths are non-zero, but a little caution is
 812      * required to avoid errors in the case where the range ends
 813      * exactly at the top of the address space and so addr + len
 814      * wraps round to zero.
 815      */
 816     vaddr wpend = wp->vaddr + wp->len - 1;
 817     vaddr addrend = addr + len - 1;
 818
 819     return !(addr > wpend || wp->vaddr > addrend);
 820 }
 821
 822 #endif
 823
 824 /* Add a breakpoint.  */
 825 int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
 826                           CPUBreakpoint **breakpoint)
 827 {
 828     CPUBreakpoint *bp;
 829
 830     bp = g_malloc(sizeof(*bp));
 831
 832     bp->pc = pc;
 833     bp->flags = flags;
 834
 835     /* keep all GDB-injected breakpoints in front */
 836     if (flags & BP_GDB) {
 837         QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
 838     } else {
 839         QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
 840     }
 841
 842     breakpoint_invalidate(cpu, pc);
 843
 844     if (breakpoint) {
 845         *breakpoint = bp;
 846     }
 847     return 0;
 848 }
 849
 850 /* Remove a specific breakpoint.  */
 851 int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
 852 {
 853     CPUBreakpoint *bp;
 854
 855     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
 856         if (bp->pc == pc && bp->flags == flags) {
 857             cpu_breakpoint_remove_by_ref(cpu, bp);
 858             return 0;
 859         }
 860     }
 861     return -ENOENT;
 862 }
 863
 864 /* Remove a specific breakpoint by reference.  */
 865 void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
 866 {
 867     QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
 868
 869     breakpoint_invalidate(cpu, breakpoint->pc);
 870
 871     g_free(breakpoint);
 872 }
 873
 874 /* Remove all matching breakpoints. */
 875 void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
 876 {
 877     CPUBreakpoint *bp, *next;
 878
 879     QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
 880         if (bp->flags & mask) {
 881             cpu_breakpoint_remove_by_ref(cpu, bp);
 882         }
 883     }
 884 }
 885
 886 /* enable or disable single step mode. EXCP_DEBUG is returned by the
 887    CPU loop after each instruction */
 888 void cpu_single_step(CPUState *cpu, int enabled)
 889 {
 890     if (cpu->singlestep_enabled != enabled) {
 891         cpu->singlestep_enabled = enabled;
 892         if (kvm_enabled()) {
 893             kvm_update_guest_debug(cpu, 0);
 894         } else {
 895             /* must flush all the translated code to avoid inconsistencies */
 896             /* XXX: only flush what is necessary */
 897             tb_flush(cpu);
 898         }
 899     }
 900 }
 901
 902 void cpu_abort(CPUState *cpu, const char *fmt, ...)
 903 {
 904     va_list ap;
 905     va_list ap2;
 906
 907     va_start(ap, fmt);
 908     va_copy(ap2, ap);
 909     fprintf(stderr, "qemu: fatal: ");
 910     vfprintf(stderr, fmt, ap);
 911     fprintf(stderr, "\n");
 912     cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 913     if (qemu_log_separate()) {
 914         qemu_log("qemu: fatal: ");
 915         qemu_log_vprintf(fmt, ap2);
 916         qemu_log("\n");
 917         log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 918         qemu_log_flush();
 919         qemu_log_close();
 920     }
 921     va_end(ap2);
 922     va_end(ap);
 923     replay_finish();
 924 #if defined(CONFIG_USER_ONLY)
 925     {
 926         struct sigaction act;
 927         sigfillset(&act.sa_mask);
 928         act.sa_handler = SIG_DFL;
 929         sigaction(SIGABRT, &act, NULL);
 930     }
 931 #endif
 932     abort();
 933 }
 934
 935 #if !defined(CONFIG_USER_ONLY)
 936 /* Called from RCU critical section */
 937 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
 938 {
 939     RAMBlock *block;
 940
 941     block = atomic_rcu_read(&ram_list.mru_block);
 942     if (block && addr - block->offset < block->max_length) {
 943         return block;
 944     }
 945     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 946         if (addr - block->offset < block->max_length) {
 947             goto found;
 948         }
 949     }
 950
 951     fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
 952     abort();
 953
 954 found:
 955     /* It is safe to write mru_block outside the iothread lock.  This
 956      * is what happens:
 957      *
 958      *     mru_block = xxx
 959      *     rcu_read_unlock()
 960      *                                        xxx removed from list
 961      *                  rcu_read_lock()
 962      *                  read mru_block
 963      *                                        mru_block = NULL;
 964      *                                        call_rcu(reclaim_ramblock, xxx);
 965      *                  rcu_read_unlock()
 966      *
 967      * atomic_rcu_set is not needed here.  The block was already published
 968      * when it was placed into the list.  Here we're just making an extra
 969      * copy of the pointer.
 970      */
 971     ram_list.mru_block = block;
 972     return block;
 973 }
 974
 975 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
 976 {
 977     CPUState *cpu;
 978     ram_addr_t start1;
 979     RAMBlock *block;
 980     ram_addr_t end;
 981
 982     end = TARGET_PAGE_ALIGN(start + length);
 983     start &= TARGET_PAGE_MASK;
 984
 985     rcu_read_lock();
 986     block = qemu_get_ram_block(start);
 987     assert(block == qemu_get_ram_block(end - 1));
 988     start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
 989     CPU_FOREACH(cpu) {
 990         tlb_reset_dirty(cpu, start1, length);
 991     }
 992     rcu_read_unlock();
 993 }
 994
 995 /* Note: start and end must be within the same ram block.  */
 996 bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
 997                                               ram_addr_t length,
 998                                               unsigned client)
 999 {
1000     DirtyMemoryBlocks *blocks;
1001     unsigned long end, page;
1002     bool dirty = false;
1003
1004     if (length == 0) {
1005         return false;
1006     }
1007
1008     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
1009     page = start >> TARGET_PAGE_BITS;
1010
1011     rcu_read_lock();
1012
1013     blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1014
1015     while (page < end) {
1016         unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1017         unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1018         unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
1019
1020         dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
1021                                               offset, num);
1022         page += num;
1023     }
1024
1025     rcu_read_unlock();
1026
1027     if (dirty && tcg_enabled()) {
1028         tlb_reset_dirty_range_all(start, length);
1029     }
1030
1031     return dirty;
1032 }
1033
1034 /* Called from RCU critical section */
1035 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1036                                        MemoryRegionSection *section,
1037                                        target_ulong vaddr,
1038                                        hwaddr paddr, hwaddr xlat,
1039                                        int prot,
1040                                        target_ulong *address)
1041 {
1042     hwaddr iotlb;
1043     CPUWatchpoint *wp;
1044
1045     if (memory_region_is_ram(section->mr)) {
1046         /* Normal RAM.  */
1047         iotlb = memory_region_get_ram_addr(section->mr) + xlat;
1048         if (!section->readonly) {
1049             iotlb |= PHYS_SECTION_NOTDIRTY;
1050         } else {
1051             iotlb |= PHYS_SECTION_ROM;
1052         }
1053     } else {
1054         AddressSpaceDispatch *d;
1055
1056         d = atomic_rcu_read(&section->address_space->dispatch);
1057         iotlb = section - d->map.sections;
1058         iotlb += xlat;
1059     }
1060
1061     /* Make accesses to pages with watchpoints go via the
1062        watchpoint trap routines.  */
1063     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1064         if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
1065             /* Avoid trapping reads of pages with a write breakpoint. */
1066             if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
1067                 iotlb = PHYS_SECTION_WATCH + paddr;
1068                 *address |= TLB_MMIO;
1069                 break;
1070             }
1071         }
1072     }
1073
1074     return iotlb;
1075 }
1076 #endif /* defined(CONFIG_USER_ONLY) */
1077
1078 #if !defined(CONFIG_USER_ONLY)
1079
1080 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
1081                              uint16_t section);
1082 static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
1083
1084 static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
1085                                qemu_anon_ram_alloc;
1086
1087 /*
1088  * Set a custom physical guest memory alloator.
1089  * Accelerators with unusual needs may need this.  Hopefully, we can
1090  * get rid of it eventually.
1091  */
1092 void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
1093 {
1094     phys_mem_alloc = alloc;
1095 }
1096
1097 static uint16_t phys_section_add(PhysPageMap *map,
1098                                  MemoryRegionSection *section)
1099 {
1100     /* The physical section number is ORed with a page-aligned
1101      * pointer to produce the iotlb entries.  Thus it should
1102      * never overflow into the page-aligned value.
1103      */
1104     assert(map->sections_nb < TARGET_PAGE_SIZE);
1105
1106     if (map->sections_nb == map->sections_nb_alloc) {
1107         map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
1108         map->sections = g_renew(MemoryRegionSection, map->sections,
1109                                 map->sections_nb_alloc);
1110     }
1111     map->sections[map->sections_nb] = *section;
1112     memory_region_ref(section->mr);
1113     return map->sections_nb++;
1114 }
1115
1116 static void phys_section_destroy(MemoryRegion *mr)
1117 {
1118     bool have_sub_page = mr->subpage;
1119
1120     memory_region_unref(mr);
1121
1122     if (have_sub_page) {
1123         subpage_t *subpage = container_of(mr, subpage_t, iomem);
1124         object_unref(OBJECT(&subpage->iomem));
1125         g_free(subpage);
1126     }
1127 }
1128
1129 static void phys_sections_free(PhysPageMap *map)
1130 {
1131     while (map->sections_nb > 0) {
1132         MemoryRegionSection *section = &map->sections[--map->sections_nb];
1133         phys_section_destroy(section->mr);
1134     }
1135     g_free(map->sections);
1136     g_free(map->nodes);
1137 }
1138
1139 static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
1140 {
1141     subpage_t *subpage;
1142     hwaddr base = section->offset_within_address_space
1143         & TARGET_PAGE_MASK;
1144     MemoryRegionSection *existing = phys_page_find(d->phys_map, base,
1145                                                    d->map.nodes, d->map.sections);
1146     MemoryRegionSection subsection = {
1147         .offset_within_address_space = base,
1148         .size = int128_make64(TARGET_PAGE_SIZE),
1149     };
1150     hwaddr start, end;
1151
1152     assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1153
1154     if (!(existing->mr->subpage)) {
1155         subpage = subpage_init(d->as, base);
1156         subsection.address_space = d->as;
1157         subsection.mr = &subpage->iomem;
1158         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1159                       phys_section_add(&d->map, &subsection));
1160     } else {
1161         subpage = container_of(existing->mr, subpage_t, iomem);
1162     }
1163     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1164     end = start + int128_get64(section->size) - 1;
1165     subpage_register(subpage, start, end,
1166                      phys_section_add(&d->map, section));
1167 }
1168
1169
1170 static void register_multipage(AddressSpaceDispatch *d,
1171                                MemoryRegionSection *section)
1172 {
1173     hwaddr start_addr = section->offset_within_address_space;
1174     uint16_t section_index = phys_section_add(&d->map, section);
1175     uint64_t num_pages = int128_get64(int128_rshift(section->size,
1176                                                     TARGET_PAGE_BITS));
1177
1178     assert(num_pages);
1179     phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1180 }
1181
1182 static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
1183 {
1184     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
1185     AddressSpaceDispatch *d = as->next_dispatch;
1186     MemoryRegionSection now = *section, remain = *section;
1187     Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1188
1189     if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
1190         uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
1191                        - now.offset_within_address_space;
1192
1193         now.size = int128_min(int128_make64(left), now.size);
1194         register_subpage(d, &now);
1195     } else {
1196         now.size = int128_zero();
1197     }
1198     while (int128_ne(remain.size, now.size)) {
1199         remain.size = int128_sub(remain.size, now.size);
1200         remain.offset_within_address_space += int128_get64(now.size);
1201         remain.offset_within_region += int128_get64(now.size);
1202         now = remain;
1203         if (int128_lt(remain.size, page_size)) {
1204             register_subpage(d, &now);
1205         } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1206             now.size = page_size;
1207             register_subpage(d, &now);
1208         } else {
1209             now.size = int128_and(now.size, int128_neg(page_size));
1210             register_multipage(d, &now);
1211         }
1212     }
1213 }
1214
1215 void qemu_flush_coalesced_mmio_buffer(void)
1216 {
1217     if (kvm_enabled())
1218         kvm_flush_coalesced_mmio_buffer();
1219 }
1220
1221 void qemu_mutex_lock_ramlist(void)
1222 {
1223     qemu_mutex_lock(&ram_list.mutex);
1224 }
1225
1226 void qemu_mutex_unlock_ramlist(void)
1227 {
1228     qemu_mutex_unlock(&ram_list.mutex);
1229 }
1230
1231 #ifdef __linux__
1232 static int64_t get_file_size(int fd)
1233 {
1234     int64_t size = lseek(fd, 0, SEEK_END);
1235     if (size < 0) {
1236         return -errno;
1237     }
1238     return size;
1239 }
1240
1241 static void *file_ram_alloc(RAMBlock *block,
1242                             ram_addr_t memory,
1243                             const char *path,
1244                             Error **errp)
1245 {
1246     bool unlink_on_error = false;
1247     char *filename;
1248     char *sanitized_name;
1249     char *c;
1250     void *area = MAP_FAILED;
1251     int fd = -1;
1252     int64_t file_size;
1253
1254     if (kvm_enabled() && !kvm_has_sync_mmu()) {
1255         error_setg(errp,
1256                    "host lacks kvm mmu notifiers, -mem-path unsupported");
1257         return NULL;
1258     }
1259
1260     for (;;) {
1261         fd = open(path, O_RDWR);
1262         if (fd >= 0) {
1263             /* @path names an existing file, use it */
1264             break;
1265         }
1266         if (errno == ENOENT) {
1267             /* @path names a file that doesn't exist, create it */
1268             fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
1269             if (fd >= 0) {
1270                 unlink_on_error = true;
1271                 break;
1272             }
1273         } else if (errno == EISDIR) {
1274             /* @path names a directory, create a file there */
1275             /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1276             sanitized_name = g_strdup(memory_region_name(block->mr));
1277             for (c = sanitized_name; *c != '\0'; c++) {
1278                 if (*c == '/') {
1279                     *c = '_';
1280                 }
1281             }
1282
1283             filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1284                                        sanitized_name);
1285             g_free(sanitized_name);
1286
1287             fd = mkstemp(filename);
1288             if (fd >= 0) {
1289                 unlink(filename);
1290                 g_free(filename);
1291                 break;
1292             }
1293             g_free(filename);
1294         }
1295         if (errno != EEXIST && errno != EINTR) {
1296             error_setg_errno(errp, errno,
1297                              "can't open backing store %s for guest RAM",
1298                              path);
1299             goto error;
1300         }
1301         /*
1302          * Try again on EINTR and EEXIST.  The latter happens when
1303          * something else creates the file between our two open().
1304          */
1305     }
1306
1307     block->page_size = qemu_fd_getpagesize(fd);
1308     block->mr->align = block->page_size;
1309 #if defined(__s390x__)
1310     if (kvm_enabled()) {
1311         block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
1312     }
1313 #endif
1314
1315     file_size = get_file_size(fd);
1316
1317     if (memory < block->page_size) {
1318         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1319                    "or larger than page size 0x%zx",
1320                    memory, block->page_size);
1321         goto error;
1322     }
1323
1324     if (file_size > 0 && file_size < memory) {
1325         error_setg(errp, "backing store %s size 0x%" PRIx64
1326                    " does not match 'size' option 0x" RAM_ADDR_FMT,
1327                    path, file_size, memory);
1328         goto error;
1329     }
1330
1331     memory = ROUND_UP(memory, block->page_size);
1332
1333     /*
1334      * ftruncate is not supported by hugetlbfs in older
1335      * hosts, so don't bother bailing out on errors.
1336      * If anything goes wrong with it under other filesystems,
1337      * mmap will fail.
1338      *
1339      * Do not truncate the non-empty backend file to avoid corrupting
1340      * the existing data in the file. Disabling shrinking is not
1341      * enough. For example, the current vNVDIMM implementation stores
1342      * the guest NVDIMM labels at the end of the backend file. If the
1343      * backend file is later extended, QEMU will not be able to find
1344      * those labels. Therefore, extending the non-empty backend file
1345      * is disabled as well.
1346      */
1347     if (!file_size && ftruncate(fd, memory)) {
1348         perror("ftruncate");
1349     }
1350
1351     area = qemu_ram_mmap(fd, memory, block->mr->align,
1352                          block->flags & RAM_SHARED);
1353     if (area == MAP_FAILED) {
1354         error_setg_errno(errp, errno,
1355                          "unable to map backing store for guest RAM");
1356         goto error;
1357     }
1358
1359     if (mem_prealloc) {
1360         os_mem_prealloc(fd, area, memory, errp);
1361         if (errp && *errp) {
1362             goto error;
1363         }
1364     }
1365
1366     block->fd = fd;
1367     return area;
1368
1369 error:
1370     if (area != MAP_FAILED) {
1371         qemu_ram_munmap(area, memory);
1372     }
1373     if (unlink_on_error) {
1374         unlink(path);
1375     }
1376     if (fd != -1) {
1377         close(fd);
1378     }
1379     return NULL;
1380 }
1381 #endif
1382
1383 /* Called with the ramlist lock held.  */
1384 static ram_addr_t find_ram_offset(ram_addr_t size)
1385 {
1386     RAMBlock *block, *next_block;
1387     ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1388
1389     assert(size != 0); /* it would hand out same offset multiple times */
1390
1391     if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1392         return 0;
1393     }
1394
1395     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1396         ram_addr_t end, next = RAM_ADDR_MAX;
1397
1398         end = block->offset + block->max_length;
1399
1400         QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) {
1401             if (next_block->offset >= end) {
1402                 next = MIN(next, next_block->offset);
1403             }
1404         }
1405         if (next - end >= size && next - end < mingap) {
1406             offset = end;
1407             mingap = next - end;
1408         }
1409     }
1410
1411     if (offset == RAM_ADDR_MAX) {
1412         fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1413                 (uint64_t)size);
1414         abort();
1415     }
1416
1417     return offset;
1418 }
1419
1420 ram_addr_t last_ram_offset(void)
1421 {
1422     RAMBlock *block;
1423     ram_addr_t last = 0;
1424
1425     rcu_read_lock();
1426     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1427         last = MAX(last, block->offset + block->max_length);
1428     }
1429     rcu_read_unlock();
1430     return last;
1431 }
1432
1433 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1434 {
1435     int ret;
1436
1437     /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1438     if (!machine_dump_guest_core(current_machine)) {
1439         ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1440         if (ret) {
1441             perror("qemu_madvise");
1442             fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1443                             "but dump_guest_core=off specified\n");
1444         }
1445     }
1446 }
1447
1448 const char *qemu_ram_get_idstr(RAMBlock *rb)
1449 {
1450     return rb->idstr;
1451 }
1452
1453 /* Called with iothread lock held.  */
1454 void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
1455 {
1456     RAMBlock *block;
1457
1458     assert(new_block);
1459     assert(!new_block->idstr[0]);
1460
1461     if (dev) {
1462         char *id = qdev_get_dev_path(dev);
1463         if (id) {
1464             snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1465             g_free(id);
1466         }
1467     }
1468     pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
1469
1470     rcu_read_lock();
1471     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1472         if (block != new_block &&
1473             !strcmp(block->idstr, new_block->idstr)) {
1474             fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
1475                     new_block->idstr);
1476             abort();
1477         }
1478     }
1479     rcu_read_unlock();
1480 }
1481
1482 /* Called with iothread lock held.  */
1483 void qemu_ram_unset_idstr(RAMBlock *block)
1484 {
1485     /* FIXME: arch_init.c assumes that this is not called throughout
1486      * migration.  Ignore the problem since hot-unplug during migration
1487      * does not work anyway.
1488      */
1489     if (block) {
1490         memset(block->idstr, 0, sizeof(block->idstr));
1491     }
1492 }
1493
1494 size_t qemu_ram_pagesize(RAMBlock *rb)
1495 {
1496     return rb->page_size;
1497 }
1498
1499 static int memory_try_enable_merging(void *addr, size_t len)
1500 {
1501     if (!machine_mem_merge(current_machine)) {
1502         /* disabled by the user */
1503         return 0;
1504     }
1505
1506     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
1507 }
1508
1509 /* Only legal before guest might have detected the memory size: e.g. on
1510  * incoming migration, or right after reset.
1511  *
1512  * As memory core doesn't know how is memory accessed, it is up to
1513  * resize callback to update device state and/or add assertions to detect
1514  * misuse, if necessary.
1515  */
1516 int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
1517 {
1518     assert(block);
1519
1520     newsize = HOST_PAGE_ALIGN(newsize);
1521
1522     if (block->used_length == newsize) {
1523         return 0;
1524     }
1525
1526     if (!(block->flags & RAM_RESIZEABLE)) {
1527         error_setg_errno(errp, EINVAL,
1528                          "Length mismatch: %s: 0x" RAM_ADDR_FMT
1529                          " in != 0x" RAM_ADDR_FMT, block->idstr,
1530                          newsize, block->used_length);
1531         return -EINVAL;
1532     }
1533
1534     if (block->max_length < newsize) {
1535         error_setg_errno(errp, EINVAL,
1536                          "Length too large: %s: 0x" RAM_ADDR_FMT
1537                          " > 0x" RAM_ADDR_FMT, block->idstr,
1538                          newsize, block->max_length);
1539         return -EINVAL;
1540     }
1541
1542     cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
1543     block->used_length = newsize;
1544     cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
1545                                         DIRTY_CLIENTS_ALL);
1546     memory_region_set_size(block->mr, newsize);
1547     if (block->resized) {
1548         block->resized(block->idstr, newsize, block->host);
1549     }
1550     return 0;
1551 }
1552
1553 /* Called with ram_list.mutex held */
1554 static void dirty_memory_extend(ram_addr_t old_ram_size,
1555                                 ram_addr_t new_ram_size)
1556 {
1557     ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
1558                                              DIRTY_MEMORY_BLOCK_SIZE);
1559     ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
1560                                              DIRTY_MEMORY_BLOCK_SIZE);
1561     int i;
1562
1563     /* Only need to extend if block count increased */
1564     if (new_num_blocks <= old_num_blocks) {
1565         return;
1566     }
1567
1568     for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
1569         DirtyMemoryBlocks *old_blocks;
1570         DirtyMemoryBlocks *new_blocks;
1571         int j;
1572
1573         old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
1574         new_blocks = g_malloc(sizeof(*new_blocks) +
1575                               sizeof(new_blocks->blocks[0]) * new_num_blocks);
1576
1577         if (old_num_blocks) {
1578             memcpy(new_blocks->blocks, old_blocks->blocks,
1579                    old_num_blocks * sizeof(old_blocks->blocks[0]));
1580         }
1581
1582         for (j = old_num_blocks; j < new_num_blocks; j++) {
1583             new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
1584         }
1585
1586         atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
1587
1588         if (old_blocks) {
1589             g_free_rcu(old_blocks, rcu);
1590         }
1591     }
1592 }
1593
1594 static void ram_block_add(RAMBlock *new_block, Error **errp)
1595 {
1596     RAMBlock *block;
1597     RAMBlock *last_block = NULL;
1598     ram_addr_t old_ram_size, new_ram_size;
1599     Error *err = NULL;
1600
1601     old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1602
1603     qemu_mutex_lock_ramlist();
1604     new_block->offset = find_ram_offset(new_block->max_length);
1605
1606     if (!new_block->host) {
1607         if (xen_enabled()) {
1608             xen_ram_alloc(new_block->offset, new_block->max_length,
1609                           new_block->mr, &err);
1610             if (err) {
1611                 error_propagate(errp, err);
1612                 qemu_mutex_unlock_ramlist();
1613                 return;
1614             }
1615         } else {
1616             new_block->host = phys_mem_alloc(new_block->max_length,
1617                                              &new_block->mr->align);
1618             if (!new_block->host) {
1619                 error_setg_errno(errp, errno,
1620                                  "cannot set up guest memory '%s'",
1621                                  memory_region_name(new_block->mr));
1622                 qemu_mutex_unlock_ramlist();
1623                 return;
1624             }
1625             memory_try_enable_merging(new_block->host, new_block->max_length);
1626         }
1627     }
1628
1629     new_ram_size = MAX(old_ram_size,
1630               (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
1631     if (new_ram_size > old_ram_size) {
1632         migration_bitmap_extend(old_ram_size, new_ram_size);
1633         dirty_memory_extend(old_ram_size, new_ram_size);
1634     }
1635     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
1636      * QLIST (which has an RCU-friendly variant) does not have insertion at
1637      * tail, so save the last element in last_block.
1638      */
1639     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1640         last_block = block;
1641         if (block->max_length < new_block->max_length) {
1642             break;
1643         }
1644     }
1645     if (block) {
1646         QLIST_INSERT_BEFORE_RCU(block, new_block, next);
1647     } else if (last_block) {
1648         QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
1649     } else { /* list is empty */
1650         QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1651     }
1652     ram_list.mru_block = NULL;
1653
1654     /* Write list before version */
1655     smp_wmb();
1656     ram_list.version++;
1657     qemu_mutex_unlock_ramlist();
1658
1659     cpu_physical_memory_set_dirty_range(new_block->offset,
1660                                         new_block->used_length,
1661                                         DIRTY_CLIENTS_ALL);
1662
1663     if (new_block->host) {
1664         qemu_ram_setup_dump(new_block->host, new_block->max_length);
1665         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
1666         /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
1667         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
1668     }
1669 }
1670
1671 #ifdef __linux__
1672 RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
1673                                    bool share, const char *mem_path,
1674                                    Error **errp)
1675 {
1676     RAMBlock *new_block;
1677     Error *local_err = NULL;
1678
1679     if (xen_enabled()) {
1680         error_setg(errp, "-mem-path not supported with Xen");
1681         return NULL;
1682     }
1683
1684     if (phys_mem_alloc != qemu_anon_ram_alloc) {
1685         /*
1686          * file_ram_alloc() needs to allocate just like
1687          * phys_mem_alloc, but we haven't bothered to provide
1688          * a hook there.
1689          */
1690         error_setg(errp,
1691                    "-mem-path not supported with this accelerator");
1692         return NULL;
1693     }
1694
1695     size = HOST_PAGE_ALIGN(size);
1696     new_block = g_malloc0(sizeof(*new_block));
1697     new_block->mr = mr;
1698     new_block->used_length = size;
1699     new_block->max_length = size;
1700     new_block->flags = share ? RAM_SHARED : 0;
1701     new_block->host = file_ram_alloc(new_block, size,
1702                                      mem_path, errp);
1703     if (!new_block->host) {
1704         g_free(new_block);
1705         return NULL;
1706     }
1707
1708     ram_block_add(new_block, &local_err);
1709     if (local_err) {
1710         g_free(new_block);
1711         error_propagate(errp, local_err);
1712         return NULL;
1713     }
1714     return new_block;
1715 }
1716 #endif
1717
1718 static
1719 RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
1720                                   void (*resized)(const char*,
1721                                                   uint64_t length,
1722                                                   void *host),
1723                                   void *host, bool resizeable,
1724                                   MemoryRegion *mr, Error **errp)
1725 {
1726     RAMBlock *new_block;
1727     Error *local_err = NULL;
1728
1729     size = HOST_PAGE_ALIGN(size);
1730     max_size = HOST_PAGE_ALIGN(max_size);
1731     new_block = g_malloc0(sizeof(*new_block));
1732     new_block->mr = mr;
1733     new_block->resized = resized;
1734     new_block->used_length = size;
1735     new_block->max_length = max_size;
1736     assert(max_size >= size);
1737     new_block->fd = -1;
1738     new_block->page_size = getpagesize();
1739     new_block->host = host;
1740     if (host) {
1741         new_block->flags |= RAM_PREALLOC;
1742     }
1743     if (resizeable) {
1744         new_block->flags |= RAM_RESIZEABLE;
1745     }
1746     ram_block_add(new_block, &local_err);
1747     if (local_err) {
1748         g_free(new_block);
1749         error_propagate(errp, local_err);
1750         return NULL;
1751     }
1752     return new_block;
1753 }
1754
1755 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
1756                                    MemoryRegion *mr, Error **errp)
1757 {
1758     return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
1759 }
1760
1761 RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
1762 {
1763     return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
1764 }
1765
1766 RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
1767                                      void (*resized)(const char*,
1768                                                      uint64_t length,
1769                                                      void *host),
1770                                      MemoryRegion *mr, Error **errp)
1771 {
1772     return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
1773 }
1774
1775 static void reclaim_ramblock(RAMBlock *block)
1776 {
1777     if (block->flags & RAM_PREALLOC) {
1778         ;
1779     } else if (xen_enabled()) {
1780         xen_invalidate_map_cache_entry(block->host);
1781 #ifndef _WIN32
1782     } else if (block->fd >= 0) {
1783         qemu_ram_munmap(block->host, block->max_length);
1784         close(block->fd);
1785 #endif
1786     } else {
1787         qemu_anon_ram_free(block->host, block->max_length);
1788     }
1789     g_free(block);
1790 }
1791
1792 void qemu_ram_free(RAMBlock *block)
1793 {
1794     if (!block) {
1795         return;
1796     }
1797
1798     qemu_mutex_lock_ramlist();
1799     QLIST_REMOVE_RCU(block, next);
1800     ram_list.mru_block = NULL;
1801     /* Write list before version */
1802     smp_wmb();
1803     ram_list.version++;
1804     call_rcu(block, reclaim_ramblock, rcu);
1805     qemu_mutex_unlock_ramlist();
1806 }
1807
1808 #ifndef _WIN32
1809 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
1810 {
1811     RAMBlock *block;
1812     ram_addr_t offset;
1813     int flags;
1814     void *area, *vaddr;
1815
1816     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1817         offset = addr - block->offset;
1818         if (offset < block->max_length) {
1819             vaddr = ramblock_ptr(block, offset);
1820             if (block->flags & RAM_PREALLOC) {
1821                 ;
1822             } else if (xen_enabled()) {
1823                 abort();
1824             } else {
1825                 flags = MAP_FIXED;
1826                 if (block->fd >= 0) {
1827                     flags |= (block->flags & RAM_SHARED ?
1828                               MAP_SHARED : MAP_PRIVATE);
1829                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1830                                 flags, block->fd, offset);
1831                 } else {
1832                     /*
1833                      * Remap needs to match alloc.  Accelerators that
1834                      * set phys_mem_alloc never remap.  If they did,
1835                      * we'd need a remap hook here.
1836                      */
1837                     assert(phys_mem_alloc == qemu_anon_ram_alloc);
1838
1839                     flags |= MAP_PRIVATE | MAP_ANONYMOUS;
1840                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1841                                 flags, -1, 0);
1842                 }
1843                 if (area != vaddr) {
1844                     fprintf(stderr, "Could not remap addr: "
1845                             RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
1846                             length, addr);
1847                     exit(1);
1848                 }
1849                 memory_try_enable_merging(vaddr, length);
1850                 qemu_ram_setup_dump(vaddr, length);
1851             }
1852         }
1853     }
1854 }
1855 #endif /* !_WIN32 */
1856
1857 /* Return a host pointer to ram allocated with qemu_ram_alloc.
1858  * This should not be used for general purpose DMA.  Use address_space_map
1859  * or address_space_rw instead. For local memory (e.g. video ram) that the
1860  * device owns, use memory_region_get_ram_ptr.
1861  *
1862  * Called within RCU critical section.
1863  */
1864 void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
1865 {
1866     RAMBlock *block = ram_block;
1867
1868     if (block == NULL) {
1869         block = qemu_get_ram_block(addr);
1870         addr -= block->offset;
1871     }
1872
1873     if (xen_enabled() && block->host == NULL) {
1874         /* We need to check if the requested address is in the RAM
1875          * because we don't want to map the entire memory in QEMU.
1876          * In that case just map until the end of the page.
1877          */
1878         if (block->offset == 0) {
1879             return xen_map_cache(addr, 0, 0);
1880         }
1881
1882         block->host = xen_map_cache(block->offset, block->max_length, 1);
1883     }
1884     return ramblock_ptr(block, addr);
1885 }
1886
1887 /* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
1888  * but takes a size argument.
1889  *
1890  * Called within RCU critical section.
1891  */
1892 static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
1893                                  hwaddr *size)
1894 {
1895     RAMBlock *block = ram_block;
1896     if (*size == 0) {
1897         return NULL;
1898     }
1899
1900     if (block == NULL) {
1901         block = qemu_get_ram_block(addr);
1902         addr -= block->offset;
1903     }
1904     *size = MIN(*size, block->max_length - addr);
1905
1906     if (xen_enabled() && block->host == NULL) {
1907         /* We need to check if the requested address is in the RAM
1908          * because we don't want to map the entire memory in QEMU.
1909          * In that case just map the requested area.
1910          */
1911         if (block->offset == 0) {
1912             return xen_map_cache(addr, *size, 1);
1913         }
1914
1915         block->host = xen_map_cache(block->offset, block->max_length, 1);
1916     }
1917
1918     return ramblock_ptr(block, addr);
1919 }
1920
1921 /*
1922  * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
1923  * in that RAMBlock.
1924  *
1925  * ptr: Host pointer to look up
1926  * round_offset: If true round the result offset down to a page boundary
1927  * *ram_addr: set to result ram_addr
1928  * *offset: set to result offset within the RAMBlock
1929  *
1930  * Returns: RAMBlock (or NULL if not found)
1931  *
1932  * By the time this function returns, the returned pointer is not protected
1933  * by RCU anymore.  If the caller is not within an RCU critical section and
1934  * does not hold the iothread lock, it must have other means of protecting the
1935  * pointer, such as a reference to the region that includes the incoming
1936  * ram_addr_t.
1937  */
1938 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
1939                                    ram_addr_t *offset)
1940 {
1941     RAMBlock *block;
1942     uint8_t *host = ptr;
1943
1944     if (xen_enabled()) {
1945         ram_addr_t ram_addr;
1946         rcu_read_lock();
1947         ram_addr = xen_ram_addr_from_mapcache(ptr);
1948         block = qemu_get_ram_block(ram_addr);
1949         if (block) {
1950             *offset = ram_addr - block->offset;
1951         }
1952         rcu_read_unlock();
1953         return block;
1954     }
1955
1956     rcu_read_lock();
1957     block = atomic_rcu_read(&ram_list.mru_block);
1958     if (block && block->host && host - block->host < block->max_length) {
1959         goto found;
1960     }
1961
1962     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1963         /* This case append when the block is not mapped. */
1964         if (block->host == NULL) {
1965             continue;
1966         }
1967         if (host - block->host < block->max_length) {
1968             goto found;
1969         }
1970     }
1971
1972     rcu_read_unlock();
1973     return NULL;
1974
1975 found:
1976     *offset = (host - block->host);
1977     if (round_offset) {
1978         *offset &= TARGET_PAGE_MASK;
1979     }
1980     rcu_read_unlock();
1981     return block;
1982 }
1983
1984 /*
1985  * Finds the named RAMBlock
1986  *
1987  * name: The name of RAMBlock to find
1988  *
1989  * Returns: RAMBlock (or NULL if not found)
1990  */
1991 RAMBlock *qemu_ram_block_by_name(const char *name)
1992 {
1993     RAMBlock *block;
1994
1995     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1996         if (!strcmp(name, block->idstr)) {
1997             return block;
1998         }
1999     }
2000
2001     return NULL;
2002 }
2003
2004 /* Some of the softmmu routines need to translate from a host pointer
2005    (typically a TLB entry) back to a ram offset.  */
2006 ram_addr_t qemu_ram_addr_from_host(void *ptr)
2007 {
2008     RAMBlock *block;
2009     ram_addr_t offset;
2010
2011     block = qemu_ram_block_from_host(ptr, false, &offset);
2012     if (!block) {
2013         return RAM_ADDR_INVALID;
2014     }
2015
2016     return block->offset + offset;
2017 }
2018
2019 /* Called within RCU critical section.  */
2020 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
2021                                uint64_t val, unsigned size)
2022 {
2023     bool locked = false;
2024
2025     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
2026         locked = true;
2027         tb_lock();
2028         tb_invalidate_phys_page_fast(ram_addr, size);
2029     }
2030     switch (size) {
2031     case 1:
2032         stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2033         break;
2034     case 2:
2035         stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2036         break;
2037     case 4:
2038         stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2039         break;
2040     default:
2041         abort();
2042     }
2043
2044     if (locked) {
2045         tb_unlock();
2046     }
2047
2048     /* Set both VGA and migration bits for simplicity and to remove
2049      * the notdirty callback faster.
2050      */
2051     cpu_physical_memory_set_dirty_range(ram_addr, size,
2052                                         DIRTY_CLIENTS_NOCODE);
2053     /* we remove the notdirty callback only if the code has been
2054        flushed */
2055     if (!cpu_physical_memory_is_clean(ram_addr)) {
2056         tlb_set_dirty(current_cpu, current_cpu->mem_io_vaddr);
2057     }
2058 }
2059
2060 static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
2061                                  unsigned size, bool is_write)
2062 {
2063     return is_write;
2064 }
2065
2066 static const MemoryRegionOps notdirty_mem_ops = {
2067     .write = notdirty_mem_write,
2068     .valid.accepts = notdirty_mem_accepts,
2069     .endianness = DEVICE_NATIVE_ENDIAN,
2070 };
2071
2072 /* Generate a debug exception if a watchpoint has been hit.  */
2073 static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
2074 {
2075     CPUState *cpu = current_cpu;
2076     CPUClass *cc = CPU_GET_CLASS(cpu);
2077     CPUArchState *env = cpu->env_ptr;
2078     target_ulong pc, cs_base;
2079     target_ulong vaddr;
2080     CPUWatchpoint *wp;
2081     uint32_t cpu_flags;
2082
2083     if (cpu->watchpoint_hit) {
2084         /* We re-entered the check after replacing the TB. Now raise
2085          * the debug interrupt so that is will trigger after the
2086          * current instruction. */
2087         cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2088         return;
2089     }
2090     vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
2091     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2092         if (cpu_watchpoint_address_matches(wp, vaddr, len)
2093             && (wp->flags & flags)) {
2094             if (flags == BP_MEM_READ) {
2095                 wp->flags |= BP_WATCHPOINT_HIT_READ;
2096             } else {
2097                 wp->flags |= BP_WATCHPOINT_HIT_WRITE;
2098             }
2099             wp->hitaddr = vaddr;
2100             wp->hitattrs = attrs;
2101             if (!cpu->watchpoint_hit) {
2102                 if (wp->flags & BP_CPU &&
2103                     !cc->debug_check_watchpoint(cpu, wp)) {
2104                     wp->flags &= ~BP_WATCHPOINT_HIT;
2105                     continue;
2106                 }
2107                 cpu->watchpoint_hit = wp;
2108
2109                 /* The tb_lock will be reset when cpu_loop_exit or
2110                  * cpu_loop_exit_noexc longjmp back into the cpu_exec
2111                  * main loop.
2112                  */
2113                 tb_lock();
2114                 tb_check_watchpoint(cpu);
2115                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2116                     cpu->exception_index = EXCP_DEBUG;
2117                     cpu_loop_exit(cpu);
2118                 } else {
2119                     cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
2120                     tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
2121                     cpu_loop_exit_noexc(cpu);
2122                 }
2123             }
2124         } else {
2125             wp->flags &= ~BP_WATCHPOINT_HIT;
2126         }
2127     }
2128 }
2129
2130 /* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
2131    so these check for a hit then pass through to the normal out-of-line
2132    phys routines.  */
2133 static MemTxResult watch_mem_read(void *opaque, hwaddr addr, uint64_t *pdata,
2134                                   unsigned size, MemTxAttrs attrs)
2135 {
2136     MemTxResult res;
2137     uint64_t data;
2138     int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2139     AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2140
2141     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
2142     switch (size) {
2143     case 1:
2144         data = address_space_ldub(as, addr, attrs, &res);
2145         break;
2146     case 2:
2147         data = address_space_lduw(as, addr, attrs, &res);
2148         break;
2149     case 4:
2150         data = address_space_ldl(as, addr, attrs, &res);
2151         break;
2152     default: abort();
2153     }
2154     *pdata = data;
2155     return res;
2156 }
2157
2158 static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
2159                                    uint64_t val, unsigned size,
2160                                    MemTxAttrs attrs)
2161 {
2162     MemTxResult res;
2163     int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2164     AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2165
2166     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
2167     switch (size) {
2168     case 1:
2169         address_space_stb(as, addr, val, attrs, &res);
2170         break;
2171     case 2:
2172         address_space_stw(as, addr, val, attrs, &res);
2173         break;
2174     case 4:
2175         address_space_stl(as, addr, val, attrs, &res);
2176         break;
2177     default: abort();
2178     }
2179     return res;
2180 }
2181
2182 static const MemoryRegionOps watch_mem_ops = {
2183     .read_with_attrs = watch_mem_read,
2184     .write_with_attrs = watch_mem_write,
2185     .endianness = DEVICE_NATIVE_ENDIAN,
2186 };
2187
2188 static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
2189                                 unsigned len, MemTxAttrs attrs)
2190 {
2191     subpage_t *subpage = opaque;
2192     uint8_t buf[8];
2193     MemTxResult res;
2194
2195 #if defined(DEBUG_SUBPAGE)
2196     printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2197            subpage, len, addr);
2198 #endif
2199     res = address_space_read(subpage->as, addr + subpage->base,
2200                              attrs, buf, len);
2201     if (res) {
2202         return res;
2203     }
2204     switch (len) {
2205     case 1:
2206         *data = ldub_p(buf);
2207         return MEMTX_OK;
2208     case 2:
2209         *data = lduw_p(buf);
2210         return MEMTX_OK;
2211     case 4:
2212         *data = ldl_p(buf);
2213         return MEMTX_OK;
2214     case 8:
2215         *data = ldq_p(buf);
2216         return MEMTX_OK;
2217     default:
2218         abort();
2219     }
2220 }
2221
2222 static MemTxResult subpage_write(void *opaque, hwaddr addr,
2223                                  uint64_t value, unsigned len, MemTxAttrs attrs)
2224 {
2225     subpage_t *subpage = opaque;
2226     uint8_t buf[8];
2227
2228 #if defined(DEBUG_SUBPAGE)
2229     printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2230            " value %"PRIx64"\n",
2231            __func__, subpage, len, addr, value);
2232 #endif
2233     switch (len) {
2234     case 1:
2235         stb_p(buf, value);
2236         break;
2237     case 2:
2238         stw_p(buf, value);
2239         break;
2240     case 4:
2241         stl_p(buf, value);
2242         break;
2243     case 8:
2244         stq_p(buf, value);
2245         break;
2246     default:
2247         abort();
2248     }
2249     return address_space_write(subpage->as, addr + subpage->base,
2250                                attrs, buf, len);
2251 }
2252
2253 static bool subpage_accepts(void *opaque, hwaddr addr,
2254                             unsigned len, bool is_write)
2255 {
2256     subpage_t *subpage = opaque;
2257 #if defined(DEBUG_SUBPAGE)
2258     printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2259            __func__, subpage, is_write ? 'w' : 'r', len, addr);
2260 #endif
2261
2262     return address_space_access_valid(subpage->as, addr + subpage->base,
2263                                       len, is_write);
2264 }
2265
2266 static const MemoryRegionOps subpage_ops = {
2267     .read_with_attrs = subpage_read,
2268     .write_with_attrs = subpage_write,
2269     .impl.min_access_size = 1,
2270     .impl.max_access_size = 8,
2271     .valid.min_access_size = 1,
2272     .valid.max_access_size = 8,
2273     .valid.accepts = subpage_accepts,
2274     .endianness = DEVICE_NATIVE_ENDIAN,
2275 };
2276
2277 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2278                              uint16_t section)
2279 {
2280     int idx, eidx;
2281
2282     if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2283         return -1;
2284     idx = SUBPAGE_IDX(start);
2285     eidx = SUBPAGE_IDX(end);
2286 #if defined(DEBUG_SUBPAGE)
2287     printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2288            __func__, mmio, start, end, idx, eidx, section);
2289 #endif
2290     for (; idx <= eidx; idx++) {
2291         mmio->sub_section[idx] = section;
2292     }
2293
2294     return 0;
2295 }
2296
2297 static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
2298 {
2299     subpage_t *mmio;
2300
2301     mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2302     mmio->as = as;
2303     mmio->base = base;
2304     memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2305                           NULL, TARGET_PAGE_SIZE);
2306     mmio->iomem.subpage = true;
2307 #if defined(DEBUG_SUBPAGE)
2308     printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2309            mmio, base, TARGET_PAGE_SIZE);
2310 #endif
2311     subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2312
2313     return mmio;
2314 }
2315
2316 static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as,
2317                               MemoryRegion *mr)
2318 {
2319     assert(as);
2320     MemoryRegionSection section = {
2321         .address_space = as,
2322         .mr = mr,
2323         .offset_within_address_space = 0,
2324         .offset_within_region = 0,
2325         .size = int128_2_64(),
2326     };
2327
2328     return phys_section_add(map, &section);
2329 }
2330
2331 MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs)
2332 {
2333     int asidx = cpu_asidx_from_attrs(cpu, attrs);
2334     CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2335     AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2336     MemoryRegionSection *sections = d->map.sections;
2337
2338     return sections[index & ~TARGET_PAGE_MASK].mr;
2339 }
2340
2341 static void io_mem_init(void)
2342 {
2343     memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2344     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2345                           NULL, UINT64_MAX);
2346     memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2347                           NULL, UINT64_MAX);
2348     memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2349                           NULL, UINT64_MAX);
2350 }
2351
2352 static void mem_begin(MemoryListener *listener)
2353 {
2354     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2355     AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2356     uint16_t n;
2357
2358     n = dummy_section(&d->map, as, &io_mem_unassigned);
2359     assert(n == PHYS_SECTION_UNASSIGNED);
2360     n = dummy_section(&d->map, as, &io_mem_notdirty);
2361     assert(n == PHYS_SECTION_NOTDIRTY);
2362     n = dummy_section(&d->map, as, &io_mem_rom);
2363     assert(n == PHYS_SECTION_ROM);
2364     n = dummy_section(&d->map, as, &io_mem_watch);
2365     assert(n == PHYS_SECTION_WATCH);
2366
2367     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2368     d->as = as;
2369     as->next_dispatch = d;
2370 }
2371
2372 static void address_space_dispatch_free(AddressSpaceDispatch *d)
2373 {
2374     phys_sections_free(&d->map);
2375     g_free(d);
2376 }
2377
2378 static void mem_commit(MemoryListener *listener)
2379 {
2380     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2381     AddressSpaceDispatch *cur = as->dispatch;
2382     AddressSpaceDispatch *next = as->next_dispatch;
2383
2384     phys_page_compact_all(next, next->map.nodes_nb);
2385
2386     atomic_rcu_set(&as->dispatch, next);
2387     if (cur) {
2388         call_rcu(cur, address_space_dispatch_free, rcu);
2389     }
2390 }
2391
2392 static void tcg_commit(MemoryListener *listener)
2393 {
2394     CPUAddressSpace *cpuas;
2395     AddressSpaceDispatch *d;
2396
2397     /* since each CPU stores ram addresses in its TLB cache, we must
2398        reset the modified entries */
2399     cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2400     cpu_reloading_memory_map();
2401     /* The CPU and TLB are protected by the iothread lock.
2402      * We reload the dispatch pointer now because cpu_reloading_memory_map()
2403      * may have split the RCU critical section.
2404      */
2405     d = atomic_rcu_read(&cpuas->as->dispatch);
2406     atomic_rcu_set(&cpuas->memory_dispatch, d);
2407     tlb_flush(cpuas->cpu, 1);
2408 }
2409
2410 void address_space_init_dispatch(AddressSpace *as)
2411 {
2412     as->dispatch = NULL;
2413     as->dispatch_listener = (MemoryListener) {
2414         .begin = mem_begin,
2415         .commit = mem_commit,
2416         .region_add = mem_add,
2417         .region_nop = mem_add,
2418         .priority = 0,
2419     };
2420     memory_listener_register(&as->dispatch_listener, as);
2421 }
2422
2423 void address_space_unregister(AddressSpace *as)
2424 {
2425     memory_listener_unregister(&as->dispatch_listener);
2426 }
2427
2428 void address_space_destroy_dispatch(AddressSpace *as)
2429 {
2430     AddressSpaceDispatch *d = as->dispatch;
2431
2432     atomic_rcu_set(&as->dispatch, NULL);
2433     if (d) {
2434         call_rcu(d, address_space_dispatch_free, rcu);
2435     }
2436 }
2437
2438 static void memory_map_init(void)
2439 {
2440     system_memory = g_malloc(sizeof(*system_memory));
2441
2442     memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2443     address_space_init(&address_space_memory, system_memory, "memory");
2444
2445     system_io = g_malloc(sizeof(*system_io));
2446     memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2447                           65536);
2448     address_space_init(&address_space_io, system_io, "I/O");
2449 }
2450
2451 MemoryRegion *get_system_memory(void)
2452 {
2453     return system_memory;
2454 }
2455
2456 MemoryRegion *get_system_io(void)
2457 {
2458     return system_io;
2459 }
2460
2461 #endif /* !defined(CONFIG_USER_ONLY) */
2462
2463 /* physical memory access (slow version, mainly for debug) */
2464 #if defined(CONFIG_USER_ONLY)
2465 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2466                         uint8_t *buf, int len, int is_write)
2467 {
2468     int l, flags;
2469     target_ulong page;
2470     void * p;
2471
2472     while (len > 0) {
2473         page = addr & TARGET_PAGE_MASK;
2474         l = (page + TARGET_PAGE_SIZE) - addr;
2475         if (l > len)
2476             l = len;
2477         flags = page_get_flags(page);
2478         if (!(flags & PAGE_VALID))
2479             return -1;
2480         if (is_write) {
2481             if (!(flags & PAGE_WRITE))
2482                 return -1;
2483             /* XXX: this code should not depend on lock_user */
2484             if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
2485                 return -1;
2486             memcpy(p, buf, l);
2487             unlock_user(p, addr, l);
2488         } else {
2489             if (!(flags & PAGE_READ))
2490                 return -1;
2491             /* XXX: this code should not depend on lock_user */
2492             if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
2493                 return -1;
2494             memcpy(buf, p, l);
2495             unlock_user(p, addr, 0);
2496         }
2497         len -= l;
2498         buf += l;
2499         addr += l;
2500     }
2501     return 0;
2502 }
2503
2504 #else
2505
2506 static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
2507                                      hwaddr length)
2508 {
2509     uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
2510     addr += memory_region_get_ram_addr(mr);
2511
2512     /* No early return if dirty_log_mask is or becomes 0, because
2513      * cpu_physical_memory_set_dirty_range will still call
2514      * xen_modified_memory.
2515      */
2516     if (dirty_log_mask) {
2517         dirty_log_mask =
2518             cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
2519     }
2520     if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
2521         tb_lock();
2522         tb_invalidate_phys_range(addr, addr + length);
2523         tb_unlock();
2524         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
2525     }
2526     cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
2527 }
2528
2529 static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2530 {
2531     unsigned access_size_max = mr->ops->valid.max_access_size;
2532
2533     /* Regions are assumed to support 1-4 byte accesses unless
2534        otherwise specified.  */
2535     if (access_size_max == 0) {
2536         access_size_max = 4;
2537     }
2538
2539     /* Bound the maximum access by the alignment of the address.  */
2540     if (!mr->ops->impl.unaligned) {
2541         unsigned align_size_max = addr & -addr;
2542         if (align_size_max != 0 && align_size_max < access_size_max) {
2543             access_size_max = align_size_max;
2544         }
2545     }
2546
2547     /* Don't attempt accesses larger than the maximum.  */
2548     if (l > access_size_max) {
2549         l = access_size_max;
2550     }
2551     l = pow2floor(l);
2552
2553     return l;
2554 }
2555
2556 static bool prepare_mmio_access(MemoryRegion *mr)
2557 {
2558     bool unlocked = !qemu_mutex_iothread_locked();
2559     bool release_lock = false;
2560
2561     if (unlocked && mr->global_locking) {
2562         qemu_mutex_lock_iothread();
2563         unlocked = false;
2564         release_lock = true;
2565     }
2566     if (mr->flush_coalesced_mmio) {
2567         if (unlocked) {
2568             qemu_mutex_lock_iothread();
2569         }
2570         qemu_flush_coalesced_mmio_buffer();
2571         if (unlocked) {
2572             qemu_mutex_unlock_iothread();
2573         }
2574     }
2575
2576     return release_lock;
2577 }
2578
2579 /* Called within RCU critical section.  */
2580 static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
2581                                                 MemTxAttrs attrs,
2582                                                 const uint8_t *buf,
2583                                                 int len, hwaddr addr1,
2584                                                 hwaddr l, MemoryRegion *mr)
2585 {
2586     uint8_t *ptr;
2587     uint64_t val;
2588     MemTxResult result = MEMTX_OK;
2589     bool release_lock = false;
2590
2591     for (;;) {
2592         if (!memory_access_is_direct(mr, true)) {
2593             release_lock |= prepare_mmio_access(mr);
2594             l = memory_access_size(mr, l, addr1);
2595             /* XXX: could force current_cpu to NULL to avoid
2596                potential bugs */
2597             switch (l) {
2598             case 8:
2599                 /* 64 bit write access */
2600                 val = ldq_p(buf);
2601                 result |= memory_region_dispatch_write(mr, addr1, val, 8,
2602                                                        attrs);
2603                 break;
2604             case 4:
2605                 /* 32 bit write access */
2606                 val = ldl_p(buf);
2607                 result |= memory_region_dispatch_write(mr, addr1, val, 4,
2608                                                        attrs);
2609                 break;
2610             case 2:
2611                 /* 16 bit write access */
2612                 val = lduw_p(buf);
2613                 result |= memory_region_dispatch_write(mr, addr1, val, 2,
2614                                                        attrs);
2615                 break;
2616             case 1:
2617                 /* 8 bit write access */
2618                 val = ldub_p(buf);
2619                 result |= memory_region_dispatch_write(mr, addr1, val, 1,
2620                                                        attrs);
2621                 break;
2622             default:
2623                 abort();
2624             }
2625         } else {
2626             /* RAM case */
2627             ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2628             memcpy(ptr, buf, l);
2629             invalidate_and_set_dirty(mr, addr1, l);
2630         }
2631
2632         if (release_lock) {
2633             qemu_mutex_unlock_iothread();
2634             release_lock = false;
2635         }
2636
2637         len -= l;
2638         buf += l;
2639         addr += l;
2640
2641         if (!len) {
2642             break;
2643         }
2644
2645         l = len;
2646         mr = address_space_translate(as, addr, &addr1, &l, true);
2647     }
2648
2649     return result;
2650 }
2651
2652 MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
2653                                 const uint8_t *buf, int len)
2654 {
2655     hwaddr l;
2656     hwaddr addr1;
2657     MemoryRegion *mr;
2658     MemTxResult result = MEMTX_OK;
2659
2660     if (len > 0) {
2661         rcu_read_lock();
2662         l = len;
2663         mr = address_space_translate(as, addr, &addr1, &l, true);
2664         result = address_space_write_continue(as, addr, attrs, buf, len,
2665                                               addr1, l, mr);
2666         rcu_read_unlock();
2667     }
2668
2669     return result;
2670 }
2671
2672 /* Called within RCU critical section.  */
2673 MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
2674                                         MemTxAttrs attrs, uint8_t *buf,
2675                                         int len, hwaddr addr1, hwaddr l,
2676                                         MemoryRegion *mr)
2677 {
2678     uint8_t *ptr;
2679     uint64_t val;
2680     MemTxResult result = MEMTX_OK;
2681     bool release_lock = false;
2682
2683     for (;;) {
2684         if (!memory_access_is_direct(mr, false)) {
2685             /* I/O case */
2686             release_lock |= prepare_mmio_access(mr);
2687             l = memory_access_size(mr, l, addr1);
2688             switch (l) {
2689             case 8:
2690                 /* 64 bit read access */
2691                 result |= memory_region_dispatch_read(mr, addr1, &val, 8,
2692                                                       attrs);
2693                 stq_p(buf, val);
2694                 break;
2695             case 4:
2696                 /* 32 bit read access */
2697                 result |= memory_region_dispatch_read(mr, addr1, &val, 4,
2698                                                       attrs);
2699                 stl_p(buf, val);
2700                 break;
2701             case 2:
2702                 /* 16 bit read access */
2703                 result |= memory_region_dispatch_read(mr, addr1, &val, 2,
2704                                                       attrs);
2705                 stw_p(buf, val);
2706                 break;
2707             case 1:
2708                 /* 8 bit read access */
2709                 result |= memory_region_dispatch_read(mr, addr1, &val, 1,
2710                                                       attrs);
2711                 stb_p(buf, val);
2712                 break;
2713             default:
2714                 abort();
2715             }
2716         } else {
2717             /* RAM case */
2718             ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2719             memcpy(buf, ptr, l);
2720         }
2721
2722         if (release_lock) {
2723             qemu_mutex_unlock_iothread();
2724             release_lock = false;
2725         }
2726
2727         len -= l;
2728         buf += l;
2729         addr += l;
2730
2731         if (!len) {
2732             break;
2733         }
2734
2735         l = len;
2736         mr = address_space_translate(as, addr, &addr1, &l, false);
2737     }
2738
2739     return result;
2740 }
2741
2742 MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
2743                                     MemTxAttrs attrs, uint8_t *buf, int len)
2744 {
2745     hwaddr l;
2746     hwaddr addr1;
2747     MemoryRegion *mr;
2748     MemTxResult result = MEMTX_OK;
2749
2750     if (len > 0) {
2751         rcu_read_lock();
2752         l = len;
2753         mr = address_space_translate(as, addr, &addr1, &l, false);
2754         result = address_space_read_continue(as, addr, attrs, buf, len,
2755                                              addr1, l, mr);
2756         rcu_read_unlock();
2757     }
2758
2759     return result;
2760 }
2761
2762 MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
2763                              uint8_t *buf, int len, bool is_write)
2764 {
2765     if (is_write) {
2766         return address_space_write(as, addr, attrs, (uint8_t *)buf, len);
2767     } else {
2768         return address_space_read(as, addr, attrs, (uint8_t *)buf, len);
2769     }
2770 }
2771
2772 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
2773                             int len, int is_write)
2774 {
2775     address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
2776                      buf, len, is_write);
2777 }
2778
2779 enum write_rom_type {
2780     WRITE_DATA,
2781     FLUSH_CACHE,
2782 };
2783
2784 static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
2785     hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
2786 {
2787     hwaddr l;
2788     uint8_t *ptr;
2789     hwaddr addr1;
2790     MemoryRegion *mr;
2791
2792     rcu_read_lock();
2793     while (len > 0) {
2794         l = len;
2795         mr = address_space_translate(as, addr, &addr1, &l, true);
2796
2797         if (!(memory_region_is_ram(mr) ||
2798               memory_region_is_romd(mr))) {
2799             l = memory_access_size(mr, l, addr1);
2800         } else {
2801             /* ROM/RAM case */
2802             ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2803             switch (type) {
2804             case WRITE_DATA:
2805                 memcpy(ptr, buf, l);
2806                 invalidate_and_set_dirty(mr, addr1, l);
2807                 break;
2808             case FLUSH_CACHE:
2809                 flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
2810                 break;
2811             }
2812         }
2813         len -= l;
2814         buf += l;
2815         addr += l;
2816     }
2817     rcu_read_unlock();
2818 }
2819
2820 /* used for ROM loading : can write in RAM and ROM */
2821 void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
2822                                    const uint8_t *buf, int len)
2823 {
2824     cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
2825 }
2826
2827 void cpu_flush_icache_range(hwaddr start, int len)
2828 {
2829     /*
2830      * This function should do the same thing as an icache flush that was
2831      * triggered from within the guest. For TCG we are always cache coherent,
2832      * so there is no need to flush anything. For KVM / Xen we need to flush
2833      * the host's instruction cache at least.
2834      */
2835     if (tcg_enabled()) {
2836         return;
2837     }
2838
2839     cpu_physical_memory_write_rom_internal(&address_space_memory,
2840                                            start, NULL, len, FLUSH_CACHE);
2841 }
2842
2843 typedef struct {
2844     MemoryRegion *mr;
2845     void *buffer;
2846     hwaddr addr;
2847     hwaddr len;
2848     bool in_use;
2849 } BounceBuffer;
2850
2851 static BounceBuffer bounce;
2852
2853 typedef struct MapClient {
2854     QEMUBH *bh;
2855     QLIST_ENTRY(MapClient) link;
2856 } MapClient;
2857
2858 QemuMutex map_client_list_lock;
2859 static QLIST_HEAD(map_client_list, MapClient) map_client_list
2860     = QLIST_HEAD_INITIALIZER(map_client_list);
2861
2862 static void cpu_unregister_map_client_do(MapClient *client)
2863 {
2864     QLIST_REMOVE(client, link);
2865     g_free(client);
2866 }
2867
2868 static void cpu_notify_map_clients_locked(void)
2869 {
2870     MapClient *client;
2871
2872     while (!QLIST_EMPTY(&map_client_list)) {
2873         client = QLIST_FIRST(&map_client_list);
2874         qemu_bh_schedule(client->bh);
2875         cpu_unregister_map_client_do(client);
2876     }
2877 }
2878
2879 void cpu_register_map_client(QEMUBH *bh)
2880 {
2881     MapClient *client = g_malloc(sizeof(*client));
2882
2883     qemu_mutex_lock(&map_client_list_lock);
2884     client->bh = bh;
2885     QLIST_INSERT_HEAD(&map_client_list, client, link);
2886     if (!atomic_read(&bounce.in_use)) {
2887         cpu_notify_map_clients_locked();
2888     }
2889     qemu_mutex_unlock(&map_client_list_lock);
2890 }
2891
2892 void cpu_exec_init_all(void)
2893 {
2894     qemu_mutex_init(&ram_list.mutex);
2895     /* The data structures we set up here depend on knowing the page size,
2896      * so no more changes can be made after this point.
2897      * In an ideal world, nothing we did before we had finished the
2898      * machine setup would care about the target page size, and we could
2899      * do this much later, rather than requiring board models to state
2900      * up front what their requirements are.
2901      */
2902     finalize_target_page_bits();
2903     io_mem_init();
2904     memory_map_init();
2905     qemu_mutex_init(&map_client_list_lock);
2906 }
2907
2908 void cpu_unregister_map_client(QEMUBH *bh)
2909 {
2910     MapClient *client;
2911
2912     qemu_mutex_lock(&map_client_list_lock);
2913     QLIST_FOREACH(client, &map_client_list, link) {
2914         if (client->bh == bh) {
2915             cpu_unregister_map_client_do(client);
2916             break;
2917         }
2918     }
2919     qemu_mutex_unlock(&map_client_list_lock);
2920 }
2921
2922 static void cpu_notify_map_clients(void)
2923 {
2924     qemu_mutex_lock(&map_client_list_lock);
2925     cpu_notify_map_clients_locked();
2926     qemu_mutex_unlock(&map_client_list_lock);
2927 }
2928
2929 bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
2930 {
2931     MemoryRegion *mr;
2932     hwaddr l, xlat;
2933
2934     rcu_read_lock();
2935     while (len > 0) {
2936         l = len;
2937         mr = address_space_translate(as, addr, &xlat, &l, is_write);
2938         if (!memory_access_is_direct(mr, is_write)) {
2939             l = memory_access_size(mr, l, addr);
2940             if (!memory_region_access_valid(mr, xlat, l, is_write)) {
2941                 return false;
2942             }
2943         }
2944
2945         len -= l;
2946         addr += l;
2947     }
2948     rcu_read_unlock();
2949     return true;
2950 }
2951
2952 /* Map a physical memory region into a host virtual address.
2953  * May map a subset of the requested range, given by and returned in *plen.
2954  * May return NULL if resources needed to perform the mapping are exhausted.
2955  * Use only for reads OR writes - not for read-modify-write operations.
2956  * Use cpu_register_map_client() to know when retrying the map operation is
2957  * likely to succeed.
2958  */
2959 void *address_space_map(AddressSpace *as,
2960                         hwaddr addr,
2961                         hwaddr *plen,
2962                         bool is_write)
2963 {
2964     hwaddr len = *plen;
2965     hwaddr done = 0;
2966     hwaddr l, xlat, base;
2967     MemoryRegion *mr, *this_mr;
2968     void *ptr;
2969
2970     if (len == 0) {
2971         return NULL;
2972     }
2973
2974     l = len;
2975     rcu_read_lock();
2976     mr = address_space_translate(as, addr, &xlat, &l, is_write);
2977
2978     if (!memory_access_is_direct(mr, is_write)) {
2979         if (atomic_xchg(&bounce.in_use, true)) {
2980             rcu_read_unlock();
2981             return NULL;
2982         }
2983         /* Avoid unbounded allocations */
2984         l = MIN(l, TARGET_PAGE_SIZE);
2985         bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
2986         bounce.addr = addr;
2987         bounce.len = l;
2988
2989         memory_region_ref(mr);
2990         bounce.mr = mr;
2991         if (!is_write) {
2992             address_space_read(as, addr, MEMTXATTRS_UNSPECIFIED,
2993                                bounce.buffer, l);
2994         }
2995
2996         rcu_read_unlock();
2997         *plen = l;
2998         return bounce.buffer;
2999     }
3000
3001     base = xlat;
3002
3003     for (;;) {
3004         len -= l;
3005         addr += l;
3006         done += l;
3007         if (len == 0) {
3008             break;
3009         }
3010
3011         l = len;
3012         this_mr = address_space_translate(as, addr, &xlat, &l, is_write);
3013         if (this_mr != mr || xlat != base + done) {
3014             break;
3015         }
3016     }
3017
3018     memory_region_ref(mr);
3019     *plen = done;
3020     ptr = qemu_ram_ptr_length(mr->ram_block, base, plen);
3021     rcu_read_unlock();
3022
3023     return ptr;
3024 }
3025
3026 /* Unmaps a memory region previously mapped by address_space_map().
3027  * Will also mark the memory as dirty if is_write == 1.  access_len gives
3028  * the amount of memory that was actually read or written by the caller.
3029  */
3030 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
3031                          int is_write, hwaddr access_len)
3032 {
3033     if (buffer != bounce.buffer) {
3034         MemoryRegion *mr;
3035         ram_addr_t addr1;
3036
3037         mr = memory_region_from_host(buffer, &addr1);
3038         assert(mr != NULL);
3039         if (is_write) {
3040             invalidate_and_set_dirty(mr, addr1, access_len);
3041         }
3042         if (xen_enabled()) {
3043             xen_invalidate_map_cache_entry(buffer);
3044         }
3045         memory_region_unref(mr);
3046         return;
3047     }
3048     if (is_write) {
3049         address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
3050                             bounce.buffer, access_len);
3051     }
3052     qemu_vfree(bounce.buffer);
3053     bounce.buffer = NULL;
3054     memory_region_unref(bounce.mr);
3055     atomic_mb_set(&bounce.in_use, false);
3056     cpu_notify_map_clients();
3057 }
3058
3059 void *cpu_physical_memory_map(hwaddr addr,
3060                               hwaddr *plen,
3061                               int is_write)
3062 {
3063     return address_space_map(&address_space_memory, addr, plen, is_write);
3064 }
3065
3066 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
3067                                int is_write, hwaddr access_len)
3068 {
3069     return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
3070 }
3071
3072 /* warning: addr must be aligned */
3073 static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr,
3074                                                   MemTxAttrs attrs,
3075                                                   MemTxResult *result,
3076                                                   enum device_endian endian)
3077 {
3078     uint8_t *ptr;
3079     uint64_t val;
3080     MemoryRegion *mr;
3081     hwaddr l = 4;
3082     hwaddr addr1;
3083     MemTxResult r;
3084     bool release_lock = false;
3085
3086     rcu_read_lock();
3087     mr = address_space_translate(as, addr, &addr1, &l, false);
3088     if (l < 4 || !memory_access_is_direct(mr, false)) {
3089         release_lock |= prepare_mmio_access(mr);
3090
3091         /* I/O case */
3092         r = memory_region_dispatch_read(mr, addr1, &val, 4, attrs);
3093 #if defined(TARGET_WORDS_BIGENDIAN)
3094         if (endian == DEVICE_LITTLE_ENDIAN) {
3095             val = bswap32(val);
3096         }
3097 #else
3098         if (endian == DEVICE_BIG_ENDIAN) {
3099             val = bswap32(val);
3100         }
3101 #endif
3102     } else {
3103         /* RAM case */
3104         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3105         switch (endian) {
3106         case DEVICE_LITTLE_ENDIAN:
3107             val = ldl_le_p(ptr);
3108             break;
3109         case DEVICE_BIG_ENDIAN:
3110             val = ldl_be_p(ptr);
3111             break;
3112         default:
3113             val = ldl_p(ptr);
3114             break;
3115         }
3116         r = MEMTX_OK;
3117     }
3118     if (result) {
3119         *result = r;
3120     }
3121     if (release_lock) {
3122         qemu_mutex_unlock_iothread();
3123     }
3124     rcu_read_unlock();
3125     return val;
3126 }
3127
3128 uint32_t address_space_ldl(AddressSpace *as, hwaddr addr,
3129                            MemTxAttrs attrs, MemTxResult *result)
3130 {
3131     return address_space_ldl_internal(as, addr, attrs, result,
3132                                       DEVICE_NATIVE_ENDIAN);
3133 }
3134
3135 uint32_t address_space_ldl_le(AddressSpace *as, hwaddr addr,
3136                               MemTxAttrs attrs, MemTxResult *result)
3137 {
3138     return address_space_ldl_internal(as, addr, attrs, result,
3139                                       DEVICE_LITTLE_ENDIAN);
3140 }
3141
3142 uint32_t address_space_ldl_be(AddressSpace *as, hwaddr addr,
3143                               MemTxAttrs attrs, MemTxResult *result)
3144 {
3145     return address_space_ldl_internal(as, addr, attrs, result,
3146                                       DEVICE_BIG_ENDIAN);
3147 }
3148
3149 uint32_t ldl_phys(AddressSpace *as, hwaddr addr)
3150 {
3151     return address_space_ldl(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3152 }
3153
3154 uint32_t ldl_le_phys(AddressSpace *as, hwaddr addr)
3155 {
3156     return address_space_ldl_le(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3157 }
3158
3159 uint32_t ldl_be_phys(AddressSpace *as, hwaddr addr)
3160 {
3161     return address_space_ldl_be(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3162 }
3163
3164 /* warning: addr must be aligned */
3165 static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr,
3166                                                   MemTxAttrs attrs,
3167                                                   MemTxResult *result,
3168                                                   enum device_endian endian)
3169 {
3170     uint8_t *ptr;
3171     uint64_t val;
3172     MemoryRegion *mr;
3173     hwaddr l = 8;
3174     hwaddr addr1;
3175     MemTxResult r;
3176     bool release_lock = false;
3177
3178     rcu_read_lock();
3179     mr = address_space_translate(as, addr, &addr1, &l,
3180                                  false);
3181     if (l < 8 || !memory_access_is_direct(mr, false)) {
3182         release_lock |= prepare_mmio_access(mr);
3183
3184         /* I/O case */
3185         r = memory_region_dispatch_read(mr, addr1, &val, 8, attrs);
3186 #if defined(TARGET_WORDS_BIGENDIAN)
3187         if (endian == DEVICE_LITTLE_ENDIAN) {
3188             val = bswap64(val);
3189         }
3190 #else
3191         if (endian == DEVICE_BIG_ENDIAN) {
3192             val = bswap64(val);
3193         }
3194 #endif
3195     } else {
3196         /* RAM case */
3197         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3198         switch (endian) {
3199         case DEVICE_LITTLE_ENDIAN:
3200             val = ldq_le_p(ptr);
3201             break;
3202         case DEVICE_BIG_ENDIAN:
3203             val = ldq_be_p(ptr);
3204             break;
3205         default:
3206             val = ldq_p(ptr);
3207             break;
3208         }
3209         r = MEMTX_OK;
3210     }
3211     if (result) {
3212         *result = r;
3213     }
3214     if (release_lock) {
3215         qemu_mutex_unlock_iothread();
3216     }
3217     rcu_read_unlock();
3218     return val;
3219 }
3220
3221 uint64_t address_space_ldq(AddressSpace *as, hwaddr addr,
3222                            MemTxAttrs attrs, MemTxResult *result)
3223 {
3224     return address_space_ldq_internal(as, addr, attrs, result,
3225                                       DEVICE_NATIVE_ENDIAN);
3226 }
3227
3228 uint64_t address_space_ldq_le(AddressSpace *as, hwaddr addr,
3229                            MemTxAttrs attrs, MemTxResult *result)
3230 {
3231     return address_space_ldq_internal(as, addr, attrs, result,
3232                                       DEVICE_LITTLE_ENDIAN);
3233 }
3234
3235 uint64_t address_space_ldq_be(AddressSpace *as, hwaddr addr,
3236                            MemTxAttrs attrs, MemTxResult *result)
3237 {
3238     return address_space_ldq_internal(as, addr, attrs, result,
3239                                       DEVICE_BIG_ENDIAN);
3240 }
3241
3242 uint64_t ldq_phys(AddressSpace *as, hwaddr addr)
3243 {
3244     return address_space_ldq(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3245 }
3246
3247 uint64_t ldq_le_phys(AddressSpace *as, hwaddr addr)
3248 {
3249     return address_space_ldq_le(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3250 }
3251
3252 uint64_t ldq_be_phys(AddressSpace *as, hwaddr addr)
3253 {
3254     return address_space_ldq_be(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3255 }
3256
3257 /* XXX: optimize */
3258 uint32_t address_space_ldub(AddressSpace *as, hwaddr addr,
3259                             MemTxAttrs attrs, MemTxResult *result)
3260 {
3261     uint8_t val;
3262     MemTxResult r;
3263
3264     r = address_space_rw(as, addr, attrs, &val, 1, 0);
3265     if (result) {
3266         *result = r;
3267     }
3268     return val;
3269 }
3270
3271 uint32_t ldub_phys(AddressSpace *as, hwaddr addr)
3272 {
3273     return address_space_ldub(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3274 }
3275
3276 /* warning: addr must be aligned */
3277 static inline uint32_t address_space_lduw_internal(AddressSpace *as,
3278                                                    hwaddr addr,
3279                                                    MemTxAttrs attrs,
3280                                                    MemTxResult *result,
3281                                                    enum device_endian endian)
3282 {
3283     uint8_t *ptr;
3284     uint64_t val;
3285     MemoryRegion *mr;
3286     hwaddr l = 2;
3287     hwaddr addr1;
3288     MemTxResult r;
3289     bool release_lock = false;
3290
3291     rcu_read_lock();
3292     mr = address_space_translate(as, addr, &addr1, &l,
3293                                  false);
3294     if (l < 2 || !memory_access_is_direct(mr, false)) {
3295         release_lock |= prepare_mmio_access(mr);
3296
3297         /* I/O case */
3298         r = memory_region_dispatch_read(mr, addr1, &val, 2, attrs);
3299 #if defined(TARGET_WORDS_BIGENDIAN)
3300         if (endian == DEVICE_LITTLE_ENDIAN) {
3301             val = bswap16(val);
3302         }
3303 #else
3304         if (endian == DEVICE_BIG_ENDIAN) {
3305             val = bswap16(val);
3306         }
3307 #endif
3308     } else {
3309         /* RAM case */
3310         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3311         switch (endian) {
3312         case DEVICE_LITTLE_ENDIAN:
3313             val = lduw_le_p(ptr);
3314             break;
3315         case DEVICE_BIG_ENDIAN:
3316             val = lduw_be_p(ptr);
3317             break;
3318         default:
3319             val = lduw_p(ptr);
3320             break;
3321         }
3322         r = MEMTX_OK;
3323     }
3324     if (result) {
3325         *result = r;
3326     }
3327     if (release_lock) {
3328         qemu_mutex_unlock_iothread();
3329     }
3330     rcu_read_unlock();
3331     return val;
3332 }
3333
3334 uint32_t address_space_lduw(AddressSpace *as, hwaddr addr,
3335                            MemTxAttrs attrs, MemTxResult *result)
3336 {
3337     return address_space_lduw_internal(as, addr, attrs, result,
3338                                        DEVICE_NATIVE_ENDIAN);
3339 }
3340
3341 uint32_t address_space_lduw_le(AddressSpace *as, hwaddr addr,
3342                            MemTxAttrs attrs, MemTxResult *result)
3343 {
3344     return address_space_lduw_internal(as, addr, attrs, result,
3345                                        DEVICE_LITTLE_ENDIAN);
3346 }
3347
3348 uint32_t address_space_lduw_be(AddressSpace *as, hwaddr addr,
3349                            MemTxAttrs attrs, MemTxResult *result)
3350 {
3351     return address_space_lduw_internal(as, addr, attrs, result,
3352                                        DEVICE_BIG_ENDIAN);
3353 }
3354
3355 uint32_t lduw_phys(AddressSpace *as, hwaddr addr)
3356 {
3357     return address_space_lduw(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3358 }
3359
3360 uint32_t lduw_le_phys(AddressSpace *as, hwaddr addr)
3361 {
3362     return address_space_lduw_le(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3363 }
3364
3365 uint32_t lduw_be_phys(AddressSpace *as, hwaddr addr)
3366 {
3367     return address_space_lduw_be(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3368 }
3369
3370 /* warning: addr must be aligned. The ram page is not masked as dirty
3371    and the code inside is not invalidated. It is useful if the dirty
3372    bits are used to track modified PTEs */
3373 void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val,
3374                                 MemTxAttrs attrs, MemTxResult *result)
3375 {
3376     uint8_t *ptr;
3377     MemoryRegion *mr;
3378     hwaddr l = 4;
3379     hwaddr addr1;
3380     MemTxResult r;
3381     uint8_t dirty_log_mask;
3382     bool release_lock = false;
3383
3384     rcu_read_lock();
3385     mr = address_space_translate(as, addr, &addr1, &l,
3386                                  true);
3387     if (l < 4 || !memory_access_is_direct(mr, true)) {
3388         release_lock |= prepare_mmio_access(mr);
3389
3390         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
3391     } else {
3392         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3393         stl_p(ptr, val);
3394
3395         dirty_log_mask = memory_region_get_dirty_log_mask(mr);
3396         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
3397         cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
3398                                             4, dirty_log_mask);
3399         r = MEMTX_OK;
3400     }
3401     if (result) {
3402         *result = r;
3403     }
3404     if (release_lock) {
3405         qemu_mutex_unlock_iothread();
3406     }
3407     rcu_read_unlock();
3408 }
3409
3410 void stl_phys_notdirty(AddressSpace *as, hwaddr addr, uint32_t val)
3411 {
3412     address_space_stl_notdirty(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3413 }
3414
3415 /* warning: addr must be aligned */
3416 static inline void address_space_stl_internal(AddressSpace *as,
3417                                               hwaddr addr, uint32_t val,
3418                                               MemTxAttrs attrs,
3419                                               MemTxResult *result,
3420                                               enum device_endian endian)
3421 {
3422     uint8_t *ptr;
3423     MemoryRegion *mr;
3424     hwaddr l = 4;
3425     hwaddr addr1;
3426     MemTxResult r;
3427     bool release_lock = false;
3428
3429     rcu_read_lock();
3430     mr = address_space_translate(as, addr, &addr1, &l,
3431                                  true);
3432     if (l < 4 || !memory_access_is_direct(mr, true)) {
3433         release_lock |= prepare_mmio_access(mr);
3434
3435 #if defined(TARGET_WORDS_BIGENDIAN)
3436         if (endian == DEVICE_LITTLE_ENDIAN) {
3437             val = bswap32(val);
3438         }
3439 #else
3440         if (endian == DEVICE_BIG_ENDIAN) {
3441             val = bswap32(val);
3442         }
3443 #endif
3444         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
3445     } else {
3446         /* RAM case */
3447         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3448         switch (endian) {
3449         case DEVICE_LITTLE_ENDIAN:
3450             stl_le_p(ptr, val);
3451             break;
3452         case DEVICE_BIG_ENDIAN:
3453             stl_be_p(ptr, val);
3454             break;
3455         default:
3456             stl_p(ptr, val);
3457             break;
3458         }
3459         invalidate_and_set_dirty(mr, addr1, 4);
3460         r = MEMTX_OK;
3461     }
3462     if (result) {
3463         *result = r;
3464     }
3465     if (release_lock) {
3466         qemu_mutex_unlock_iothread();
3467     }
3468     rcu_read_unlock();
3469 }
3470
3471 void address_space_stl(AddressSpace *as, hwaddr addr, uint32_t val,
3472                        MemTxAttrs attrs, MemTxResult *result)
3473 {
3474     address_space_stl_internal(as, addr, val, attrs, result,
3475                                DEVICE_NATIVE_ENDIAN);
3476 }
3477
3478 void address_space_stl_le(AddressSpace *as, hwaddr addr, uint32_t val,
3479                        MemTxAttrs attrs, MemTxResult *result)
3480 {
3481     address_space_stl_internal(as, addr, val, attrs, result,
3482                                DEVICE_LITTLE_ENDIAN);
3483 }
3484
3485 void address_space_stl_be(AddressSpace *as, hwaddr addr, uint32_t val,
3486                        MemTxAttrs attrs, MemTxResult *result)
3487 {
3488     address_space_stl_internal(as, addr, val, attrs, result,
3489                                DEVICE_BIG_ENDIAN);
3490 }
3491
3492 void stl_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3493 {
3494     address_space_stl(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3495 }
3496
3497 void stl_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3498 {
3499     address_space_stl_le(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3500 }
3501
3502 void stl_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3503 {
3504     address_space_stl_be(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3505 }
3506
3507 /* XXX: optimize */
3508 void address_space_stb(AddressSpace *as, hwaddr addr, uint32_t val,
3509                        MemTxAttrs attrs, MemTxResult *result)
3510 {
3511     uint8_t v = val;
3512     MemTxResult r;
3513
3514     r = address_space_rw(as, addr, attrs, &v, 1, 1);
3515     if (result) {
3516         *result = r;
3517     }
3518 }
3519
3520 void stb_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3521 {
3522     address_space_stb(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3523 }
3524
3525 /* warning: addr must be aligned */
3526 static inline void address_space_stw_internal(AddressSpace *as,
3527                                               hwaddr addr, uint32_t val,
3528                                               MemTxAttrs attrs,
3529                                               MemTxResult *result,
3530                                               enum device_endian endian)
3531 {
3532     uint8_t *ptr;
3533     MemoryRegion *mr;
3534     hwaddr l = 2;
3535     hwaddr addr1;
3536     MemTxResult r;
3537     bool release_lock = false;
3538
3539     rcu_read_lock();
3540     mr = address_space_translate(as, addr, &addr1, &l, true);
3541     if (l < 2 || !memory_access_is_direct(mr, true)) {
3542         release_lock |= prepare_mmio_access(mr);
3543
3544 #if defined(TARGET_WORDS_BIGENDIAN)
3545         if (endian == DEVICE_LITTLE_ENDIAN) {
3546             val = bswap16(val);
3547         }
3548 #else
3549         if (endian == DEVICE_BIG_ENDIAN) {
3550             val = bswap16(val);
3551         }
3552 #endif
3553         r = memory_region_dispatch_write(mr, addr1, val, 2, attrs);
3554     } else {
3555         /* RAM case */
3556         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3557         switch (endian) {
3558         case DEVICE_LITTLE_ENDIAN:
3559             stw_le_p(ptr, val);
3560             break;
3561         case DEVICE_BIG_ENDIAN:
3562             stw_be_p(ptr, val);
3563             break;
3564         default:
3565             stw_p(ptr, val);
3566             break;
3567         }
3568         invalidate_and_set_dirty(mr, addr1, 2);
3569         r = MEMTX_OK;
3570     }
3571     if (result) {
3572         *result = r;
3573     }
3574     if (release_lock) {
3575         qemu_mutex_unlock_iothread();
3576     }
3577     rcu_read_unlock();
3578 }
3579
3580 void address_space_stw(AddressSpace *as, hwaddr addr, uint32_t val,
3581                        MemTxAttrs attrs, MemTxResult *result)
3582 {
3583     address_space_stw_internal(as, addr, val, attrs, result,
3584                                DEVICE_NATIVE_ENDIAN);
3585 }
3586
3587 void address_space_stw_le(AddressSpace *as, hwaddr addr, uint32_t val,
3588                        MemTxAttrs attrs, MemTxResult *result)
3589 {
3590     address_space_stw_internal(as, addr, val, attrs, result,
3591                                DEVICE_LITTLE_ENDIAN);
3592 }
3593
3594 void address_space_stw_be(AddressSpace *as, hwaddr addr, uint32_t val,
3595                        MemTxAttrs attrs, MemTxResult *result)
3596 {
3597     address_space_stw_internal(as, addr, val, attrs, result,
3598                                DEVICE_BIG_ENDIAN);
3599 }
3600
3601 void stw_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3602 {
3603     address_space_stw(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3604 }
3605
3606 void stw_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3607 {
3608     address_space_stw_le(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3609 }
3610
3611 void stw_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3612 {
3613     address_space_stw_be(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3614 }
3615
3616 /* XXX: optimize */
3617 void address_space_stq(AddressSpace *as, hwaddr addr, uint64_t val,
3618                        MemTxAttrs attrs, MemTxResult *result)
3619 {
3620     MemTxResult r;
3621     val = tswap64(val);
3622     r = address_space_rw(as, addr, attrs, (void *) &val, 8, 1);
3623     if (result) {
3624         *result = r;
3625     }
3626 }
3627
3628 void address_space_stq_le(AddressSpace *as, hwaddr addr, uint64_t val,
3629                        MemTxAttrs attrs, MemTxResult *result)
3630 {
3631     MemTxResult r;
3632     val = cpu_to_le64(val);
3633     r = address_space_rw(as, addr, attrs, (void *) &val, 8, 1);
3634     if (result) {
3635         *result = r;
3636     }
3637 }
3638 void address_space_stq_be(AddressSpace *as, hwaddr addr, uint64_t val,
3639                        MemTxAttrs attrs, MemTxResult *result)
3640 {
3641     MemTxResult r;
3642     val = cpu_to_be64(val);
3643     r = address_space_rw(as, addr, attrs, (void *) &val, 8, 1);
3644     if (result) {
3645         *result = r;
3646     }
3647 }
3648
3649 void stq_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3650 {
3651     address_space_stq(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3652 }
3653
3654 void stq_le_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3655 {
3656     address_space_stq_le(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3657 }
3658
3659 void stq_be_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3660 {
3661     address_space_stq_be(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3662 }
3663
3664 /* virtual memory access for debug (includes writing to ROM) */
3665 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3666                         uint8_t *buf, int len, int is_write)
3667 {
3668     int l;
3669     hwaddr phys_addr;
3670     target_ulong page;
3671
3672     while (len > 0) {
3673         int asidx;
3674         MemTxAttrs attrs;
3675
3676         page = addr & TARGET_PAGE_MASK;
3677         phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
3678         asidx = cpu_asidx_from_attrs(cpu, attrs);
3679         /* if no physical page mapped, return an error */
3680         if (phys_addr == -1)
3681             return -1;
3682         l = (page + TARGET_PAGE_SIZE) - addr;
3683         if (l > len)
3684             l = len;
3685         phys_addr += (addr & ~TARGET_PAGE_MASK);
3686         if (is_write) {
3687             cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
3688                                           phys_addr, buf, l);
3689         } else {
3690             address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
3691                              MEMTXATTRS_UNSPECIFIED,
3692                              buf, l, 0);
3693         }
3694         len -= l;
3695         buf += l;
3696         addr += l;
3697     }
3698     return 0;
3699 }
3700
3701 /*
3702  * Allows code that needs to deal with migration bitmaps etc to still be built
3703  * target independent.
3704  */
3705 size_t qemu_target_page_bits(void)
3706 {
3707     return TARGET_PAGE_BITS;
3708 }
3709
3710 #endif
3711
3712 /*
3713  * A helper function for the _utterly broken_ virtio device model to find out if
3714  * it's running on a big endian machine. Don't do this at home kids!
3715  */
3716 bool target_words_bigendian(void);
3717 bool target_words_bigendian(void)
3718 {
3719 #if defined(TARGET_WORDS_BIGENDIAN)
3720     return true;
3721 #else
3722     return false;
3723 #endif
3724 }
3725
3726 #ifndef CONFIG_USER_ONLY
3727 bool cpu_physical_memory_is_io(hwaddr phys_addr)
3728 {
3729     MemoryRegion*mr;
3730     hwaddr l = 1;
3731     bool res;
3732
3733     rcu_read_lock();
3734     mr = address_space_translate(&address_space_memory,
3735                                  phys_addr, &phys_addr, &l, false);
3736
3737     res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
3738     rcu_read_unlock();
3739     return res;
3740 }
3741
3742 int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3743 {
3744     RAMBlock *block;
3745     int ret = 0;
3746
3747     rcu_read_lock();
3748     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
3749         ret = func(block->idstr, block->host, block->offset,
3750                    block->used_length, opaque);
3751         if (ret) {
3752             break;
3753         }
3754     }
3755     rcu_read_unlock();
3756     return ret;
3757 }
3758 #endif