exec.c

   1 /*
   2  *  Virtual page mapping
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19 #include "qemu/osdep.h"
  20 #include "qapi/error.h"
  21 #ifndef _WIN32
  22 #endif
  23
  24 #include "qemu/cutils.h"
  25 #include "cpu.h"
  26 #include "exec/exec-all.h"
  27 #include "tcg.h"
  28 #include "hw/qdev-core.h"
  29 #if !defined(CONFIG_USER_ONLY)
  30 #include "hw/boards.h"
  31 #include "hw/xen/xen.h"
  32 #endif
  33 #include "sysemu/kvm.h"
  34 #include "sysemu/hax.h"
  35 #include "sysemu/sysemu.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/config-file.h"
  38 #include "qemu/error-report.h"
  39 #if defined(CONFIG_USER_ONLY)
  40 #include "qemu.h"
  41 #else /* !CONFIG_USER_ONLY */
  42 #include "hw/hw.h"
  43 #include "exec/memory.h"
  44 #include "exec/ioport.h"
  45 #include "sysemu/dma.h"
  46 #include "exec/address-spaces.h"
  47 #include "sysemu/xen-mapcache.h"
  48 #include "trace.h"
  49 #endif
  50 #include "exec/cpu-all.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "qemu/main-loop.h"
  53 #include "translate-all.h"
  54 #include "sysemu/replay.h"
  55
  56 #include "exec/memory-internal.h"
  57 #include "exec/ram_addr.h"
  58 #include "exec/log.h"
  59
  60 #include "migration/vmstate.h"
  61
  62 #include "qemu/range.h"
  63 #ifndef _WIN32
  64 #include "qemu/mmap-alloc.h"
  65 #endif
  66
  67 //#define DEBUG_SUBPAGE
  68
  69 #if !defined(CONFIG_USER_ONLY)
  70 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  71  * are protected by the ramlist lock.
  72  */
  73 RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  74
  75 static MemoryRegion *system_memory;
  76 static MemoryRegion *system_io;
  77
  78 AddressSpace address_space_io;
  79 AddressSpace address_space_memory;
  80
  81 MemoryRegion io_mem_rom, io_mem_notdirty;
  82 static MemoryRegion io_mem_unassigned;
  83
  84 /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
  85 #define RAM_PREALLOC   (1 << 0)
  86
  87 /* RAM is mmap-ed with MAP_SHARED */
  88 #define RAM_SHARED     (1 << 1)
  89
  90 /* Only a portion of RAM (used_length) is actually used, and migrated.
  91  * This used_length size can change across reboots.
  92  */
  93 #define RAM_RESIZEABLE (1 << 2)
  94
  95 #endif
  96
  97 #ifdef TARGET_PAGE_BITS_VARY
  98 int target_page_bits;
  99 bool target_page_bits_decided;
 100 #endif
 101
 102 struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
 103 /* current CPU in the current thread. It is only valid inside
 104    cpu_exec() */
 105 __thread CPUState *current_cpu;
 106 /* 0 = Do not count executed instructions.
 107    1 = Precise instruction counting.
 108    2 = Adaptive rate instruction counting.  */
 109 int use_icount;
 110
 111 bool set_preferred_target_page_bits(int bits)
 112 {
 113     /* The target page size is the lowest common denominator for all
 114      * the CPUs in the system, so we can only make it smaller, never
 115      * larger. And we can't make it smaller once we've committed to
 116      * a particular size.
 117      */
 118 #ifdef TARGET_PAGE_BITS_VARY
 119     assert(bits >= TARGET_PAGE_BITS_MIN);
 120     if (target_page_bits == 0 || target_page_bits > bits) {
 121         if (target_page_bits_decided) {
 122             return false;
 123         }
 124         target_page_bits = bits;
 125     }
 126 #endif
 127     return true;
 128 }
 129
 130 #if !defined(CONFIG_USER_ONLY)
 131
 132 static void finalize_target_page_bits(void)
 133 {
 134 #ifdef TARGET_PAGE_BITS_VARY
 135     if (target_page_bits == 0) {
 136         target_page_bits = TARGET_PAGE_BITS_MIN;
 137     }
 138     target_page_bits_decided = true;
 139 #endif
 140 }
 141
 142 typedef struct PhysPageEntry PhysPageEntry;
 143
 144 struct PhysPageEntry {
 145     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 146     uint32_t skip : 6;
 147      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 148     uint32_t ptr : 26;
 149 };
 150
 151 #define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 152
 153 /* Size of the L2 (and L3, etc) page tables.  */
 154 #define ADDR_SPACE_BITS 64
 155
 156 #define P_L2_BITS 9
 157 #define P_L2_SIZE (1 << P_L2_BITS)
 158
 159 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 160
 161 typedef PhysPageEntry Node[P_L2_SIZE];
 162
 163 typedef struct PhysPageMap {
 164     struct rcu_head rcu;
 165
 166     unsigned sections_nb;
 167     unsigned sections_nb_alloc;
 168     unsigned nodes_nb;
 169     unsigned nodes_nb_alloc;
 170     Node *nodes;
 171     MemoryRegionSection *sections;
 172 } PhysPageMap;
 173
 174 struct AddressSpaceDispatch {
 175     struct rcu_head rcu;
 176
 177     MemoryRegionSection *mru_section;
 178     /* This is a multi-level map on the physical address space.
 179      * The bottom level has pointers to MemoryRegionSections.
 180      */
 181     PhysPageEntry phys_map;
 182     PhysPageMap map;
 183     AddressSpace *as;
 184 };
 185
 186 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 187 typedef struct subpage_t {
 188     MemoryRegion iomem;
 189     AddressSpace *as;
 190     hwaddr base;
 191     uint16_t sub_section[];
 192 } subpage_t;
 193
 194 #define PHYS_SECTION_UNASSIGNED 0
 195 #define PHYS_SECTION_NOTDIRTY 1
 196 #define PHYS_SECTION_ROM 2
 197 #define PHYS_SECTION_WATCH 3
 198
 199 static void io_mem_init(void);
 200 static void memory_map_init(void);
 201 static void tcg_commit(MemoryListener *listener);
 202
 203 static MemoryRegion io_mem_watch;
 204
 205 /**
 206  * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 207  * @cpu: the CPU whose AddressSpace this is
 208  * @as: the AddressSpace itself
 209  * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 210  * @tcg_as_listener: listener for tracking changes to the AddressSpace
 211  */
 212 struct CPUAddressSpace {
 213     CPUState *cpu;
 214     AddressSpace *as;
 215     struct AddressSpaceDispatch *memory_dispatch;
 216     MemoryListener tcg_as_listener;
 217 };
 218
 219 #endif
 220
 221 #if !defined(CONFIG_USER_ONLY)
 222
 223 static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 224 {
 225     static unsigned alloc_hint = 16;
 226     if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 227         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
 228         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
 229         map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 230         alloc_hint = map->nodes_nb_alloc;
 231     }
 232 }
 233
 234 static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
 235 {
 236     unsigned i;
 237     uint32_t ret;
 238     PhysPageEntry e;
 239     PhysPageEntry *p;
 240
 241     ret = map->nodes_nb++;
 242     p = map->nodes[ret];
 243     assert(ret != PHYS_MAP_NODE_NIL);
 244     assert(ret != map->nodes_nb_alloc);
 245
 246     e.skip = leaf ? 0 : 1;
 247     e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
 248     for (i = 0; i < P_L2_SIZE; ++i) {
 249         memcpy(&p[i], &e, sizeof(e));
 250     }
 251     return ret;
 252 }
 253
 254 static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 255                                 hwaddr *index, hwaddr *nb, uint16_t leaf,
 256                                 int level)
 257 {
 258     PhysPageEntry *p;
 259     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 260
 261     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 262         lp->ptr = phys_map_node_alloc(map, level == 0);
 263     }
 264     p = map->nodes[lp->ptr];
 265     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 266
 267     while (*nb && lp < &p[P_L2_SIZE]) {
 268         if ((*index & (step - 1)) == 0 && *nb >= step) {
 269             lp->skip = 0;
 270             lp->ptr = leaf;
 271             *index += step;
 272             *nb -= step;
 273         } else {
 274             phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 275         }
 276         ++lp;
 277     }
 278 }
 279
 280 static void phys_page_set(AddressSpaceDispatch *d,
 281                           hwaddr index, hwaddr nb,
 282                           uint16_t leaf)
 283 {
 284     /* Wildly overreserve - it doesn't matter much. */
 285     phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 286
 287     phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 288 }
 289
 290 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
 291  * and update our entry so we can skip it and go directly to the destination.
 292  */
 293 static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
 294 {
 295     unsigned valid_ptr = P_L2_SIZE;
 296     int valid = 0;
 297     PhysPageEntry *p;
 298     int i;
 299
 300     if (lp->ptr == PHYS_MAP_NODE_NIL) {
 301         return;
 302     }
 303
 304     p = nodes[lp->ptr];
 305     for (i = 0; i < P_L2_SIZE; i++) {
 306         if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 307             continue;
 308         }
 309
 310         valid_ptr = i;
 311         valid++;
 312         if (p[i].skip) {
 313             phys_page_compact(&p[i], nodes);
 314         }
 315     }
 316
 317     /* We can only compress if there's only one child. */
 318     if (valid != 1) {
 319         return;
 320     }
 321
 322     assert(valid_ptr < P_L2_SIZE);
 323
 324     /* Don't compress if it won't fit in the # of bits we have. */
 325     if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
 326         return;
 327     }
 328
 329     lp->ptr = p[valid_ptr].ptr;
 330     if (!p[valid_ptr].skip) {
 331         /* If our only child is a leaf, make this a leaf. */
 332         /* By design, we should have made this node a leaf to begin with so we
 333          * should never reach here.
 334          * But since it's so simple to handle this, let's do it just in case we
 335          * change this rule.
 336          */
 337         lp->skip = 0;
 338     } else {
 339         lp->skip += p[valid_ptr].skip;
 340     }
 341 }
 342
 343 static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
 344 {
 345     if (d->phys_map.skip) {
 346         phys_page_compact(&d->phys_map, d->map.nodes);
 347     }
 348 }
 349
 350 static inline bool section_covers_addr(const MemoryRegionSection *section,
 351                                        hwaddr addr)
 352 {
 353     /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
 354      * the section must cover the entire address space.
 355      */
 356     return int128_gethi(section->size) ||
 357            range_covers_byte(section->offset_within_address_space,
 358                              int128_getlo(section->size), addr);
 359 }
 360
 361 static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
 362                                            Node *nodes, MemoryRegionSection *sections)
 363 {
 364     PhysPageEntry *p;
 365     hwaddr index = addr >> TARGET_PAGE_BITS;
 366     int i;
 367
 368     for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 369         if (lp.ptr == PHYS_MAP_NODE_NIL) {
 370             return &sections[PHYS_SECTION_UNASSIGNED];
 371         }
 372         p = nodes[lp.ptr];
 373         lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 374     }
 375
 376     if (section_covers_addr(&sections[lp.ptr], addr)) {
 377         return &sections[lp.ptr];
 378     } else {
 379         return &sections[PHYS_SECTION_UNASSIGNED];
 380     }
 381 }
 382
 383 bool memory_region_is_unassigned(MemoryRegion *mr)
 384 {
 385     return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
 386         && mr != &io_mem_watch;
 387 }
 388
 389 /* Called from RCU critical section */
 390 static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 391                                                         hwaddr addr,
 392                                                         bool resolve_subpage)
 393 {
 394     MemoryRegionSection *section = atomic_read(&d->mru_section);
 395     subpage_t *subpage;
 396     bool update;
 397
 398     if (section && section != &d->map.sections[PHYS_SECTION_UNASSIGNED] &&
 399         section_covers_addr(section, addr)) {
 400         update = false;
 401     } else {
 402         section = phys_page_find(d->phys_map, addr, d->map.nodes,
 403                                  d->map.sections);
 404         update = true;
 405     }
 406     if (resolve_subpage && section->mr->subpage) {
 407         subpage = container_of(section->mr, subpage_t, iomem);
 408         section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 409     }
 410     if (update) {
 411         atomic_set(&d->mru_section, section);
 412     }
 413     return section;
 414 }
 415
 416 /* Called from RCU critical section */
 417 static MemoryRegionSection *
 418 address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 419                                  hwaddr *plen, bool resolve_subpage)
 420 {
 421     MemoryRegionSection *section;
 422     MemoryRegion *mr;
 423     Int128 diff;
 424
 425     section = address_space_lookup_region(d, addr, resolve_subpage);
 426     /* Compute offset within MemoryRegionSection */
 427     addr -= section->offset_within_address_space;
 428
 429     /* Compute offset within MemoryRegion */
 430     *xlat = addr + section->offset_within_region;
 431
 432     mr = section->mr;
 433
 434     /* MMIO registers can be expected to perform full-width accesses based only
 435      * on their address, without considering adjacent registers that could
 436      * decode to completely different MemoryRegions.  When such registers
 437      * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
 438      * regions overlap wildly.  For this reason we cannot clamp the accesses
 439      * here.
 440      *
 441      * If the length is small (as is the case for address_space_ldl/stl),
 442      * everything works fine.  If the incoming length is large, however,
 443      * the caller really has to do the clamping through memory_access_size.
 444      */
 445     if (memory_region_is_ram(mr)) {
 446         diff = int128_sub(section->size, int128_make64(addr));
 447         *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 448     }
 449     return section;
 450 }
 451
 452 /* Called from RCU critical section */
 453 MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
 454                                       hwaddr *xlat, hwaddr *plen,
 455                                       bool is_write)
 456 {
 457     IOMMUTLBEntry iotlb;
 458     MemoryRegionSection *section;
 459     MemoryRegion *mr;
 460
 461     for (;;) {
 462         AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
 463         section = address_space_translate_internal(d, addr, &addr, plen, true);
 464         mr = section->mr;
 465
 466         if (!mr->iommu_ops) {
 467             break;
 468         }
 469
 470         iotlb = mr->iommu_ops->translate(mr, addr, is_write);
 471         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 472                 | (addr & iotlb.addr_mask));
 473         *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
 474         if (!(iotlb.perm & (1 << is_write))) {
 475             mr = &io_mem_unassigned;
 476             break;
 477         }
 478
 479         as = iotlb.target_as;
 480     }
 481
 482     if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 483         hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 484         *plen = MIN(page, *plen);
 485     }
 486
 487     *xlat = addr;
 488     return mr;
 489 }
 490
 491 /* Called from RCU critical section */
 492 MemoryRegionSection *
 493 address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
 494                                   hwaddr *xlat, hwaddr *plen)
 495 {
 496     MemoryRegionSection *section;
 497     AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
 498
 499     section = address_space_translate_internal(d, addr, xlat, plen, false);
 500
 501     assert(!section->mr->iommu_ops);
 502     return section;
 503 }
 504 #endif
 505
 506 #if !defined(CONFIG_USER_ONLY)
 507
 508 static int cpu_common_post_load(void *opaque, int version_id)
 509 {
 510     CPUState *cpu = opaque;
 511
 512     /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 513        version_id is increased. */
 514     cpu->interrupt_request &= ~0x01;
 515     tlb_flush(cpu, 1);
 516
 517     return 0;
 518 }
 519
 520 static int cpu_common_pre_load(void *opaque)
 521 {
 522     CPUState *cpu = opaque;
 523
 524     cpu->exception_index = -1;
 525
 526     return 0;
 527 }
 528
 529 static bool cpu_common_exception_index_needed(void *opaque)
 530 {
 531     CPUState *cpu = opaque;
 532
 533     return tcg_enabled() && cpu->exception_index != -1;
 534 }
 535
 536 static const VMStateDescription vmstate_cpu_common_exception_index = {
 537     .name = "cpu_common/exception_index",
 538     .version_id = 1,
 539     .minimum_version_id = 1,
 540     .needed = cpu_common_exception_index_needed,
 541     .fields = (VMStateField[]) {
 542         VMSTATE_INT32(exception_index, CPUState),
 543         VMSTATE_END_OF_LIST()
 544     }
 545 };
 546
 547 static bool cpu_common_crash_occurred_needed(void *opaque)
 548 {
 549     CPUState *cpu = opaque;
 550
 551     return cpu->crash_occurred;
 552 }
 553
 554 static const VMStateDescription vmstate_cpu_common_crash_occurred = {
 555     .name = "cpu_common/crash_occurred",
 556     .version_id = 1,
 557     .minimum_version_id = 1,
 558     .needed = cpu_common_crash_occurred_needed,
 559     .fields = (VMStateField[]) {
 560         VMSTATE_BOOL(crash_occurred, CPUState),
 561         VMSTATE_END_OF_LIST()
 562     }
 563 };
 564
 565 const VMStateDescription vmstate_cpu_common = {
 566     .name = "cpu_common",
 567     .version_id = 1,
 568     .minimum_version_id = 1,
 569     .pre_load = cpu_common_pre_load,
 570     .post_load = cpu_common_post_load,
 571     .fields = (VMStateField[]) {
 572         VMSTATE_UINT32(halted, CPUState),
 573         VMSTATE_UINT32(interrupt_request, CPUState),
 574         VMSTATE_END_OF_LIST()
 575     },
 576     .subsections = (const VMStateDescription*[]) {
 577         &vmstate_cpu_common_exception_index,
 578         &vmstate_cpu_common_crash_occurred,
 579         NULL
 580     }
 581 };
 582
 583 #endif
 584
 585 CPUState *qemu_get_cpu(int index)
 586 {
 587     CPUState *cpu;
 588
 589     CPU_FOREACH(cpu) {
 590         if (cpu->cpu_index == index) {
 591             return cpu;
 592         }
 593     }
 594
 595     return NULL;
 596 }
 597
 598 #if !defined(CONFIG_USER_ONLY)
 599 void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx)
 600 {
 601     CPUAddressSpace *newas;
 602
 603     /* Target code should have set num_ases before calling us */
 604     assert(asidx < cpu->num_ases);
 605
 606     if (asidx == 0) {
 607         /* address space 0 gets the convenience alias */
 608         cpu->as = as;
 609     }
 610
 611     /* KVM cannot currently support multiple address spaces. */
 612     assert(asidx == 0 || !kvm_enabled());
 613
 614     if (!cpu->cpu_ases) {
 615         cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
 616     }
 617
 618     newas = &cpu->cpu_ases[asidx];
 619     newas->cpu = cpu;
 620     newas->as = as;
 621     if (tcg_enabled()) {
 622         newas->tcg_as_listener.commit = tcg_commit;
 623         memory_listener_register(&newas->tcg_as_listener, as);
 624     }
 625 }
 626
 627 AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 628 {
 629     /* Return the AddressSpace corresponding to the specified index */
 630     return cpu->cpu_ases[asidx].as;
 631 }
 632 #endif
 633
 634 void cpu_exec_unrealizefn(CPUState *cpu)
 635 {
 636     CPUClass *cc = CPU_GET_CLASS(cpu);
 637
 638     cpu_list_remove(cpu);
 639
 640     if (cc->vmsd != NULL) {
 641         vmstate_unregister(NULL, cc->vmsd, cpu);
 642     }
 643     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 644         vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
 645     }
 646 }
 647
 648 void cpu_exec_initfn(CPUState *cpu)
 649 {
 650 #ifdef TARGET_WORDS_BIGENDIAN
 651     cpu->bigendian = true;
 652 #else
 653     cpu->bigendian = false;
 654 #endif
 655     cpu->as = NULL;
 656     cpu->num_ases = 0;
 657
 658 #ifndef CONFIG_USER_ONLY
 659     cpu->thread_id = qemu_get_thread_id();
 660
 661     /* This is a softmmu CPU object, so create a property for it
 662      * so users can wire up its memory. (This can't go in qom/cpu.c
 663      * because that file is compiled only once for both user-mode
 664      * and system builds.) The default if no link is set up is to use
 665      * the system address space.
 666      */
 667     object_property_add_link(OBJECT(cpu), "memory", TYPE_MEMORY_REGION,
 668                              (Object **)&cpu->memory,
 669                              qdev_prop_allow_set_link_before_realize,
 670                              OBJ_PROP_LINK_UNREF_ON_RELEASE,
 671                              &error_abort);
 672     cpu->memory = system_memory;
 673     object_ref(OBJECT(cpu->memory));
 674 #endif
 675 }
 676
 677 void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 678 {
 679     CPUClass *cc ATTRIBUTE_UNUSED = CPU_GET_CLASS(cpu);
 680
 681     cpu_list_add(cpu);
 682
 683 #ifndef CONFIG_USER_ONLY
 684     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 685         vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
 686     }
 687     if (cc->vmsd != NULL) {
 688         vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
 689     }
 690 #endif
 691 }
 692
 693 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 694 {
 695     /* Flush the whole TB as this will not have race conditions
 696      * even if we don't have proper locking yet.
 697      * Ideally we would just invalidate the TBs for the
 698      * specified PC.
 699      */
 700     tb_flush(cpu);
 701 }
 702
 703 #if defined(CONFIG_USER_ONLY)
 704 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 705
 706 {
 707 }
 708
 709 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 710                           int flags)
 711 {
 712     return -ENOSYS;
 713 }
 714
 715 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 716 {
 717 }
 718
 719 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 720                           int flags, CPUWatchpoint **watchpoint)
 721 {
 722     return -ENOSYS;
 723 }
 724 #else
 725 /* Add a watchpoint.  */
 726 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 727                           int flags, CPUWatchpoint **watchpoint)
 728 {
 729     CPUWatchpoint *wp;
 730
 731     /* forbid ranges which are empty or run off the end of the address space */
 732     if (len == 0 || (addr + len - 1) < addr) {
 733         error_report("tried to set invalid watchpoint at %"
 734                      VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
 735         return -EINVAL;
 736     }
 737     wp = g_malloc(sizeof(*wp));
 738
 739     wp->vaddr = addr;
 740     wp->len = len;
 741     wp->flags = flags;
 742
 743     /* keep all GDB-injected watchpoints in front */
 744     if (flags & BP_GDB) {
 745         QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
 746     } else {
 747         QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
 748     }
 749
 750     tlb_flush_page(cpu, addr);
 751
 752     if (watchpoint)
 753         *watchpoint = wp;
 754     return 0;
 755 }
 756
 757 /* Remove a specific watchpoint.  */
 758 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 759                           int flags)
 760 {
 761     CPUWatchpoint *wp;
 762
 763     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 764         if (addr == wp->vaddr && len == wp->len
 765                 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
 766             cpu_watchpoint_remove_by_ref(cpu, wp);
 767             return 0;
 768         }
 769     }
 770     return -ENOENT;
 771 }
 772
 773 /* Remove a specific watchpoint by reference.  */
 774 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 775 {
 776     QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
 777
 778     tlb_flush_page(cpu, watchpoint->vaddr);
 779
 780     g_free(watchpoint);
 781 }
 782
 783 /* Remove all matching watchpoints.  */
 784 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 785 {
 786     CPUWatchpoint *wp, *next;
 787
 788     QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
 789         if (wp->flags & mask) {
 790             cpu_watchpoint_remove_by_ref(cpu, wp);
 791         }
 792     }
 793 }
 794
 795 /* Return true if this watchpoint address matches the specified
 796  * access (ie the address range covered by the watchpoint overlaps
 797  * partially or completely with the address range covered by the
 798  * access).
 799  */
 800 static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
 801                                                   vaddr addr,
 802                                                   vaddr len)
 803 {
 804     /* We know the lengths are non-zero, but a little caution is
 805      * required to avoid errors in the case where the range ends
 806      * exactly at the top of the address space and so addr + len
 807      * wraps round to zero.
 808      */
 809     vaddr wpend = wp->vaddr + wp->len - 1;
 810     vaddr addrend = addr + len - 1;
 811
 812     return !(addr > wpend || wp->vaddr > addrend);
 813 }
 814
 815 #endif
 816
 817 /* Add a breakpoint.  */
 818 int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
 819                           CPUBreakpoint **breakpoint)
 820 {
 821     CPUBreakpoint *bp;
 822
 823     bp = g_malloc(sizeof(*bp));
 824
 825     bp->pc = pc;
 826     bp->flags = flags;
 827
 828     /* keep all GDB-injected breakpoints in front */
 829     if (flags & BP_GDB) {
 830         QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
 831     } else {
 832         QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
 833     }
 834
 835     breakpoint_invalidate(cpu, pc);
 836
 837     if (breakpoint) {
 838         *breakpoint = bp;
 839     }
 840     return 0;
 841 }
 842
 843 /* Remove a specific breakpoint.  */
 844 int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
 845 {
 846     CPUBreakpoint *bp;
 847
 848     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
 849         if (bp->pc == pc && bp->flags == flags) {
 850             cpu_breakpoint_remove_by_ref(cpu, bp);
 851             return 0;
 852         }
 853     }
 854     return -ENOENT;
 855 }
 856
 857 /* Remove a specific breakpoint by reference.  */
 858 void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
 859 {
 860     QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
 861
 862     breakpoint_invalidate(cpu, breakpoint->pc);
 863
 864     g_free(breakpoint);
 865 }
 866
 867 /* Remove all matching breakpoints. */
 868 void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
 869 {
 870     CPUBreakpoint *bp, *next;
 871
 872     QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
 873         if (bp->flags & mask) {
 874             cpu_breakpoint_remove_by_ref(cpu, bp);
 875         }
 876     }
 877 }
 878
 879 /* enable or disable single step mode. EXCP_DEBUG is returned by the
 880    CPU loop after each instruction */
 881 void cpu_single_step(CPUState *cpu, int enabled)
 882 {
 883     if (cpu->singlestep_enabled != enabled) {
 884         cpu->singlestep_enabled = enabled;
 885         if (kvm_enabled()) {
 886             kvm_update_guest_debug(cpu, 0);
 887         } else {
 888             /* must flush all the translated code to avoid inconsistencies */
 889             /* XXX: only flush what is necessary */
 890             tb_flush(cpu);
 891         }
 892     }
 893 }
 894
 895 void QEMU_NORETURN cpu_abort(CPUState *cpu, const char *fmt, ...)
 896 {
 897     va_list ap;
 898     va_list ap2;
 899
 900     va_start(ap, fmt);
 901     va_copy(ap2, ap);
 902     fprintf(stderr, "qemu: fatal: ");
 903     vfprintf(stderr, fmt, ap);
 904     fprintf(stderr, "\n");
 905     cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 906     if (qemu_log_separate()) {
 907         qemu_log_lock();
 908         qemu_log("qemu: fatal: ");
 909         qemu_log_vprintf(fmt, ap2);
 910         qemu_log("\n");
 911         log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 912         qemu_log_flush();
 913         qemu_log_unlock();
 914         qemu_log_close();
 915     }
 916     va_end(ap2);
 917     va_end(ap);
 918     replay_finish();
 919 #if defined(CONFIG_USER_ONLY)
 920     {
 921         struct sigaction act;
 922         sigfillset(&act.sa_mask);
 923         act.sa_handler = SIG_DFL;
 924         sigaction(SIGABRT, &act, NULL);
 925     }
 926 #endif
 927     abort();
 928 }
 929
 930 #if !defined(CONFIG_USER_ONLY)
 931 /* Called from RCU critical section */
 932 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
 933 {
 934     RAMBlock *block;
 935
 936     block = atomic_rcu_read(&ram_list.mru_block);
 937     if (block && addr - block->offset < block->max_length) {
 938         return block;
 939     }
 940     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 941         if (addr - block->offset < block->max_length) {
 942             goto found;
 943         }
 944     }
 945
 946     fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
 947     abort();
 948
 949 found:
 950     /* It is safe to write mru_block outside the iothread lock.  This
 951      * is what happens:
 952      *
 953      *     mru_block = xxx
 954      *     rcu_read_unlock()
 955      *                                        xxx removed from list
 956      *                  rcu_read_lock()
 957      *                  read mru_block
 958      *                                        mru_block = NULL;
 959      *                                        call_rcu(reclaim_ramblock, xxx);
 960      *                  rcu_read_unlock()
 961      *
 962      * atomic_rcu_set is not needed here.  The block was already published
 963      * when it was placed into the list.  Here we're just making an extra
 964      * copy of the pointer.
 965      */
 966     ram_list.mru_block = block;
 967     return block;
 968 }
 969
 970 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
 971 {
 972     CPUState *cpu;
 973     ram_addr_t start1;
 974     RAMBlock *block;
 975     ram_addr_t end;
 976
 977     end = TARGET_PAGE_ALIGN(start + length);
 978     start &= TARGET_PAGE_MASK;
 979
 980     rcu_read_lock();
 981     block = qemu_get_ram_block(start);
 982     assert(block == qemu_get_ram_block(end - 1));
 983     start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
 984     CPU_FOREACH(cpu) {
 985         tlb_reset_dirty(cpu, start1, length);
 986     }
 987     rcu_read_unlock();
 988 }
 989
 990 /* Note: start and end must be within the same ram block.  */
 991 bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
 992                                               ram_addr_t length,
 993                                               unsigned client)
 994 {
 995     DirtyMemoryBlocks *blocks;
 996     unsigned long end, page;
 997     bool dirty = false;
 998
 999     if (length == 0) {
1000         return false;
1001     }
1002
1003     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
1004     page = start >> TARGET_PAGE_BITS;
1005
1006     rcu_read_lock();
1007
1008     blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1009
1010     while (page < end) {
1011         unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1012         unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1013         unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
1014
1015         dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
1016                                               offset, num);
1017         page += num;
1018     }
1019
1020     rcu_read_unlock();
1021
1022     if (dirty && tcg_enabled()) {
1023         tlb_reset_dirty_range_all(start, length);
1024     }
1025
1026     return dirty;
1027 }
1028
1029 /* Called from RCU critical section */
1030 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1031                                        MemoryRegionSection *section,
1032                                        target_ulong vaddr,
1033                                        hwaddr paddr, hwaddr xlat,
1034                                        int prot,
1035                                        target_ulong *address)
1036 {
1037     hwaddr iotlb;
1038     CPUWatchpoint *wp;
1039
1040     if (memory_region_is_ram(section->mr)) {
1041         /* Normal RAM.  */
1042         iotlb = memory_region_get_ram_addr(section->mr) + xlat;
1043         if (!section->readonly) {
1044             iotlb |= PHYS_SECTION_NOTDIRTY;
1045         } else {
1046             iotlb |= PHYS_SECTION_ROM;
1047         }
1048     } else {
1049         AddressSpaceDispatch *d;
1050
1051         d = atomic_rcu_read(&section->address_space->dispatch);
1052         iotlb = section - d->map.sections;
1053         iotlb += xlat;
1054     }
1055
1056     /* Make accesses to pages with watchpoints go via the
1057        watchpoint trap routines.  */
1058     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1059         if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
1060             /* Avoid trapping reads of pages with a write breakpoint. */
1061             if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
1062                 iotlb = PHYS_SECTION_WATCH + paddr;
1063                 *address |= TLB_MMIO;
1064                 break;
1065             }
1066         }
1067     }
1068
1069     return iotlb;
1070 }
1071 #endif /* defined(CONFIG_USER_ONLY) */
1072
1073 #if !defined(CONFIG_USER_ONLY)
1074
1075 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
1076                              uint16_t section);
1077 static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
1078
1079 static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
1080                                qemu_anon_ram_alloc;
1081
1082 /*
1083  * Set a custom physical guest memory alloator.
1084  * Accelerators with unusual needs may need this.  Hopefully, we can
1085  * get rid of it eventually.
1086  */
1087 void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
1088 {
1089     phys_mem_alloc = alloc;
1090 }
1091
1092 static uint16_t phys_section_add(PhysPageMap *map,
1093                                  MemoryRegionSection *section)
1094 {
1095     /* The physical section number is ORed with a page-aligned
1096      * pointer to produce the iotlb entries.  Thus it should
1097      * never overflow into the page-aligned value.
1098      */
1099     assert(map->sections_nb < TARGET_PAGE_SIZE);
1100
1101     if (map->sections_nb == map->sections_nb_alloc) {
1102         map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
1103         map->sections = g_renew(MemoryRegionSection, map->sections,
1104                                 map->sections_nb_alloc);
1105     }
1106     map->sections[map->sections_nb] = *section;
1107     memory_region_ref(section->mr);
1108     return map->sections_nb++;
1109 }
1110
1111 static void phys_section_destroy(MemoryRegion *mr)
1112 {
1113     bool have_sub_page = mr->subpage;
1114
1115     memory_region_unref(mr);
1116
1117     if (have_sub_page) {
1118         subpage_t *subpage = container_of(mr, subpage_t, iomem);
1119         object_unref(OBJECT(&subpage->iomem));
1120         g_free(subpage);
1121     }
1122 }
1123
1124 static void phys_sections_free(PhysPageMap *map)
1125 {
1126     while (map->sections_nb > 0) {
1127         MemoryRegionSection *section = &map->sections[--map->sections_nb];
1128         phys_section_destroy(section->mr);
1129     }
1130     g_free(map->sections);
1131     g_free(map->nodes);
1132 }
1133
1134 static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
1135 {
1136     subpage_t *subpage;
1137     hwaddr base = section->offset_within_address_space
1138         & TARGET_PAGE_MASK;
1139     MemoryRegionSection *existing = phys_page_find(d->phys_map, base,
1140                                                    d->map.nodes, d->map.sections);
1141     MemoryRegionSection subsection = {
1142         .offset_within_address_space = base,
1143         .size = int128_make64(TARGET_PAGE_SIZE),
1144     };
1145     hwaddr start, end;
1146
1147     assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1148
1149     if (!(existing->mr->subpage)) {
1150         subpage = subpage_init(d->as, base);
1151         subsection.address_space = d->as;
1152         subsection.mr = &subpage->iomem;
1153         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1154                       phys_section_add(&d->map, &subsection));
1155     } else {
1156         subpage = container_of(existing->mr, subpage_t, iomem);
1157     }
1158     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1159     end = start + int128_get64(section->size) - 1;
1160     subpage_register(subpage, start, end,
1161                      phys_section_add(&d->map, section));
1162 }
1163
1164
1165 static void register_multipage(AddressSpaceDispatch *d,
1166                                MemoryRegionSection *section)
1167 {
1168     hwaddr start_addr = section->offset_within_address_space;
1169     uint16_t section_index = phys_section_add(&d->map, section);
1170     uint64_t num_pages = int128_get64(int128_rshift(section->size,
1171                                                     TARGET_PAGE_BITS));
1172
1173     assert(num_pages);
1174     phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1175 }
1176
1177 static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
1178 {
1179     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
1180     AddressSpaceDispatch *d = as->next_dispatch;
1181     MemoryRegionSection now = *section, remain = *section;
1182     Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1183
1184     if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
1185         uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
1186                        - now.offset_within_address_space;
1187
1188         now.size = int128_min(int128_make64(left), now.size);
1189         register_subpage(d, &now);
1190     } else {
1191         now.size = int128_zero();
1192     }
1193     while (int128_ne(remain.size, now.size)) {
1194         remain.size = int128_sub(remain.size, now.size);
1195         remain.offset_within_address_space += int128_get64(now.size);
1196         remain.offset_within_region += int128_get64(now.size);
1197         now = remain;
1198         if (int128_lt(remain.size, page_size)) {
1199             register_subpage(d, &now);
1200         } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1201             now.size = page_size;
1202             register_subpage(d, &now);
1203         } else {
1204             now.size = int128_and(now.size, int128_neg(page_size));
1205             register_multipage(d, &now);
1206         }
1207     }
1208 }
1209
1210 void qemu_flush_coalesced_mmio_buffer(void)
1211 {
1212     if (kvm_enabled())
1213         kvm_flush_coalesced_mmio_buffer();
1214 }
1215
1216 void qemu_mutex_lock_ramlist(void)
1217 {
1218     qemu_mutex_lock(&ram_list.mutex);
1219 }
1220
1221 void qemu_mutex_unlock_ramlist(void)
1222 {
1223     qemu_mutex_unlock(&ram_list.mutex);
1224 }
1225
1226 #ifdef __linux__
1227 static int64_t get_file_size(int fd)
1228 {
1229     int64_t size = lseek(fd, 0, SEEK_END);
1230     if (size < 0) {
1231         return -errno;
1232     }
1233     return size;
1234 }
1235
1236 static void *file_ram_alloc(RAMBlock *block,
1237                             ram_addr_t memory,
1238                             const char *path,
1239                             Error **errp)
1240 {
1241     bool unlink_on_error = false;
1242     char *filename;
1243     char *sanitized_name;
1244     char *c;
1245     void * volatile area = MAP_FAILED;
1246     int fd = -1;
1247     int64_t file_size;
1248
1249     if (kvm_enabled() && !kvm_has_sync_mmu()) {
1250         error_setg(errp,
1251                    "host lacks kvm mmu notifiers, -mem-path unsupported");
1252         return NULL;
1253     }
1254
1255     for (;;) {
1256         fd = open(path, O_RDWR);
1257         if (fd >= 0) {
1258             /* @path names an existing file, use it */
1259             break;
1260         }
1261         if (errno == ENOENT) {
1262             /* @path names a file that doesn't exist, create it */
1263             fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
1264             if (fd >= 0) {
1265                 unlink_on_error = true;
1266                 break;
1267             }
1268         } else if (errno == EISDIR) {
1269             /* @path names a directory, create a file there */
1270             /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1271             sanitized_name = g_strdup(memory_region_name(block->mr));
1272             for (c = sanitized_name; *c != '\0'; c++) {
1273                 if (*c == '/') {
1274                     *c = '_';
1275                 }
1276             }
1277
1278             filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1279                                        sanitized_name);
1280             g_free(sanitized_name);
1281
1282             fd = mkstemp(filename);
1283             if (fd >= 0) {
1284                 unlink(filename);
1285                 g_free(filename);
1286                 break;
1287             }
1288             g_free(filename);
1289         }
1290         if (errno != EEXIST && errno != EINTR) {
1291             error_setg_errno(errp, errno,
1292                              "can't open backing store %s for guest RAM",
1293                              path);
1294             goto error;
1295         }
1296         /*
1297          * Try again on EINTR and EEXIST.  The latter happens when
1298          * something else creates the file between our two open().
1299          */
1300     }
1301
1302     block->page_size = qemu_fd_getpagesize(fd);
1303     block->mr->align = block->page_size;
1304 #if defined(__s390x__)
1305     if (kvm_enabled()) {
1306         block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
1307     }
1308 #endif
1309
1310     file_size = get_file_size(fd);
1311
1312     if (memory < block->page_size) {
1313         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1314                    "or larger than page size 0x%zx",
1315                    memory, block->page_size);
1316         goto error;
1317     }
1318
1319     if (file_size > 0 && file_size < memory) {
1320         error_setg(errp, "backing store %s size 0x%" PRIx64
1321                    " does not match 'size' option 0x" RAM_ADDR_FMT,
1322                    path, file_size, memory);
1323         goto error;
1324     }
1325
1326     memory = ROUND_UP(memory, block->page_size);
1327
1328     /*
1329      * ftruncate is not supported by hugetlbfs in older
1330      * hosts, so don't bother bailing out on errors.
1331      * If anything goes wrong with it under other filesystems,
1332      * mmap will fail.
1333      *
1334      * Do not truncate the non-empty backend file to avoid corrupting
1335      * the existing data in the file. Disabling shrinking is not
1336      * enough. For example, the current vNVDIMM implementation stores
1337      * the guest NVDIMM labels at the end of the backend file. If the
1338      * backend file is later extended, QEMU will not be able to find
1339      * those labels. Therefore, extending the non-empty backend file
1340      * is disabled as well.
1341      */
1342     if (!file_size && ftruncate(fd, memory)) {
1343         perror("ftruncate");
1344     }
1345
1346     area = qemu_ram_mmap(fd, memory, block->mr->align,
1347                          block->flags & RAM_SHARED);
1348     if (area == MAP_FAILED) {
1349         error_setg_errno(errp, errno,
1350                          "unable to map backing store for guest RAM");
1351         goto error;
1352     }
1353
1354     if (mem_prealloc) {
1355         os_mem_prealloc(fd, area, memory, errp);
1356         if (errp && *errp) {
1357             goto error;
1358         }
1359     }
1360
1361     block->fd = fd;
1362     return area;
1363
1364 error:
1365     if (area != MAP_FAILED) {
1366         qemu_ram_munmap(area, memory);
1367     }
1368     if (unlink_on_error) {
1369         unlink(path);
1370     }
1371     if (fd != -1) {
1372         close(fd);
1373     }
1374     return NULL;
1375 }
1376 #endif
1377
1378 /* Called with the ramlist lock held.  */
1379 static ram_addr_t find_ram_offset(ram_addr_t size)
1380 {
1381     RAMBlock *block, *next_block;
1382     ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1383
1384     assert(size != 0); /* it would hand out same offset multiple times */
1385
1386     if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1387         return 0;
1388     }
1389
1390     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1391         ram_addr_t end, next = RAM_ADDR_MAX;
1392
1393         end = block->offset + block->max_length;
1394
1395         QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) {
1396             if (next_block->offset >= end) {
1397                 next = MIN(next, next_block->offset);
1398             }
1399         }
1400         if (next - end >= size && next - end < mingap) {
1401             offset = end;
1402             mingap = next - end;
1403         }
1404     }
1405
1406     if (offset == RAM_ADDR_MAX) {
1407         fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1408                 (uint64_t)size);
1409         abort();
1410     }
1411
1412     return offset;
1413 }
1414
1415 ram_addr_t last_ram_offset(void)
1416 {
1417     RAMBlock *block;
1418     ram_addr_t last = 0;
1419
1420     rcu_read_lock();
1421     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1422         last = MAX(last, block->offset + block->max_length);
1423     }
1424     rcu_read_unlock();
1425     return last;
1426 }
1427
1428 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1429 {
1430     int ret;
1431
1432     /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1433     if (!machine_dump_guest_core(current_machine)) {
1434         ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1435         if (ret) {
1436             perror("qemu_madvise");
1437             fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1438                             "but dump_guest_core=off specified\n");
1439         }
1440     }
1441 }
1442
1443 const char *qemu_ram_get_idstr(RAMBlock *rb)
1444 {
1445     return rb->idstr;
1446 }
1447
1448 /* Called with iothread lock held.  */
1449 void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
1450 {
1451     RAMBlock *block;
1452
1453     assert(new_block);
1454     assert(!new_block->idstr[0]);
1455
1456     if (dev) {
1457         char *id = qdev_get_dev_path(dev);
1458         if (id) {
1459             snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1460             g_free(id);
1461         }
1462     }
1463     pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
1464
1465     rcu_read_lock();
1466     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1467         if (block != new_block &&
1468             !strcmp(block->idstr, new_block->idstr)) {
1469             fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
1470                     new_block->idstr);
1471             abort();
1472         }
1473     }
1474     rcu_read_unlock();
1475 }
1476
1477 /* Called with iothread lock held.  */
1478 void qemu_ram_unset_idstr(RAMBlock *block)
1479 {
1480     /* FIXME: arch_init.c assumes that this is not called throughout
1481      * migration.  Ignore the problem since hot-unplug during migration
1482      * does not work anyway.
1483      */
1484     if (block) {
1485         memset(block->idstr, 0, sizeof(block->idstr));
1486     }
1487 }
1488
1489 size_t qemu_ram_pagesize(RAMBlock *rb)
1490 {
1491     return rb->page_size;
1492 }
1493
1494 static int memory_try_enable_merging(void *addr, size_t len)
1495 {
1496     if (!machine_mem_merge(current_machine)) {
1497         /* disabled by the user */
1498         return 0;
1499     }
1500
1501     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
1502 }
1503
1504 /* Only legal before guest might have detected the memory size: e.g. on
1505  * incoming migration, or right after reset.
1506  *
1507  * As memory core doesn't know how is memory accessed, it is up to
1508  * resize callback to update device state and/or add assertions to detect
1509  * misuse, if necessary.
1510  */
1511 int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
1512 {
1513     assert(block);
1514
1515     newsize = HOST_PAGE_ALIGN(newsize);
1516
1517     if (block->used_length == newsize) {
1518         return 0;
1519     }
1520
1521     if (!(block->flags & RAM_RESIZEABLE)) {
1522         error_setg_errno(errp, EINVAL,
1523                          "Length mismatch: %s: 0x" RAM_ADDR_FMT
1524                          " in != 0x" RAM_ADDR_FMT, block->idstr,
1525                          newsize, block->used_length);
1526         return -EINVAL;
1527     }
1528
1529     if (block->max_length < newsize) {
1530         error_setg_errno(errp, EINVAL,
1531                          "Length too large: %s: 0x" RAM_ADDR_FMT
1532                          " > 0x" RAM_ADDR_FMT, block->idstr,
1533                          newsize, block->max_length);
1534         return -EINVAL;
1535     }
1536
1537     cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
1538     block->used_length = newsize;
1539     cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
1540                                         DIRTY_CLIENTS_ALL);
1541     memory_region_set_size(block->mr, newsize);
1542     if (block->resized) {
1543         block->resized(block->idstr, newsize, block->host);
1544     }
1545     return 0;
1546 }
1547
1548 /* Called with ram_list.mutex held */
1549 static void dirty_memory_extend(ram_addr_t old_ram_size,
1550                                 ram_addr_t new_ram_size)
1551 {
1552     ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
1553                                              DIRTY_MEMORY_BLOCK_SIZE);
1554     ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
1555                                              DIRTY_MEMORY_BLOCK_SIZE);
1556     int i;
1557
1558     /* Only need to extend if block count increased */
1559     if (new_num_blocks <= old_num_blocks) {
1560         return;
1561     }
1562
1563     for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
1564         DirtyMemoryBlocks *old_blocks;
1565         DirtyMemoryBlocks *new_blocks;
1566         int j;
1567
1568         old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
1569         new_blocks = g_malloc(sizeof(*new_blocks) +
1570                               sizeof(new_blocks->blocks[0]) * new_num_blocks);
1571
1572         if (old_num_blocks) {
1573             memcpy(new_blocks->blocks, old_blocks->blocks,
1574                    old_num_blocks * sizeof(old_blocks->blocks[0]));
1575         }
1576
1577         for (j = old_num_blocks; j < new_num_blocks; j++) {
1578             new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
1579         }
1580
1581         atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
1582
1583         if (old_blocks) {
1584             g_free_rcu(old_blocks, rcu);
1585         }
1586     }
1587 }
1588
1589 static void ram_block_add(RAMBlock *new_block, Error **errp)
1590 {
1591     RAMBlock *block;
1592     RAMBlock *last_block = NULL;
1593     ram_addr_t old_ram_size, new_ram_size;
1594     Error *err = NULL;
1595
1596     old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1597
1598     qemu_mutex_lock_ramlist();
1599     new_block->offset = find_ram_offset(new_block->max_length);
1600
1601     if (!new_block->host) {
1602         if (xen_enabled()) {
1603             xen_ram_alloc(new_block->offset, new_block->max_length,
1604                           new_block->mr, &err);
1605             if (err) {
1606                 error_propagate(errp, err);
1607                 qemu_mutex_unlock_ramlist();
1608                 return;
1609             }
1610         } else {
1611             new_block->host = phys_mem_alloc(new_block->max_length,
1612                                              &new_block->mr->align);
1613             /*
1614              * In Hax, the qemu allocate the virtual address, and HAX kernel
1615              * populate the memory with physical memory. Currently we have no
1616              * paging, so user should make sure enough free memory in advance
1617              */
1618             if (hax_enabled()) {
1619                 int ret;
1620                 ret = hax_populate_ram((uint64_t)(uintptr_t)new_block->host,
1621                                        new_block->max_length);
1622                 if (ret < 0) {
1623                     error_setg(errp, "Hax failed to populate ram");
1624                     return;
1625                 }
1626             }
1627
1628             if (!new_block->host) {
1629                 error_setg_errno(errp, errno,
1630                                  "cannot set up guest memory '%s'",
1631                                  memory_region_name(new_block->mr));
1632                 qemu_mutex_unlock_ramlist();
1633                 return;
1634             }
1635             /*
1636              * In Hax, the qemu allocate the virtual address, and HAX kernel
1637              * populate the memory with physical memory. Currently we have no
1638              * paging, so user should make sure enough free memory in advance
1639              */
1640             if (hax_enabled()) {
1641                 int ret;
1642                 ret = hax_populate_ram((uint64_t)(uintptr_t)new_block->host,
1643                                        new_block->max_length);
1644                 if (ret < 0) {
1645                     error_setg(errp, "Hax failed to populate ram");
1646                     return;
1647                 }
1648             }
1649
1650             memory_try_enable_merging(new_block->host, new_block->max_length);
1651         }
1652     }
1653
1654     new_ram_size = MAX(old_ram_size,
1655               (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
1656     if (new_ram_size > old_ram_size) {
1657         migration_bitmap_extend(old_ram_size, new_ram_size);
1658         dirty_memory_extend(old_ram_size, new_ram_size);
1659     }
1660     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
1661      * QLIST (which has an RCU-friendly variant) does not have insertion at
1662      * tail, so save the last element in last_block.
1663      */
1664     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1665         last_block = block;
1666         if (block->max_length < new_block->max_length) {
1667             break;
1668         }
1669     }
1670     if (block) {
1671         QLIST_INSERT_BEFORE_RCU(block, new_block, next);
1672     } else if (last_block) {
1673         QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
1674     } else { /* list is empty */
1675         QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1676     }
1677     ram_list.mru_block = NULL;
1678
1679     /* Write list before version */
1680     smp_wmb();
1681     ram_list.version++;
1682     qemu_mutex_unlock_ramlist();
1683
1684     cpu_physical_memory_set_dirty_range(new_block->offset,
1685                                         new_block->used_length,
1686                                         DIRTY_CLIENTS_ALL);
1687
1688     if (new_block->host) {
1689         qemu_ram_setup_dump(new_block->host, new_block->max_length);
1690         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
1691         /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
1692         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
1693     }
1694 }
1695
1696 #ifdef __linux__
1697 RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
1698                                    bool share, const char *mem_path,
1699                                    Error **errp)
1700 {
1701     RAMBlock *new_block;
1702     Error *local_err = NULL;
1703
1704     if (xen_enabled()) {
1705         error_setg(errp, "-mem-path not supported with Xen");
1706         return NULL;
1707     }
1708
1709     if (phys_mem_alloc != qemu_anon_ram_alloc) {
1710         /*
1711          * file_ram_alloc() needs to allocate just like
1712          * phys_mem_alloc, but we haven't bothered to provide
1713          * a hook there.
1714          */
1715         error_setg(errp,
1716                    "-mem-path not supported with this accelerator");
1717         return NULL;
1718     }
1719
1720     size = HOST_PAGE_ALIGN(size);
1721     new_block = g_malloc0(sizeof(*new_block));
1722     new_block->mr = mr;
1723     new_block->used_length = size;
1724     new_block->max_length = size;
1725     new_block->flags = share ? RAM_SHARED : 0;
1726     new_block->host = file_ram_alloc(new_block, size,
1727                                      mem_path, errp);
1728     if (!new_block->host) {
1729         g_free(new_block);
1730         return NULL;
1731     }
1732
1733     ram_block_add(new_block, &local_err);
1734     if (local_err) {
1735         g_free(new_block);
1736         error_propagate(errp, local_err);
1737         return NULL;
1738     }
1739     return new_block;
1740 }
1741 #endif
1742
1743 static
1744 RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
1745                                   void (*resized)(const char*,
1746                                                   uint64_t length,
1747                                                   void *host),
1748                                   void *host, bool resizeable,
1749                                   MemoryRegion *mr, Error **errp)
1750 {
1751     RAMBlock *new_block;
1752     Error *local_err = NULL;
1753
1754     size = HOST_PAGE_ALIGN(size);
1755     max_size = HOST_PAGE_ALIGN(max_size);
1756     new_block = g_malloc0(sizeof(*new_block));
1757     new_block->mr = mr;
1758     new_block->resized = resized;
1759     new_block->used_length = size;
1760     new_block->max_length = max_size;
1761     assert(max_size >= size);
1762     new_block->fd = -1;
1763     new_block->page_size = getpagesize();
1764     new_block->host = host;
1765     if (host) {
1766         new_block->flags |= RAM_PREALLOC;
1767     }
1768     if (resizeable) {
1769         new_block->flags |= RAM_RESIZEABLE;
1770     }
1771     ram_block_add(new_block, &local_err);
1772     if (local_err) {
1773         g_free(new_block);
1774         error_propagate(errp, local_err);
1775         return NULL;
1776     }
1777     return new_block;
1778 }
1779
1780 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
1781                                    MemoryRegion *mr, Error **errp)
1782 {
1783     return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
1784 }
1785
1786 RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
1787 {
1788     return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
1789 }
1790
1791 RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
1792                                      void (*resized)(const char*,
1793                                                      uint64_t length,
1794                                                      void *host),
1795                                      MemoryRegion *mr, Error **errp)
1796 {
1797     return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
1798 }
1799
1800 static void reclaim_ramblock(RAMBlock *block)
1801 {
1802     if (block->flags & RAM_PREALLOC) {
1803         ;
1804     } else if (xen_enabled()) {
1805         xen_invalidate_map_cache_entry(block->host);
1806 #ifndef _WIN32
1807     } else if (block->fd >= 0) {
1808         qemu_ram_munmap(block->host, block->max_length);
1809         close(block->fd);
1810 #endif
1811     } else {
1812         qemu_anon_ram_free(block->host, block->max_length);
1813     }
1814     g_free(block);
1815 }
1816
1817 void qemu_ram_free(RAMBlock *block)
1818 {
1819     if (!block) {
1820         return;
1821     }
1822
1823     qemu_mutex_lock_ramlist();
1824     QLIST_REMOVE_RCU(block, next);
1825     ram_list.mru_block = NULL;
1826     /* Write list before version */
1827     smp_wmb();
1828     ram_list.version++;
1829     call_rcu(block, reclaim_ramblock, rcu);
1830     qemu_mutex_unlock_ramlist();
1831 }
1832
1833 #ifndef _WIN32
1834 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
1835 {
1836     RAMBlock *block;
1837     ram_addr_t offset;
1838     int flags;
1839     void *area, *vaddr;
1840
1841     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1842         offset = addr - block->offset;
1843         if (offset < block->max_length) {
1844             vaddr = ramblock_ptr(block, offset);
1845             if (block->flags & RAM_PREALLOC) {
1846                 ;
1847             } else if (xen_enabled()) {
1848                 abort();
1849             } else {
1850                 flags = MAP_FIXED;
1851                 if (block->fd >= 0) {
1852                     flags |= (block->flags & RAM_SHARED ?
1853                               MAP_SHARED : MAP_PRIVATE);
1854                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1855                                 flags, block->fd, offset);
1856                 } else {
1857                     /*
1858                      * Remap needs to match alloc.  Accelerators that
1859                      * set phys_mem_alloc never remap.  If they did,
1860                      * we'd need a remap hook here.
1861                      */
1862                     assert(phys_mem_alloc == qemu_anon_ram_alloc);
1863
1864                     flags |= MAP_PRIVATE | MAP_ANONYMOUS;
1865                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1866                                 flags, -1, 0);
1867                 }
1868                 if (area != vaddr) {
1869                     fprintf(stderr, "Could not remap addr: "
1870                             RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
1871                             length, addr);
1872                     exit(1);
1873                 }
1874                 memory_try_enable_merging(vaddr, length);
1875                 qemu_ram_setup_dump(vaddr, length);
1876             }
1877         }
1878     }
1879 }
1880 #endif /* !_WIN32 */
1881
1882 /* Return a host pointer to ram allocated with qemu_ram_alloc.
1883  * This should not be used for general purpose DMA.  Use address_space_map
1884  * or address_space_rw instead. For local memory (e.g. video ram) that the
1885  * device owns, use memory_region_get_ram_ptr.
1886  *
1887  * Called within RCU critical section.
1888  */
1889 void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
1890 {
1891     RAMBlock *block = ram_block;
1892
1893     if (block == NULL) {
1894         block = qemu_get_ram_block(addr);
1895         addr -= block->offset;
1896     }
1897
1898     if (xen_enabled() && block->host == NULL) {
1899         /* We need to check if the requested address is in the RAM
1900          * because we don't want to map the entire memory in QEMU.
1901          * In that case just map until the end of the page.
1902          */
1903         if (block->offset == 0) {
1904             return xen_map_cache(addr, 0, 0);
1905         }
1906
1907         block->host = xen_map_cache(block->offset, block->max_length, 1);
1908     }
1909     return ramblock_ptr(block, addr);
1910 }
1911
1912 /* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
1913  * but takes a size argument.
1914  *
1915  * Called within RCU critical section.
1916  */
1917 static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
1918                                  hwaddr *size)
1919 {
1920     RAMBlock *block = ram_block;
1921     if (*size == 0) {
1922         return NULL;
1923     }
1924
1925     if (block == NULL) {
1926         block = qemu_get_ram_block(addr);
1927         addr -= block->offset;
1928     }
1929     *size = MIN(*size, block->max_length - addr);
1930
1931     if (xen_enabled() && block->host == NULL) {
1932         /* We need to check if the requested address is in the RAM
1933          * because we don't want to map the entire memory in QEMU.
1934          * In that case just map the requested area.
1935          */
1936         if (block->offset == 0) {
1937             return xen_map_cache(addr, *size, 1);
1938         }
1939
1940         block->host = xen_map_cache(block->offset, block->max_length, 1);
1941     }
1942
1943     return ramblock_ptr(block, addr);
1944 }
1945
1946 /*
1947  * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
1948  * in that RAMBlock.
1949  *
1950  * ptr: Host pointer to look up
1951  * round_offset: If true round the result offset down to a page boundary
1952  * *ram_addr: set to result ram_addr
1953  * *offset: set to result offset within the RAMBlock
1954  *
1955  * Returns: RAMBlock (or NULL if not found)
1956  *
1957  * By the time this function returns, the returned pointer is not protected
1958  * by RCU anymore.  If the caller is not within an RCU critical section and
1959  * does not hold the iothread lock, it must have other means of protecting the
1960  * pointer, such as a reference to the region that includes the incoming
1961  * ram_addr_t.
1962  */
1963 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
1964                                    ram_addr_t *offset)
1965 {
1966     RAMBlock *block;
1967     uint8_t *host = ptr;
1968
1969     if (xen_enabled()) {
1970         ram_addr_t ram_addr;
1971         rcu_read_lock();
1972         ram_addr = xen_ram_addr_from_mapcache(ptr);
1973         block = qemu_get_ram_block(ram_addr);
1974         if (block) {
1975             *offset = ram_addr - block->offset;
1976         }
1977         rcu_read_unlock();
1978         return block;
1979     }
1980
1981     rcu_read_lock();
1982     block = atomic_rcu_read(&ram_list.mru_block);
1983     if (block && block->host && host - block->host < block->max_length) {
1984         goto found;
1985     }
1986
1987     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1988         /* This case append when the block is not mapped. */
1989         if (block->host == NULL) {
1990             continue;
1991         }
1992         if (host - block->host < block->max_length) {
1993             goto found;
1994         }
1995     }
1996
1997     rcu_read_unlock();
1998     return NULL;
1999
2000 found:
2001     *offset = (host - block->host);
2002     if (round_offset) {
2003         *offset &= TARGET_PAGE_MASK;
2004     }
2005     rcu_read_unlock();
2006     return block;
2007 }
2008
2009 /*
2010  * Finds the named RAMBlock
2011  *
2012  * name: The name of RAMBlock to find
2013  *
2014  * Returns: RAMBlock (or NULL if not found)
2015  */
2016 RAMBlock *qemu_ram_block_by_name(const char *name)
2017 {
2018     RAMBlock *block;
2019
2020     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2021         if (!strcmp(name, block->idstr)) {
2022             return block;
2023         }
2024     }
2025
2026     return NULL;
2027 }
2028
2029 /* Some of the softmmu routines need to translate from a host pointer
2030    (typically a TLB entry) back to a ram offset.  */
2031 ram_addr_t qemu_ram_addr_from_host(void *ptr)
2032 {
2033     RAMBlock *block;
2034     ram_addr_t offset;
2035
2036     block = qemu_ram_block_from_host(ptr, false, &offset);
2037     if (!block) {
2038         return RAM_ADDR_INVALID;
2039     }
2040
2041     return block->offset + offset;
2042 }
2043
2044 /* Called within RCU critical section.  */
2045 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
2046                                uint64_t val, unsigned size)
2047 {
2048     bool locked = false;
2049
2050     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
2051         locked = true;
2052         tb_lock();
2053         tb_invalidate_phys_page_fast(ram_addr, size);
2054     }
2055     switch (size) {
2056     case 1:
2057         stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2058         break;
2059     case 2:
2060         stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2061         break;
2062     case 4:
2063         stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2064         break;
2065     default:
2066         abort();
2067     }
2068
2069     if (locked) {
2070         tb_unlock();
2071     }
2072
2073     /* Set both VGA and migration bits for simplicity and to remove
2074      * the notdirty callback faster.
2075      */
2076     cpu_physical_memory_set_dirty_range(ram_addr, size,
2077                                         DIRTY_CLIENTS_NOCODE);
2078     /* we remove the notdirty callback only if the code has been
2079        flushed */
2080     if (!cpu_physical_memory_is_clean(ram_addr)) {
2081         tlb_set_dirty(current_cpu, current_cpu->mem_io_vaddr);
2082     }
2083 }
2084
2085 static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
2086                                  unsigned size, bool is_write)
2087 {
2088     return is_write;
2089 }
2090
2091 static const MemoryRegionOps notdirty_mem_ops = {
2092     .write = notdirty_mem_write,
2093     .valid.accepts = notdirty_mem_accepts,
2094     .endianness = DEVICE_NATIVE_ENDIAN,
2095 };
2096
2097 /* Generate a debug exception if a watchpoint has been hit.  */
2098 static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
2099 {
2100     CPUState *cpu = current_cpu;
2101     CPUClass *cc = CPU_GET_CLASS(cpu);
2102     CPUArchState *env = cpu->env_ptr;
2103     target_ulong pc, cs_base;
2104     target_ulong vaddr;
2105     CPUWatchpoint *wp;
2106     uint32_t cpu_flags;
2107
2108     if (cpu->watchpoint_hit) {
2109         /* We re-entered the check after replacing the TB. Now raise
2110          * the debug interrupt so that is will trigger after the
2111          * current instruction. */
2112         cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2113         return;
2114     }
2115     vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
2116     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2117         if (cpu_watchpoint_address_matches(wp, vaddr, len)
2118             && (wp->flags & flags)) {
2119             if (flags == BP_MEM_READ) {
2120                 wp->flags |= BP_WATCHPOINT_HIT_READ;
2121             } else {
2122                 wp->flags |= BP_WATCHPOINT_HIT_WRITE;
2123             }
2124             wp->hitaddr = vaddr;
2125             wp->hitattrs = attrs;
2126             if (!cpu->watchpoint_hit) {
2127                 if (wp->flags & BP_CPU &&
2128                     !cc->debug_check_watchpoint(cpu, wp)) {
2129                     wp->flags &= ~BP_WATCHPOINT_HIT;
2130                     continue;
2131                 }
2132                 cpu->watchpoint_hit = wp;
2133
2134                 /* The tb_lock will be reset when cpu_loop_exit or
2135                  * cpu_loop_exit_noexc longjmp back into the cpu_exec
2136                  * main loop.
2137                  */
2138                 tb_lock();
2139                 tb_check_watchpoint(cpu);
2140                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2141                     cpu->exception_index = EXCP_DEBUG;
2142                     cpu_loop_exit(cpu);
2143                 } else {
2144                     cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
2145                     tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
2146                     cpu_loop_exit_noexc(cpu);
2147                 }
2148             }
2149         } else {
2150             wp->flags &= ~BP_WATCHPOINT_HIT;
2151         }
2152     }
2153 }
2154
2155 /* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
2156    so these check for a hit then pass through to the normal out-of-line
2157    phys routines.  */
2158 static MemTxResult watch_mem_read(void *opaque, hwaddr addr, uint64_t *pdata,
2159                                   unsigned size, MemTxAttrs attrs)
2160 {
2161     MemTxResult res;
2162     uint64_t data;
2163     int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2164     AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2165
2166     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
2167     switch (size) {
2168     case 1:
2169         data = address_space_ldub(as, addr, attrs, &res);
2170         break;
2171     case 2:
2172         data = address_space_lduw(as, addr, attrs, &res);
2173         break;
2174     case 4:
2175         data = address_space_ldl(as, addr, attrs, &res);
2176         break;
2177     default: abort();
2178     }
2179     *pdata = data;
2180     return res;
2181 }
2182
2183 static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
2184                                    uint64_t val, unsigned size,
2185                                    MemTxAttrs attrs)
2186 {
2187     MemTxResult res;
2188     int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2189     AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2190
2191     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
2192     switch (size) {
2193     case 1:
2194         address_space_stb(as, addr, val, attrs, &res);
2195         break;
2196     case 2:
2197         address_space_stw(as, addr, val, attrs, &res);
2198         break;
2199     case 4:
2200         address_space_stl(as, addr, val, attrs, &res);
2201         break;
2202     default: abort();
2203     }
2204     return res;
2205 }
2206
2207 static const MemoryRegionOps watch_mem_ops = {
2208     .read_with_attrs = watch_mem_read,
2209     .write_with_attrs = watch_mem_write,
2210     .endianness = DEVICE_NATIVE_ENDIAN,
2211 };
2212
2213 static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
2214                                 unsigned len, MemTxAttrs attrs)
2215 {
2216     subpage_t *subpage = opaque;
2217     uint8_t buf[8];
2218     MemTxResult res;
2219
2220 #if defined(DEBUG_SUBPAGE)
2221     printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2222            subpage, len, addr);
2223 #endif
2224     res = address_space_read(subpage->as, addr + subpage->base,
2225                              attrs, buf, len);
2226     if (res) {
2227         return res;
2228     }
2229     switch (len) {
2230     case 1:
2231         *data = ldub_p(buf);
2232         return MEMTX_OK;
2233     case 2:
2234         *data = lduw_p(buf);
2235         return MEMTX_OK;
2236     case 4:
2237         *data = ldl_p(buf);
2238         return MEMTX_OK;
2239     case 8:
2240         *data = ldq_p(buf);
2241         return MEMTX_OK;
2242     default:
2243         abort();
2244     }
2245 }
2246
2247 static MemTxResult subpage_write(void *opaque, hwaddr addr,
2248                                  uint64_t value, unsigned len, MemTxAttrs attrs)
2249 {
2250     subpage_t *subpage = opaque;
2251     uint8_t buf[8];
2252
2253 #if defined(DEBUG_SUBPAGE)
2254     printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2255            " value %"PRIx64"\n",
2256            __func__, subpage, len, addr, value);
2257 #endif
2258     switch (len) {
2259     case 1:
2260         stb_p(buf, value);
2261         break;
2262     case 2:
2263         stw_p(buf, value);
2264         break;
2265     case 4:
2266         stl_p(buf, value);
2267         break;
2268     case 8:
2269         stq_p(buf, value);
2270         break;
2271     default:
2272         abort();
2273     }
2274     return address_space_write(subpage->as, addr + subpage->base,
2275                                attrs, buf, len);
2276 }
2277
2278 static bool subpage_accepts(void *opaque, hwaddr addr,
2279                             unsigned len, bool is_write)
2280 {
2281     subpage_t *subpage = opaque;
2282 #if defined(DEBUG_SUBPAGE)
2283     printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2284            __func__, subpage, is_write ? 'w' : 'r', len, addr);
2285 #endif
2286
2287     return address_space_access_valid(subpage->as, addr + subpage->base,
2288                                       len, is_write);
2289 }
2290
2291 static const MemoryRegionOps subpage_ops = {
2292     .read_with_attrs = subpage_read,
2293     .write_with_attrs = subpage_write,
2294     .impl.min_access_size = 1,
2295     .impl.max_access_size = 8,
2296     .valid.min_access_size = 1,
2297     .valid.max_access_size = 8,
2298     .valid.accepts = subpage_accepts,
2299     .endianness = DEVICE_NATIVE_ENDIAN,
2300 };
2301
2302 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2303                              uint16_t section)
2304 {
2305     int idx, eidx;
2306
2307     if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2308         return -1;
2309     idx = SUBPAGE_IDX(start);
2310     eidx = SUBPAGE_IDX(end);
2311 #if defined(DEBUG_SUBPAGE)
2312     printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2313            __func__, mmio, start, end, idx, eidx, section);
2314 #endif
2315     for (; idx <= eidx; idx++) {
2316         mmio->sub_section[idx] = section;
2317     }
2318
2319     return 0;
2320 }
2321
2322 static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
2323 {
2324     subpage_t *mmio;
2325
2326     mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2327     mmio->as = as;
2328     mmio->base = base;
2329     memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2330                           NULL, TARGET_PAGE_SIZE);
2331     mmio->iomem.subpage = true;
2332 #if defined(DEBUG_SUBPAGE)
2333     printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2334            mmio, base, TARGET_PAGE_SIZE);
2335 #endif
2336     subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2337
2338     return mmio;
2339 }
2340
2341 static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as,
2342                               MemoryRegion *mr)
2343 {
2344     assert(as);
2345     MemoryRegionSection section = {
2346         .address_space = as,
2347         .mr = mr,
2348         .offset_within_address_space = 0,
2349         .offset_within_region = 0,
2350         .size = int128_2_64(),
2351     };
2352
2353     return phys_section_add(map, &section);
2354 }
2355
2356 MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs)
2357 {
2358     int asidx = cpu_asidx_from_attrs(cpu, attrs);
2359     CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2360     AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2361     MemoryRegionSection *sections = d->map.sections;
2362
2363     return sections[index & ~TARGET_PAGE_MASK].mr;
2364 }
2365
2366 static void io_mem_init(void)
2367 {
2368     memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2369     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2370                           NULL, UINT64_MAX);
2371     memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2372                           NULL, UINT64_MAX);
2373     memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2374                           NULL, UINT64_MAX);
2375 }
2376
2377 static void mem_begin(MemoryListener *listener)
2378 {
2379     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2380     AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2381     uint16_t n;
2382
2383     n = dummy_section(&d->map, as, &io_mem_unassigned);
2384     assert(n == PHYS_SECTION_UNASSIGNED);
2385     n = dummy_section(&d->map, as, &io_mem_notdirty);
2386     assert(n == PHYS_SECTION_NOTDIRTY);
2387     n = dummy_section(&d->map, as, &io_mem_rom);
2388     assert(n == PHYS_SECTION_ROM);
2389     n = dummy_section(&d->map, as, &io_mem_watch);
2390     assert(n == PHYS_SECTION_WATCH);
2391
2392     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2393     d->as = as;
2394     as->next_dispatch = d;
2395 }
2396
2397 static void address_space_dispatch_free(AddressSpaceDispatch *d)
2398 {
2399     phys_sections_free(&d->map);
2400     g_free(d);
2401 }
2402
2403 static void mem_commit(MemoryListener *listener)
2404 {
2405     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2406     AddressSpaceDispatch *cur = as->dispatch;
2407     AddressSpaceDispatch *next = as->next_dispatch;
2408
2409     phys_page_compact_all(next, next->map.nodes_nb);
2410
2411     atomic_rcu_set(&as->dispatch, next);
2412     if (cur) {
2413         call_rcu(cur, address_space_dispatch_free, rcu);
2414     }
2415 }
2416
2417 static void tcg_commit(MemoryListener *listener)
2418 {
2419     CPUAddressSpace *cpuas;
2420     AddressSpaceDispatch *d;
2421
2422     /* since each CPU stores ram addresses in its TLB cache, we must
2423        reset the modified entries */
2424     cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2425     cpu_reloading_memory_map();
2426     /* The CPU and TLB are protected by the iothread lock.
2427      * We reload the dispatch pointer now because cpu_reloading_memory_map()
2428      * may have split the RCU critical section.
2429      */
2430     d = atomic_rcu_read(&cpuas->as->dispatch);
2431     atomic_rcu_set(&cpuas->memory_dispatch, d);
2432     tlb_flush(cpuas->cpu, 1);
2433 }
2434
2435 void address_space_init_dispatch(AddressSpace *as)
2436 {
2437     as->dispatch = NULL;
2438     as->dispatch_listener = (MemoryListener) {
2439         .begin = mem_begin,
2440         .commit = mem_commit,
2441         .region_add = mem_add,
2442         .region_nop = mem_add,
2443         .priority = 0,
2444     };
2445     memory_listener_register(&as->dispatch_listener, as);
2446 }
2447
2448 void address_space_unregister(AddressSpace *as)
2449 {
2450     memory_listener_unregister(&as->dispatch_listener);
2451 }
2452
2453 void address_space_destroy_dispatch(AddressSpace *as)
2454 {
2455     AddressSpaceDispatch *d = as->dispatch;
2456
2457     atomic_rcu_set(&as->dispatch, NULL);
2458     if (d) {
2459         call_rcu(d, address_space_dispatch_free, rcu);
2460     }
2461 }
2462
2463 static void memory_map_init(void)
2464 {
2465     system_memory = g_malloc(sizeof(*system_memory));
2466
2467     memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2468     address_space_init(&address_space_memory, system_memory, "memory");
2469
2470     system_io = g_malloc(sizeof(*system_io));
2471     memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2472                           65536);
2473     address_space_init(&address_space_io, system_io, "I/O");
2474 }
2475
2476 MemoryRegion *get_system_memory(void)
2477 {
2478     return system_memory;
2479 }
2480
2481 MemoryRegion *get_system_io(void)
2482 {
2483     return system_io;
2484 }
2485
2486 #endif /* !defined(CONFIG_USER_ONLY) */
2487
2488 /* physical memory access (slow version, mainly for debug) */
2489 #if defined(CONFIG_USER_ONLY)
2490 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2491                         uint8_t *buf, int len, int is_write)
2492 {
2493     int l, flags;
2494     target_ulong page;
2495     void * p;
2496
2497     while (len > 0) {
2498         page = addr & TARGET_PAGE_MASK;
2499         l = (page + TARGET_PAGE_SIZE) - addr;
2500         if (l > len)
2501             l = len;
2502         flags = page_get_flags(page);
2503         if (!(flags & PAGE_VALID))
2504             return -1;
2505         if (is_write) {
2506             if (!(flags & PAGE_WRITE))
2507                 return -1;
2508             /* XXX: this code should not depend on lock_user */
2509             if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
2510                 return -1;
2511             memcpy(p, buf, l);
2512             unlock_user(p, addr, l);
2513         } else {
2514             if (!(flags & PAGE_READ))
2515                 return -1;
2516             /* XXX: this code should not depend on lock_user */
2517             if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
2518                 return -1;
2519             memcpy(buf, p, l);
2520             unlock_user(p, addr, 0);
2521         }
2522         len -= l;
2523         buf += l;
2524         addr += l;
2525     }
2526     return 0;
2527 }
2528
2529 #else
2530
2531 static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
2532                                      hwaddr length)
2533 {
2534     uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
2535     addr += memory_region_get_ram_addr(mr);
2536
2537     /* No early return if dirty_log_mask is or becomes 0, because
2538      * cpu_physical_memory_set_dirty_range will still call
2539      * xen_modified_memory.
2540      */
2541     if (dirty_log_mask) {
2542         dirty_log_mask =
2543             cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
2544     }
2545     if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
2546         tb_lock();
2547         tb_invalidate_phys_range(addr, addr + length);
2548         tb_unlock();
2549         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
2550     }
2551     cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
2552 }
2553
2554 static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2555 {
2556     unsigned access_size_max = mr->ops->valid.max_access_size;
2557
2558     /* Regions are assumed to support 1-4 byte accesses unless
2559        otherwise specified.  */
2560     if (access_size_max == 0) {
2561         access_size_max = 4;
2562     }
2563
2564     /* Bound the maximum access by the alignment of the address.  */
2565     if (!mr->ops->impl.unaligned) {
2566         unsigned align_size_max = addr & -addr;
2567         if (align_size_max != 0 && align_size_max < access_size_max) {
2568             access_size_max = align_size_max;
2569         }
2570     }
2571
2572     /* Don't attempt accesses larger than the maximum.  */
2573     if (l > access_size_max) {
2574         l = access_size_max;
2575     }
2576     l = pow2floor(l);
2577
2578     return l;
2579 }
2580
2581 static bool prepare_mmio_access(MemoryRegion *mr)
2582 {
2583     bool unlocked = !qemu_mutex_iothread_locked();
2584     bool release_lock = false;
2585
2586     if (unlocked && mr->global_locking) {
2587         qemu_mutex_lock_iothread();
2588         unlocked = false;
2589         release_lock = true;
2590     }
2591     if (mr->flush_coalesced_mmio) {
2592         if (unlocked) {
2593             qemu_mutex_lock_iothread();
2594         }
2595         qemu_flush_coalesced_mmio_buffer();
2596         if (unlocked) {
2597             qemu_mutex_unlock_iothread();
2598         }
2599     }
2600
2601     return release_lock;
2602 }
2603
2604 /* Called within RCU critical section.  */
2605 static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
2606                                                 MemTxAttrs attrs,
2607                                                 const uint8_t *buf,
2608                                                 int len, hwaddr addr1,
2609                                                 hwaddr l, MemoryRegion *mr)
2610 {
2611     uint8_t *ptr;
2612     uint64_t val;
2613     MemTxResult result = MEMTX_OK;
2614     bool release_lock = false;
2615
2616     for (;;) {
2617         if (!memory_access_is_direct(mr, true)) {
2618             release_lock |= prepare_mmio_access(mr);
2619             l = memory_access_size(mr, l, addr1);
2620             /* XXX: could force current_cpu to NULL to avoid
2621                potential bugs */
2622             switch (l) {
2623             case 8:
2624                 /* 64 bit write access */
2625                 val = ldq_p(buf);
2626                 result |= memory_region_dispatch_write(mr, addr1, val, 8,
2627                                                        attrs);
2628                 break;
2629             case 4:
2630                 /* 32 bit write access */
2631                 val = ldl_p(buf);
2632                 result |= memory_region_dispatch_write(mr, addr1, val, 4,
2633                                                        attrs);
2634                 break;
2635             case 2:
2636                 /* 16 bit write access */
2637                 val = lduw_p(buf);
2638                 result |= memory_region_dispatch_write(mr, addr1, val, 2,
2639                                                        attrs);
2640                 break;
2641             case 1:
2642                 /* 8 bit write access */
2643                 val = ldub_p(buf);
2644                 result |= memory_region_dispatch_write(mr, addr1, val, 1,
2645                                                        attrs);
2646                 break;
2647             default:
2648                 abort();
2649             }
2650         } else {
2651             /* RAM case */
2652             ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2653             memcpy(ptr, buf, l);
2654             invalidate_and_set_dirty(mr, addr1, l);
2655         }
2656
2657         if (release_lock) {
2658             qemu_mutex_unlock_iothread();
2659             release_lock = false;
2660         }
2661
2662         len -= l;
2663         buf += l;
2664         addr += l;
2665
2666         if (!len) {
2667             break;
2668         }
2669
2670         l = len;
2671         mr = address_space_translate(as, addr, &addr1, &l, true);
2672     }
2673
2674     return result;
2675 }
2676
2677 MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
2678                                 const uint8_t *buf, int len)
2679 {
2680     hwaddr l;
2681     hwaddr addr1;
2682     MemoryRegion *mr;
2683     MemTxResult result = MEMTX_OK;
2684
2685     if (len > 0) {
2686         rcu_read_lock();
2687         l = len;
2688         mr = address_space_translate(as, addr, &addr1, &l, true);
2689         result = address_space_write_continue(as, addr, attrs, buf, len,
2690                                               addr1, l, mr);
2691         rcu_read_unlock();
2692     }
2693
2694     return result;
2695 }
2696
2697 /* Called within RCU critical section.  */
2698 MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
2699                                         MemTxAttrs attrs, uint8_t *buf,
2700                                         int len, hwaddr addr1, hwaddr l,
2701                                         MemoryRegion *mr)
2702 {
2703     uint8_t *ptr;
2704     uint64_t val;
2705     MemTxResult result = MEMTX_OK;
2706     bool release_lock = false;
2707
2708     for (;;) {
2709         if (!memory_access_is_direct(mr, false)) {
2710             /* I/O case */
2711             release_lock |= prepare_mmio_access(mr);
2712             l = memory_access_size(mr, l, addr1);
2713             switch (l) {
2714             case 8:
2715                 /* 64 bit read access */
2716                 result |= memory_region_dispatch_read(mr, addr1, &val, 8,
2717                                                       attrs);
2718                 stq_p(buf, val);
2719                 break;
2720             case 4:
2721                 /* 32 bit read access */
2722                 result |= memory_region_dispatch_read(mr, addr1, &val, 4,
2723                                                       attrs);
2724                 stl_p(buf, val);
2725                 break;
2726             case 2:
2727                 /* 16 bit read access */
2728                 result |= memory_region_dispatch_read(mr, addr1, &val, 2,
2729                                                       attrs);
2730                 stw_p(buf, val);
2731                 break;
2732             case 1:
2733                 /* 8 bit read access */
2734                 result |= memory_region_dispatch_read(mr, addr1, &val, 1,
2735                                                       attrs);
2736                 stb_p(buf, val);
2737                 break;
2738             default:
2739                 abort();
2740             }
2741         } else {
2742             /* RAM case */
2743             ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2744             memcpy(buf, ptr, l);
2745         }
2746
2747         if (release_lock) {
2748             qemu_mutex_unlock_iothread();
2749             release_lock = false;
2750         }
2751
2752         len -= l;
2753         buf += l;
2754         addr += l;
2755
2756         if (!len) {
2757             break;
2758         }
2759
2760         l = len;
2761         mr = address_space_translate(as, addr, &addr1, &l, false);
2762     }
2763
2764     return result;
2765 }
2766
2767 MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
2768                                     MemTxAttrs attrs, uint8_t *buf, int len)
2769 {
2770     hwaddr l;
2771     hwaddr addr1;
2772     MemoryRegion *mr;
2773     MemTxResult result = MEMTX_OK;
2774
2775     if (len > 0) {
2776         rcu_read_lock();
2777         l = len;
2778         mr = address_space_translate(as, addr, &addr1, &l, false);
2779         result = address_space_read_continue(as, addr, attrs, buf, len,
2780                                              addr1, l, mr);
2781         rcu_read_unlock();
2782     }
2783
2784     return result;
2785 }
2786
2787 MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
2788                              uint8_t *buf, int len, bool is_write)
2789 {
2790     if (is_write) {
2791         return address_space_write(as, addr, attrs, (uint8_t *)buf, len);
2792     } else {
2793         return address_space_read(as, addr, attrs, (uint8_t *)buf, len);
2794     }
2795 }
2796
2797 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
2798                             int len, int is_write)
2799 {
2800     address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
2801                      buf, len, is_write);
2802 }
2803
2804 enum write_rom_type {
2805     WRITE_DATA,
2806     FLUSH_CACHE,
2807 };
2808
2809 static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
2810     hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
2811 {
2812     hwaddr l;
2813     uint8_t *ptr;
2814     hwaddr addr1;
2815     MemoryRegion *mr;
2816
2817     rcu_read_lock();
2818     while (len > 0) {
2819         l = len;
2820         mr = address_space_translate(as, addr, &addr1, &l, true);
2821
2822         if (!(memory_region_is_ram(mr) ||
2823               memory_region_is_romd(mr))) {
2824             l = memory_access_size(mr, l, addr1);
2825         } else {
2826             /* ROM/RAM case */
2827             ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2828             switch (type) {
2829             case WRITE_DATA:
2830                 memcpy(ptr, buf, l);
2831                 invalidate_and_set_dirty(mr, addr1, l);
2832                 break;
2833             case FLUSH_CACHE:
2834                 flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
2835                 break;
2836             }
2837         }
2838         len -= l;
2839         buf += l;
2840         addr += l;
2841     }
2842     rcu_read_unlock();
2843 }
2844
2845 /* used for ROM loading : can write in RAM and ROM */
2846 void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
2847                                    const uint8_t *buf, int len)
2848 {
2849     cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
2850 }
2851
2852 void cpu_flush_icache_range(hwaddr start, int len)
2853 {
2854     /*
2855      * This function should do the same thing as an icache flush that was
2856      * triggered from within the guest. For TCG we are always cache coherent,
2857      * so there is no need to flush anything. For KVM / Xen we need to flush
2858      * the host's instruction cache at least.
2859      */
2860     if (tcg_enabled()) {
2861         return;
2862     }
2863
2864     cpu_physical_memory_write_rom_internal(&address_space_memory,
2865                                            start, NULL, len, FLUSH_CACHE);
2866 }
2867
2868 typedef struct {
2869     MemoryRegion *mr;
2870     void *buffer;
2871     hwaddr addr;
2872     hwaddr len;
2873     bool in_use;
2874 } BounceBuffer;
2875
2876 static BounceBuffer bounce;
2877
2878 typedef struct MapClient {
2879     QEMUBH *bh;
2880     QLIST_ENTRY(MapClient) link;
2881 } MapClient;
2882
2883 QemuMutex map_client_list_lock;
2884 static QLIST_HEAD(map_client_list, MapClient) map_client_list
2885     = QLIST_HEAD_INITIALIZER(map_client_list);
2886
2887 static void cpu_unregister_map_client_do(MapClient *client)
2888 {
2889     QLIST_REMOVE(client, link);
2890     g_free(client);
2891 }
2892
2893 static void cpu_notify_map_clients_locked(void)
2894 {
2895     MapClient *client;
2896
2897     while (!QLIST_EMPTY(&map_client_list)) {
2898         client = QLIST_FIRST(&map_client_list);
2899         qemu_bh_schedule(client->bh);
2900         cpu_unregister_map_client_do(client);
2901     }
2902 }
2903
2904 void cpu_register_map_client(QEMUBH *bh)
2905 {
2906     MapClient *client = g_malloc(sizeof(*client));
2907
2908     qemu_mutex_lock(&map_client_list_lock);
2909     client->bh = bh;
2910     QLIST_INSERT_HEAD(&map_client_list, client, link);
2911     if (!atomic_read(&bounce.in_use)) {
2912         cpu_notify_map_clients_locked();
2913     }
2914     qemu_mutex_unlock(&map_client_list_lock);
2915 }
2916
2917 void cpu_exec_init_all(void)
2918 {
2919     qemu_mutex_init(&ram_list.mutex);
2920     /* The data structures we set up here depend on knowing the page size,
2921      * so no more changes can be made after this point.
2922      * In an ideal world, nothing we did before we had finished the
2923      * machine setup would care about the target page size, and we could
2924      * do this much later, rather than requiring board models to state
2925      * up front what their requirements are.
2926      */
2927     finalize_target_page_bits();
2928     io_mem_init();
2929     memory_map_init();
2930     qemu_mutex_init(&map_client_list_lock);
2931 }
2932
2933 void cpu_unregister_map_client(QEMUBH *bh)
2934 {
2935     MapClient *client;
2936
2937     qemu_mutex_lock(&map_client_list_lock);
2938     QLIST_FOREACH(client, &map_client_list, link) {
2939         if (client->bh == bh) {
2940             cpu_unregister_map_client_do(client);
2941             break;
2942         }
2943     }
2944     qemu_mutex_unlock(&map_client_list_lock);
2945 }
2946
2947 static void cpu_notify_map_clients(void)
2948 {
2949     qemu_mutex_lock(&map_client_list_lock);
2950     cpu_notify_map_clients_locked();
2951     qemu_mutex_unlock(&map_client_list_lock);
2952 }
2953
2954 bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
2955 {
2956     MemoryRegion *mr;
2957     hwaddr l, xlat;
2958
2959     rcu_read_lock();
2960     while (len > 0) {
2961         l = len;
2962         mr = address_space_translate(as, addr, &xlat, &l, is_write);
2963         if (!memory_access_is_direct(mr, is_write)) {
2964             l = memory_access_size(mr, l, addr);
2965             if (!memory_region_access_valid(mr, xlat, l, is_write)) {
2966                 return false;
2967             }
2968         }
2969
2970         len -= l;
2971         addr += l;
2972     }
2973     rcu_read_unlock();
2974     return true;
2975 }
2976
2977 /* Map a physical memory region into a host virtual address.
2978  * May map a subset of the requested range, given by and returned in *plen.
2979  * May return NULL if resources needed to perform the mapping are exhausted.
2980  * Use only for reads OR writes - not for read-modify-write operations.
2981  * Use cpu_register_map_client() to know when retrying the map operation is
2982  * likely to succeed.
2983  */
2984 void *address_space_map(AddressSpace *as,
2985                         hwaddr addr,
2986                         hwaddr *plen,
2987                         bool is_write)
2988 {
2989     hwaddr len = *plen;
2990     hwaddr done = 0;
2991     hwaddr l, xlat, base;
2992     MemoryRegion *mr, *this_mr;
2993     void *ptr;
2994
2995     if (len == 0) {
2996         return NULL;
2997     }
2998
2999     l = len;
3000     rcu_read_lock();
3001     mr = address_space_translate(as, addr, &xlat, &l, is_write);
3002
3003     if (!memory_access_is_direct(mr, is_write)) {
3004         if (atomic_xchg(&bounce.in_use, true)) {
3005             rcu_read_unlock();
3006             return NULL;
3007         }
3008         /* Avoid unbounded allocations */
3009         l = MIN(l, TARGET_PAGE_SIZE);
3010         bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3011         bounce.addr = addr;
3012         bounce.len = l;
3013
3014         memory_region_ref(mr);
3015         bounce.mr = mr;
3016         if (!is_write) {
3017             address_space_read(as, addr, MEMTXATTRS_UNSPECIFIED,
3018                                bounce.buffer, l);
3019         }
3020
3021         rcu_read_unlock();
3022         *plen = l;
3023         return bounce.buffer;
3024     }
3025
3026     base = xlat;
3027
3028     for (;;) {
3029         len -= l;
3030         addr += l;
3031         done += l;
3032         if (len == 0) {
3033             break;
3034         }
3035
3036         l = len;
3037         this_mr = address_space_translate(as, addr, &xlat, &l, is_write);
3038         if (this_mr != mr || xlat != base + done) {
3039             break;
3040         }
3041     }
3042
3043     memory_region_ref(mr);
3044     *plen = done;
3045     ptr = qemu_ram_ptr_length(mr->ram_block, base, plen);
3046     rcu_read_unlock();
3047
3048     return ptr;
3049 }
3050
3051 /* Unmaps a memory region previously mapped by address_space_map().
3052  * Will also mark the memory as dirty if is_write == 1.  access_len gives
3053  * the amount of memory that was actually read or written by the caller.
3054  */
3055 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
3056                          int is_write, hwaddr access_len)
3057 {
3058     if (buffer != bounce.buffer) {
3059         MemoryRegion *mr;
3060         ram_addr_t addr1;
3061
3062         mr = memory_region_from_host(buffer, &addr1);
3063         assert(mr != NULL);
3064         if (is_write) {
3065             invalidate_and_set_dirty(mr, addr1, access_len);
3066         }
3067         if (xen_enabled()) {
3068             xen_invalidate_map_cache_entry(buffer);
3069         }
3070         memory_region_unref(mr);
3071         return;
3072     }
3073     if (is_write) {
3074         address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
3075                             bounce.buffer, access_len);
3076     }
3077     qemu_vfree(bounce.buffer);
3078     bounce.buffer = NULL;
3079     memory_region_unref(bounce.mr);
3080     atomic_mb_set(&bounce.in_use, false);
3081     cpu_notify_map_clients();
3082 }
3083
3084 void *cpu_physical_memory_map(hwaddr addr,
3085                               hwaddr *plen,
3086                               int is_write)
3087 {
3088     return address_space_map(&address_space_memory, addr, plen, is_write);
3089 }
3090
3091 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
3092                                int is_write, hwaddr access_len)
3093 {
3094     return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
3095 }
3096
3097 /* warning: addr must be aligned */
3098 static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr,
3099                                                   MemTxAttrs attrs,
3100                                                   MemTxResult *result,
3101                                                   enum device_endian endian)
3102 {
3103     uint8_t *ptr;
3104     uint64_t val;
3105     MemoryRegion *mr;
3106     hwaddr l = 4;
3107     hwaddr addr1;
3108     MemTxResult r;
3109     bool release_lock = false;
3110
3111     rcu_read_lock();
3112     mr = address_space_translate(as, addr, &addr1, &l, false);
3113     if (l < 4 || !memory_access_is_direct(mr, false)) {
3114         release_lock |= prepare_mmio_access(mr);
3115
3116         /* I/O case */
3117         r = memory_region_dispatch_read(mr, addr1, &val, 4, attrs);
3118 #if defined(TARGET_WORDS_BIGENDIAN)
3119         if (endian == DEVICE_LITTLE_ENDIAN) {
3120             val = bswap32(val);
3121         }
3122 #else
3123         if (endian == DEVICE_BIG_ENDIAN) {
3124             val = bswap32(val);
3125         }
3126 #endif
3127     } else {
3128         /* RAM case */
3129         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3130         switch (endian) {
3131         case DEVICE_LITTLE_ENDIAN:
3132             val = ldl_le_p(ptr);
3133             break;
3134         case DEVICE_BIG_ENDIAN:
3135             val = ldl_be_p(ptr);
3136             break;
3137         default:
3138             val = ldl_p(ptr);
3139             break;
3140         }
3141         r = MEMTX_OK;
3142     }
3143     if (result) {
3144         *result = r;
3145     }
3146     if (release_lock) {
3147         qemu_mutex_unlock_iothread();
3148     }
3149     rcu_read_unlock();
3150     return val;
3151 }
3152
3153 uint32_t address_space_ldl(AddressSpace *as, hwaddr addr,
3154                            MemTxAttrs attrs, MemTxResult *result)
3155 {
3156     return address_space_ldl_internal(as, addr, attrs, result,
3157                                       DEVICE_NATIVE_ENDIAN);
3158 }
3159
3160 uint32_t address_space_ldl_le(AddressSpace *as, hwaddr addr,
3161                               MemTxAttrs attrs, MemTxResult *result)
3162 {
3163     return address_space_ldl_internal(as, addr, attrs, result,
3164                                       DEVICE_LITTLE_ENDIAN);
3165 }
3166
3167 uint32_t address_space_ldl_be(AddressSpace *as, hwaddr addr,
3168                               MemTxAttrs attrs, MemTxResult *result)
3169 {
3170     return address_space_ldl_internal(as, addr, attrs, result,
3171                                       DEVICE_BIG_ENDIAN);
3172 }
3173
3174 uint32_t ldl_phys(AddressSpace *as, hwaddr addr)
3175 {
3176     return address_space_ldl(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3177 }
3178
3179 uint32_t ldl_le_phys(AddressSpace *as, hwaddr addr)
3180 {
3181     return address_space_ldl_le(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3182 }
3183
3184 uint32_t ldl_be_phys(AddressSpace *as, hwaddr addr)
3185 {
3186     return address_space_ldl_be(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3187 }
3188
3189 /* warning: addr must be aligned */
3190 static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr,
3191                                                   MemTxAttrs attrs,
3192                                                   MemTxResult *result,
3193                                                   enum device_endian endian)
3194 {
3195     uint8_t *ptr;
3196     uint64_t val;
3197     MemoryRegion *mr;
3198     hwaddr l = 8;
3199     hwaddr addr1;
3200     MemTxResult r;
3201     bool release_lock = false;
3202
3203     rcu_read_lock();
3204     mr = address_space_translate(as, addr, &addr1, &l,
3205                                  false);
3206     if (l < 8 || !memory_access_is_direct(mr, false)) {
3207         release_lock |= prepare_mmio_access(mr);
3208
3209         /* I/O case */
3210         r = memory_region_dispatch_read(mr, addr1, &val, 8, attrs);
3211 #if defined(TARGET_WORDS_BIGENDIAN)
3212         if (endian == DEVICE_LITTLE_ENDIAN) {
3213             val = bswap64(val);
3214         }
3215 #else
3216         if (endian == DEVICE_BIG_ENDIAN) {
3217             val = bswap64(val);
3218         }
3219 #endif
3220     } else {
3221         /* RAM case */
3222         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3223         switch (endian) {
3224         case DEVICE_LITTLE_ENDIAN:
3225             val = ldq_le_p(ptr);
3226             break;
3227         case DEVICE_BIG_ENDIAN:
3228             val = ldq_be_p(ptr);
3229             break;
3230         default:
3231             val = ldq_p(ptr);
3232             break;
3233         }
3234         r = MEMTX_OK;
3235     }
3236     if (result) {
3237         *result = r;
3238     }
3239     if (release_lock) {
3240         qemu_mutex_unlock_iothread();
3241     }
3242     rcu_read_unlock();
3243     return val;
3244 }
3245
3246 uint64_t address_space_ldq(AddressSpace *as, hwaddr addr,
3247                            MemTxAttrs attrs, MemTxResult *result)
3248 {
3249     return address_space_ldq_internal(as, addr, attrs, result,
3250                                       DEVICE_NATIVE_ENDIAN);
3251 }
3252
3253 uint64_t address_space_ldq_le(AddressSpace *as, hwaddr addr,
3254                            MemTxAttrs attrs, MemTxResult *result)
3255 {
3256     return address_space_ldq_internal(as, addr, attrs, result,
3257                                       DEVICE_LITTLE_ENDIAN);
3258 }
3259
3260 uint64_t address_space_ldq_be(AddressSpace *as, hwaddr addr,
3261                            MemTxAttrs attrs, MemTxResult *result)
3262 {
3263     return address_space_ldq_internal(as, addr, attrs, result,
3264                                       DEVICE_BIG_ENDIAN);
3265 }
3266
3267 uint64_t ldq_phys(AddressSpace *as, hwaddr addr)
3268 {
3269     return address_space_ldq(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3270 }
3271
3272 uint64_t ldq_le_phys(AddressSpace *as, hwaddr addr)
3273 {
3274     return address_space_ldq_le(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3275 }
3276
3277 uint64_t ldq_be_phys(AddressSpace *as, hwaddr addr)
3278 {
3279     return address_space_ldq_be(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3280 }
3281
3282 /* XXX: optimize */
3283 uint32_t address_space_ldub(AddressSpace *as, hwaddr addr,
3284                             MemTxAttrs attrs, MemTxResult *result)
3285 {
3286     uint8_t val;
3287     MemTxResult r;
3288
3289     r = address_space_rw(as, addr, attrs, &val, 1, 0);
3290     if (result) {
3291         *result = r;
3292     }
3293     return val;
3294 }
3295
3296 uint32_t ldub_phys(AddressSpace *as, hwaddr addr)
3297 {
3298     return address_space_ldub(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3299 }
3300
3301 /* warning: addr must be aligned */
3302 static inline uint32_t address_space_lduw_internal(AddressSpace *as,
3303                                                    hwaddr addr,
3304                                                    MemTxAttrs attrs,
3305                                                    MemTxResult *result,
3306                                                    enum device_endian endian)
3307 {
3308     uint8_t *ptr;
3309     uint64_t val;
3310     MemoryRegion *mr;
3311     hwaddr l = 2;
3312     hwaddr addr1;
3313     MemTxResult r;
3314     bool release_lock = false;
3315
3316     rcu_read_lock();
3317     mr = address_space_translate(as, addr, &addr1, &l,
3318                                  false);
3319     if (l < 2 || !memory_access_is_direct(mr, false)) {
3320         release_lock |= prepare_mmio_access(mr);
3321
3322         /* I/O case */
3323         r = memory_region_dispatch_read(mr, addr1, &val, 2, attrs);
3324 #if defined(TARGET_WORDS_BIGENDIAN)
3325         if (endian == DEVICE_LITTLE_ENDIAN) {
3326             val = bswap16(val);
3327         }
3328 #else
3329         if (endian == DEVICE_BIG_ENDIAN) {
3330             val = bswap16(val);
3331         }
3332 #endif
3333     } else {
3334         /* RAM case */
3335         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3336         switch (endian) {
3337         case DEVICE_LITTLE_ENDIAN:
3338             val = lduw_le_p(ptr);
3339             break;
3340         case DEVICE_BIG_ENDIAN:
3341             val = lduw_be_p(ptr);
3342             break;
3343         default:
3344             val = lduw_p(ptr);
3345             break;
3346         }
3347         r = MEMTX_OK;
3348     }
3349     if (result) {
3350         *result = r;
3351     }
3352     if (release_lock) {
3353         qemu_mutex_unlock_iothread();
3354     }
3355     rcu_read_unlock();
3356     return val;
3357 }
3358
3359 uint32_t address_space_lduw(AddressSpace *as, hwaddr addr,
3360                            MemTxAttrs attrs, MemTxResult *result)
3361 {
3362     return address_space_lduw_internal(as, addr, attrs, result,
3363                                        DEVICE_NATIVE_ENDIAN);
3364 }
3365
3366 uint32_t address_space_lduw_le(AddressSpace *as, hwaddr addr,
3367                            MemTxAttrs attrs, MemTxResult *result)
3368 {
3369     return address_space_lduw_internal(as, addr, attrs, result,
3370                                        DEVICE_LITTLE_ENDIAN);
3371 }
3372
3373 uint32_t address_space_lduw_be(AddressSpace *as, hwaddr addr,
3374                            MemTxAttrs attrs, MemTxResult *result)
3375 {
3376     return address_space_lduw_internal(as, addr, attrs, result,
3377                                        DEVICE_BIG_ENDIAN);
3378 }
3379
3380 uint32_t lduw_phys(AddressSpace *as, hwaddr addr)
3381 {
3382     return address_space_lduw(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3383 }
3384
3385 uint32_t lduw_le_phys(AddressSpace *as, hwaddr addr)
3386 {
3387     return address_space_lduw_le(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3388 }
3389
3390 uint32_t lduw_be_phys(AddressSpace *as, hwaddr addr)
3391 {
3392     return address_space_lduw_be(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3393 }
3394
3395 /* warning: addr must be aligned. The ram page is not masked as dirty
3396    and the code inside is not invalidated. It is useful if the dirty
3397    bits are used to track modified PTEs */
3398 void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val,
3399                                 MemTxAttrs attrs, MemTxResult *result)
3400 {
3401     uint8_t *ptr;
3402     MemoryRegion *mr;
3403     hwaddr l = 4;
3404     hwaddr addr1;
3405     MemTxResult r;
3406     uint8_t dirty_log_mask;
3407     bool release_lock = false;
3408
3409     rcu_read_lock();
3410     mr = address_space_translate(as, addr, &addr1, &l,
3411                                  true);
3412     if (l < 4 || !memory_access_is_direct(mr, true)) {
3413         release_lock |= prepare_mmio_access(mr);
3414
3415         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
3416     } else {
3417         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3418         stl_p(ptr, val);
3419
3420         dirty_log_mask = memory_region_get_dirty_log_mask(mr);
3421         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
3422         cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
3423                                             4, dirty_log_mask);
3424         r = MEMTX_OK;
3425     }
3426     if (result) {
3427         *result = r;
3428     }
3429     if (release_lock) {
3430         qemu_mutex_unlock_iothread();
3431     }
3432     rcu_read_unlock();
3433 }
3434
3435 void stl_phys_notdirty(AddressSpace *as, hwaddr addr, uint32_t val)
3436 {
3437     address_space_stl_notdirty(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3438 }
3439
3440 /* warning: addr must be aligned */
3441 static inline void address_space_stl_internal(AddressSpace *as,
3442                                               hwaddr addr, uint32_t val,
3443                                               MemTxAttrs attrs,
3444                                               MemTxResult *result,
3445                                               enum device_endian endian)
3446 {
3447     uint8_t *ptr;
3448     MemoryRegion *mr;
3449     hwaddr l = 4;
3450     hwaddr addr1;
3451     MemTxResult r;
3452     bool release_lock = false;
3453
3454     rcu_read_lock();
3455     mr = address_space_translate(as, addr, &addr1, &l,
3456                                  true);
3457     if (l < 4 || !memory_access_is_direct(mr, true)) {
3458         release_lock |= prepare_mmio_access(mr);
3459
3460 #if defined(TARGET_WORDS_BIGENDIAN)
3461         if (endian == DEVICE_LITTLE_ENDIAN) {
3462             val = bswap32(val);
3463         }
3464 #else
3465         if (endian == DEVICE_BIG_ENDIAN) {
3466             val = bswap32(val);
3467         }
3468 #endif
3469         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
3470     } else {
3471         /* RAM case */
3472         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3473         switch (endian) {
3474         case DEVICE_LITTLE_ENDIAN:
3475             stl_le_p(ptr, val);
3476             break;
3477         case DEVICE_BIG_ENDIAN:
3478             stl_be_p(ptr, val);
3479             break;
3480         default:
3481             stl_p(ptr, val);
3482             break;
3483         }
3484         invalidate_and_set_dirty(mr, addr1, 4);
3485         r = MEMTX_OK;
3486     }
3487     if (result) {
3488         *result = r;
3489     }
3490     if (release_lock) {
3491         qemu_mutex_unlock_iothread();
3492     }
3493     rcu_read_unlock();
3494 }
3495
3496 void address_space_stl(AddressSpace *as, hwaddr addr, uint32_t val,
3497                        MemTxAttrs attrs, MemTxResult *result)
3498 {
3499     address_space_stl_internal(as, addr, val, attrs, result,
3500                                DEVICE_NATIVE_ENDIAN);
3501 }
3502
3503 void address_space_stl_le(AddressSpace *as, hwaddr addr, uint32_t val,
3504                        MemTxAttrs attrs, MemTxResult *result)
3505 {
3506     address_space_stl_internal(as, addr, val, attrs, result,
3507                                DEVICE_LITTLE_ENDIAN);
3508 }
3509
3510 void address_space_stl_be(AddressSpace *as, hwaddr addr, uint32_t val,
3511                        MemTxAttrs attrs, MemTxResult *result)
3512 {
3513     address_space_stl_internal(as, addr, val, attrs, result,
3514                                DEVICE_BIG_ENDIAN);
3515 }
3516
3517 void stl_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3518 {
3519     address_space_stl(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3520 }
3521
3522 void stl_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3523 {
3524     address_space_stl_le(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3525 }
3526
3527 void stl_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3528 {
3529     address_space_stl_be(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3530 }
3531
3532 /* XXX: optimize */
3533 void address_space_stb(AddressSpace *as, hwaddr addr, uint32_t val,
3534                        MemTxAttrs attrs, MemTxResult *result)
3535 {
3536     uint8_t v = val;
3537     MemTxResult r;
3538
3539     r = address_space_rw(as, addr, attrs, &v, 1, 1);
3540     if (result) {
3541         *result = r;
3542     }
3543 }
3544
3545 void stb_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3546 {
3547     address_space_stb(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3548 }
3549
3550 /* warning: addr must be aligned */
3551 static inline void address_space_stw_internal(AddressSpace *as,
3552                                               hwaddr addr, uint32_t val,
3553                                               MemTxAttrs attrs,
3554                                               MemTxResult *result,
3555                                               enum device_endian endian)
3556 {
3557     uint8_t *ptr;
3558     MemoryRegion *mr;
3559     hwaddr l = 2;
3560     hwaddr addr1;
3561     MemTxResult r;
3562     bool release_lock = false;
3563
3564     rcu_read_lock();
3565     mr = address_space_translate(as, addr, &addr1, &l, true);
3566     if (l < 2 || !memory_access_is_direct(mr, true)) {
3567         release_lock |= prepare_mmio_access(mr);
3568
3569 #if defined(TARGET_WORDS_BIGENDIAN)
3570         if (endian == DEVICE_LITTLE_ENDIAN) {
3571             val = bswap16(val);
3572         }
3573 #else
3574         if (endian == DEVICE_BIG_ENDIAN) {
3575             val = bswap16(val);
3576         }
3577 #endif
3578         r = memory_region_dispatch_write(mr, addr1, val, 2, attrs);
3579     } else {
3580         /* RAM case */
3581         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3582         switch (endian) {
3583         case DEVICE_LITTLE_ENDIAN:
3584             stw_le_p(ptr, val);
3585             break;
3586         case DEVICE_BIG_ENDIAN:
3587             stw_be_p(ptr, val);
3588             break;
3589         default:
3590             stw_p(ptr, val);
3591             break;
3592         }
3593         invalidate_and_set_dirty(mr, addr1, 2);
3594         r = MEMTX_OK;
3595     }
3596     if (result) {
3597         *result = r;
3598     }
3599     if (release_lock) {
3600         qemu_mutex_unlock_iothread();
3601     }
3602     rcu_read_unlock();
3603 }
3604
3605 void address_space_stw(AddressSpace *as, hwaddr addr, uint32_t val,
3606                        MemTxAttrs attrs, MemTxResult *result)
3607 {
3608     address_space_stw_internal(as, addr, val, attrs, result,
3609                                DEVICE_NATIVE_ENDIAN);
3610 }
3611
3612 void address_space_stw_le(AddressSpace *as, hwaddr addr, uint32_t val,
3613                        MemTxAttrs attrs, MemTxResult *result)
3614 {
3615     address_space_stw_internal(as, addr, val, attrs, result,
3616                                DEVICE_LITTLE_ENDIAN);
3617 }
3618
3619 void address_space_stw_be(AddressSpace *as, hwaddr addr, uint32_t val,
3620                        MemTxAttrs attrs, MemTxResult *result)
3621 {
3622     address_space_stw_internal(as, addr, val, attrs, result,
3623                                DEVICE_BIG_ENDIAN);
3624 }
3625
3626 void stw_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3627 {
3628     address_space_stw(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3629 }
3630
3631 void stw_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3632 {
3633     address_space_stw_le(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3634 }
3635
3636 void stw_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3637 {
3638     address_space_stw_be(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3639 }
3640
3641 /* XXX: optimize */
3642 void address_space_stq(AddressSpace *as, hwaddr addr, uint64_t val,
3643                        MemTxAttrs attrs, MemTxResult *result)
3644 {
3645     MemTxResult r;
3646     val = tswap64(val);
3647     r = address_space_rw(as, addr, attrs, (void *) &val, 8, 1);
3648     if (result) {
3649         *result = r;
3650     }
3651 }
3652
3653 void address_space_stq_le(AddressSpace *as, hwaddr addr, uint64_t val,
3654                        MemTxAttrs attrs, MemTxResult *result)
3655 {
3656     MemTxResult r;
3657     val = cpu_to_le64(val);
3658     r = address_space_rw(as, addr, attrs, (void *) &val, 8, 1);
3659     if (result) {
3660         *result = r;
3661     }
3662 }
3663 void address_space_stq_be(AddressSpace *as, hwaddr addr, uint64_t val,
3664                        MemTxAttrs attrs, MemTxResult *result)
3665 {
3666     MemTxResult r;
3667     val = cpu_to_be64(val);
3668     r = address_space_rw(as, addr, attrs, (void *) &val, 8, 1);
3669     if (result) {
3670         *result = r;
3671     }
3672 }
3673
3674 void stq_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3675 {
3676     address_space_stq(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3677 }
3678
3679 void stq_le_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3680 {
3681     address_space_stq_le(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3682 }
3683
3684 void stq_be_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3685 {
3686     address_space_stq_be(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3687 }
3688
3689 /* virtual memory access for debug (includes writing to ROM) */
3690 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3691                         uint8_t *buf, int len, int is_write)
3692 {
3693     int l;
3694     hwaddr phys_addr;
3695     target_ulong page;
3696
3697     while (len > 0) {
3698         int asidx;
3699         MemTxAttrs attrs;
3700
3701         page = addr & TARGET_PAGE_MASK;
3702         phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
3703         asidx = cpu_asidx_from_attrs(cpu, attrs);
3704         /* if no physical page mapped, return an error */
3705         if (phys_addr == -1)
3706             return -1;
3707         l = (page + TARGET_PAGE_SIZE) - addr;
3708         if (l > len)
3709             l = len;
3710         phys_addr += (addr & ~TARGET_PAGE_MASK);
3711         if (is_write) {
3712             cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
3713                                           phys_addr, buf, l);
3714         } else {
3715             address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
3716                              MEMTXATTRS_UNSPECIFIED,
3717                              buf, l, 0);
3718         }
3719         len -= l;
3720         buf += l;
3721         addr += l;
3722     }
3723     return 0;
3724 }
3725
3726 /*
3727  * Allows code that needs to deal with migration bitmaps etc to still be built
3728  * target independent.
3729  */
3730 size_t qemu_target_page_bits(void)
3731 {
3732     return TARGET_PAGE_BITS;
3733 }
3734
3735 #endif
3736
3737 /*
3738  * A helper function for the _utterly broken_ virtio device model to find out if
3739  * it's running on a big endian machine. Don't do this at home kids!
3740  */
3741 bool target_words_bigendian(void);
3742 bool target_words_bigendian(void)
3743 {
3744 #if defined(TARGET_WORDS_BIGENDIAN)
3745     return true;
3746 #else
3747     return false;
3748 #endif
3749 }
3750
3751 #ifndef CONFIG_USER_ONLY
3752 bool cpu_physical_memory_is_io(hwaddr phys_addr)
3753 {
3754     MemoryRegion*mr;
3755     hwaddr l = 1;
3756     bool res;
3757
3758     rcu_read_lock();
3759     mr = address_space_translate(&address_space_memory,
3760                                  phys_addr, &phys_addr, &l, false);
3761
3762     res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
3763     rcu_read_unlock();
3764     return res;
3765 }
3766
3767 int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3768 {
3769     RAMBlock *block;
3770     int ret = 0;
3771
3772     rcu_read_lock();
3773     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
3774         ret = func(block->idstr, block->host, block->offset,
3775                    block->used_length, opaque);
3776         if (ret) {
3777             break;
3778         }
3779     }
3780     rcu_read_unlock();
3781     return ret;
3782 }
3783 #endif